diff --git a/.bazelrc b/.bazelrc
index 396b84f70b3..03208385283 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -49,7 +49,6 @@
 #     rocm:         Build with AMD GPU support (rocm).
 #     mkl:          Enable full mkl support.
 #     tensorrt:     Enable Tensorrt support.
-#     ngraph:       Enable ngraph support.
 #     numa:         Enable numa using hwloc.
 #     noaws:        Disable AWS S3 storage support
 #     nogcp:        Disable GCS support.
@@ -159,6 +158,7 @@ build --host_java_toolchain=//third_party/toolchains/java:tf_java_toolchain
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl --define=build_with_openmp=true
 build:mkl -c opt
 
 # config to build OneDNN backend with a user specified threadpool.
@@ -172,6 +172,7 @@ build:mkl_threadpool -c opt
 build:mkl_opensource_only --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_opensource_only --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_opensource_only --define=build_with_mkl_opensource=true
+build:mkl_opensource_only --define=build_with_openmp=true
 build:mkl_opensource_only -c opt
 
 # Config setting to build with oneDNN for Arm.
@@ -218,7 +219,6 @@ build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
 build:rocm --action_env TF_NEED_ROCM=1
 
 # Options extracted from configure script
-build:ngraph --define=with_ngraph_support=true
 build:numa --define=with_numa_support=true
 
 # Options to disable default on features
@@ -283,7 +283,7 @@ build:ios --copt=-w
 build:linux --copt=-w
 build:linux --host_copt=-w
 build:macos --copt=-w
-build:windows --copt=/w
+build:windows --copt=/W0
 
 # Tensorflow uses M_* math constants that only get defined by MSVC headers if
 # _USE_MATH_DEFINES is defined.
@@ -294,9 +294,11 @@ build:windows --host_copt=/D_USE_MATH_DEFINES
 build:linux --define=PREFIX=/usr
 build:linux --define=LIBDIR=$(PREFIX)/lib
 build:linux --define=INCLUDEDIR=$(PREFIX)/include
+build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
 build:macos --define=PREFIX=/usr
 build:macos --define=LIBDIR=$(PREFIX)/lib
 build:macos --define=INCLUDEDIR=$(PREFIX)/include
+build:macos --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
 # TF_SYSTEM_LIBS do not work on windows.
 
 # By default, build TF in C++ 14 mode.
diff --git a/README.md b/README.md
index 31888cfbbc6..116477670f6 100644
--- a/README.md
+++ b/README.md
@@ -103,23 +103,22 @@ open-source software development:
 
 ### Official Builds
 
-Build Type               | Status                                                                                                                                                                           | Artifacts
------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**Linux CPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)           | [PyPI](https://pypi.org/project/tf-nightly/)
-**Linux GPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Linux XLA**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)         | TBA
-**macOS**                | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)     | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)       | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)       | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)               | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
-**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
-**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
-**Libtensorflow MacOS CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Linux CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Linux GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-
+Build Type                    | Status                                                                                                                                                                                             | Artifacts
+----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux CPU**                 | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)                             | [PyPI](https://pypi.org/project/tf-nightly/)
+**Linux GPU**                 | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html)                   | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Linux XLA**                 | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)                           | TBA
+**macOS**                     | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)                       | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows CPU**               | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)                         | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows GPU**               | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)                         | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Android**                   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)                                 | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+**Raspberry Pi 0 and 1**      | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)                             | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
+**Raspberry Pi 2 and 3**      | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)                             | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+**Libtensorflow MacOS CPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.html)     | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Linux CPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.html) | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Linux GPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.html) | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.html)     | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.html)     | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
 
 ### Community Supported Builds
 
@@ -133,12 +132,20 @@ Build Type
 **Linux ppc64le CPU** Stable Release                                                | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                                                                                                                                                                                                                             | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_CPU_Release_Build/)
 **Linux ppc64le GPU** Nightly                                                       | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                                                                                                                                                                                                                             | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
 **Linux ppc64le GPU** Stable Release                                                | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                                                                                                                                                                                                                             | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/)
-**Linux aarch64 CPU** Nightly <br> Python 3.6                                       | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow)](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)                                                                                                                                                                              | [Nightly](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)
-**Linux aarch64 CPU** Stable Release                                                | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show)](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show) | Release [1.15](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show) / [2.x](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show)
+**Linux aarch64 CPU** Nightly (Linaro)<br> Python 3.8                               | [![Build Status](https://ci.linaro.org/jenkins/buildStatus/icon?job=ldcg-hpc-tensorflow)](https://ci.linaro.org/jenkins/job/ldcg-hpc-tensorflow/)                                                                                                                                                                                                                                                   | [Nightly](http://snapshots.linaro.org/hpc/python/tensorflow/latest/)
+**Linux aarch64 CPU** Stable Release (Linaro)                                       | [![Build Status](https://ci.linaro.org/jenkins/buildStatus/icon?job=ldcg-hpc-tensorflow)](https://ci.linaro.org/jenkins/job/ldcg-hpc-tensorflow/)                                                                                                                                                                                                                                                   | Release [1.x & 2.x](http://snapshots.linaro.org/hpc/python/tensorflow/latest/)
+**Linux aarch64 CPU** Nightly (OpenLab)<br> Python 3.6                              | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow)](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)                                                                                                                                                                              | [Nightly](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)
+**Linux aarch64 CPU** Stable Release (OpenLab)                                      | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show)](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show) | Release [1.15](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show) / [2.x](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show)
 **Linux CPU with Intel oneAPI Deep Neural Network Library (oneDNN)** Nightly        | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)                                                                                                                                                                                                                           | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
 **Linux CPU with Intel oneAPI Deep Neural Network Library (oneDNN)** Stable Release | ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)                                                                                                                                                                                                                                                                                                    | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/)
 **Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6                   | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/)                                                                                                                                                                                                       | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/)
 
+### Community Supported Containers
+
+Container Type                                                    | Status | Artifacts
+----------------------------------------------------------------- | ------ | ---------
+**TensorFlow aarch64 Neoverse-N1 CPU** Stable (Linaro)<br> Debian | Static | Release [2.3](https://hub.docker.com/r/linaro/tensorflow-arm-neoverse-n1)
+
 ## Resources
 
 *   [TensorFlow.org](https://www.tensorflow.org)
@@ -151,6 +158,7 @@ Build Type
 *   [Intro to TensorFlow for Deep Learning from Udacity](https://www.udacity.com/course/intro-to-tensorflow-for-deep-learning--ud187)
 *   [Introduction to TensorFlow Lite from Udacity](https://www.udacity.com/course/intro-to-tensorflow-lite--ud190)
 *   [Machine Learning with TensorFlow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp)
+*   [TensorFlow Codelabs](https://codelabs.developers.google.com/?cat=TensorFlow)
 *   [TensorFlow Chat Room on StackOverflow (not actively monitored by the
     TensorFlow team)](https://chat.stackoverflow.com/rooms/216694/tensorflow)
 *   [TensorFlow Blog](https://blog.tensorflow.org)
diff --git a/RELEASE.md b/RELEASE.md
index 18649653304..962cc87ae28 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,58 @@
+# Release 2.5.0
+
+<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+
+## Breaking Changes
+
+* <DOCUMENT BREAKING CHANGES HERE>
+* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+
+## Known Caveats
+
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
+* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
+* <KNWON LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
+
+## Major Features and Improvements
+
+* <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
+* <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+
+* TPU embedding support
+  * Added `profile_data_directory` to `EmbeddingConfigSpec` in
+    `_tpu_estimator_embedding.py`. This allows embedding lookup statistics
+    gathered at runtime to be used in embedding layer partitioning decisions.
+
+## Bug Fixes and Other Changes
+
+*   <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+*   <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+*   <NOTES SHOULD BE GROUPED PER AREA>
+*   `tf.keras`:
+    *   Improvements to Keras preprocessing layers:
+        *   Discretization combiner implemented, with additional arg `epsilon`.
+
+*   `tf.data`:
+    *   Exposing `tf.data.experimental.ExternalStatePolicy`, which can be used
+        to control how external state should be handled during dataset
+        serialization or iterator checkpointing.
+
+*   `tf.lite`:
+    *   NNAPI
+        *   Removed deprecated `Interpreter::UseNNAPI(bool)` C++ API.
+            *   Use `NnApiDelegate()` and related delegate configuration methods
+                directly.
+*   TF Core:
+    *   Corrected higher-order gradients of control flow constructs (`tf.cond`,
+        `tf.while_loop`, and compositions like `tf.foldl`) computed with
+        `tf.GradientTape` inside a `tf.function`.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+
 # Release 2.4.0
 
 <INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
@@ -6,6 +61,15 @@
 
 * <DOCUMENT BREAKING CHANGES HERE>
 * <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+* Certain float32 ops run in lower precsion on Ampere based GPUs, including 
+  matmuls and convolutions, due to the use of
+  [TensorFloat-32](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/).
+  Specifically, inputs to such ops are rounded from 23 bits of precision to 10
+  bits of precision. This is unlikely to cause issues in practice for deep
+  learning models. In some cases, TensorFloat-32 is also used for complex64 ops.
+  TensorFloat-32 can be disabled by running
+  `config.experimental.enable_tensor_float_32_execution(False)`. The "Major
+  Features and Improvements" section has more details.
 * The byte layout for string tensors across the C-API has been updated to match
   TF Core/C++; i.e., a contiguous array of `tensorflow::tstring`/`TF_TString`s.
 * C-API functions `TF_StringDecode`, `TF_StringEncode`, and
@@ -34,6 +98,7 @@
   shape assumptions (note that you can pass shapes with `None` entries for axes
   that are meant to be dynamic). You can also disable the input checking
   entirely by setting `model.input_spec = None`.
+* TF pip packages now use CUDA11 and cuDNN 8.0.2.
 * XLA:CPU and XLA:GPU devices are no longer registered by default. Use
   `TF_XLA_FLAGS=--tf_xla_enable_xla_devices` if you really need them (to be
   removed).
@@ -46,6 +111,49 @@
 * `tf.data.experimental.service.WorkerServer` now takes a config tuple
   instead of individual arguments. Usages should be updated to
   `tf.data.experimental.service.WorkerServer(worker_config)`.
+* `tf.quantization.quantize_and_dequantize_v2` has been introduced, which
+  updates the gradient definition for quantization which is outside the range
+  to be 0. To simulate the V1 the behavior of
+  tf.quantization.quantize_and_dequantize(...) use
+  tf.grad_pass_through(tf.quantization.quantize_and_dequantize_v2)(...).
+* `tf.distribute.Strategy.experimental_make_numpy_dataset` is removed. Please
+  use `tf.data.Dataset.from_tensor_slices` instead.
+* `experimental_hints` in `tf.distribute.StrategyExtended.reduce_to`,
+  `tf.distribute.StrategyExtended.batch_reduce_to`,
+  `tf.distribute.ReplicaContext.all_reduce` are renamed to `options`.
+  `tf.distribute.experimental.CollectiveHints` is renamed
+  `tf.distribute.experimental.CommunicationOptions`.
+  `tf.distribute.experimental.CollectiveCommunication` is renamed
+  `tf.distribute.experimental.CommunicationImplementation`.
+* `tf.keras.mixed_precision.experimental`:
+  * `AutoCastVariable.dtype` now refers to the actual variable dtype, not the
+    dtype it will be casted to.
+  * When mixed precision is enabled, `tf.keras.layers.Embedding` now outputs a
+    float16 or bfloat16 tensor instead of a float32 tensor.
+  * The property
+    `tf.keras.mixed_precision.experimental.LossScaleOptimizer.loss_scale` is now
+    a tensor, not a `LossScale` object. This means to get a loss scale of a
+    `LossScaleOptimizer` as a tensor, you must now call `opt.loss_scale` instead
+    of `opt.loss_scale()`.
+  * The property `should_cast_variables` has been removed from
+    `tf.keras.mixed_precision.experimental.Policy`
+  * When passing a `tf.mixed_precision.experimental.DynamicLossScale` to
+    `tf.keras.mixed_precision.experimental.LossScaleOptimizer`, the
+    `DynamicLossScale`'s multiplier must be 2.
+  * When passing a `tf.mixed_precision.experimental.DynamicLossScale` to
+    `tf.keras.mixed_precision.experimental.LossScaleOptimizer`, the weights of
+    the `DynanmicLossScale` are copied into the `LossScaleOptimizer` instead of
+    being reused. This means modifying the weights of the `DynamicLossScale`
+    will no longer affect the weights of the LossScaleOptimizer, and vice versa.
+  * The global policy can no longer be set to a non-floating point policy in
+    `tf.keras.mixed_precision.experimental.set_policy`
+  * In `Layer.call`, `AutoCastVariable`s will no longer be casted within
+    `MirroredStrategy.run` or `ReplicaContext.merge_call`. This is because a
+    thread local variable is used to determine whether `AutoCastVariable`s are
+    casted, and those two functions run with a different thread. Note this only
+    applies if one of these two functions is called within `Layer.call`; if one
+    of those two functions calls `Layer.call`, `AutoCastVariable`s will still be
+    casted.
 
 ## Known Caveats
 
@@ -57,9 +165,40 @@
 * <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
 * A new module named `tf.experimental.numpy` is added, which is a NumPy-compatible API for writing TF programs. This module provides class `ndarray`, which mimics the `ndarray` class in NumPy, and wraps an immutable `tf.Tensor` under the hood. A subset of NumPy functions (e.g. `numpy.add`) are provided. Their inter-operation with TF facilities is seamless in most cases. See [tensorflow/python/ops/numpy_ops/README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/numpy_ops/README.md) for details of what operations are supported and what are the differences from NumPy.
 * A major refactoring of the internals of the Keras Functional API has been completed, that should improve the reliability, stability, and performance of constructing Functional models.
+* Support for
+  [TensorFloat-32](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/)
+  on Ampere based GPUs has been added. TensorFloat-32, or TF32 for short, is a
+  math mode for NVIDIA Ampere GPUs which causes certain float32 ops, such as
+  matrix multiplications and convolutions, to run much faster on Ampere GPUs but
+  with reduced precision. This reduced precision has not been found to effect
+  convergence quality of deep learning models in practice. TensorFloat-32 is
+  enabled by default, but can be disabled with
+  `tf.config.experimental.enable_tensor_float_32_execution`.
 
 * `tf.distribute`:
+  * `MultiWorkerMirroredStrategy` is graduated out of experimental.
+    * Peer failure will no longer cause the cluster to hang.
+    * Major issues with saving are fixed.
+    * See [Multi-worker training with Keras](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras) for a tutorial.
   * Deprecated `experimental_distribute_datasets_from_function` method and renamed it to `distribute_datasets_from_function` as it is no longer experimental.
+* The `tf.keras.mixed_precision` API has been made non-experimental. The major
+  changes to the new non-experimental API are:
+  * `tf.keras.mixed_precision.Policy` no longer takes in a
+    `tf.mixed_precision.experimental.LossScale` in the constructor, and no
+    longer has a `LossScale` associated with it. Instead, `Model.compile` will
+    automatically wrap the optimizer with a `LossScaleOptimizer` using dynamic
+    loss scaling if `Policy.name` is "mixed_float16".
+  * `tf.keras.mixed_precision.LossScaleOptimizer`'s constructor takes in
+    different arguments. In particular, it no longer takes in a `LossScale`, and
+    there is no longer a `LossScale` associated with the `LossScaleOptimizer`.
+    Instead, `LossScaleOptimizer` directly implements fixed or dynamic loss
+    scaling. See the documentation of
+    `tf.keras.mixed_precision.experimental.LossScaleOptimizer` for details on
+    the differences between the experimental `LossScaleOptimizer` and the new
+    non-experimental `LossScaleOptimizer`.
+  * `tf.mixed_precision.experimental.LossScale` and its subclasses are
+    deprecated, as all of its functionality now exists within
+    `tf.keras.mixed_precision.LossScaleOptimizer`
 
 ## Bug Fixes and Other Changes
 
@@ -109,6 +248,10 @@
         ([CVE-2020-15212](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15212),
         [CVE-2020-15213](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15213),
         [CVE-2020-15214](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15214))
+    *   Fixes a segfault in `tf.quantization.quantize_and_dequantize`
+        ([CVE-2020-15265](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15265))
+    *   Fixes an undefined behavior float cast causing a crash
+        ([CVE-2020-15266](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15266))
 *   TF Core:
     *   `tf.types.experimental.TensorLike` is a new `Union` type that can be
         used as type annotation for variables representing a Tensor or a value
@@ -130,6 +273,8 @@
         stateful ops.
     *   Added `tf.config.experimental.get_memory_usage` to return total memory
         usage of the device.
+    *   Added gradients for `RaggedTensorToVariant` and `RaggedTensorFromVariant`.
+    *   Improve shape inference of nested function calls by supporting constant folding across Arg nodes which makes more static values available to shape inference functions.
 *   `tf.data`:
     *   tf.data service:
     *   Added new `tf.data.experimental.service.register_dataset` and
@@ -174,7 +319,16 @@
         how many times the function is called, and independent of global seed
         settings.
 *   `tf.distribute`:
-    *   <ADD RELEASE NOTES HERE>
+    *   (Experimental) Parameter server training:
+        *   Replaced the existing
+            `tf.distribute.experimental.ParameterServerStrategy` symbol with
+            a new class that is for parameter server training in TF2. Usage with
+            the old symbol, usually with Estimator, should be replaced with
+            `tf.compat.v1.distribute.experimental.ParameterServerStrategy`.
+        *   Added `tf.distribute.experimental.coordinator.*` namespace,
+            including the main API `ClusterCoordinator` for coordinating the
+            training cluster, the related data structure `RemoteValue`
+            and `PerWorkerValue`.
 *   `tf.keras`:
     *   Improvements from the functional API refactoring:
         *   Functional model construction does not need to maintain a global
@@ -209,21 +363,37 @@
     *   Improvements to Keras preprocessing layers:
         *   TextVectorization can now accept a vocabulary list or file as an
             init arg.
+        *   TextVectorization, StringLookup, and IntegerLookup can now accept a
+            vocabulary file via the `set_vocab_from_file` method.
+        *   Normalization can now accept mean and variance values as init args.
     *   In `Attention` and `AdditiveAttention` layers, the `call()` method now
         accepts a `return_attention_scores` argument. When set to
         True, the layer returns the attention scores as an additional output
         argument.
     *   Added `tf.metrics.log_cosh` and `tf.metrics.logcosh` API entrypoints
         with the same implementation as their `tf.losses` equivalent.
+    *   For Keras model, the individual call of `Model.evaluate` uses no cached
+        data for evaluation, while `Model.fit` uses cached data when
+        `validation_data` arg is provided for better performance.
+    *   Added a `save_traces` argument to `model.save`/
+        `tf.keras.models.save_model` which determines whether the SavedModel
+        format stores the Keras model/layer call functions. The traced functions
+        allow Keras to revive custom models and layers without the original
+        class definition, but if this isn't required the tracing can be
+        disabled with the added option.
 *   `tf.function` / AutoGraph:
     *   Added `experimental_follow_type_hints` argument for `tf.function`. When
         True, the function may use type annotations to optimize the tracing
         performance.
     *   Added support for `iter(DistributedDataset)` in AutoGraph `for` loops.
-    *   AutoGraph now allows creating new symbols inside a TensorFLow loop, if
+    *   AutoGraph now allows creating new symbols inside a TensorFlow loop, if
         the values of these symbols at an iteration does not depend on the
         previous iteration. These types of loops must run at least one
         iteration, and will raise a runtime error otherwise.
+    *   Variables contained in `tf.Module`s that are set as attributes of
+        custom Keras `Layer`s and `Model`s are now tracked in
+        the properties `layer.trainable_variables` and
+        `layer.non_trainable_variables`.
 
     Example:
 
@@ -254,8 +424,13 @@
         *   Deprecate `Interpreter::UseNNAPI(bool)` C++ API.
             *   Use `NnApiDelegate()` and related delegate configuration methods
                 directly.
+        *   Deprecate `Interpreter::SetAllowFp16PrecisionForFp32(bool)` C++ API
+            *   Prefer controlling this via delegate options, e.g.
+                `tflite::StatefulNnApiDelegate::Options::allow_fp16' or
+                `TfLiteGpuDelegateOptionsV2::is_precision_loss_allowed`.
     *   `DynamicBuffer::AddJoinedString()` will now add a separator if the first
         string to be joined is empty.
+    *  Added support for cumulative sum (cumsum), both as builtin op and MLIR conversion.
     *   <ADD RELEASE NOTES HERE>
 
 *   `tf.random`:
@@ -264,7 +439,7 @@
 
 *   Math and Linear Algebra:
 
-    *   <ADD RELEASE NOTES HERE>
+    * Add `tf.math.erfcinv`, the inverse to `tf.math.erfc`.
 
 *   TPU Enhancements:
 
@@ -310,6 +485,12 @@
         didn't have the keys sorted, the keys and values were not being printed
         in accordance with their correct mapping.
 
+*    `TensorRT`
+
+    *   We now issue a warning when the `session_config` parameter for the TF1
+        converter is used or the `rewrite_config_template` field in the TF2
+        converter parameter object is used.
+
 *   Other:
 
     *   We have replaced uses of "whitelist" and "blacklist" with "allowlist"
@@ -318,6 +499,8 @@
         context.
     *   Add `tf.config.experimental.mlir_bridge_rollout` which will help us
         rollout the new MLIR TPU bridge.
+    *   Added `tf.experimental.register_filesystem_plugin` to load modular
+        filesystem plugins from Python
     *   <ADD RELEASE NOTES HERE>
 
 ## Thanks to our Contributors
@@ -690,6 +873,7 @@ stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
     * Add `tf.saved_model.LoadOptions` with [`experimental_io_device`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/saved_model/LoadOptions?hl=en) as arg with default value `None` to choose the I/O device for loading models and weights.
     * Update `tf.saved_model.SaveOptions` with [`experimental_io_device`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/saved_model/SaveOptions?hl=en) as arg with default value `None` to choose the I/O device for saving models and weights.
     * Mutable tables now restore checkpointed values when loaded from SavedModel.
+    * The user object metadata field in the SavedModel proto has been deprecated as part of the updates to Keras SavedModel. Keras was the only consumer of this field prior to the update.
   * GPU
     * TF 2.3 includes PTX kernels only for [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0 to reduce the TF pip binary size.  Earlier releases included PTX for a variety of older compute capabilities.
     * Remove environmental variable `TF_USE_CUDNN`.
@@ -718,6 +902,7 @@ stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
   * Fix the issue that `strategy.reduce()` inside `tf.function` may raise exceptions when the values to reduce are from loops or if-clauses.
   * Fix the issue that `tf.distribute.MirroredStrategy` cannot be used together with `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
   * Add a `tf.distribute.cluster_resolver.TPUClusterResolver.connect` API to simplify TPU initialization.
+  * Add `tf.distribute.Strategy.gather` and `tf.distribute.ReplicaContext.all_gather` methods to gather and concatenate `tf.distribute.DistributedValues` across workers and devices.
 
 ### `tf.keras`:
   * Introduces experimental preprocessing layers API (`tf.keras.layers.experimental.preprocessing`)  to handle data preprocessing operations such as categorical feature encoding, text vectorization, data normalization, and data discretization (binning). The newly added layers provide a replacement for the  legacy feature column API, and support composite tensor inputs.
diff --git a/WORKSPACE b/WORKSPACE
index fa39cedae9b..9db1d9b80eb 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -113,26 +113,10 @@ http_archive(
 # Required for dependency @com_github_grpc_grpc
 
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
-
 grpc_deps()
 
-load(
-    "@build_bazel_rules_apple//apple:repositories.bzl",
-    "apple_rules_dependencies",
-)
-
-apple_rules_dependencies()
-
-load(
-    "@build_bazel_apple_support//lib:repositories.bzl",
-    "apple_support_dependencies",
-)
-
-apple_support_dependencies()
-
-load("@upb//bazel:repository_defs.bzl", "bazel_version_repository")
-
-bazel_version_repository(name = "bazel_version")
+load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
+grpc_extra_deps()
 
 load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
 
diff --git a/configure.py b/configure.py
index e381c8c20db..2f9902bc93e 100644
--- a/configure.py
+++ b/configure.py
@@ -1163,12 +1163,9 @@ def set_system_libs_flag(environ_cp):
       syslibs = ','.join(sorted(syslibs.split()))
     write_action_env_to_bazelrc('TF_SYSTEM_LIBS', syslibs)
 
-  if 'PREFIX' in environ_cp:
-    write_to_bazelrc('build --define=PREFIX=%s' % environ_cp['PREFIX'])
-  if 'LIBDIR' in environ_cp:
-    write_to_bazelrc('build --define=LIBDIR=%s' % environ_cp['LIBDIR'])
-  if 'INCLUDEDIR' in environ_cp:
-    write_to_bazelrc('build --define=INCLUDEDIR=%s' % environ_cp['INCLUDEDIR'])
+  for varname in ('PREFIX', 'LIBDIR', 'INCLUDEDIR', 'PROTOBUF_INCLUDE_PATH'):
+    if varname in environ_cp:
+      write_to_bazelrc('build --define=%s=%s' % (varname, environ_cp[varname]))
 
 
 def is_reduced_optimize_huge_functions_available(environ_cp):
@@ -1487,7 +1484,6 @@ def main():
   config_info_line('mkl', 'Build with MKL support.')
   config_info_line('mkl_aarch64', 'Build with oneDNN support for Aarch64.')
   config_info_line('monolithic', 'Config for mostly static monolithic build.')
-  config_info_line('ngraph', 'Build with Intel nGraph support.')
   config_info_line('numa', 'Build with NUMA support.')
   config_info_line(
       'dynamic_kernels',
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 15ef7f21ed0..379b483e5d2 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -3,6 +3,7 @@
 # learning applications.
 
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("//tensorflow:tensorflow.bzl", "VERSION", "tf_cc_shared_object", "tf_custom_op_library_additional_deps_impl", "tf_native_cc_binary")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -22,10 +23,6 @@ load(
     "//tensorflow/python/tools/api/generator:api_init_files_v1.bzl",
     "TENSORFLOW_API_INIT_FILES_V1",  # @unused
 )
-load(
-    "//third_party/ngraph:build_defs.bzl",
-    "if_ngraph",
-)
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -238,6 +235,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux_mips64",
+    values = {"cpu": "mips64"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "debug",
     values = {
@@ -465,14 +468,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# This flag is set from the configure step when the user selects with nGraph option.
-# By default it should be false
-config_setting(
-    name = "with_ngraph_support",
-    values = {"define": "with_ngraph_support=true"},
-    visibility = ["//visibility:public"],
-)
-
 # This flag specifies whether TensorFlow 2.0 API should be built instead
 # of 1.* API. Note that TensorFlow 2.0 API is currently under development.
 config_setting(
@@ -563,18 +558,45 @@ selects.config_setting_group(
     ],
 )
 
+# 'enable_registration_v2' opts-in to a different implementation of op and
+# kernel registration - REGISTER_OP, REGISTER_KERNEL_BUILDER, etc.
+#
+# This setting is currently experimental. The 'v2' implementation does _not_
+# correspond to a particular, finalized design; rather, it relates to
+# developing one.
+#
+# The current aim of the 'v2' implementation is to allow 'unused' ops and
+# kernels to be discarded by the linker (to the benefit of binary size).
+bool_flag(
+    name = "enable_registration_v2",
+    build_setting_default = False,
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "registration_v1",
+    flag_values = {":enable_registration_v2": "False"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "registration_v2",
+    flag_values = {":enable_registration_v2": "True"},
+    visibility = ["//visibility:public"],
+)
+
 # DO NOT ADD ANY NEW EXCEPTIONS TO THIS LIST!
 # Instead, please use public APIs or public build rules TF provides.
 # If you need functionality that is not exposed, we will work with you to expand our public APIs.
 package_group(
     name = "internal",
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/lib/ami/simple_ml/...",
+        "//tensorflow/...",
+    ],
 )
 
-package_group(
-    name = "ndarray_tensor_allow_list",
-    packages = ["//learning/pathways/..."],
-)
+package_group(name = "ndarray_tensor_allow_list")
 
 # Packages that use private types symbols, until they are exported.
 # TODO(b/154650521) Remove.
@@ -605,7 +627,7 @@ bzl_library(
         "//tensorflow/core/platform/default:cuda_build_defs_bzl",
         "//third_party/mkl:build_defs_bzl",
         "//third_party/mkl_dnn:build_defs_bzl",
-        "//third_party/ngraph:build_defs_bzl",
+        "@bazel_skylib//rules:common_settings",
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",
         "@local_config_tensorrt//:build_defs_bzl",
@@ -706,8 +728,11 @@ tf_cc_shared_object(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
+        "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:ops_hdrs",
         "//tensorflow/cc/saved_model:loader_lite_impl",
-        "//tensorflow/core:core_cpu_impl",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
         "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
@@ -809,7 +834,7 @@ tf_cc_shared_object(
         "//tensorflow/cc:scope",
         "//tensorflow/cc/profiler",
         "//tensorflow/core:tensorflow",
-    ] + if_ngraph(["@ngraph_tf//:ngraph_tf"]),
+    ],
 )
 
 # ** Targets for Windows build (start) **
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 677ab3355ff..3f4d70ed60e 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -202,6 +202,7 @@ tf_cuda_library(
             ":tf_status",
             ":tf_tensor",
             "@com_google_absl//absl/strings",
+            "//tensorflow/c/experimental/filesystem:modular_filesystem",
             "//tensorflow/cc/saved_model:loader_lite",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
@@ -217,6 +218,8 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core/distributed_runtime:server_lib",
             "//tensorflow/core/kernels:logging_ops",
+            "//tensorflow/compiler/mlir/tfr:node_expansion_pass",
+            "//tensorflow/compiler/mlir/tfr:graph_decompose_pass",
         ],
     }),
     alwayslink = 1,
@@ -509,6 +512,18 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "kernels_hdrs",
+    hdrs = ["kernels.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":c_api_internal",
+        ":tf_datatype",
+        ":tf_status",
+        ":tf_tensor",
+    ],
+)
+
 tf_cuda_library(
     name = "kernels",
     srcs = [
@@ -562,6 +577,16 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "ops_hdrs",
+    hdrs = ["ops.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":tf_datatype",
+        ":tf_status",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index a03e9227a75..9579efab94d 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"  // NOLINT
 
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+#include "tensorflow/c/experimental/filesystem/modular_filesystem.h"
 #include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope_internal.h"
@@ -2606,4 +2607,14 @@ void TF_RegisterLogListener(void (*listener)(const char*)) {
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 }
 
+void TF_RegisterFilesystemPlugin(const char* plugin_filename,
+                                 TF_Status* status) {
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
+  status->status = tensorflow::errors::Unimplemented(
+      "FileSystem plugin functionality is not supported on mobile");
+#else
+  status->status = tensorflow::RegisterFilesystemPlugin(plugin_filename);
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
+}
+
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index db5f8fd68f8..f550b690e27 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1577,6 +1577,13 @@ TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
 TF_CAPI_EXPORT extern void TF_RegisterLogListener(
     void (*listener)(const char*));
 
+// Register a FileSystem plugin from filename `plugin_filename`.
+//
+// On success, place OK in status.
+// On failure, place an error status in status.
+TF_CAPI_EXPORT extern void TF_RegisterFilesystemPlugin(
+    const char* plugin_filename, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 81fb9d1a2b8..0d188aa5ee0 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -561,15 +561,15 @@ TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
   collective_executor_handle->get()->StartAbort(status->status);
 }
 
-TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(TFE_Context* ctx,
-                                                            const char* task,
-                                                            TF_Status* status) {
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(
+    TFE_Context* ctx, const char* task, int64_t timeout_in_ms,
+    TF_Status* status) {
   tensorflow::EagerContext* context =
       tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   auto collective_executor_handle = context->GetCollectiveExecutorHandle();
   tensorflow::Notification done;
   collective_executor_handle->get()->remote_access()->CheckPeerHealth(
-      task, [&done, status](const Status& s) {
+      task, timeout_in_ms, [&done, status](const Status& s) {
         status->status = s;
         done.Notify();
       });
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index c9c74f4e874..90e074d232f 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -241,9 +241,9 @@ TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
 // Checks the health of collective ops peers. Explicit health check is needed in
 // multi worker collective ops to detect failures in the cluster.  If a peer is
 // down, collective ops may hang.
-TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(TFE_Context* ctx,
-                                                            const char* task,
-                                                            TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(
+    TFE_Context* ctx, const char* task, int64_t timeout_in_ms,
+    TF_Status* status);
 
 // Information about the shape of a Tensor and its type.
 struct TF_ShapeAndType {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index b90b2644269..fa0fdbae861 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -10,6 +10,9 @@ load(
     "tf_cuda_library",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "filegroup")
 
@@ -94,6 +97,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:remote_device",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_interface",
         "//tensorflow/core:gpu_runtime",
     ] + internal_tfrt_deps(),
     alwayslink = 1,
@@ -106,6 +110,7 @@ filegroup(
         "abstract_function.h",
         "abstract_operation.h",
         "abstract_tensor_handle.h",
+        "c_api.h",
         "c_api_experimental.h",
         "c_api_internal.h",
         "c_api_unified_experimental.h",
@@ -638,6 +643,19 @@ cc_library(
     ],
 )
 
+cc_header_only_library(
+    name = "tfe_tensorhandle_internal_hdrs_only",
+    extra_deps = [
+        "@com_google_absl//absl/strings",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":tfe_tensorhandle_internal",
+    ],
+)
+
 tf_cuda_library(
     name = "c_api_test_util",
     testonly = 1,
diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
index d31b1e13611..07a78f97bd5 100644
--- a/tensorflow/c/eager/abstract_context.h
+++ b/tensorflow/c/eager/abstract_context.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 // environment, a traced representation etc.
 class AbstractContext {
  protected:
-  enum AbstractContextKind { kGraph, kMlir, kEager, kTfrt, kTape };
+  enum AbstractContextKind { kGraph, kMlir, kEager, kTfrt, kTape, kOpHandler };
   explicit AbstractContext(AbstractContextKind kind) : kind_(kind) {}
   virtual ~AbstractContext() {}
 
diff --git a/tensorflow/c/eager/abstract_operation.h b/tensorflow/c/eager/abstract_operation.h
index 4c630528f5d..997c8e0e441 100644
--- a/tensorflow/c/eager/abstract_operation.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -30,7 +30,14 @@ namespace tensorflow {
 // tracing or immediate execution mode.
 class AbstractOperation {
  protected:
-  enum AbstractOperationKind { kGraph, kMlir, kEager, kTfrt, kTape };
+  enum AbstractOperationKind {
+    kGraph,
+    kMlir,
+    kEager,
+    kTfrt,
+    kTape,
+    kOpHandler
+  };
   explicit AbstractOperation(AbstractOperationKind kind) : kind_(kind) {}
   virtual ~AbstractOperation() {}
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 5f388bfe0cd..9c73d1aba8c 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -70,6 +70,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h"
 #endif  // !IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/node_def_util.h"
@@ -855,41 +856,42 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
 #else   // !defined(IS_MOBILE_PLATFORM)
   tensorflow::EagerContext* context =
       tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  // TODO(yuefengz): support partially specified `worker_name`.
-  tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
-  status->status = context->GetClient(worker_name, &eager_client);
-  if (!status->status.ok()) {
+  tensorflow::GrpcServer* grpc_server =
+      dynamic_cast<tensorflow::GrpcServer*>(context->GetServer());
+  if (grpc_server == nullptr) {
+    status->status =
+        tensorflow::errors::Internal("Failed to get tensorflow::GrpcServer.");
+    return false;
+  }
+  tensorflow::WorkerInterface* wi =
+      grpc_server->master_env()->worker_cache->GetOrCreateWorker(worker_name);
+  if (wi == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Unable to find worker interface corresponding to task ", worker_name);
     return false;
   }
 
-  // Send a rpc request to the worker to check aliveness.
-  tensorflow::eager::KeepAliveRequest request;
-  request.set_context_id(context->GetContextId());
-  tensorflow::eager::KeepAliveResponse response;
-
-  tensorflow::Status keep_alive_status;
+  tensorflow::GetStatusRequest request;
+  tensorflow::GetStatusResponse response;
+  tensorflow::Status remote_status;
   tensorflow::Notification done;
-  eager_client->KeepAliveAsync(
-      &request, &response,
-      [&keep_alive_status, &done](const tensorflow::Status& s) {
-        keep_alive_status = s;
-        done.Notify();
-      });
+  wi->GetStatusAsync(/*opts_=*/nullptr, &request, &response, /*fail_fast=*/true,
+                     [&remote_status, &done](const tensorflow::Status& s) {
+                       remote_status = s;
+                       done.Notify();
+                     });
   done.WaitForNotification();
 
+  // We set OK status so the call does not raise any exceptions. Instead, caller
+  // users the return value to tell if the remote worker is alive.
   status->status = tensorflow::Status::OK();
 
-  // If `context_id` doesn't exist on the remote worker, an InvalidArgument
-  // error will return. But this still indicates that the remote worker is
-  // alive.
-  if (keep_alive_status.ok() ||
-      keep_alive_status.code() == tensorflow::error::INVALID_ARGUMENT) {
+  if (remote_status.ok()) {
     return true;
-  } else {
-    LOG(INFO) << "Remote worker " << worker_name
-              << " is not alive: " << keep_alive_status.error_message();
-    return false;
   }
+  LOG(INFO) << "Remote worker " << worker_name
+            << " is not alive: " << remote_status.error_message();
+  return false;
 #endif  // !IS_MOBILE_PLATFORM
 }
 
@@ -1445,13 +1447,11 @@ TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t,
 
 void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                   TF_Status* status) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  status->status = context->Executor().WaitForAllPendingNodes();
+  auto* context = tensorflow::unwrap(ctx);
+  status->status = context->AsyncWait();
   if (!status->status.ok()) return;
-  tensorflow::mutex_lock ml(*context->MetadataMu());
-  status->status = MessageToBuffer(*context->RunMetadataProto(), buf);
-  context->ClearRunMetadata();
+  auto run_metadata = context->ExportRunMetadata();
+  status->status = MessageToBuffer(*run_metadata, buf);
 }
 
 namespace {
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index cc2270755bf..1ef536a66f6 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -638,3 +638,19 @@ void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx, unsigned char enable,
                                       TF_Status* status) {
   tensorflow::unwrap(ctx)->SetLogDevicePlacement(enable);
 }
+
+const char* TFE_TensorHandleDeviceType(TFE_TensorHandle* h, TF_Status* status) {
+  if (h == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument("Invalid handle");
+    return nullptr;
+  }
+  return tensorflow::unwrap(h)->DeviceType(&status->status);
+}
+
+int TFE_TensorHandleDeviceID(TFE_TensorHandle* h, TF_Status* status) {
+  if (h == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument("Invalid handle");
+    return -1;
+  }
+  return tensorflow::unwrap(h)->DeviceId(&status->status);
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 12546c6082a..d0739a5437d 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -553,6 +553,14 @@ TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
                                                      unsigned char enable,
                                                      TF_Status* status);
 
+// Returns the device type of the operation that produced `h`.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceType(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// Returns the device ID of the operation that produced `h`.
+TF_CAPI_EXPORT extern int TFE_TensorHandleDeviceID(TFE_TensorHandle* h,
+                                                   TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index 4975d303375..4fe83b5116d 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -411,5 +411,109 @@ TEST(CAPI, TensorHandleOnDeviceMemory) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, TensorHandleNullptr) {
+  TFE_TensorHandle* h = nullptr;
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  const char* device_type = TFE_TensorHandleDeviceType(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_type, nullptr);
+  ASSERT_EQ("Invalid handle", string(TF_Message(status.get())));
+
+  TF_SetStatus(status.get(), TF_OK, "");
+
+  int device_id = TFE_TensorHandleDeviceID(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_id, -1);
+  ASSERT_EQ("Invalid handle", string(TF_Message(status.get())));
+}
+
+TEST(CAPI, TensorHandleDevices) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle(ctx);
+  const char* device_type = TFE_TensorHandleDeviceType(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_type, "CPU")) << device_type;
+  int device_id = TFE_TensorHandleDeviceID(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(0, device_id) << device_id;
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_Op* shape_op = ShapeOp(ctx, hgpu);
+    TFE_OpSetDevice(shape_op, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(shape_op, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    device_type = TFE_TensorHandleDeviceType(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(device_type, "GPU")) << device_type;
+
+    device_id = TFE_TensorHandleDeviceID(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_EQ(0, device_id) << device_id;
+
+    TFE_DeleteOp(shape_op);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
+  TFE_DeleteContext(ctx);
+}
+
+TEST(CAPI, TensorHandleDefaults) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* h_default = TestMatrixTensorHandle(ctx);
+  const char* device_type = TFE_TensorHandleDeviceType(h_default, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_type, "CPU")) << device_type;
+  int device_id = TFE_TensorHandleDeviceID(h_default, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(0, device_id) << device_id;
+
+  TFE_TensorHandle* h_cpu = TFE_TensorHandleCopyToDevice(
+      h_default, ctx, "/device:CPU:0", status.get());
+  const char* device_type_cpu = TFE_TensorHandleDeviceType(h_cpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_type_cpu, "CPU")) << device_type_cpu;
+  int device_id_cpu = TFE_TensorHandleDeviceID(h_cpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(0, device_id_cpu) << device_id_cpu;
+
+  TFE_DeleteTensorHandle(h_default);
+  TFE_DeleteTensorHandle(h_cpu);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
+  TFE_DeleteContext(ctx);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index fd208c6770d..0f5f494e5e2 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -769,7 +769,7 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
     TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
     EXPECT_NE(TF_OK, TF_GetCode(status));
     EXPECT_EQ(nullptr, t);
-    const char* msg = "Matrix size-incompatible: In[0]: [2,2], In[1]: [3,2]";
+    const char* msg = "In[0] mismatch In[1] shape: 2 vs. 3: [2,2] [3,2]";
     EXPECT_TRUE(strstr(TF_Message(status), msg) != nullptr)
         << TF_Message(status);
     // Since error is not cleared, the following copy with correct device will
diff --git a/tensorflow/c/eager/gradient_checker_test.cc b/tensorflow/c/eager/gradient_checker_test.cc
index 7a438085fb5..393ad2ceb98 100644
--- a/tensorflow/c/eager/gradient_checker_test.cc
+++ b/tensorflow/c/eager/gradient_checker_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -56,6 +57,9 @@ Status RegisterGradients(GradientRegistry* registry) {
 }
 
 TEST_P(GradientCheckerTest, TestGradCheckMatMul) {
+  // Computing numerical gradients with TensorFloat-32 is numerically unstable
+  enable_tensor_float_32_execution(false);
+
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   AbstractContextPtr ctx;
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index cd4febba8c1..a81d7aa6952 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -62,10 +62,12 @@ Status RegisterGradients(GradientRegistry* registry) {
   TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("IdentityN", IdentityNRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("Sqrt", SqrtRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Neg", NegRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Sub", SubRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Mul", MulRegisterer));
   return Status::OK();
 }
 
-
 // Computes
 // y = inputs[0] + inputs[1]
 // return grad(y, {inputs[0], inputs[1]})
@@ -74,11 +76,11 @@ Status AddGradModel(AbstractContext* ctx,
                     absl::Span<AbstractTensorHandle*> outputs,
                     const GradientRegistry& registry) {
   TapeVSpace vspace(ctx);
-  auto tape = new Tape(/*persistent=*/false);
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
   tape->Watch(ToId(inputs[0]));  // Watch x.
   tape->Watch(ToId(inputs[1]));  // Watch y.
   std::vector<AbstractTensorHandle*> add_outputs(1);
-  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape.get(), registry));
   TF_RETURN_IF_ERROR(ops::Add(tape_ctx.get(), inputs,
                               absl::MakeSpan(add_outputs),
                               "Add"));  // Compute x+y.
@@ -97,7 +99,6 @@ Status AddGradModel(AbstractContext* ctx,
   }
   outputs[0] = out_grads[0];
   outputs[1] = out_grads[1];
-  delete tape;
   return Status::OK();
 }
 
@@ -109,10 +110,10 @@ Status ExpGradModel(AbstractContext* ctx,
                     absl::Span<AbstractTensorHandle*> outputs,
                     const GradientRegistry& registry) {
   TapeVSpace vspace(ctx);
-  auto tape = new Tape(/*persistent=*/false);
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
   tape->Watch(ToId(inputs[0]));  // Watch x.
   std::vector<AbstractTensorHandle*> exp_outputs(1);
-  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape.get(), registry));
   TF_RETURN_IF_ERROR(
       ops::Exp(tape_ctx.get(), inputs, absl::MakeSpan(exp_outputs), "Exp"));
   std::unordered_map<tensorflow::int64, TapeTensor>
@@ -128,7 +129,6 @@ Status ExpGradModel(AbstractContext* ctx,
     exp_output->Unref();
   }
   outputs[0] = out_grads[0];
-  delete tape;
   return Status::OK();
 }
 
@@ -140,10 +140,10 @@ Status SqrtGradModel(AbstractContext* ctx,
                      absl::Span<AbstractTensorHandle*> outputs,
                      const GradientRegistry& registry) {
   TapeVSpace vspace(ctx);
-  auto tape = new Tape(/*persistent=*/false);
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
   tape->Watch(ToId(inputs[0]));  // Watch x.
   std::vector<AbstractTensorHandle*> sqrt_outputs(1);
-  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape.get(), registry));
   TF_RETURN_IF_ERROR(
       ops::Sqrt(tape_ctx.get(), inputs, absl::MakeSpan(sqrt_outputs), "Sqrt"));
   std::unordered_map<tensorflow::int64, TapeTensor>
@@ -159,7 +159,6 @@ Status SqrtGradModel(AbstractContext* ctx,
     sqrt_output->Unref();
   }
   outputs[0] = out_grads[0];
-  delete tape;
   return Status::OK();
 }
 
@@ -172,12 +171,12 @@ Status IdentityNGradModel(AbstractContext* ctx,
                           absl::Span<AbstractTensorHandle*> outputs,
                           const GradientRegistry& registry) {
   TapeVSpace vspace(ctx);
-  auto tape = new Tape(/*persistent=*/false);
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
   tape->Watch(ToId(inputs[0]));
   tape->Watch(ToId(inputs[1]));
 
   vector<AbstractTensorHandle*> identity_n_outputs(2);
-  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape.get(), registry));
   TF_RETURN_IF_ERROR(ops::IdentityN(
       tape_ctx.get(), inputs, absl::MakeSpan(identity_n_outputs), "IdentityN"));
 
@@ -195,6 +194,105 @@ Status IdentityNGradModel(AbstractContext* ctx,
   }
   outputs[0] = out_grads[0];
   outputs[1] = out_grads[1];
+  return Status::OK();
+}
+
+// Computes
+// y = - inputs[0]
+// return grad(y, {inputs[0]})
+Status NegGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));
+
+  std::vector<AbstractTensorHandle*> neg_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape.get(), registry));
+  TF_RETURN_IF_ERROR(
+      ops::Neg(tape_ctx.get(), inputs, absl::MakeSpan(neg_outputs), "Neg"));
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(neg_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto neg_output : neg_outputs) {
+    neg_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  return Status::OK();
+}
+
+// Computes
+// y = inputs[0] - inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status SubGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  std::vector<AbstractTensorHandle*> sub_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape.get(), registry));
+  TF_RETURN_IF_ERROR(ops::Sub(tape_ctx.get(), inputs,
+                              absl::MakeSpan(sub_outputs),
+                              "Sub"));  // Compute x-y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(sub_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto sub_output : sub_outputs) {
+    sub_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  return Status::OK();
+}
+
+// Computes
+// y = inputs[0] * inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status MulGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  std::vector<AbstractTensorHandle*> mul_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::Mul(tape_ctx.get(), inputs,
+                              absl::MakeSpan(mul_outputs),
+                              "Mul"));  // Compute x*y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(mul_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto mul_output : mul_outputs) {
+    mul_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
   delete tape;
   return Status::OK();
 }
@@ -536,6 +634,172 @@ TEST_P(CppGradients, TestIdentityNGrad) {
   result_tensor = nullptr;
 }
 
+TEST_P(CppGradients, TestNegGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // y = - x
+  // outputs = tape.gradient(y, x)
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(NegGradModel, ctx.get(), {x.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, -1.0);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
+TEST_P(CppGradients, TestSubGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  AbstractTensorHandlePtr y;
+  {
+    AbstractTensorHandle* y_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &y_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    y.reset(y_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // tape.watch(y)
+  // y = x - y
+  // outputs = tape.gradient(y, [x, y])
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(SubGradModel, ctx.get(), {x.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, 1.0);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+
+  s = getValue(outputs[1], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, -1.0);
+  outputs[1]->Unref();
+  TF_DeleteTensor(result_tensor);
+}
+
+TEST_P(CppGradients, TestMulGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  AbstractTensorHandlePtr y;
+  {
+    AbstractTensorHandle* y_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &y_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    y.reset(y_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // tape.watch(y)
+  // y = x * y
+  // outputs = tape.gradient(y, [x, y])
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(MulGradModel, ctx.get(), {x.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, 2.0);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+
+  s = getValue(outputs[1], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, 1.0);
+  outputs[1]->Unref();
+  TF_DeleteTensor(result_tensor);
+}
+
 TEST_P(CppGradients, TestSetAttrString) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -575,7 +839,7 @@ TEST_P(CppGradients, TestSetAttrString) {
   int num_retvals = 1;
   std::vector<AbstractTensorHandle*> outputs(1);
   GradientRegistry registry;
-  std::unique_ptr<Tape> tape(new Tape(/*persistent=*/false));
+  auto tape = std::make_unique<Tape>(/*persistent=*/false);
   s = Execute(check_numerics_op.get(), ctx.get(), absl::MakeSpan(outputs),
               &num_retvals, &forward_op, tape.get(), registry);
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index a3e3857b34b..27fa17127b8 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -124,6 +125,13 @@ class ImmediateExecutionContext : public AbstractContext {
   // Returns the device placement policy for the current thread.
   virtual ContextDevicePlacementPolicy GetDevicePlacementPolicy() const = 0;
 
+  // Configure graph collection in RunMetadata.
+  virtual void SetShouldStoreGraphs(bool value) = 0;
+
+  // Return the collected RunMetadata. This method will transfer the ownership
+  // to the caller.
+  virtual std::unique_ptr<RunMetadata> ExportRunMetadata() = 0;
+
   // For LLVM style RTTI.
   static bool classof(const AbstractContext* ptr) {
     return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
@@ -149,9 +157,6 @@ class ImmediateExecutionContext : public AbstractContext {
   // Update the Eager Executor for current thread.
   virtual void SetExecutorForThread(EagerExecutor* executor) = 0;
 
-  // Configure graph collection in RunMetadata.
-  virtual void SetShouldStoreGraphs(bool value) = 0;
-
  protected:
   explicit ImmediateExecutionContext(AbstractContextKind kind)
       : AbstractContext(kind) {}
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
index 6d32d482747..bb6d471f12f 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -44,6 +44,10 @@ class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
   virtual const char* DeviceName(Status* status) const = 0;
   // Returns the device where the tensor was placed.
   virtual const char* BackingDeviceName(Status* status) const = 0;
+  // Returns the device type which created the handle.
+  virtual const char* DeviceType(Status* status) const = 0;
+  // Returns the device ID which created the handle.
+  virtual int DeviceId(Status* status) const = 0;
   // Returns a tensor for the handle. If tensor is remote, it will be copied.
   virtual AbstractTensorInterface* Resolve(Status* status) = 0;
 
diff --git a/tensorflow/c/eager/mnist_gradients_test.cc b/tensorflow/c/eager/mnist_gradients_test.cc
index 4114f50a798..16cb01110fd 100644
--- a/tensorflow/c/eager/mnist_gradients_test.cc
+++ b/tensorflow/c/eager/mnist_gradients_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -43,6 +44,11 @@ class CppGradients
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = StatusFromTF_Status(status.get());
     CHECK_EQ(errors::OK, s.code()) << s.error_message();
+
+    // Computing numerical gradients with TensorFloat-32 is numerically
+    // unstable. Some forward pass tests also fail with TensorFloat-32 due to
+    // low tolerances
+    enable_tensor_float_32_execution(false);
   }
 };
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index e270bfcbb80..095f33ff303 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -58,7 +58,7 @@ using ExecutorPtr = std::unique_ptr<TFE_Executor, ExecutorDeleter>;
 class DeviceThread {
  public:
   // Starts a background thread waiting for `StartExecute`.
-  explicit DeviceThread(const std::string& device)
+  explicit DeviceThread(const std::string& device, const bool is_async)
       : status_(TF_NewStatus()),
         device_(device),
         // If the context's default exector is set to async, re-using that in
@@ -67,7 +67,7 @@ class DeviceThread {
         //
         // TODO(allenl): We should have an async API that works with the
         // parallel device.
-        executor_(TFE_NewExecutor(/*is_async=*/false)),
+        executor_(TFE_NewExecutor(is_async)),
         op_(nullptr),
         thread_(tensorflow::Env::Default()->StartThread(
             tensorflow::ThreadOptions(), "parallel_device_execute",
@@ -236,12 +236,13 @@ void DeviceThread::Execute(TFE_Context* context, const char* operation_name,
   }
 }
 
-ParallelDevice::ParallelDevice(const std::vector<std::string>& devices)
+ParallelDevice::ParallelDevice(const std::vector<std::string>& devices,
+                               const bool is_async)
     : underlying_devices_(devices) {
   device_threads_.reserve(devices.size());
   for (int device_index = 0; device_index < devices.size(); ++device_index) {
     device_threads_.emplace_back(
-        new DeviceThread(devices[device_index].c_str()));
+        new DeviceThread(devices[device_index].c_str(), is_async));
   }
 }
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
index b3dc47ab088..1bb9ce0f663 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -49,7 +49,10 @@ class DeviceThread;
 // placed on each underlying device.
 class ParallelDevice {
  public:
-  explicit ParallelDevice(const std::vector<std::string>& devices);
+  // Eager async execution is only supported when remote eager is not in use
+  // (b/157523095).
+  explicit ParallelDevice(const std::vector<std::string>& devices,
+                          const bool is_async = false);
 
   ~ParallelDevice();
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
index 5ff28e4229a..50a9f54cb1e 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
@@ -182,9 +182,8 @@ hdfsFS Connect(tf_hadoop_filesystem::HadoopFile* hadoop_file,
   ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
 
   std::string cacheKey(scheme);
-  hdfsBuilder* builder = libhdfs->hdfsNewBuilder();
   if (scheme == "file") {
-    libhdfs->hdfsBuilderSetNameNode(builder, nullptr);
+    namenode = "";
   } else if (scheme == "viewfs") {
     char* defaultFS = nullptr;
     libhdfs->hdfsConfGetStr("fs.defaultFS", &defaultFS);
@@ -200,24 +199,27 @@ hdfsFS Connect(tf_hadoop_filesystem::HadoopFile* hadoop_file,
     // The default NameNode configuration will be used (from the XML
     // configuration files). See:
     // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
-    libhdfs->hdfsBuilderSetNameNode(builder, "default");
+    namenode = "default";
   } else if (scheme == "har") {
     std::string path_har = path;
     SplitArchiveNameAndPath(&path_har, &namenode, status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    libhdfs->hdfsBuilderSetNameNode(builder, namenode.c_str());
-    cacheKey += namenode;
   } else {
-    libhdfs->hdfsBuilderSetNameNode(
-        builder, namenode.empty() ? "default" : namenode.c_str());
-    cacheKey += namenode;
+    if (namenode.empty()) {
+      namenode = "default";
+    }
   }
+  cacheKey += namenode;
+
   absl::MutexLock l(&hadoop_file->connection_cache_lock);
   if (hadoop_file->connection_cache.find(cacheKey) ==
       hadoop_file->connection_cache.end()) {
+    hdfsBuilder* builder = libhdfs->hdfsNewBuilder();
+    libhdfs->hdfsBuilderSetNameNode(
+        builder, namenode.empty() ? nullptr : namenode.c_str());
     auto cacheFs = libhdfs->hdfsBuilderConnect(builder);
     if (cacheFs == nullptr) {
-      TF_SetStatusFromIOError(status, TF_NOT_FOUND, strerror(errno));
+      TF_SetStatusFromIOError(status, TF_ABORTED, strerror(errno));
       return cacheFs;
     }
     hadoop_file->connection_cache[cacheKey] = cacheFs;
diff --git a/tensorflow/c/experimental/gradients/math_grad.cc b/tensorflow/c/experimental/gradients/math_grad.cc
index 5cba7b28fda..3ee5294ca15 100644
--- a/tensorflow/c/experimental/gradients/math_grad.cc
+++ b/tensorflow/c/experimental/gradients/math_grad.cc
@@ -24,6 +24,7 @@ using std::vector;
 using tensorflow::ops::Conj;
 using tensorflow::ops::MatMul;
 using tensorflow::ops::Mul;
+using tensorflow::ops::Neg;
 using tensorflow::ops::SqrtGrad;
 
 namespace tensorflow {
@@ -201,6 +202,93 @@ class MatMulGradientFunction : public GradientFunction {
   AttrBuilder forward_attrs;
 };
 
+class NegGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a Neg op Y = -X, the gradients are:
+     *
+     *    dX =  -U
+     *
+     */
+
+    grad_outputs->resize(1);
+    std::string name = "Neg_Grad";
+    TF_RETURN_IF_ERROR(ops::Neg(ctx->ctx, {grad_inputs[0]},
+                                absl::MakeSpan(*grad_outputs), name.c_str()));
+    return Status::OK();
+  }
+  ~NegGradientFunction() override {}
+};
+
+class SubGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a Sub op A-B, the gradients are:
+     *
+     *    dA =  U
+     *    dB = -U
+     *
+     */
+
+    grad_outputs->resize(2);
+
+    // Grad for A
+    DCHECK(grad_inputs[0]);
+    (*grad_outputs)[0] = grad_inputs[0];
+    (*grad_outputs)[0]->Ref();
+
+    // Grad for B
+    // negate the upstream grad
+    std::vector<AbstractTensorHandle*> neg_outputs(1);
+    std::string name = "Neg_Sub_Grad_B";
+    TF_RETURN_IF_ERROR(ops::Neg(ctx->ctx, {grad_inputs[0]},
+                                absl::MakeSpan(neg_outputs), name.c_str()));
+    (*grad_outputs)[1] = neg_outputs[0];
+
+    return Status::OK();
+  }
+  ~SubGradientFunction() override {}
+};
+
+class MulGradientFunction : public GradientFunction {
+ public:
+  explicit MulGradientFunction(vector<AbstractTensorHandle*> f_inputs)
+      : forward_inputs(f_inputs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a mul op A*B, the gradients are:
+     *
+     *    dA = U * B
+     *    dB = A * U
+     *
+     */
+
+    AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    grad_outputs->resize(2);
+    std::vector<AbstractTensorHandle*> mul_outputs(1);
+
+    // Gradient for A
+    std::string name = "Mul_Grad_A";
+    TF_RETURN_IF_ERROR(Mul(ctx->ctx, {upstream_grad, forward_inputs[1]},
+                           absl::MakeSpan(mul_outputs), name.c_str()));
+    (*grad_outputs)[0] = mul_outputs[0];
+
+    // Gradient for B
+    name = "Mul_Grad_B";
+    TF_RETURN_IF_ERROR(Mul(ctx->ctx, {forward_inputs[0], upstream_grad},
+                           absl::MakeSpan(mul_outputs), name.c_str()));
+    (*grad_outputs)[1] = mul_outputs[0];
+    return Status::OK();
+  }
+  ~MulGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_inputs;
+};
+
 }  // namespace
 
 BackwardFunction* AddRegisterer(const ForwardOperation& op) {
@@ -239,5 +327,32 @@ BackwardFunction* SqrtRegisterer(const ForwardOperation& op) {
   return new BackwardFunction(gradient_function, default_gradients);
 }
 
+BackwardFunction* NegRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new NegGradientFunction;
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* SubRegisterer(const ForwardOperation& op) {
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto gradient_function = new SubGradientFunction;
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* MulRegisterer(const ForwardOperation& op) {
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto gradient_function = new MulGradientFunction(op.inputs);
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
 }  // namespace gradients
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/math_grad.h b/tensorflow/c/experimental/gradients/math_grad.h
index 7faeadcca81..d2a0bf2b646 100644
--- a/tensorflow/c/experimental/gradients/math_grad.h
+++ b/tensorflow/c/experimental/gradients/math_grad.h
@@ -24,6 +24,9 @@ BackwardFunction* AddRegisterer(const ForwardOperation& op);
 BackwardFunction* ExpRegisterer(const ForwardOperation& op);
 BackwardFunction* MatMulRegisterer(const ForwardOperation& op);
 BackwardFunction* SqrtRegisterer(const ForwardOperation& op);
+BackwardFunction* NegRegisterer(const ForwardOperation& op);
+BackwardFunction* SubRegisterer(const ForwardOperation& op);
+BackwardFunction* MulRegisterer(const ForwardOperation& op);
 
 }  // namespace gradients
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/tape/tape_operation.cc b/tensorflow/c/experimental/gradients/tape/tape_operation.cc
index 0b247d08f6c..841782aa6da 100644
--- a/tensorflow/c/experimental/gradients/tape/tape_operation.cc
+++ b/tensorflow/c/experimental/gradients/tape/tape_operation.cc
@@ -25,7 +25,7 @@ TapeOperation::TapeOperation(AbstractOperation* parent_op, Tape* tape,
       parent_op_(parent_op),
       tape_(tape),
       registry_(registry) {
-  // TODO(srbs): Make AbstractOperation RefCounted.
+  // TODO(b/172003047): Consider making AbstractOperation RefCounted.
   // parent_op_->Ref();
 }
 void TapeOperation::Release() {
@@ -33,7 +33,7 @@ void TapeOperation::Release() {
   delete this;
 }
 TapeOperation::~TapeOperation() {
-  // TODO(srbs): Make AbstractOperation RefCounted.
+  // TODO(b/172003047): Consider making AbstractOperation RefCounted.
   // parent_op->Unref();
 }
 Status TapeOperation::Reset(const char* op, const char* raw_device_name) {
diff --git a/tensorflow/c/experimental/op_handler/BUILD b/tensorflow/c/experimental/op_handler/BUILD
new file mode 100644
index 00000000000..bdb5328180c
--- /dev/null
+++ b/tensorflow/c/experimental/op_handler/BUILD
@@ -0,0 +1,43 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_cc_test(
+    name = "internal_test",
+    srcs = ["internal_test.cc"],
+    deps = [
+        ":internal",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "internal",
+    srcs = ["internal.cc"],
+    hdrs = ["internal.h"],
+    deps = [
+        ":wrapper_operation",
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "wrapper_operation",
+    srcs = ["wrapper_operation.cc"],
+    hdrs = ["wrapper_operation.h"],
+    deps = ["//tensorflow/c/eager:abstract_operation"],
+)
diff --git a/tensorflow/c/experimental/op_handler/internal.cc b/tensorflow/c/experimental/op_handler/internal.cc
new file mode 100644
index 00000000000..b9acbf44583
--- /dev/null
+++ b/tensorflow/c/experimental/op_handler/internal.cc
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_INTERNAL_CC_
+#define TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_INTERNAL_CC_
+
+#include "tensorflow/c/experimental/op_handler/internal.h"
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/experimental/op_handler/wrapper_operation.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+OpHandlerContext::OpHandlerContext(AbstractContext* parent_ctx)
+    : AbstractContext(kOpHandler), parent_ctx_(parent_ctx) {}
+OpHandlerContext::~OpHandlerContext() {}
+void OpHandlerContext::Release() { delete this; }
+Status OpHandlerContext::RegisterFunction(AbstractFunction* function) {
+  return parent_ctx_->RegisterFunction(function);
+}
+
+Status OpHandlerContext::RemoveFunction(const string& function) {
+  return parent_ctx_->RemoveFunction(function);
+}
+
+void OpHandlerContext::set_default_handler(OpHandler* handler) {
+  handler->Ref();
+  default_handler_.reset(handler);
+}
+
+OpHandlerOperation* OpHandlerContext::CreateOperation() {
+  OpHandlerOperation* result =
+      new OpHandlerOperation(parent_ctx_->CreateOperation());
+  if (default_handler_ != nullptr) {
+    result->set_handler(default_handler_.get());
+  }
+  return result;
+}
+
+OpHandlerOperation::OpHandlerOperation(AbstractOperation* parent_op)
+    : WrapperOperation(parent_op, kOpHandler) {}
+
+OpHandler* OpHandlerOperation::get_handler() { return handler_.get(); }
+
+void OpHandlerOperation::set_handler(OpHandler* handler) {
+  if (handler != nullptr) {
+    handler->Ref();
+  }
+  handler_.reset(handler);
+}
+
+Status OpHandlerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
+                                   int* num_retvals) {
+  if (handler_ == nullptr) {
+    return WrapperOperation::Execute(retvals, num_retvals);
+  } else {
+    return handler_->Execute(this, retvals, num_retvals);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_INTERNAL_H_
diff --git a/tensorflow/c/experimental/op_handler/internal.h b/tensorflow/c/experimental/op_handler/internal.h
new file mode 100644
index 00000000000..de893f77a7e
--- /dev/null
+++ b/tensorflow/c/experimental/op_handler/internal.h
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/experimental/op_handler/wrapper_operation.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class OpHandlerOperation;
+
+// Op handlers are a convenient way to intercept and transform computation.
+//
+// The implementation is currently experimental and incomplete, but aims
+// eventually to support tracing and replay of function bodies, gradients
+// through copy operations, and a variety of hooks for things like debug
+// strings. A public C API for op handlers is planned.
+class OpHandler : public core::RefCounted {
+ public:
+  // Called on operation->Execute when operation->get_handler() == this.
+  //
+  // Allows the handler to customize or inspect `operation`'s execution.
+  virtual Status Execute(OpHandlerOperation* operation,
+                         absl::Span<AbstractTensorHandle*> retvals,
+                         int* num_retvals) = 0;
+  // Creates a new handler by merging this handler with `next_handler`.
+  //
+  // The new handler is expected to transform operations first with this handler
+  // and then execute the resulting operations on `next_handler` (by calling
+  // `OpHandlerOperation::set_handler` and passing `next_handler`). If this is
+  // not possible then the merge operation should fail.
+  virtual Status Merge(OpHandler* next_handler,
+                       core::RefCountPtr<OpHandler>& merged_handler) = 0;
+};
+
+// Keeps some handler-specific metadata, but otherwise wraps a single
+// AbstractOperation in the underlying context. The operation is created, its
+// attributes set, etc., and at execution time it is presented to its handler,
+// which may choose to execute it or simply inspect it and do something else.
+//
+// This is somewhat different than the Context approach, where the operation's
+// construction is streamed through each layered Context. The streaming approach
+// would require a much larger op handler public API, one function pointer per
+// attribute type, and there is some ambiguity before an op is finalized about
+// whether it should be presented as-is to handlers (regular operations) or
+// replayed (function calls and control flow operations).
+class OpHandlerOperation : public WrapperOperation {
+ public:
+  explicit OpHandlerOperation(AbstractOperation*);
+  OpHandler* get_handler();
+  void set_handler(OpHandler* handler);
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                 int* num_retvals) override;
+
+ protected:
+  core::RefCountPtr<OpHandler> handler_;
+};
+
+// A context which allows a default handler to be set for new operations. It
+// otherwise defers to the context it wraps.
+//
+// TODO(allenl): A stack of contexts and a stack of handlers look pretty similar
+// in some ways. Having each handler be its own context seems almost doable,
+// with things like copy operations and function/control flow replay being
+// somewhat tricky (since they should be generated at the top of the handler
+// stack and "caught" at the bottom). After handlers have evolved for a bit we
+// should re-evaluate whether the handler+context concepts can be merged.
+class OpHandlerContext : public AbstractContext {
+ public:
+  explicit OpHandlerContext(AbstractContext*);
+  void Release() override;
+  OpHandlerOperation* CreateOperation() override;
+  Status RegisterFunction(AbstractFunction*) override;
+  Status RemoveFunction(const string&) override;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kOpHandler;
+  }
+  ~OpHandlerContext() override;
+
+  void set_default_handler(OpHandler* handler);
+
+ private:
+  AbstractContext* parent_ctx_;  // Not owned.
+  core::RefCountPtr<OpHandler> default_handler_;
+};
+
+class ReleaseOpHandlerOperation {
+ public:
+  void operator()(OpHandlerOperation* operation) { operation->Release(); }
+};
+
+typedef std::unique_ptr<OpHandlerOperation, ReleaseOpHandlerOperation>
+    OpHandlerOperationPtr;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_INTERNAL_H_
diff --git a/tensorflow/c/experimental/op_handler/internal_test.cc b/tensorflow/c/experimental/op_handler/internal_test.cc
new file mode 100644
index 00000000000..d8ac8b3b985
--- /dev/null
+++ b/tensorflow/c/experimental/op_handler/internal_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/op_handler/internal.h"
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TestOpHandler : public OpHandler {
+ public:
+  TestOpHandler() : last_operation_(new std::string("")) {}
+  Status Execute(OpHandlerOperation* operation,
+                 absl::Span<AbstractTensorHandle*> retvals,
+                 int* num_retvals) override {
+    CHECK(operation->get_handler() == this);
+    *last_operation_ = operation->Name();
+    operation->set_handler(next_handler_.get());
+    return operation->Execute(retvals, num_retvals);
+  }
+  Status Merge(OpHandler* next_handler,
+               core::RefCountPtr<OpHandler>& merged_handler) override {
+    merged_handler.reset(new TestOpHandler(next_handler, last_operation_));
+    return Status::OK();
+  }
+
+  core::RefCountPtr<OpHandler> next_handler_ = nullptr;
+  // Shared between merged handlers of this type.
+  std::shared_ptr<std::string> last_operation_;
+
+ private:
+  TestOpHandler(OpHandler* next_handler,
+                std::shared_ptr<std::string> last_operation)
+      : next_handler_(next_handler), last_operation_(last_operation) {
+    next_handler->Ref();
+  }
+};
+
+TEST(INTERNAL_TEST, UseOpHandler) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_ExecutionContext, decltype(&TF_DeleteExecutionContext)>
+      c_ctx(TF_NewEagerExecutionContext(opts.get(), status.get()),
+            TF_DeleteExecutionContext);
+  OpHandlerContext ctx(unwrap(c_ctx.get()));
+  core::RefCountPtr<TestOpHandler> outer_handler(new TestOpHandler());
+  core::RefCountPtr<TestOpHandler> inner_handler(new TestOpHandler());
+  ctx.set_default_handler(outer_handler.get());
+  OpHandlerOperationPtr op(ctx.CreateOperation());
+  Status s = op->Reset("NoOp", "");
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  std::vector<AbstractTensorHandle*> retvals;
+  int num_retvals = 0;
+  EXPECT_EQ("", *outer_handler->last_operation_);
+  s = op->Execute(absl::Span<AbstractTensorHandle*>(retvals), &num_retvals);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  EXPECT_EQ("NoOp", *outer_handler->last_operation_);
+  *outer_handler->last_operation_ = "";
+  EXPECT_EQ("", *inner_handler->last_operation_);
+
+  // This op executes on both handlers, changing the state of `inner_handler`
+  // since the handler has decided to preserve that state across merges.
+  core::RefCountPtr<OpHandler> merged;
+  s = inner_handler->Merge(outer_handler.get(), merged);
+  ctx.set_default_handler(merged.get());
+  op.reset(ctx.CreateOperation());
+  s = op->Reset("NoOp", "");
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  s = op->Execute(absl::Span<AbstractTensorHandle*>(retvals), &num_retvals);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  EXPECT_EQ("NoOp", *inner_handler->last_operation_);
+  EXPECT_EQ("NoOp", *outer_handler->last_operation_);
+
+  inner_handler.reset();
+  outer_handler.reset();
+  op.reset(ctx.CreateOperation());
+  s = op->Reset("NoOp", "");
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  s = op->Execute(absl::Span<AbstractTensorHandle*>(retvals), &num_retvals);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/op_handler/wrapper_operation.cc b/tensorflow/c/experimental/op_handler/wrapper_operation.cc
new file mode 100644
index 00000000000..018bba04b8a
--- /dev/null
+++ b/tensorflow/c/experimental/op_handler/wrapper_operation.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/op_handler/wrapper_operation.h"
+
+namespace tensorflow {
+WrapperOperation::WrapperOperation(AbstractOperation* parent_op,
+                                   AbstractOperationKind kind)
+    : AbstractOperation(kind), parent_op_(parent_op) {
+  // TODO(b/172003047): Consider making AbstractOperation RefCounted.
+  // parent_op_->Ref();
+}
+void WrapperOperation::Release() {
+  parent_op_->Release();
+  // TODO(b/172003047): Consider making AbstractOperation RefCounted.
+  delete this;
+}
+
+Status WrapperOperation::Reset(const char* op, const char* raw_device_name) {
+  return parent_op_->Reset(op, raw_device_name);
+}
+const string& WrapperOperation::Name() const { return parent_op_->Name(); }
+const string& WrapperOperation::DeviceName() const {
+  return parent_op_->DeviceName();
+}
+Status WrapperOperation::SetDeviceName(const char* name) {
+  return parent_op_->SetDeviceName(name);
+}
+Status WrapperOperation::AddInput(AbstractTensorHandle* input) {
+  return parent_op_->AddInput(input);
+}
+Status WrapperOperation::AddInputList(
+    absl::Span<AbstractTensorHandle* const> inputs) {
+  return parent_op_->AddInputList(inputs);
+}
+Status WrapperOperation::SetAttrString(const char* attr_name, const char* data,
+                                       size_t length) {
+  return parent_op_->SetAttrString(attr_name, data, length);
+}
+Status WrapperOperation::SetAttrInt(const char* attr_name, int64_t value) {
+  return parent_op_->SetAttrInt(attr_name, value);
+}
+Status WrapperOperation::SetAttrFloat(const char* attr_name, float value) {
+  return parent_op_->SetAttrFloat(attr_name, value);
+}
+Status WrapperOperation::SetAttrBool(const char* attr_name, bool value) {
+  return parent_op_->SetAttrBool(attr_name, value);
+}
+Status WrapperOperation::SetAttrType(const char* attr_name, DataType value) {
+  return parent_op_->SetAttrType(attr_name, value);
+}
+Status WrapperOperation::SetAttrShape(const char* attr_name,
+                                      const int64_t* dims, const int num_dims) {
+  return parent_op_->SetAttrShape(attr_name, dims, num_dims);
+}
+Status WrapperOperation::SetAttrFunction(const char* attr_name,
+                                         const AbstractOperation* value) {
+  return parent_op_->SetAttrFunction(attr_name, value);
+}
+Status WrapperOperation::SetAttrFunctionName(const char* attr_name,
+                                             const char* value, size_t length) {
+  return parent_op_->SetAttrFunctionName(attr_name, value, length);
+}
+Status WrapperOperation::SetAttrTensor(const char* attr_name,
+                                       AbstractTensorInterface* tensor) {
+  return parent_op_->SetAttrTensor(attr_name, tensor);
+}
+Status WrapperOperation::SetAttrStringList(const char* attr_name,
+                                           const void* const* values,
+                                           const size_t* lengths,
+                                           int num_values) {
+  return parent_op_->SetAttrStringList(attr_name, values, lengths, num_values);
+}
+Status WrapperOperation::SetAttrFloatList(const char* attr_name,
+                                          const float* values, int num_values) {
+  return parent_op_->SetAttrFloatList(attr_name, values, num_values);
+}
+Status WrapperOperation::SetAttrIntList(const char* attr_name,
+                                        const int64_t* values, int num_values) {
+  return parent_op_->SetAttrIntList(attr_name, values, num_values);
+}
+Status WrapperOperation::SetAttrTypeList(const char* attr_name,
+                                         const DataType* values,
+                                         int num_values) {
+  return parent_op_->SetAttrTypeList(attr_name, values, num_values);
+}
+Status WrapperOperation::SetAttrBoolList(const char* attr_name,
+                                         const unsigned char* values,
+                                         int num_values) {
+  return parent_op_->SetAttrBoolList(attr_name, values, num_values);
+}
+Status WrapperOperation::SetAttrShapeList(const char* attr_name,
+                                          const int64_t** dims,
+                                          const int* num_dims, int num_values) {
+  return parent_op_->SetAttrShapeList(attr_name, dims, num_dims, num_values);
+}
+Status WrapperOperation::SetAttrFunctionList(
+    const char* attr_name, absl::Span<const AbstractOperation*> values) {
+  return parent_op_->SetAttrFunctionList(attr_name, values);
+}
+AbstractOperation* WrapperOperation::GetBackingOperation() {
+  return parent_op_;
+}
+Status WrapperOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
+                                 int* num_retvals) {
+  return parent_op_->Execute(retvals, num_retvals);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/op_handler/wrapper_operation.h b/tensorflow/c/experimental/op_handler/wrapper_operation.h
new file mode 100644
index 00000000000..b0ec9f174f0
--- /dev/null
+++ b/tensorflow/c/experimental/op_handler/wrapper_operation.h
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_WRAPPER_OPERATION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_WRAPPER_OPERATION_H_
+
+#include "tensorflow/c/eager/abstract_operation.h"
+
+namespace tensorflow {
+
+// Forwards all of the AbstractOperation's methods to its wrapped operation.
+//
+// Useful as a base class to default to forwarding while adding some
+// customization.
+class WrapperOperation : public AbstractOperation {
+ public:
+  explicit WrapperOperation(AbstractOperation*, AbstractOperationKind kind);
+  void Release() override;
+  Status Reset(const char* op, const char* raw_device_name) override;
+  const string& Name() const override;
+  const string& DeviceName() const override;
+  Status SetDeviceName(const char* name) override;
+  Status AddInput(AbstractTensorHandle* input) override;
+  Status AddInputList(absl::Span<AbstractTensorHandle* const> inputs) override;
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                 int* num_retvals) override;
+  Status SetAttrString(const char* attr_name, const char* data,
+                       size_t length) override;
+  Status SetAttrInt(const char* attr_name, int64_t value) override;
+  Status SetAttrFloat(const char* attr_name, float value) override;
+  Status SetAttrBool(const char* attr_name, bool value) override;
+  Status SetAttrType(const char* attr_name, DataType value) override;
+  Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                      const int num_dims) override;
+  Status SetAttrFunction(const char* attr_name,
+                         const AbstractOperation* value) override;
+  Status SetAttrFunctionName(const char* attr_name, const char* value,
+                             size_t length) override;
+  Status SetAttrTensor(const char* attr_name,
+                       AbstractTensorInterface* tensor) override;
+  Status SetAttrStringList(const char* attr_name, const void* const* values,
+                           const size_t* lengths, int num_values) override;
+  Status SetAttrFloatList(const char* attr_name, const float* values,
+                          int num_values) override;
+  Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                        int num_values) override;
+  Status SetAttrTypeList(const char* attr_name, const DataType* values,
+                         int num_values) override;
+  Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
+                         int num_values) override;
+  Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                          const int* num_dims, int num_values) override;
+  Status SetAttrFunctionList(
+      const char* attr_name,
+      absl::Span<const AbstractOperation*> values) override;
+  AbstractOperation* GetBackingOperation();
+
+ private:
+  AbstractOperation* parent_op_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OP_HANDLER_WRAPPER_OPERATION_H_
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 95bb12e8e50..462993e8918 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -11,17 +11,29 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "stream_executor_hdrs",
+    hdrs = ["stream_executor.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status_headers",
+    ],
+)
+
 cc_library(
     name = "stream_executor",
     srcs = ["stream_executor.cc"],
     hdrs = ["stream_executor.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         ":stream_executor_internal",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:strcat",
         "//tensorflow/stream_executor:executor_cache",
         "//tensorflow/stream_executor:multi_platform_manager",
         "//tensorflow/stream_executor:platform",
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 09442a4f7b7..ec2bada791e 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -28,7 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/stream_executor/executor_cache.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -40,6 +43,8 @@ limitations under the License.
 using tensorflow::StatusFromTF_Status;
 
 namespace stream_executor {
+using tensorflow::StringPiece;
+
 namespace {
 
 #define VALIDATE_STRUCT_SIZE(STRUCT_NAME, STRUCT_OBJ, SIZE_VALUE_NAME) \
@@ -59,10 +64,35 @@ namespace {
     }                                                            \
   } while (0)
 
+port::Status ValidateDeviceType(StringPiece type) {
+  // Validate device type. Device type must start with a capital letter and
+  // consist of capital letters and underscores. Reasoning behind this decision:
+  // * At the minimum we want to disallow '/' and ':' since
+  //   these characters are used in device spec, for e.g.
+  //   /job:foo/replica:12/device:GPU:1.
+  // * Underscores seem useful, for e.g. XLA_GPU uses underscores.
+  // * Allowing lowercase might get confusing. For example, say someone
+  //   registers a new type called "Gpu". It might be confusing for users that
+  //   "Gpu" is not the same device type as "GPU".
+  //   Note that lowercase "cpu" and "gpu" are currently supported only for
+  //   legacy reasons:
+  //   https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/python/framework/device_spec.py;l=46;drc=d3a378f9665d8eee827c74cb9ecbee81e4c288dd
+  static const LazyRE2 kTfDeviceTypeRegEx = {"[A-Z][A-Z_]*"};
+  bool matches = RE2::FullMatch(type, *kTfDeviceTypeRegEx);
+  if (!matches) {
+    return port::FailedPreconditionError(
+        tensorflow::strings::StrCat("Device name/type '", type, "' must match ",
+                                    kTfDeviceTypeRegEx->pattern(), "."));
+  }
+  return port::Status::OK();
+}
+
 port::Status ValidateSPPlatform(const SP_Platform& platform) {
   VALIDATE_STRUCT_SIZE(SP_Platform, platform, SP_PLATFORM_STRUCT_SIZE);
   VALIDATE_MEMBER(SP_Platform, platform, name);
   VALIDATE_MEMBER(SP_Platform, platform, type);
+  TF_RETURN_IF_ERROR(ValidateDeviceType(platform.name));
+  TF_RETURN_IF_ERROR(ValidateDeviceType(platform.type));
   // `visible_device_count` could be 0 at initialization time.
   return port::Status::OK();
 }
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.h b/tensorflow/c/experimental/stream_executor/stream_executor.h
index ba6b1c564a8..bec77ef520b 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.h
@@ -52,7 +52,7 @@ limitations under the License.
 //   params.device = &device;
 //
 //   /* Plugin code below */
-//   constexpr char DEVICE_NAME[] = "MyDevice";
+//   constexpr char DEVICE_NAME[] = "MY_DEVICE";
 //   constexpr char DEVICE_TYPE[] = "GPU";
 //
 //   void create_device(const SP_Platform* platform,
@@ -416,10 +416,15 @@ typedef struct SP_Platform {
 
   void* ext;  // free-form data set by plugin
 
-  // Platform name. Must be null-terminated.
+  // Platform name (also referred to as subtype), for example MY_DEVICE.
+  // The name must start with a capital letter and consist of
+  // capital letters and underscores.
+  // Must be null-terminated.
   const char* name;
 
   // Device type name, for example GPU. Must be null-terminated.
+  // The name must start with a capital letter and consist of
+  // capital letters and underscores.
   const char* type;
 
   // Number of visible devices
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index b28d1f6fc6d..56c4ea09052 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -41,9 +41,9 @@ struct SP_Timer_st {
 
 namespace stream_executor {
 namespace {
-constexpr int DEVICE_COUNT = 2;
-constexpr char DEVICE_NAME[] = "MyDevice";
-constexpr char DEVICE_TYPE[] = "GPU";
+constexpr int kDeviceCount = 2;
+constexpr char kDeviceName[] = "MY_DEVICE";
+constexpr char kDeviceType[] = "GPU";
 
 /*** Create SP_StreamExecutor (with empty functions) ***/
 void allocate(const SP_Device* const device, uint64_t size,
@@ -190,9 +190,9 @@ void destroy_device_fns(const SP_Platform* platform, SP_DeviceFns* device_fns) {
 void PopulateDefaultPlatform(SP_Platform* platform,
                              SP_PlatformFns* platform_fns) {
   *platform = {SP_PLATFORM_STRUCT_SIZE};
-  platform->name = DEVICE_NAME;
-  platform->type = DEVICE_TYPE;
-  platform->visible_device_count = DEVICE_COUNT;
+  platform->name = kDeviceName;
+  platform->type = kDeviceType;
+  platform->visible_device_count = kDeviceCount;
   platform_fns->create_device = create_device;
   platform_fns->destroy_device = destroy_device;
   platform_fns->create_device_fns = create_device_fns;
@@ -218,11 +218,11 @@ TEST(StreamExecutor, SuccessfulRegistration) {
   port::Status status = InitStreamExecutorPlugin(plugin_init);
   TF_ASSERT_OK(status);
   port::StatusOr<Platform*> maybe_platform =
-      MultiPlatformManager::PlatformWithName("MyDevice");
+      MultiPlatformManager::PlatformWithName("MY_DEVICE");
   TF_ASSERT_OK(maybe_platform.status());
   Platform* platform = maybe_platform.ConsumeValueOrDie();
-  ASSERT_EQ(platform->Name(), DEVICE_NAME);
-  ASSERT_EQ(platform->VisibleDeviceCount(), DEVICE_COUNT);
+  ASSERT_EQ(platform->Name(), kDeviceName);
+  ASSERT_EQ(platform->VisibleDeviceCount(), kDeviceCount);
 
   port::StatusOr<StreamExecutor*> maybe_executor =
       platform->ExecutorForDevice(0);
@@ -244,6 +244,39 @@ TEST(StreamExecutor, NameNotSet) {
   ASSERT_EQ(status.error_message(), "'name' field in SP_Platform must be set.");
 }
 
+TEST(StreamExecutor, InvalidNameWithSemicolon) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->name = "INVALID:NAME";
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  EXPECT_THAT(
+      status.error_message(),
+      testing::ContainsRegex("Device name/type 'INVALID:NAME' must match"));
+}
+
+TEST(StreamExecutor, InvalidNameWithSlash) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->name = "INVALID/";
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  EXPECT_THAT(status.error_message(),
+              testing::ContainsRegex("Device name/type 'INVALID/' must match"));
+}
+
 TEST(StreamExecutor, CreateDeviceNotSet) {
   auto plugin_init = [](SE_PlatformRegistrationParams* const params,
                         TF_Status* const status) -> void {
diff --git a/tensorflow/c/kernels/bitcast_op.cc b/tensorflow/c/kernels/bitcast_op.cc
index c194dcd686b..c6468e0ab80 100644
--- a/tensorflow/c/kernels/bitcast_op.cc
+++ b/tensorflow/c/kernels/bitcast_op.cc
@@ -148,7 +148,7 @@ void RegisterBitcastOpKernel() {
         << "Error while registering bitcast kernel";
   }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   {
     auto* builder = TF_NewKernelBuilder("Bitcast", tensorflow::DEVICE_GPU,
                                         &BitcastOp_Create, &BitcastOp_Compute,
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index c9df2cc34d1..5ddc9a46be1 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -352,7 +352,7 @@ class DeviceKernelOpTest : public OpsTestBase {
     EXPECT_EQ(TF_OK, TF_GetCode(status));
     TF_DeleteStatus(status);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice(device_name_, {}, "/job:a/replica:0/task:0"));
     OpsTestBase::SetDevice(DEVICE_GPU, std::move(device));
@@ -361,7 +361,7 @@ class DeviceKernelOpTest : public OpsTestBase {
     TF_ASSERT_OK(InitOp());
   }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   const char* device_name_ = tensorflow::DEVICE_GPU;
 #else
   const char* device_name_ = tensorflow::DEVICE_CPU;
@@ -468,7 +468,7 @@ TEST_F(DeviceKernelOpTest, TestAllocateTempSizeOne) {
     int64_t dim = 1;
     TF_AllocatorAttributes alloc_attrs;
     alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     alloc_attrs.on_host = 0;
 #else
     alloc_attrs.on_host = 1;
@@ -505,7 +505,7 @@ TEST_F(DeviceKernelOpTest, TestAllocateTempEmpty) {
     int64_t dim = 0;
     TF_AllocatorAttributes alloc_attrs;
     alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     alloc_attrs.on_host = 0;
 #else
     alloc_attrs.on_host = 1;
@@ -538,7 +538,7 @@ TEST_F(DeviceKernelOpTest, TestAllocateTempSize2x3) {
     int64_t dim[2] = {2, 3};
     TF_AllocatorAttributes alloc_attrs;
     alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     alloc_attrs.on_host = 0;
 #else
     alloc_attrs.on_host = 1;
@@ -646,7 +646,7 @@ template <typename T>
 void set_tensor_data(TF_Tensor* tensor, T* values, size_t tensor_size_bytes,
                      TF_OpKernelContext* ctx) {
   T* data = reinterpret_cast<T*>(TF_TensorData(tensor));
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
   cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, values,
                                                 tensor_size_bytes);
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index bf5ada89cdd..8f7e447d322 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -251,7 +251,6 @@ cc_library_with_android_deps(
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_experimental",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -266,7 +265,6 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_experimental",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index e9173227aad..480243a29e6 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#include "tensorflow/cc/framework/grad_op_registry.h"
-#include "tensorflow/cc/framework/gradients.h"
-
 namespace tensorflow {
 namespace ops {
 namespace {
@@ -90,15 +89,25 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad);
 
-Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
-  grad_outputs->push_back(NoGradient());
-  grad_outputs->push_back(NoGradient());
+Status QuantizeAndDequantizeV4GradHelper(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  Input input = Shape(scope, op.input(0));
+  Input input_min = op.input(1);
+  Input input_max = op.input(2);
+  int64 axis;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
+  auto qdq_v4_grad = QuantizeAndDequantizeV4Grad(
+      scope, grad_inputs[0], input, input_min, input_max,
+      QuantizeAndDequantizeV4Grad::Axis(axis));
+  grad_outputs->push_back(qdq_v4_grad.input_backprop);
+  grad_outputs->push_back(qdq_v4_grad.input_min_backprop);
+  grad_outputs->push_back(qdq_v4_grad.input_max_backprop);
   return scope.status();
 }
-REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV4",
+                     QuantizeAndDequantizeV4GradHelper);
 
 Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op,
                                    const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 70d080a682f..dcd652d9fdf 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -404,10 +404,12 @@ Status RestoreSession(const RunOptions& run_options,
   const uint64 read_start_microseconds = Env::Default()->NowMicros();
   std::vector<AssetFileDef> asset_file_defs;
   TF_RETURN_IF_ERROR(internal::GetAssetFileDefs(meta_graph, &asset_file_defs));
-  TF_RETURN_IF_ERROR(RunRestore(run_options, export_dir,
-                                meta_graph.saver_def().restore_op_name(),
-                                meta_graph.saver_def().filename_tensor_name(),
-                                asset_file_defs, session->get()));
+  if (meta_graph.has_saver_def()) {
+    TF_RETURN_IF_ERROR(RunRestore(run_options, export_dir,
+                                  meta_graph.saver_def().restore_op_name(),
+                                  meta_graph.saver_def().filename_tensor_name(),
+                                  asset_file_defs, session->get()));
+  }
   // Record walltime spent in restoring graph from disk, but postpone metric
   // increments until graph init finishes.
   const uint64 restore_graph_walltime =
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 7d87b5f0715..5c84eecd976 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -7,6 +7,9 @@ load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "tf_cc_
 load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("//tensorflow/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "filegroup")
 
@@ -283,6 +286,7 @@ cc_library(
         "//tensorflow/compiler/xla:parse_flags_from_env",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
     ],
@@ -291,7 +295,7 @@ cc_library(
 # Header-only version of "flags" library, for linking from the shared object
 # without ODR violations.
 cc_library(
-    name = "flags_headers_only",
+    name = "flags_headers",
     hdrs = ["flags.h"],
     visibility = [":friends"],
     deps = [
@@ -302,6 +306,11 @@ cc_library(
     ],
 )
 
+cc_header_only_library(
+    name = "flags_headers_only",
+    deps = [":flags_headers"],
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -447,8 +456,8 @@ cc_library(
 # Header-only version of "flags" library, for linking from the shared object
 # without ODR violations.
 cc_library(
-    name = "get_compiler_ir_hdrs_only",
-    hdrs = ["get_compiler_ir.h"],
+    name = "get_compiler_ir_hdrs",
+    textual_hdrs = ["get_compiler_ir.h"],
     visibility = [
         ":internal",
         "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
@@ -463,6 +472,23 @@ cc_library(
     ],
 )
 
+cc_header_only_library(
+    name = "get_compiler_ir_hdrs_only",
+    deps = [":get_compiler_ir_hdrs"],
+)
+
+# This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
+cc_header_only_library(
+    name = "xla_jit_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+    ],
+)
+
 cc_library(
     name = "xla_kernel_creator",
     srcs = [
@@ -520,8 +546,8 @@ cc_library(
     hdrs = ["resource_operation_safety_analysis.h"],
     deps = [
         ":xla_cluster_util",
-        "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -692,7 +718,6 @@ cc_library(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/cc:scope_internal",
-        "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:side_effect_util",
@@ -705,6 +730,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -732,9 +758,9 @@ cc_library(
     deps = [
         ":flags",
         ":xla_activity_proto_cc",
-        "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -842,9 +868,12 @@ tf_cc_test(
         "partially_decluster_pass_test.cc",
         "rearrange_function_argument_pass_test.cc",
     ],
-    # TODO(b/141643254) Re-enable msan after fixing use-of-uninitialized-value
-    # error.
-    tags = ["nomsan"] + tf_cuda_tests_tags(),
+    tags = [
+        # TODO(b/141643254) Re-enable msan after fixing
+        # use-of-uninitialized-value error.
+        "nomsan",
+        "no_cuda_asan",  # TODO(b/171317460): re-enable.
+    ] + tf_cuda_tests_tags(),
     deps = [
         ":common",
         ":compilability_check_util",
@@ -965,13 +994,13 @@ cc_library(
         ":xla_activity_listener",
         ":xla_activity_proto_cc",
         ":xla_cluster_util",
-        "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -1075,15 +1104,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
-cc_header_only_library(
-    name = "xla_jit_headers_lib",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xla_cpu_device",
-        ":xla_cpu_jit",
-        ":xla_gpu_device",
-        ":xla_gpu_jit",
-    ],
-)
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 62e121420c3..87b06c2ab36 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/device_util.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
@@ -42,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 65da072483b..224bedabd3b 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/device_util.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index d482642b44c..fd55cab637c 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index ee7daf092da..52d8fb94ff6 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -167,8 +167,16 @@ void AllocateAndParseFlags() {
   jitter_flags = new IntroduceFloatingPointJitterPassFlags;
   jitter_flags->jitter_amount = 1e-5;
 
-  mlir_flags = new MlirCommonFlags;
-  mlir_flags->tf_mlir_enable_mlir_bridge = false;
+  // The `enable_mlir_bridge` flag allows the user to explicitly request that
+  // their program is (or isn't) compiled using the MLIR-based TF-to-XLA bridge.
+  //
+  // The `enable_mlir_bridge_is_explicit` variable tracks whether or not the
+  // user has made an explicit request. That is, if this variable is set to
+  // true, the program honors the user's request as per `enable_mlir_bridge`; if
+  // it's set to false, the default behavior is used (which may run either
+  // bridge, on a per-graph basis).
+  bool enable_mlir_bridge = false;
+  bool enable_mlir_bridge_is_explicit = false;
 
   auto setter_for_jitter_tensor_names = [](string sequence) {
     jitter_flags->tensor_names = absl::StrSplit(sequence, ',');
@@ -217,12 +225,24 @@ void AllocateAndParseFlags() {
             "The amount of jitter to introduce.  This amount is added to each "
             "element in the tensors named in `tensor_names."),
 
-       Flag("tf_mlir_enable_mlir_bridge",
-            &mlir_flags->tf_mlir_enable_mlir_bridge,
-            "Enables experimental MLIR-Based TensorFlow Compiler Bridge.")});
+       Flag("tf_mlir_enable_mlir_bridge", &enable_mlir_bridge,
+            "Enables experimental MLIR-Based TensorFlow Compiler Bridge.",
+            &enable_mlir_bridge_is_explicit)});
 
   AppendMarkForCompilationPassFlagsInternal(flag_list);
   xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
+
+  mlir_flags = new MlirCommonFlags;
+  if (!enable_mlir_bridge_is_explicit) {
+    mlir_flags->tf_mlir_enable_mlir_bridge =
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED;
+  } else if (enable_mlir_bridge) {
+    mlir_flags->tf_mlir_enable_mlir_bridge =
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
+  } else {
+    mlir_flags->tf_mlir_enable_mlir_bridge =
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED;
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 5612b3b5864..a0860da7b04 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
@@ -135,7 +136,7 @@ struct IntroduceFloatingPointJitterPassFlags {
 
 // Flags for common MLIR configurations.
 struct MlirCommonFlags {
-  bool tf_mlir_enable_mlir_bridge;
+  ConfigProto::Experimental::MlirBridgeRollout tf_mlir_enable_mlir_bridge;
 };
 
 // Return a pointer to the DumpGraphFlags struct;
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 12b40b1c83b..0f0f43cbad6 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -274,18 +274,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
-  xla::ThenExecuteFunction then_execute;
-  if (ctx->op_device_context()) {
-    then_execute = [&](se::Stream* stream, std::function<void()> fn) {
-      Status status = ctx->op_device_context()->ThenExecute(
-          down_cast<Device*>(ctx->device()), stream, std::move(fn));
-      if (!status.ok()) {
-        // This should never happen.
-        LOG(ERROR) << "ThenExecute failed " << status;
-      }
-    };
-    run_options.set_then_execute_function(&then_execute);
-  }
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
@@ -522,18 +510,6 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
   run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
-  xla::ThenExecuteFunction then_execute;
-  if (ctx->op_device_context()) {
-    then_execute = [&](se::Stream* stream, std::function<void()> fn) {
-      Status status = ctx->op_device_context()->ThenExecute(
-          down_cast<Device*>(ctx->device()), stream, std::move(fn));
-      if (!status.ok()) {
-        // This should never happen.
-        LOG(ERROR) << "ThenExecute failed " << status;
-      }
-    };
-    run_options.set_then_execute_function(&then_execute);
-  }
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 317e29d4a84..abd5d8d02f6 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -30,12 +30,12 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/device_util.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -1801,11 +1801,11 @@ absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable() {
       "Range", "Rank", "Reshape", "Shape", "ShapeN", "Size", "Squeeze",
       "Transpose", "ZerosLike", "OnesLike", "BiasAdd" /*PW + Broadcast*/,
       "BroadcastArgs", "BroadcastGradientArgs", "OneHot", "Concat", "ConcatV2",
-      "ConcatOffset", "Const", "MirrorPad", "Pack", "Pad", "PadV2", "Reverse",
-      "ReverseV2", "ReverseSequence", "Slice", "Split", "SplitV",
-      "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
-      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex",
-      "TensorStridedSliceUpdate",
+      "ConcatOffset", "Const", "MirrorPad", "MirrorPadGrad", "Pack", "Pad",
+      "PadV2", "Reverse", "ReverseV2", "ReverseSequence", "Slice", "Split",
+      "SplitV", "StridedSlice", "StridedSliceGrad",
+      "ResourceStridedSliceAssign", "Tile", "Transpose", "InvertPermutation",
+      "Unpack", "DeviceIndex", "TensorStridedSliceUpdate",
      }}};
   // clang-format on
   return result;
@@ -2061,11 +2061,13 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "XlaSelfAdjointEig",
                                      "XlaSend",
                                      "XlaSetBound",
+                                     "XlaSetDynamicDimensionSize",
                                      "XlaSharding",
                                      "XlaSort",
                                      "XlaSpmdFullToShardShape",
                                      "XlaSpmdShardToFullShape",
                                      "XlaSvd",
+                                     "XlaVariadicReduce",
                                      "XlaWhile",
                                      "Zeta",
                                      "_Arg",
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.h b/tensorflow/compiler/jit/resource_operation_safety_analysis.h
index c652e5fe216..3931ae6c7cc 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.h
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
 
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
 
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index e2a1d159336..bf6dd5ab9f4 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/algorithm.h"
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index d7d5ee02265..435e3752b2e 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -283,25 +283,23 @@ Status XlaCompilationCache::CompileSingleOp(
     const NodeDef& node_def = ctx->op_kernel().def();
     TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(node_def, args, result_dtypes));
 
-    bool has_tensor_list_arg =
-        absl::c_any_of(args, [](const XlaCompiler::Argument arg) {
-          return arg.kind == XlaCompiler::Argument::kTensorList;
-        });
     const ConfigProto* config = ctx->function_library()->config_proto();
-    bool use_mlir = config && config->experimental().enable_mlir_bridge();
+    // TODO(b/171039585): Support tf.VarIsInitializedOp using MLIR.
+    bool use_mlir = config && config->experimental().enable_mlir_bridge() &&
+                    node_def.op() != "VarIsInitializedOp";
 #ifdef LIBTPU_ON_GCE
-    if (use_mlir && has_tensor_list_arg) {
+    if (use_mlir) {
       LOG(WARNING) << "MLIR is not supported in this environment.";
     }
     return compiler->CompileGraph(compile_options, node_def.name(),
                                   std::move(graph), args, result);
 #else
-    // TODO(b/155596779): Support TensorList args.
-    if (!use_mlir || !has_tensor_list_arg) {
+    if (!use_mlir) {
       return compiler->CompileGraph(compile_options, node_def.name(),
                                     std::move(graph), args, result);
     }
 
+    VLOG(1) << "Using MLIR bridge";
     GraphDebugInfo debug_info;
     std::vector<std::string> control_rets;
     if (result_dtypes.empty()) {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
index 5578925b790..e40d6221324 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache_test.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
@@ -78,7 +78,9 @@ TEST(XlaCompilationCacheTest, TestDisabledXlaCompilation) {
       absl::StrContains(status.error_message(), "XLA compilation disabled"));
 }
 
-static void BM_BuildSignature(int iters, int n_args) {
+void BM_BuildSignature(::testing::benchmark::State& state) {
+  const int n_args = state.range(0);
+
   NameAttrList fn;
   fn.set_name("afunction");
   for (int i = 0; i < n_args; i++) {
@@ -93,7 +95,7 @@ static void BM_BuildSignature(int iters, int n_args) {
     args[i].constant_value = Tensor(DT_INT32, {4, 0});
   }
 
-  while (--iters > 0) {
+  for (auto i : state) {
     xla::StatusOr<XlaCompilationCache::Signature> s =
         XlaCompilationCache::BuildSignature(fn, args);
     CHECK(s.ok());
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index d4a69da4898..b90f8b7b990 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -89,7 +89,8 @@ static Status CreateXlaKernel(FunctionLibraryRuntime* flr,
   XlaOpRegistry::RegisterCompilationKernels();
 
   // Only check for compilability if the MLIR bridge is not enabled.
-  if (!GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge !=
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
     RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
     if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
       std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index a0e60b1eafe..1c5581eb4ab 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -426,7 +426,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     ShapedBuffer buffer(
         xla::ShapeUtil::MakeTupleShape({nontuple_buffer.on_host_shape()}),
         xla::ShapeUtil::MakeTupleShape({nontuple_buffer.on_device_shape()}),
-        output.platform(), output.device_ordinal());
+        output.device_ordinal());
     buffer.buffers().CopySubtreeFrom(nontuple_buffer.buffers(),
                                      /*source_base_index=*/{},
                                      /*target_base_index=*/{0});
@@ -583,7 +583,11 @@ XlaComputationLaunchContext::BuildXlaCompilerArguments(
     XlaCompiler::Argument& arg = out[input_num];
     if (absl::c_binary_search(must_be_constant_idxs, input_num)) {
       // Handles compile-time constants.
-      TF_RET_CHECK(input->dtype() != DT_RESOURCE);
+
+      // TODO(b/157241314): Support constants located in resource variables.
+      TF_RET_CHECK(input->dtype() != DT_RESOURCE)
+          << "tf2xla bridge does not support must-be-constants located in "
+             "resource variables; try moving them to a tensor";
       arg.kind = XlaCompiler::Argument::kConstant;
       arg.type = input->dtype();
       arg.shape = input->shape();
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index 30e8d8a86a7..e1b81133724 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -45,6 +45,7 @@ filegroup(
         "include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td",
         "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
         "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td",
+        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td",
         "include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td",
         "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td",
         "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td",
@@ -122,8 +123,6 @@ gentbl(
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"),
         ("-gen-op-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"),
-        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"),
-        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
@@ -150,6 +149,24 @@ gentbl(
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [":hlo_ops_td_files"],
+)
+
+gentbl(
+    name = "hlo_ops_base_structs_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h.inc"),
+        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td",
+    td_relative_includes = [
+        "include",
+    ],
     td_srcs = [":hlo_ops_td_files"],
 )
 
@@ -194,6 +211,63 @@ gentbl(
     td_srcs = [":hlo_ops_td_files"],
 )
 
+gentbl(
+    name = "lhlo_gpu_ops_structs_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "include",
+    tbl_outs = [
+        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h.inc"),
+        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td",
+    ],
+)
+
+cc_library(
+    name = "lhlo_gpu_ops_structs",
+    srcs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h.inc",
+        "lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc",
+    ],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":lhlo_gpu_ops_structs_inc_gen",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl(
+    name = "lhlo_gpu_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "include",
+    tbl_outs = [
+        ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h.inc"),
+        ("-gen-op-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td",
+    ],
+)
+
 #TODO(aminim): revisit the naming and grouping of these rules post-move.
 gentbl(
     name = "canonicalize_inc_gen",
@@ -251,6 +325,23 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "hlo_ops_base_structs",
+    srcs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h.inc",
+        "lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc",
+    ],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":hlo_ops_base_structs_inc_gen",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "convert_op_folder",
     srcs = ["lib/utils/convert_op_folder.cc"],
@@ -284,6 +375,7 @@ cc_library(
         ":chlo_ops_inc_gen",
         ":convert_op_folder",
         ":hlo_ops_base_inc_gen",
+        ":hlo_ops_base_structs",
         ":hlo_ops_inc_gen",
         ":infer_fusibility_op_interface",
         "@llvm-project//llvm:Support",
@@ -314,6 +406,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":hlo_ops_base_inc_gen",
+        ":hlo_ops_base_structs",
         ":lhlo_ops_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -330,6 +423,39 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "lhlo_gpu",
+    srcs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h.inc",
+        "lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc",
+    ],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":hlo",
+        ":hlo_ops_base_structs",
+        ":infer_fusibility_op_interface",
+        ":lhlo_gpu_ops_inc_gen",
+        ":lhlo_gpu_ops_structs",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:CopyOpInterface",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:ViewLikeInterface",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "hlo_dialect_registration",
     srcs = ["lib/Dialect/mhlo/IR/init.cc"],
@@ -337,6 +463,7 @@ cc_library(
     deps = [
         ":hlo",
         ":lhlo",
+        ":lhlo_gpu",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -385,10 +512,20 @@ cc_library(
         ":lhlo",
         ":map_hlo_to_lhlo_op",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:StandardOps",
     ],
 )
 
+cc_library(
+    name = "map_chlo_to_hlo_op",
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h"],
+    deps = [
+        ":hlo",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "map_hlo_to_lhlo_op",
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"],
@@ -410,6 +547,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -477,9 +615,11 @@ cc_library(
     ],
     deps = [
         ":hlo",
+        ":map_chlo_to_hlo_op",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
@@ -522,6 +662,7 @@ cc_library(
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:ViewLikeInterface",
     ],
     alwayslink = 1,
 )
@@ -635,6 +776,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -762,6 +904,7 @@ cc_library(
     deps = [
         ":chlo_legalize_to_hlo_inc_gen",
         ":hlo",
+        ":map_chlo_to_hlo_op",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
@@ -878,6 +1021,7 @@ cc_binary(
         ":all_passes",
         ":hlo",
         ":lhlo",
+        ":lhlo_gpu",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/hlo/README.md b/tensorflow/compiler/mlir/hlo/README.md
index 61517cd9fca..05aabe3f67e 100644
--- a/tensorflow/compiler/mlir/hlo/README.md
+++ b/tensorflow/compiler/mlir/hlo/README.md
@@ -22,7 +22,7 @@ upstream.
 
 ## QuickStart: building and testing
 
-These instructions work on Linux, you may have to adjust for your plaform.
+These instructions work on Linux, you may have to adjust for your platform.
 
 To build the code in this repository, you need a clone of the LLVM/MLIR git
 repository:
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
index 09bdca84cd3..3fa2b908d9c 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
@@ -25,7 +25,22 @@ function(add_mlir_hlo_dialect dialect dialect_namespace)
 endfunction()
 
 add_mlir_hlo_dialect(chlo_ops chlo)
-add_mlir_hlo_dialect(hlo_ops mhlo)
 add_mlir_hlo_dialect(lhlo_ops lmhlo)
 
+set(LLVM_TARGET_DEFINITIONS hlo_ops.td)
+mlir_tablegen(hlo_ops.h.inc -gen-op-decls)
+mlir_tablegen(hlo_ops.cc.inc -gen-op-defs)
+mlir_tablegen(hlo_ops_base_structs.h.inc -gen-struct-attr-decls)
+mlir_tablegen(hlo_ops_base_structs.cc.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRhlo_opsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops.td)
+mlir_tablegen(lhlo_gpu_ops.h.inc -gen-op-decls)
+mlir_tablegen(lhlo_gpu_ops.cc.inc -gen-op-defs)
+set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops_structs.td)
+mlir_tablegen(lhlo_gpu_ops_structs.h.inc -gen-struct-attr-decls)
+mlir_tablegen(lhlo_gpu_ops_structs.cc.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRlhlo_gpu_opsIncGen)
+add_dependencies(mlir-headers MLIRlhlo_gpu_opsIncGen)
+
 add_mlir_interface(infer_fusibility_op_interface)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
index 13d5f02368b..a65d8258a51 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
@@ -89,10 +89,9 @@ class HLOClient_BroadcastBinaryElementwiseOp<
     OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$left, "Value":$right,
+      "DenseIntElementsAttr":$broadcast_dimensions)>];
 
   let results = (outs HLO_Tensor);
 
@@ -427,7 +426,10 @@ def HLOClient_BroadcastCompareOp : HLOClient_BroadcastBinaryElementwiseOp<
   string summary = "Compare operator (with optional broadcasting)";
 
   string description = [{
-    Compares `lhs` and `rhs` elementwise according to `comparison_direction`.
+    Compares `lhs` and `rhs` elementwise according to `comparison_direction`
+    and `compare_type`. If unspecified, `compare_type` is FLOAT for float element
+    types, SIGNED for signed element types and UNSIGNED for unsigned element
+    types.
 
     See
     https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations.
@@ -437,14 +439,15 @@ def HLOClient_BroadcastCompareOp : HLOClient_BroadcastBinaryElementwiseOp<
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
     OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
-    HLO_ComparisonDirectionAttr:$comparison_direction
+    HLO_ComparisonDirectionAttr:$comparison_direction,
+    OptionalAttr<HLO_ComparisonTypeAttr>:$compare_type
   );
   let results = (outs HLO_PredTensor);
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
-    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$lhs, "Value":$rhs,
+      "DenseIntElementsAttr":$broadcast_dimensions,
+      "StringAttr":$comparison_direction, CArg<"StringAttr", "{}">:$compare_type)>];
 }
 
 #endif  // CHLO_OPS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
index 60ee4e613eb..b354189c12a 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -32,7 +33,7 @@ limitations under the License.
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 // clang-format off
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
 #include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
 // clang-format on
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
index 507f7c11d63..42db595634c 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
@@ -25,11 +25,6 @@ include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td"
 include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
 include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td"
 
-def HLO_Dialect : Dialect {
-  let name = "mhlo";
-  let cppNamespace = "::mlir::mhlo";
-}
-
 class HLO_Op<string mnemonic, list<OpTrait> traits> :
     Op<HLO_Dialect, mnemonic, traits> {
   // Whether this operation has a custom conversion to HLO or not.
@@ -63,9 +58,8 @@ def HLO_ConstOp : HLO_Op<"constant",
     HLO_StaticShapeTensor:$output
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Attribute value"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "Attribute":$value)>];
 
   let assemblyFormat = "attr-dict $value";
 
@@ -136,8 +130,8 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
       }
       LogicalResult reifyReturnTypeShapes(
           OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
-        return deriveShapeFromFirstOperand(&builder, getOperation(),
-                                           &reifiedReturnShapes);
+        return ::mlir::mhlo::deriveShapeFromFirstOperand(&builder, getOperation(),
+                                                         &reifiedReturnShapes);
       }
       bool inferInputOutputShapeEquality(int input, int output) {
         return true;
@@ -152,9 +146,8 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
 def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs",
     [NoSideEffect, SameOperandsAndResultShape],
      TensorOf<[HLO_SInt, AnyFloat, HLO_Complex]>>, BASE_HLO_AbsOp {
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value operand"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$operand)>];
 }
 
 def HLO_CbrtOp: HLO_UnaryElementwiseOp<"cbrt",
@@ -167,10 +160,8 @@ def HLO_ConvertOp : HLO_UnaryElementwiseOp<
     "convert", [NoSideEffect, SameOperandsAndResultShape], HLO_Tensor>,
     BASE_HLO_ConvertOp {
 
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value operand, "
-    "Type result_element_ty"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$operand, "Type":$result_element_ty)>];
 
   let hasFolder = 1;
 
@@ -247,7 +238,9 @@ def HLO_RealOp: HLO_UnaryElementwiseOp<"real",
 }
 
 def HLO_RoundOp: HLO_UnaryElementwiseOp<"round_nearest_afz",
-    [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_RoundOp;
+    [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_RoundOp {
+  let hasFolder = 1;
+}
 
 def HLO_RsqrtOp: HLO_UnaryElementwiseOp<"rsqrt",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
@@ -293,8 +286,8 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
     }
     LogicalResult reifyReturnTypeShapes(
         OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
-      return deriveShapeFromFirstOperand(&builder, getOperation(),
-                                         &reifiedReturnShapes);
+      return ::mlir::mhlo::deriveShapeFromFirstOperand(&builder, getOperation(),
+                                                       &reifiedReturnShapes);
     }
     bool inferInputsShapeEquality(int lhs, int rhs) {
       return true;
@@ -458,7 +451,7 @@ def HLO_SendOp : HLO_Op<"send", []> {
   let arguments = (ins
     HLO_TensorOrTuple:$operand,
     HLO_Token:$token,
-    ChannelHandle<HLO_Dialect>:$channel_id,
+    ChannelHandle:$channel_id,
     DefaultValuedAttr<BoolAttr, "false">:$is_host_transfer
   );
 
@@ -483,7 +476,7 @@ def HLO_RecvOp : HLO_Op<"recv", []> {
 
   let arguments = (ins
     HLO_Token:$token,
-    ChannelHandle<HLO_Dialect>:$channel_id,
+    ChannelHandle:$channel_id,
     DefaultValuedAttr<BoolAttr, "false">:$is_host_transfer
   );
 
@@ -587,7 +580,7 @@ def HLO_AllReduceOp : HLO_Op<"all_reduce",
   let arguments = (ins
     HLO_Tensor:$operand,
     I64ElementsAttr:$replica_groups,
-    OptionalAttr<ChannelHandle<HLO_Dialect>>:$channel_id
+    OptionalAttr<ChannelHandle>:$channel_id
   );
   let regions = (region SizedRegion<1>:$computation);
   let results = (outs HLO_Tensor);
@@ -622,10 +615,9 @@ def HLO_ReduceOp: HLO_Op<"reduce", [
 
   let results = (outs Variadic<HLO_TensorOrTuple>);
 
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, ValueRange operands, "
-    "ValueRange init_values, DenseIntElementsAttr dimensions"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "ValueRange":$operands, "ValueRange":$init_values,
+      "DenseIntElementsAttr":$dimensions)>];
 
   let extraClassDeclaration = [{
     bool isFusibleWithConsumer() {
@@ -661,18 +653,16 @@ def HLO_GetTupleElementOp: HLO_Op<"get_tuple_element", [NoSideEffect]>, BASE_HLO
 
   let hasFolder = 1;
 
-  let builders = [OpBuilder<
-                  "OpBuilder &builder, OperationState &results, "
-                  "Value  value, int32_t index">];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$value, "int32_t":$index)>];
 }
 
 def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
   let arguments = (ins Variadic<HLO_TensorOrTokenOrTuple>:$val);
   let results = (outs HLO_Tuple);
 
-  let builders = [OpBuilder<
-                  "OpBuilder &builder, OperationState &results, "
-                  "ValueRange values">];
+  let builders = [
+    OpBuilderDAG<(ins "ValueRange":$values)>];
 
   let hasCanonicalizer = 1;
 }
@@ -684,16 +674,19 @@ def HLO_CompareOp: HLO_Op<"compare", [NoSideEffect, SameTypeOperands,
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
-    HLO_ComparisonDirectionAttr:$comparison_direction
+    HLO_ComparisonDirectionAttr:$comparison_direction,
+    OptionalAttr<HLO_ComparisonTypeAttr>:$compare_type
   );
   let results = (outs HLO_PredTensor);
 
   let hasFolder = 1;
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
-    "StringAttr comparison_direction"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$lhs, "Value":$rhs,
+      "StringAttr":$comparison_direction, CArg<"StringAttr", "{}">:$compare_type)>,
+  ];
+
+  let hasCustomHLOConverter = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -703,7 +696,8 @@ def HLO_CompareOp: HLO_Op<"compare", [NoSideEffect, SameTypeOperands,
 def HLO_SliceOp: HLO_Op<
       "slice",
       [NoSideEffect, SameOperandsAndResultElementType,
-       AllTypesMatch<["start_indices", "limit_indices", "strides"]>]> {
+       AllTypesMatch<["start_indices", "limit_indices", "strides"]>,
+       DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let arguments = (ins
     HLO_Tensor:$operand,
     I64ElementsAttr:$start_indices,
@@ -715,21 +709,6 @@ def HLO_SliceOp: HLO_Op<
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
-
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value operand, "
-    "DenseIntElementsAttr start_indices, DenseIntElementsAttr limit_indices, "
-    "DenseIntElementsAttr strides"
-  >];
-
-  let extraClassDeclaration = [{
-    // Infers output type for given operand and attributes. Result type is
-    // unranked if any of the attributes is illegal.
-    static Type InferOutputTypes(Builder *builder, Value operand,
-                                 DenseIntElementsAttr start_indices,
-                                 DenseIntElementsAttr limit_indices,
-                                 DenseIntElementsAttr strides);
-  }];
 }
 
 def HLO_DynamicSliceOp: HLO_Op<"dynamic-slice",
@@ -959,15 +938,6 @@ def HLO_DotOp: HLO_Op<"dot", [NoSideEffect]>, BASE_HLO_DotOp {
   let results = (outs HLO_Tensor);
 }
 
-def DotDimensionNumbers : StructAttr<"DotDimensionNumbers", HLO_Dialect, [
-                StructFieldAttr<"lhs_batching_dimensions",   I64ElementsAttr>,
-                StructFieldAttr<"rhs_batching_dimensions",   I64ElementsAttr>,
-                StructFieldAttr<"lhs_contracting_dimensions", I64ElementsAttr>,
-                StructFieldAttr<"rhs_contracting_dimensions", I64ElementsAttr>
-  ]> {
-  let description = "Structure of dimension information for dot product";
-}
-
 def HLO_DotGeneralOp: HLO_Op<"dot_general", [NoSideEffect]>, BASE_HLO_DotGeneralOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
@@ -1029,14 +999,6 @@ def HLO_FftOp: HLO_Op<"fft", [NoSideEffect]>, BASE_HLO_FftOp {
   let results = (outs HLO_Tensor);
 }
 
-def GatherDimensionNumbers : StructAttr<"GatherDimensionNumbers", HLO_Dialect,
-      [StructFieldAttr<"offset_dims", I64ElementsAttr>,
-      StructFieldAttr<"collapsed_slice_dims", I64ElementsAttr>,
-      StructFieldAttr<"start_index_map", I64ElementsAttr>,
-      StructFieldAttr<"index_vector_dim", I64Attr>]> {
-  let description = "Structure of dimension information for gather";
-}
-
 def HLO_GatherOp: HLO_Op<"gather", [NoSideEffect]>, BASE_HLO_GatherOp {
   let arguments = (ins
     HLO_Tensor:$operand,
@@ -1114,7 +1076,7 @@ def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
     HLO_Tensor:$operand,
     HLO_Tensor:$scatter_indices,
     HLO_Tensor:$updates,
-    ScatterDimensionNumbers<HLO_Dialect>:$scatter_dimension_numbers,
+    ScatterDimensionNumbers:$scatter_dimension_numbers,
     DefaultValuedAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedAttr<BoolAttr, "false">:$unique_indices
   );
@@ -1124,6 +1086,8 @@ def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
   let results = (outs HLO_Tensor);
 
   let hasCustomHLOConverter = 1;
+
+  let hasFolder = 1;
 }
 
 // TODO(jpienaar): Add broadcastable trait.
@@ -1181,10 +1145,9 @@ def HLO_SortOp : HLO_Op<"sort", [RecursiveSideEffects, SameOperandsAndResultShap
 
   let regions = (region SizedRegion<1>:$comparator);
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &state, ValueRange operands, "
-    "int64_t dimension = -1, bool is_stable = false"
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "ValueRange":$operands, CArg<"int64_t", "-1">:$dimension,
+      CArg<"bool", "false">:$is_stable)>];
 
   // TODO(b/129422361): SortOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
@@ -1220,6 +1183,8 @@ def HLO_PadOp: HLO_Op<"pad",
 
   // TODO(b/129422361): PadOp has a custom constructor for HLO.
   let hasCustomHLOConverter = 1;
+
+  let hasFolder = 1;
 }
 
 def HLO_TraceOp: HLO_Op<"trace", []>, BASE_HLO_TraceOp {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
index cba2dc370f0..572a2f9dc07 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
@@ -18,6 +18,13 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 
+def HLO_Dialect : Dialect {
+  let name = "mhlo";
+  let cppNamespace = "::mlir::mhlo";
+}
+
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td"
+
 def HLO_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
 
 // TODO(hinsu): Use signed integers instead of signless integer which is being
@@ -614,15 +621,6 @@ class BASE_HLO_CaseOp {
 // XLA parallelism related op definitions.
 //===----------------------------------------------------------------------===//
 
-// Represents a unique identifier for each Send/Recv instruction pair or
-// optionally for collective instructions (AllReduce, CollectivePermute,
-// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
-class ChannelHandle<Dialect dialect> : StructAttr<"ChannelHandle", dialect, [
-                StructFieldAttr<"handle", I64Attr>,
-                StructFieldAttr<"type", I64Attr>]> {
-  let description = "two 64-bit integers 'handle' and 'type'";
-}
-
 class BASE_HLO_ReplicaIdOp {
   string summary = "ReplicaId operator";
 
@@ -712,6 +710,7 @@ def HLO_PrecisionConfigAttr:
     OptionalAttr<
           TypedArrayAttrBase<HLO_PrecisionAttr, "Precision Config attribute">>;
 
+
 //===----------------------------------------------------------------------===//
 // Fast Fourier Transform Type enum definitions.
 //===----------------------------------------------------------------------===//
@@ -750,11 +749,30 @@ def HLO_ComparisonDirectionAttr : StrEnumAttr<"ComparisonDirection",
       HLO_COMPARISON_DIRECTION_LT
     ]>;
 
+def HLO_DEFAULT_COMPARISON_TYPE : NativeCodeCall<"StringAttr()">;
+def HLO_COMPARISON_TYPE_FLOAT : StrEnumAttrCase<"FLOAT">;
+def HLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER : StrEnumAttrCase<"TOTALORDER">;
+def HLO_COMPARISON_TYPE_SIGNED : StrEnumAttrCase<"SIGNED">;
+def HLO_COMPARISON_TYPE_UNSIGNED : StrEnumAttrCase<"UNSIGNED">;
+
+def HLO_ComparisonTypeAttr : StrEnumAttr<"ComparisonType",
+    "Which comparison type to use.",
+    [
+      HLO_COMPARISON_TYPE_FLOAT,
+      HLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
+      HLO_COMPARISON_TYPE_SIGNED,
+      HLO_COMPARISON_TYPE_UNSIGNED
+    ]>;
+
+
 class BASE_HLO_CompareOp {
   string summary = "Comparison operator";
 
   string description = [{
-    Compares `lhs` and `rhs` elementwise according to `comparison_direction`.
+    Compares `lhs` and `rhs` elementwise according to `comparison_direction`
+    and `compare_type`. If unspecified, `compare_type` is FLOAT for float element
+    types, SIGNED for signed element types and UNSIGNED for unsigned element
+    types.
 
     See
     https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations.
@@ -1011,21 +1029,6 @@ class BASE_HLO_ConcatenateOp {
 // Common convolution attributes
 //===----------------------------------------------------------------------===//
 
-class ConvDimensionNumbersBase<Dialect dialect>
-  : StructAttr<"ConvDimensionNumbers", dialect, [
-  StructFieldAttr<"input_batch_dimension",I64Attr>,
-  StructFieldAttr<"input_feature_dimension", I64Attr>,
-  StructFieldAttr<"input_spatial_dimensions", I64ElementsAttr>,
-  StructFieldAttr<"kernel_input_feature_dimension", I64Attr>,
-  StructFieldAttr<"kernel_output_feature_dimension", I64Attr>,
-  StructFieldAttr<"kernel_spatial_dimensions", I64ElementsAttr>,
-  StructFieldAttr<"output_batch_dimension", I64Attr>,
-  StructFieldAttr<"output_feature_dimension", I64Attr>,
-  StructFieldAttr<"output_spatial_dimensions", I64ElementsAttr>] > {
-
-  let description = "Structure of dimension information for conv op";
-}
-
 class ConvolutionAttributes<Dialect dialect> {
   dag attributes = (ins
     // Default value: one for each of the spatial dimension.
@@ -1036,7 +1039,7 @@ class ConvolutionAttributes<Dialect dialect> {
     OptionalAttr<I64ElementsAttr>:$lhs_dilation,
     // Default value: one for each of the spatial dimension.
     OptionalAttr<I64ElementsAttr>:$rhs_dilation,
-    ConvDimensionNumbersBase<dialect>:$dimension_numbers,
+    ConvDimensionNumbers:$dimension_numbers,
     I64Attr:$feature_group_count,
     I64Attr:$batch_group_count,
     HLO_PrecisionConfigAttr:$precision_config
@@ -1164,15 +1167,6 @@ class BASE_HLO_ReshapeOp {
   }];
 }
 
-class ScatterDimensionNumbers<Dialect dialect> : StructAttr<
-    "ScatterDimensionNumbers", dialect, [
-      StructFieldAttr<"update_window_dims", I64ElementsAttr>,
-      StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
-      StructFieldAttr<"scatter_dims_to_operand_dims", I64ElementsAttr>,
-      StructFieldAttr<"index_vector_dim", I64Attr>]> {
-  let description = "Structure of dimension information for scatter";
-}
-
 class BASE_HLO_ScatterOp {
   string summary = "Scatter operator";
 
@@ -1388,7 +1382,7 @@ class BASE_HLO_BitcastOp {
 
   string description = [{
     This op changes the shape of the input in the way that the physical
-    arranggment of elements are unchanged.
+    arrangement of elements are unchanged.
 
     However, the op needs layout information to make sense of "physical
     arrangement of elements". Layout support in MHLO is currently under
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h
new file mode 100644
index 00000000000..3b78ff8a367
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines structures used in MHLO and LMHLO.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_BASE_STRUCTS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_BASE_STRUCTS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+
+// Order matters, this .inc header is not self-contained, and relies on the
+// #includes above.
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_BASE_STRUCTS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td
new file mode 100644
index 00000000000..d25eb5104c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef HLO_OPS_BASE_STRUCTS
+#define HLO_OPS_BASE_STRUCTS
+
+//===----------------------------------------------------------------------===//
+// Dot dimensions enum definitions.
+//===----------------------------------------------------------------------===//
+
+def DotDimensionNumbers : StructAttr<"DotDimensionNumbers", HLO_Dialect, [
+                StructFieldAttr<"lhs_batching_dimensions",   I64ElementsAttr>,
+                StructFieldAttr<"rhs_batching_dimensions",   I64ElementsAttr>,
+                StructFieldAttr<"lhs_contracting_dimensions", I64ElementsAttr>,
+                StructFieldAttr<"rhs_contracting_dimensions", I64ElementsAttr>
+  ]> {
+  let description = "Structure of dimension information for dot product";
+}
+
+def ScatterDimensionNumbers : StructAttr<
+    "ScatterDimensionNumbers", HLO_Dialect, [
+      StructFieldAttr<"update_window_dims", I64ElementsAttr>,
+      StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
+      StructFieldAttr<"scatter_dims_to_operand_dims", I64ElementsAttr>,
+      StructFieldAttr<"index_vector_dim", I64Attr>]> {
+  let description = "Structure of dimension information for scatter";
+}
+
+def ConvDimensionNumbers : StructAttr<"ConvDimensionNumbers", HLO_Dialect, [
+  StructFieldAttr<"input_batch_dimension",I64Attr>,
+  StructFieldAttr<"input_feature_dimension", I64Attr>,
+  StructFieldAttr<"input_spatial_dimensions", I64ElementsAttr>,
+  StructFieldAttr<"kernel_input_feature_dimension", I64Attr>,
+  StructFieldAttr<"kernel_output_feature_dimension", I64Attr>,
+  StructFieldAttr<"kernel_spatial_dimensions", I64ElementsAttr>,
+  StructFieldAttr<"output_batch_dimension", I64Attr>,
+  StructFieldAttr<"output_feature_dimension", I64Attr>,
+  StructFieldAttr<"output_spatial_dimensions", I64ElementsAttr>] > {
+
+  let description = "Structure of dimension information for conv op";
+}
+
+def GatherDimensionNumbers : StructAttr<"GatherDimensionNumbers", HLO_Dialect,
+      [StructFieldAttr<"offset_dims", I64ElementsAttr>,
+      StructFieldAttr<"collapsed_slice_dims", I64ElementsAttr>,
+      StructFieldAttr<"start_index_map", I64ElementsAttr>,
+      StructFieldAttr<"index_vector_dim", I64Attr>]> {
+  let description = "Structure of dimension information for gather";
+}
+
+
+// Represents a unique identifier for each Send/Recv instruction pair or
+// optionally for collective instructions (AllReduce, CollectivePermute,
+// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
+def ChannelHandle : StructAttr<"ChannelHandle", HLO_Dialect, [
+                StructFieldAttr<"handle", I64Attr>,
+                StructFieldAttr<"type", I64Attr>]> {
+  let description = "two 64-bit integers 'handle' and 'type'";
+}
+
+#endif // HLO_OPS_BASE_STRUCTS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h
new file mode 100644
index 00000000000..effa9ecc83b
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the LHLO dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/CopyOpInterface.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+
+namespace mlir {
+class OpBuilder;
+}  // namespace mlir
+
+
+namespace mlir {
+namespace lmhlo_gpu {
+
+class LmhloGpuDialect : public Dialect {
+ public:
+  explicit LmhloGpuDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "lmhlo_gpu"; }
+};
+
+}  // namespace lmhlo_gpu
+}  // end namespace mlir
+
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td
new file mode 100644
index 00000000000..b3708bf4ff1
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td
@@ -0,0 +1,210 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for LHMLO level GPU operations.
+// Because these are LMHLO level operations, they operate on memrefs.
+
+#ifndef LHLO_GPU_OPS
+#define LHLO_GPU_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td"
+
+
+class LHLOGPU_Op<string mnemonic, list<OpTrait> traits = []> :
+  Op<LHLO_GPU_Dialect, mnemonic,
+    !listconcat([MemoryEffects<[MemRead, MemWrite]>], traits)>;
+
+// Type for scratch buffers used by GPU library calls (memref<?xi8>)
+def UntypedBuffer : MemRefRankOf<[I8], [1]>;
+
+// Cholesky info output buffer type.
+def I32Buffer : MemRefOf<[I32]>;
+
+//===----------------------------------------------------------------------===//
+// LMHLO ops representing batch norm library functions.
+//===----------------------------------------------------------------------===//
+
+// Note: these are semantically different from similar LHLO as the GPU library
+// calls generate or consume standard deviation, whereas LHLO ops generate or
+// consume variance (= std-dev ^ 2).
+
+def LHLOGPU_BatchNormGradOp : LHLOGPU_Op<"batch_norm_grad">,
+    BASE_HLO_BatchNormGradOp {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$mean,
+    Arg<LHLO_Buffer, "", [MemRead]>:$stddev,
+    Arg<LHLO_Buffer, "", [MemRead]>:$grad_output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_operand,  // gradient of $operand.
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_scale,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_offset,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index
+  );
+}
+
+def LHLOGPU_BatchNormInferenceOp : LHLOGPU_Op<"batch_norm_inference">,
+    BASE_HLO_BatchNormInferenceOp {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$offset,
+    Arg<LHLO_Buffer, "", [MemRead]>:$mean,
+    Arg<LHLO_Buffer, "", [MemRead]>:$stddev,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index);
+}
+
+def LHLOGPU_BatchNormTrainingOp : LHLOGPU_Op<"batch_norm_training">,
+    BASE_HLO_BatchNormTrainingOp {
+
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$offset,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_mean,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_stddev,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// LMHLO ops representing convolution library functions.
+//===----------------------------------------------------------------------===//
+
+def ActivationModeNone : StrEnumAttrCase<"None">;
+def ActivationModeSigmoid : StrEnumAttrCase<"Sigmoid">;
+def ActivationModeTanh : StrEnumAttrCase<"Relu">;
+def ActivationModeRelu : StrEnumAttrCase<"Relu">;
+def ActivationModeRelu6 : StrEnumAttrCase<"Relu6">;
+def ActivationModeReluX : StrEnumAttrCase<"ReluX">;
+def ActivationModeBandPass : StrEnumAttrCase<"BandPass">;
+
+def ActivationAttr : StrEnumAttr<"Activation",
+    "Activation applied with fused convolution",
+    [ActivationModeNone,  ActivationModeSigmoid, ActivationModeTanh,
+     ActivationModeRelu, ActivationModeRelu6, ActivationModeReluX,
+     ActivationModeBandPass]>;
+
+def GpuConvolutionAttributes {
+  dag attributes = !con(
+    ConvolutionAttributes<LHLO_GPU_Dialect>.attributes,
+    (ins F64Attr:$result_scale),
+    (ins ConvolutionBackendConfigAttr:$backend_config));
+}
+
+def GpuFusedConvolutionAttributes {
+  dag attributes = !con(
+    ConvolutionAttributes<LHLO_GPU_Dialect>.attributes,
+    (ins F64Attr:$result_scale,
+         ActivationAttr:$activation_mode,
+         F64Attr:$side_input_scale),
+    (ins ConvolutionBackendConfigAttr:$backend_config));
+}
+
+def LHLOGPU_ConvForwardOp : LHLOGPU_Op<"conv_forward"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$input,
+       Arg<LHLO_Buffer, "", [MemRead]>:$filter,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuConvolutionAttributes.attributes);
+}
+
+def LHLOGPU_ConvBackwardInputOp : LHLOGPU_Op<"conv_backwardinput"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$d_output,
+       Arg<LHLO_Buffer, "", [MemRead]>:$filter,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$d_input,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuConvolutionAttributes.attributes);
+}
+
+def LHLOGPU_ConvBackwardFilterOp : LHLOGPU_Op<"conv_backwardfilter"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$input,
+       Arg<LHLO_Buffer, "", [MemRead]>:$d_output,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$d_filter,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuConvolutionAttributes.attributes);
+}
+
+// output = activation(result_scale * conv(input, filter) +
+//                     side_input * side_input_scale +
+//                     bias)
+def LHLOGPU_ConvForwardFusedOp : LHLOGPU_Op<"conv_forward_fused"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$input,
+       Arg<LHLO_Buffer, "", [MemRead]>:$filter,
+       Arg<LHLO_Buffer, "", [MemRead]>:$bias,
+       Arg<LHLO_Buffer, "", [MemRead]>:$side_input,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuFusedConvolutionAttributes.attributes);
+}
+
+//===----------------------------------------------------------------------===//
+// LMHLO ops representing other library functions.
+//===----------------------------------------------------------------------===//
+
+// output = alpha * (lhs * rhs)
+// Verify: beta = 0.0
+def LHLOGPU_GEMMOp : LHLOGPU_Op<"gemm"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$output,
+    DotDimensionNumbers:$dot_dimension_numbers,
+    F64Attr:$alpha,
+    I64Attr:$batch_size,
+    I64Attr:$algorithm);
+}
+
+// output = alpha(lhs * rhs) + beta * bias
+def LHLOGPU_GEMM_BiasOp : LHLOGPU_Op<"gemm_bias"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$bias,
+    Arg<LHLO_Buffer, "", [MemRead]>:$output,
+    DotDimensionNumbers:$dot_dimension_numbers,
+    F64Attr:$alpha,
+    F64Attr:$beta,
+    I64Attr:$batch_size,
+    I64Attr:$algorithm);
+}
+
+def LHLOGPU_CholeskyOp : LHLOGPU_Op<"cholesky"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$input,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    Arg<UntypedBuffer, "", [MemWrite]>:$scratch,
+    Arg<I32Buffer, "", [MemWrite]>:$info,
+    BoolAttr:$is_upper);
+}
+
+#endif // LHLO_GPU_OPS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td
new file mode 100644
index 00000000000..820e4ce64b0
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// We define the dialect here so that both structs and ops can refer to it.
+
+#ifndef LHLO_GPU_OPS_BASE
+#define LHLO_GPU_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+
+def LHLO_GPU_Dialect : Dialect {
+  let name = "lmhlo_gpu";
+  let cppNamespace = "::mlir::lmhlo_gpu";
+}
+
+#endif // LHLO_GPU_OPS_BASE
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h
new file mode 100644
index 00000000000..ff642b82c22
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *     ==============================================================================*/
+
+// This file defines structures used in the LMHLO_GPU dialect.
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_STRUCTS_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_STRUCTS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+
+// Order matters, this .inc header is not self-contained, and relies on the
+// #includes above.
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h.inc"
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_STRUCTS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td
new file mode 100644
index 00000000000..2236fc38e29
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td
@@ -0,0 +1,29 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef LHLO_GPU_OPS_STRUCTS
+#define LHLO_GPU_OPS_STRUCTS
+
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td"
+
+def ConvolutionBackendConfigAttr : StructAttr<"ConvolutionBackendConfig",
+                                          LHLO_GPU_Dialect, [
+   StructFieldAttr<"algorithm", I64Attr>,
+   StructFieldAttr<"tensor_ops_enabled", BoolAttr>]> {
+   let description = "GPU Convolution backend configuration";
+}
+
+#endif // LHLO_GPU_OPS_STRUCTS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
index cc24e17c001..9dc6d7aa0c0 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines the operations used in the LXLA dialect.
+// This file defines the operations used in the LHLO dialect.
 
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
@@ -33,11 +34,6 @@ limitations under the License.
 
 namespace mlir {
 class OpBuilder;
-}  // namespace mlir
-
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.h.inc"
-
-namespace mlir {
 namespace lmhlo {
 
 class LmhloDialect : public Dialect {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
index c013939c544..32901f47dbe 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
@@ -284,7 +284,8 @@ def LHLO_CompareOp: LHLO_Op<"compare", []>, BASE_HLO_CompareOp {
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
     Arg<LHLO_PredBuffer, "", [MemWrite]>:$out,
     OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
-    HLO_ComparisonDirectionAttr:$comparison_direction
+    HLO_ComparisonDirectionAttr:$comparison_direction,
+    OptionalAttr<HLO_ComparisonTypeAttr>:$compare_type
   );
 }
 
@@ -340,7 +341,8 @@ def HLO_StaticMemRefCastOp: Op<LHLO_Dialect, "static_memref_cast",
   let arguments = (ins Arg<LHLO_Buffer, "", []>:$operand);
   let results = (outs Res<LHLO_Buffer, "", []>:$result);
 
-  let builders = [OpBuilder<"MemRefType resultType, Value operand",
+  let builders = [
+    OpBuilderDAG<(ins "MemRefType":$resultType, "Value":$operand),
     [{
       $_state.addOperands(operand);
       $_state.types.push_back(resultType);
@@ -386,8 +388,9 @@ def HLO_DynamicMemRefCastOp: Op<LHLO_Dialect, "dynamic_memref_cast",
   let results = (outs Res<LHLO_Buffer, "", []>:$result);
 
   let builders = [
-    OpBuilder<"MemRefType resultType, Value operand, ValueRange sizes, "
-              "ValueRange strides", [{
+    OpBuilderDAG<(ins "MemRefType":$resultType, "Value":$operand,
+      "ValueRange":$sizes, "ValueRange":$strides),
+    [{
       $_state.addOperands(operand);
       $_state.addOperands(sizes);
       $_state.addOperands(strides);
@@ -592,6 +595,7 @@ def LHLO_DotOp: LHLO_Op<"dot", []>, BASE_HLO_DotOp {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+    DotDimensionNumbers:$dot_dimension_numbers,
     HLO_PrecisionConfigAttr:$precision_config,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
@@ -601,11 +605,8 @@ def LHLO_GatherOp: LHLO_Op<"gather", []>, BASE_HLO_GatherOp {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_IntBuffer, "", [MemRead]>:$start_indices,
-    I64Attr:$index_vector_dim,
-    I64ElementsAttr:$offset_dims,
+    GatherDimensionNumbers:$dimension_numbers,
     I64ElementsAttr:$slice_sizes,
-    I64ElementsAttr:$collapsed_slice_dims,
-    I64ElementsAttr:$start_index_map,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
 }
@@ -623,7 +624,7 @@ def LHLO_ScatterOp: LHLO_Op<"scatter", []>, BASE_HLO_ScatterOp {
     Arg<LHLO_Buffer, "", [MemRead]>:$scatter_indices,
     Arg<LHLO_Buffer, "", [MemRead]>:$updates,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    ScatterDimensionNumbers<LHLO_Dialect>:$scatter_dimension_numbers,
+    ScatterDimensionNumbers:$scatter_dimension_numbers,
     DefaultValuedAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedAttr<BoolAttr, "false">:$unique_indices
   );
@@ -699,7 +700,7 @@ def LHLO_AllReduceOp : LHLO_Op<"all_reduce", [SameTypeOperands]>,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$replica_groups,
     DefaultValuedAttr<BoolAttr, "false">:$constrain_layout,
-    OptionalAttr<ChannelHandle<LHLO_Dialect>>:$channel_id,
+    OptionalAttr<ChannelHandle>:$channel_id,
     DefaultValuedAttr<BoolAttr, "false">:$use_global_device_ids
   );
   let regions = (region SizedRegion<1>:$computation);
@@ -712,7 +713,7 @@ def LHLO_CollectivePermuteOp: LHLO_Op<"collective_permute", [SameTypeOperands]>,
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$source_target_pairs,
-    OptionalAttr<ChannelHandle<LHLO_Dialect>>:$channel_id
+    OptionalAttr<ChannelHandle>:$channel_id
   );
 }
 
@@ -814,7 +815,7 @@ def FusionOp : LHLO_Op<"fusion", [SingleBlockImplicitTerminator<"TerminatorOp">]
 
   let skipDefaultBuilders = 1;
   let builders = [
-     OpBuilder<"ArrayRef<NamedAttribute> attributes">
+     OpBuilderDAG<(ins "ArrayRef<NamedAttribute>":$attributes)>
   ];
 }
 
@@ -824,9 +825,9 @@ def TerminatorOp :
   let description = [{
     Terminator operation for the LHLO dialect.
   }];
-  let builders = [OpBuilder<"ValueRange operands",
-    [{ build($_builder, $_state, llvm::None, operands, llvm::None); }]
-  >];
+  let builders = [
+    OpBuilderDAG<(ins "ValueRange":$operands),
+    [{ build($_builder, $_state, llvm::None, operands, llvm::None); }]>];
 }
 
 #endif // LHLO_OPS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h
new file mode 100644
index 00000000000..316e65076ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_CHLO_TO_MHLO_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_CHLO_TO_MHLO_OP_H_
+
+#include <type_traits>
+
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace chlo {
+
+struct HloComplexAdaptor {
+  static mhlo::ComplexOp CreateOp(BroadcastComplexOp from_op, Type result_type,
+                                  Value broadcasted_lhs, Value broadcasted_rhs,
+                                  OpBuilder &builder) {
+    return builder.create<mhlo::ComplexOp>(from_op.getLoc(), result_type,
+                                           broadcasted_lhs, broadcasted_rhs);
+  }
+};
+template <typename FromOpTy, typename ToOpTy>
+struct HloBinaryElementwiseAdaptor {
+  static ToOpTy CreateOp(FromOpTy from_op, Type result_type,
+                         Value broadcasted_lhs, Value broadcasted_rhs,
+                         OpBuilder &builder) {
+    return builder.create<ToOpTy>(from_op.getLoc(), result_type,
+                                  broadcasted_lhs, broadcasted_rhs);
+  }
+};
+struct HloCompareAdaptor {
+  static mhlo::CompareOp CreateOp(BroadcastCompareOp from_op, Type result_type,
+                                  Value broadcasted_lhs, Value broadcasted_rhs,
+                                  OpBuilder &builder) {
+    return builder.create<mhlo::CompareOp>(
+        from_op.getLoc(), result_type, broadcasted_lhs, broadcasted_rhs,
+        from_op.comparison_direction(), from_op.compare_typeAttr());
+  }
+};
+
+// Populate a pattern for each Broadcasting CHlo op. This requires the pattern
+// to take a ChloOpTy, MhloOpTy, and an Adaptor as templated values.
+template <template <typename, typename, typename> class Pattern,
+          typename... ConstructorArgs>
+void PopulateForBroadcastingBinaryOp(MLIRContext *context,
+                                     OwningRewritePatternList *patterns,
+                                     ConstructorArgs &&...args) {
+#define POPULATE_BCAST(ChloOp, HloOp)                                      \
+  patterns->insert<                                                        \
+      Pattern<ChloOp, HloOp, HloBinaryElementwiseAdaptor<ChloOp, HloOp>>>( \
+      context, args...);
+
+  POPULATE_BCAST(BroadcastAddOp, mhlo::AddOp);
+  POPULATE_BCAST(BroadcastAndOp, mhlo::AndOp);
+  POPULATE_BCAST(BroadcastAtan2Op, mhlo::Atan2Op);
+  POPULATE_BCAST(BroadcastDivOp, mhlo::DivOp);
+  POPULATE_BCAST(BroadcastMaxOp, mhlo::MaxOp);
+  POPULATE_BCAST(BroadcastMinOp, mhlo::MinOp);
+  POPULATE_BCAST(BroadcastMulOp, mhlo::MulOp);
+  POPULATE_BCAST(BroadcastOrOp, mhlo::OrOp);
+  POPULATE_BCAST(BroadcastPowOp, mhlo::PowOp);
+  POPULATE_BCAST(BroadcastRemOp, mhlo::RemOp);
+  POPULATE_BCAST(BroadcastShiftLeftOp, mhlo::ShiftLeftOp);
+  POPULATE_BCAST(BroadcastShiftRightArithmeticOp, mhlo::ShiftRightArithmeticOp);
+  POPULATE_BCAST(BroadcastShiftRightLogicalOp, mhlo::ShiftRightLogicalOp);
+  POPULATE_BCAST(BroadcastSubOp, mhlo::SubOp);
+  POPULATE_BCAST(BroadcastXorOp, mhlo::XorOp);
+
+  // Broadcasting ops requiring special construction.
+  patterns
+      ->insert<Pattern<BroadcastComplexOp, mhlo::ComplexOp, HloComplexAdaptor>>(
+          context, args...);
+  patterns
+      ->insert<Pattern<BroadcastCompareOp, mhlo::CompareOp, HloCompareAdaptor>>(
+          context, args...);
+
+#undef POPULATE_BCAST
+}
+
+}  // namespace chlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
index 9cf1b6ce57e..d59dfd43d1b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/TypeUtilities.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -96,7 +97,7 @@ template <typename SupportedType, typename StdScalarOp, typename... Args>
 struct MapLhloOpToStdScalarOpImpl<SupportedType, StdScalarOp, Args...> {
   Value operator()(Location loc, ArrayRef<Type> result_types,
                    ArrayRef<Value> args, OpBuilder* b) {
-    Type element_type = args.front().getType();
+    Type element_type = getElementTypeOrSelf(args.front().getType());
     if (element_type.isa<SupportedType>()) {
       return b->template create<StdScalarOp>(loc, result_types, args,
                                              mlir::None);
@@ -120,7 +121,7 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::AbsOp>(Location loc,
                                                   ArrayRef<Type> result_types,
                                                   ArrayRef<Value> args,
                                                   OpBuilder* b) {
-  Type element_type = args.front().getType();
+  Type element_type = getElementTypeOrSelf(args.front().getType());
   if (element_type.isa<FloatType>()) {
     return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::AbsFOp>{}(
         loc, result_types, args, b);
@@ -130,8 +131,11 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::AbsOp>(Location loc,
     Value lhs = args[0];
     auto integer_type = element_type.dyn_cast<IntegerType>();
 
-    auto zero_intval =
+    Value zero_intval =
         b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+    }
     auto lhs_gt_zero = b->create<ScalarIOp<CompareOp>>(loc, CmpIPredicate::sge,
                                                        lhs, zero_intval);
     auto neg_val = b->create<ScalarIOp<lmhlo::SubOp>>(loc, zero_intval, lhs);
@@ -196,7 +200,7 @@ inline Value MapCompareOpToStdScalarOp(Location loc,
                                        ArrayRef<Value> args, OpBuilder* b) {
   const auto& lhs = args[0];
   const auto& rhs = args[1];
-  Type element_type = lhs.getType();
+  Type element_type = getElementTypeOrSelf(lhs.getType());
   if (element_type.isSignlessInteger()) {
     Optional<CmpIPredicate> predicate =
         getCmpPredicate<CmpIPredicate>(comparison_direction);
@@ -268,8 +272,8 @@ template <>
 inline Value MapLhloOpToStdScalarOp<lmhlo::ConvertOp>(
     Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
-  Type sourceType = args.front().getType();
-  Type targetType = result_types.front();
+  Type sourceType = getElementTypeOrSelf(args.front().getType());
+  Type targetType = getElementTypeOrSelf(result_types.front());
 
   if (mlir::SIToFPOp::areCastCompatible(sourceType, targetType)) {
     return b->create<mlir::SIToFPOp>(loc, result_types, args, mlir::None);
@@ -390,7 +394,7 @@ struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
   static Value map(Location loc, StringRef comparison_direction,
                    ArrayRef<Type> result_types, ArrayRef<Value> args,
                    OpBuilder* b) {
-    Type element_type = args.front().getType();
+    Type element_type = getElementTypeOrSelf(args.front().getType());
     if (element_type.isa<SupportedType>()) {
       auto predicate = getCmpPredicate<Predicate>(comparison_direction);
       assert(predicate.hasValue() && "expected valid comparison direction");
@@ -439,7 +443,7 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::NegOp>(Location loc,
                                                   ArrayRef<Type> result_types,
                                                   ArrayRef<Value> args,
                                                   OpBuilder* b) {
-  Type element_type = args.front().getType();
+  Type element_type = getElementTypeOrSelf(args.front().getType());
   if (element_type.isa<FloatType>()) {
     return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::NegFOp>{}(
         loc, result_types, args, b);
@@ -449,8 +453,11 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::NegOp>(Location loc,
     Value lhs = args[0];
     auto integer_type = element_type.dyn_cast<IntegerType>();
 
-    auto zero_intval =
+    Value zero_intval =
         b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+    }
     return b->create<ScalarIOp<lmhlo::SubOp>>(loc, zero_intval, lhs);
   }
   return nullptr;
@@ -461,11 +468,14 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::NotOp>(Location loc,
                                                   ArrayRef<Type> result_types,
                                                   ArrayRef<Value> args,
                                                   OpBuilder* b) {
-  Type element_type = args.front().getType();
+  Type element_type = getElementTypeOrSelf(args.front().getType());
   if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
     // lmhlo.not(x) -> x ^ -1
-    auto all_ones =
+    Value all_ones =
         b->create<::mlir::ConstantIntOp>(loc, -1, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      all_ones = b->create<::mlir::SplatOp>(loc, vec_type, all_ones);
+    }
     return b->create<::mlir::XOrOp>(loc, all_ones, args[0]);
   }
   return nullptr;
@@ -493,26 +503,35 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::SignOp>(Location loc,
                                                    ArrayRef<Type> result_types,
                                                    ArrayRef<Value> args,
                                                    OpBuilder* b) {
-  Type element_type = args.front().getType();
+  Type element_type = getElementTypeOrSelf(args.front().getType());
   if (auto float_type = element_type.dyn_cast<FloatType>()) {
     bool ignored;
     APFloat one_apfloat(1.0f);
     one_apfloat.convert(float_type.getFloatSemantics(),
                         APFloat::rmNearestTiesToEven, &ignored);
     Value one = b->create<mlir::ConstantFloatOp>(loc, one_apfloat, float_type);
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      one = b->create<::mlir::SplatOp>(loc, vec_type, one);
+    }
     return b->create<::mlir::CopySignOp>(loc, result_types, one, args[0]);
   } else if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
     // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
     Value zero =
         b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
-    Value cmp =
-        b->create<::mlir::CmpIOp>(loc, CmpIPredicate::eq, args[0], zero);
     Value bitwidth_minus_one = b->create<::mlir::ConstantIntOp>(
         loc, integer_type.getWidth() - 1, integer_type.getWidth());
-    Value ashr =
-        b->create<::mlir::SignedShiftRightOp>(loc, args[0], bitwidth_minus_one);
     Value one =
         b->create<::mlir::ConstantIntOp>(loc, 1, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
+      bitwidth_minus_one =
+          b->create<::mlir::SplatOp>(loc, vec_type, bitwidth_minus_one);
+      one = b->create<::mlir::SplatOp>(loc, vec_type, one);
+    }
+    Value cmp =
+        b->create<::mlir::CmpIOp>(loc, CmpIPredicate::eq, args[0], zero);
+    Value ashr =
+        b->create<::mlir::SignedShiftRightOp>(loc, args[0], bitwidth_minus_one);
     Value or_op = b->create<::mlir::OrOp>(loc, ashr, one);
     return b->create<::mlir::SelectOp>(loc, cmp, zero, or_op);
   }
@@ -583,6 +602,27 @@ struct HloOpToStdScalarOp {
     return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
         op.getLoc(), comparison_direction, result_types, args, b);
   }
+
+  // Implementation for LHLO ops except lmhlo::CompareOp.
+  template <typename LhloOpTy,
+            typename = std::enable_if_t<
+                !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
+                std::is_same<typename mhlo::HloToLhloOp<LhloOpTy>,
+                             std::false_type>::value>>
+  static Value map(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Value> args, OpBuilder* b, unsigned i = 0) {
+    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(loc, result_types, args, b);
+  }
+
+  // Implementation for lmhlo::CompareOp.
+  template <typename LhloOpTy, typename = std::enable_if_t<std::is_same<
+                                   LhloOpTy, lmhlo::CompareOp>::value>>
+  static Value map(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Value> args,
+                   OpBuilder* b) {
+    return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
+        loc, comparison_direction, result_types, args, b);
+  }
 };
 
 }  // namespace lmhlo
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
index b0a382d6b0f..c568165c0fb 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
@@ -20,14 +20,13 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/BufferPlacement.h"
+#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 class LLVMTypeConverter;
 class LowerToLLVMOptions;
 class OwningRewritePatternList;
-class BufferAssignmentPlacer;
 
 // Populates a collection of rewrite patterns to realize element-wise operations
 // on ranked tensors where possible.
@@ -56,9 +55,9 @@ void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
                                MLIRContext *ctx);
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHLOToLHLOConversionPattern(
-    MLIRContext *context, BufferAssignmentTypeConverter *converter,
-    OwningRewritePatternList *patterns);
+void populateHLOToLHLOConversionPattern(MLIRContext *context,
+                                        BufferizeTypeConverter *converter,
+                                        OwningRewritePatternList *patterns);
 
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
index d7bb5057b00..7c0c11b1edd 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
@@ -43,6 +43,7 @@ add_mlir_library(MhloInferFusibilityOpInterface
 
 add_mlir_dialect_library(MhloDialect
   hlo_ops.cc
+  hlo_ops_base_structs.cc
 
   DEPENDS
   MLIRhlo_opsIncGen
@@ -66,6 +67,15 @@ add_mlir_dialect_library(LmhloDialect
 )
 target_link_libraries(LmhloDialect PUBLIC MLIRIR)
 
+add_mlir_dialect_library(LmhloGPUDialect
+  lhlo_gpu_ops.cc
+  lhlo_gpu_ops_structs.cc
+
+  DEPENDS
+  MLIRlhlo_gpu_opsIncGen
+)
+target_link_libraries(LmhloGPUDialect PUBLIC MLIRIR)
+
 
 add_mlir_dialect_library(MhloRegisterDialects
   init.cc
@@ -73,10 +83,12 @@ DEPENDS
   MLIRchlo_opsIncGen
   MLIRhlo_opsIncGen
   MLIRlhlo_opsIncGen
+  MLIRlhlo_gpu_opsIncGen
 )
 target_link_libraries(MhloRegisterDialects
   PUBLIC
   ChloDialect
   MhloDialect
   LmhloDialect
+  LmhloGPUDialect
 )
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
index 99b22a75a14..7ea42c6f806 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
@@ -190,11 +190,12 @@ LogicalResult BroadcastComplexOp::reifyReturnTypeShapes(
 void BroadcastCompareOp::build(OpBuilder& builder, OperationState& result,
                                Value lhs, Value rhs,
                                DenseIntElementsAttr broadcast_dimensions,
-                               StringAttr comparison_direction) {
+                               StringAttr comparison_direction,
+                               StringAttr compare_type) {
   auto new_type = GetBroadcastType(lhs.getType(), rhs.getType(),
                                    builder.getI1Type(), broadcast_dimensions);
   build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
-        comparison_direction);
+        comparison_direction, compare_type);
 }
 
 LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
index 0a5bb0e018a..edc0ab20f33 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -62,8 +63,6 @@ namespace mlir {
 #include "hlo_patterns.cc.inc"
 }  // namespace mlir
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"
-
 namespace mlir {
 namespace mhlo {
 
@@ -1054,6 +1053,9 @@ LogicalResult ConcatenateOp::inferReturnTypes(
     return success();
   }
 
+  if (first_type.getRank() == 0)
+    return emitOptionalError(location, "rank-0 values cannot be concatenated");
+
   auto out_shape = llvm::to_vector<6>(first_type.getShape());
 
   // Determine what the non-concatenate dimensions should be.
@@ -1785,6 +1787,61 @@ static LogicalResult Verify(PadOp op) {
   return success();
 }
 
+OpFoldResult PadOp::fold(ArrayRef<Attribute> operands) {
+  // If all padding is zero then it is an identity pad.
+  auto is_zero = [](const APInt& i) { return i == 0; };
+  if (llvm::all_of(edge_padding_low().getIntValues(), is_zero) &&
+      llvm::all_of(edge_padding_high().getIntValues(), is_zero) &&
+      llvm::all_of(interior_padding().getIntValues(), is_zero))
+    return operand();
+
+  // If any padding is negative then it isn't supported by the folder (yet).
+  auto is_negative = [](const APInt& i) { return i.slt(0); };
+  if (llvm::all_of(edge_padding_low().getIntValues(), is_negative) &&
+      llvm::all_of(edge_padding_high().getIntValues(), is_negative) &&
+      llvm::all_of(interior_padding().getIntValues(), is_negative))
+    return {};
+
+  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  DenseElementsAttr padding = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  RankedTensorType return_type = getType().dyn_cast_or_null<RankedTensorType>();
+  if (!input || !input.getType().hasRank() || !padding || !return_type ||
+      !return_type.hasStaticShape())
+    return {};
+
+  // Fill the full result tensor with the padding value.
+  llvm::SmallVector<Attribute, 4> result(return_type.getNumElements(),
+                                         padding.getValue({}));
+
+  auto next_index = [](llvm::SmallVector<uint64_t, 8>& index,
+                       llvm::ArrayRef<int64_t> shape) {
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      ++index[i];
+      if (index[i] < shape[i]) return true;
+      index[i] = 0;
+    }
+    return false;
+  };
+
+  // Iterate over all elements of the input tensor and copy it to the correct
+  // location in the output tensor.
+  llvm::SmallVector<uint64_t, 8> index(input.getType().getRank(), 0);
+  do {
+    uint64_t linear_index = 0;
+    uint64_t linear_index_multiplyer = 1;
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      linear_index +=
+          (edge_padding_low().getValue<int64_t>({uint64_t(i)}) +
+           index[i] *
+               (interior_padding().getValue<int64_t>({uint64_t(i)}) + 1)) *
+          linear_index_multiplyer;
+      linear_index_multiplyer *= return_type.getShape()[i];
+    }
+    result[linear_index] = input.getValue(index);
+  } while (next_index(index, input.getType().getShape()));
+  return DenseElementsAttr::get(return_type, result);
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
@@ -1931,6 +1988,14 @@ static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   return DenseElementsAttr::get(type, values);
 }
 
+struct round {
+  APFloat operator()(const APFloat& f) {
+    APFloat r = f;
+    r.roundToIntegral(llvm::RoundingMode::NearestTiesToAway);
+    return r;
+  }
+};
+
 #define UNARY_FOLDER(Op, Func)                                                \
   OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                          \
     if (getElementTypeOrSelf(getType()).isa<FloatType>())                     \
@@ -1940,7 +2005,15 @@ static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
     return {};                                                                \
   }
 
+#define UNARY_FOLDER_FLOAT(Op, Func)                                 \
+  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                 \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
+      return UnaryFolder<Op, FloatType, APFloat, Func>(this, attrs); \
+    return {};                                                       \
+  }
+
 UNARY_FOLDER(NegOp, std::negate);
+UNARY_FOLDER_FLOAT(RoundOp, round);
 
 //===----------------------------------------------------------------------===//
 // BinaryOps
@@ -2069,14 +2142,66 @@ BINARY_FOLDER(MinOp, min);
 // SliceOp
 //===----------------------------------------------------------------------===//
 
-void SliceOp::build(OpBuilder& builder, OperationState& result, Value operand,
-                    DenseIntElementsAttr start_indices,
-                    DenseIntElementsAttr limit_indices,
-                    DenseIntElementsAttr strides) {
-  return build(builder, result,
-               InferOutputTypes(&builder, operand, start_indices, limit_indices,
-                                strides),
-               operand, start_indices, limit_indices, strides);
+// Returns output dimension size for slice result for the given arguments.
+// Returns -1 if arguments are illegal.
+static int64_t InferSliceDim(int64_t input_dim, int64_t start, int64_t end,
+                             int64_t stride) {
+  if (input_dim == -1 || start < 0 || start > end || end > input_dim ||
+      stride == 0)
+    return -1;
+
+  return llvm::divideCeil(end - start, stride);
+}
+
+LogicalResult SliceOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  SliceOpAdaptor slice(operands, attributes);
+  // TODO(jpienaar): Update this code after refactoring verify.
+  if (failed(slice.verify(location.getValueOr(UnknownLoc::get(context))))) {
+    return failure();
+  }
+
+  Type ty = slice.operand().getType();
+  RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
+  if (!ranked_ty) {
+    // The operand type is unranked, so the best we can infer for the result
+    // type is an unranked tensor with the same element type as the operand
+    // type.
+    inferredReturnTypes.assign({ty});
+    return success();
+  }
+
+  int64_t rank = ranked_ty.getRank();
+  ShapedType attr_ty = slice.start_indices().getType();
+  if (attr_ty.getRank() != 1 || attr_ty.getNumElements() != rank ||
+      !attr_ty.getElementType().isSignlessInteger(64) ||
+      slice.limit_indices().getType() != attr_ty ||
+      slice.strides().getType() != attr_ty) {
+    // Unfortunately we can't rely on the AllTypesMatch trait for the SliceOp
+    // having been verified at this point. Emit an error message that matches
+    // the one that would be reported by AllTypesMatch for a more consistent
+    // user experience.
+    // TODO(b/171567182): Clean this up after AllTypesMatch has been refactored.
+    return emitOptionalError(location,
+                             "failed to verify that all of {start_indices, "
+                             "limit_indices, strides} have same type");
+  }
+
+  SmallVector<int64_t, 4> start(slice.start_indices().getValues<int64_t>());
+  SmallVector<int64_t, 4> limit(slice.limit_indices().getValues<int64_t>());
+  SmallVector<int64_t, 4> stride_vals(slice.strides().getValues<int64_t>());
+
+  SmallVector<int64_t, 4> shape;
+  shape.reserve(rank);
+  for (int64_t i = 0, e = rank; i != e; i++) {
+    shape.push_back(InferSliceDim(ranked_ty.getDimSize(i), start[i], limit[i],
+                                  stride_vals[i]));
+  }
+  inferredReturnTypes.assign(
+      {RankedTensorType::get(shape, ranked_ty.getElementType())});
+  return success();
 }
 
 template <typename I, typename E>
@@ -2259,46 +2384,6 @@ void SliceOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
   results.insert<SimplifyConcatSlice>(context);
 }
 
-// Returns output dimension size for slice result for the given arguments.
-// Returns -1 if arguments are illegal.
-static int64_t InferSliceDim(int64_t input_dim, int64_t start, int64_t end,
-                             int64_t stride) {
-  if (input_dim == -1 || start < 0 || start > end || end > input_dim ||
-      stride == 0)
-    return -1;
-
-  return llvm::divideCeil(end - start, stride);
-}
-
-Type SliceOp::InferOutputTypes(Builder* builder, Value operand,
-                               DenseIntElementsAttr start_indices,
-                               DenseIntElementsAttr limit_indices,
-                               DenseIntElementsAttr strides) {
-  Type ty = operand.getType();
-  RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
-  if (!ranked_ty) return ty;
-  int64_t rank = ranked_ty.getRank();
-
-  // Illegal attributes.
-  ShapedType attr_ty = start_indices.getType();
-  if (attr_ty.getRank() != 1 || attr_ty.getNumElements() != rank ||
-      !attr_ty.getElementType().isSignlessInteger(64) ||
-      limit_indices.getType() != attr_ty || strides.getType() != attr_ty)
-    return ty;
-
-  SmallVector<int64_t, 4> start(start_indices.getValues<int64_t>());
-  SmallVector<int64_t, 4> limit(limit_indices.getValues<int64_t>());
-  SmallVector<int64_t, 4> stride_vals(strides.getValues<int64_t>());
-
-  SmallVector<int64_t, 4> shape;
-  shape.reserve(rank);
-  for (int64_t i = 0, e = rank; i != e; i++) {
-    shape.push_back(InferSliceDim(ranked_ty.getDimSize(i), start[i], limit[i],
-                                  stride_vals[i]));
-  }
-  return RankedTensorType::get(shape, ranked_ty.getElementType());
-}
-
 //===----------------------------------------------------------------------===//
 // SortOp
 //===----------------------------------------------------------------------===//
@@ -2526,10 +2611,12 @@ void UnaryEinsumOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 
 void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
-                      Value rhs, StringAttr comparison_direction) {
+                      Value rhs, StringAttr comparison_direction,
+                      StringAttr compare_type) {
   auto new_type =
       UpdateResultElementType(&builder, lhs.getType(), builder.getI1Type());
-  build(builder, result, new_type, lhs, rhs, comparison_direction);
+  build(builder, result, new_type, lhs, rhs, comparison_direction,
+        compare_type);
 }
 
 LogicalResult CompareOp::inferReturnTypeComponents(
@@ -2645,6 +2732,145 @@ OpFoldResult CompareOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+
+llvm::SmallVector<Attribute, 4> evaluateMhloRegion(Region& region,
+                                                   ArrayRef<Attribute> inputs) {
+  if (region.getNumArguments() != inputs.size()) return {};
+
+  llvm::DenseMap<Value, Attribute> values;
+  values.reserve(region.getNumArguments());
+  for (auto it : llvm::zip(region.getArguments(), inputs)) {
+    values.try_emplace(std::get<0>(it), std::get<1>(it));
+  }
+
+  for (auto& op : region.getOps()) {
+    llvm::SmallVector<Attribute, 4> inputs;
+    for (auto& operand : op.getOpOperands()) {
+      inputs.push_back(values.lookup(operand.get()));
+    }
+    if (isa<ReturnOp>(op)) return inputs;
+
+    llvm::SmallVector<OpFoldResult, 4> results;
+    if (failed(op.fold(inputs, results))) return {};
+    for (auto it : llvm::zip(op.getResults(), results)) {
+      if (!std::get<1>(it).is<Attribute>()) return {};
+      values.insert({std::get<0>(it), std::get<1>(it).get<Attribute>()});
+    }
+  }
+  return {};
+}
+
+OpFoldResult ScatterOp::fold(ArrayRef<Attribute> operands) {
+  auto base = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto index = operands[1].dyn_cast_or_null<DenseIntElementsAttr>();
+  auto update = operands[2].dyn_cast_or_null<DenseElementsAttr>();
+  if (!base || !index || !update) return {};
+
+  auto base_type = base.getType().dyn_cast<RankedTensorType>();
+  auto index_type = index.getType().dyn_cast<RankedTensorType>();
+  auto update_type = update.getType().dyn_cast<RankedTensorType>();
+  if (!base_type || !index_type || !update_type) return {};
+
+  // Add the virtual trailing dimension of size 1 if index_vector_dim equals to
+  // index_type.rank.
+  const int64_t index_vector_dim =
+      scatter_dimension_numbers().index_vector_dim().getInt();
+  if (index_vector_dim == index_type.getRank()) {
+    auto index_shape = index_type.getShape().vec();
+    index_shape.push_back(1);
+    index_type =
+        RankedTensorType::get(index_shape, index_type.getElementType());
+    index = index.reshape(index_type).cast<DenseIntElementsAttr>();
+  }
+
+  // Increment the multi-dimensional index vector based on the limits for each
+  // dimension specified by shape and returns false if the index rolled around
+  // with true otherwise.
+  auto next_index = [](llvm::SmallVector<uint64_t, 8>& index,
+                       llvm::ArrayRef<int64_t> shape) {
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      ++index[i];
+      if (index[i] < shape[i]) return true;
+      index[i] = 0;
+    }
+    return false;
+  };
+
+  // Iterate over all elements of the update tensor, then find the corresponding
+  // value in the indices tensor to determine which location we have to update
+  // in the base/result tensor.
+  llvm::SmallVector<Attribute, 8> results(base.getValues<Attribute>());
+  llvm::SmallVector<uint64_t, 8> update_index(update_type.getRank(), 0);
+  llvm::SmallVector<uint64_t, 8> index_index;
+  index_index.reserve(index_type.getRank());
+  llvm::SmallVector<uint64_t, 8> base_index;
+  base_index.reserve(base_type.getRank());
+  do {
+    // Compute the index for the slice of the indices tensor for this update
+    // value.
+    index_index.clear();
+    if (index_vector_dim == 0) index_index.push_back(0);
+    for (int64_t i = 0; i < update_index.size(); ++i) {
+      if (llvm::count(scatter_dimension_numbers().update_window_dims(), i) == 0)
+        index_index.push_back(update_index[i]);
+      if (index_index.size() == index_vector_dim) index_index.push_back(0);
+    }
+
+    // Compute the index for the given update value in the base tensor.
+    base_index.assign(base_type.getRank(), 0);
+    uint64_t index_count = index_type.getShape()[index_vector_dim];
+    for (uint64_t i = 0; i < index_count; ++i) {
+      uint64_t operand_dim = scatter_dimension_numbers()
+                                 .scatter_dims_to_operand_dims()
+                                 .getValue<APInt>({i})
+                                 .getSExtValue();
+      index_index[index_vector_dim] = i;
+      base_index[operand_dim] +=
+          index.getValue<APInt>(index_index).getSExtValue();
+    }
+    uint64_t update_window_dim_index = 0;
+    for (uint64_t i = 0; i < base_index.size(); ++i) {
+      if (llvm::count(scatter_dimension_numbers().inserted_window_dims(), i))
+        continue;
+      base_index[i] +=
+          update_index[scatter_dimension_numbers()
+                           .update_window_dims()
+                           .getValue<APInt>({update_window_dim_index})
+                           .getSExtValue()];
+      update_window_dim_index++;
+    }
+
+    // Compute the linear index for the index into the base tensor.
+    int64_t linear_base_index = 0;
+    int64_t linear_base_index_multiplyer = 1;
+    for (int64_t i = base_index.size() - 1; i >= 0; --i) {
+      // Out of bound index have backend specific behaviour so avoid folding it.
+      if (base_index[i] < 0 || base_index[i] >= base_type.getShape()[i])
+        return {};
+      linear_base_index += base_index[i] * linear_base_index_multiplyer;
+      linear_base_index_multiplyer *= base_type.getShape()[i];
+    }
+
+    // Evaluate update computation and update the value with the newly computed
+    // attribute in the base tensor.
+    auto lhs = DenseElementsAttr::get(
+        RankedTensorType::get({}, base_type.getElementType()),
+        results[linear_base_index]);
+    auto rhs = DenseElementsAttr::get(
+        RankedTensorType::get({}, base_type.getElementType()),
+        update.getValue<Attribute>(update_index));
+    auto new_value = evaluateMhloRegion(update_computation(), {lhs, rhs});
+    if (new_value.size() != 1 || !new_value[0]) return {};
+    results[linear_base_index] =
+        new_value[0].cast<DenseElementsAttr>().getValue<Attribute>({});
+  } while (next_index(update_index, update_type.getShape()));
+
+  return DenseElementsAttr::get(base_type, results);
+}
+
 }  // namespace mhlo
 }  // namespace mlir
 
@@ -2661,15 +2887,22 @@ namespace mhlo {
 namespace {
 struct HLOInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
+
+  // Allow all call operations to be inlined.
+  bool isLegalToInline(Operation* call, Operation* callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
   // We don't have any special restrictions on what can be inlined into
   // destination regions (e.g. while/conditional bodies). Always allow it.
-  bool isLegalToInline(Region* dest, Region* src,
+  bool isLegalToInline(Region* dest, Region* src, bool wouldBeCloned,
                        BlockAndValueMapping& valueMapping) const final {
     return true;
   }
   // Operations in mhlo dialect are always legal to inline since they are
   // pure.
-  bool isLegalToInline(Operation*, Region*, BlockAndValueMapping&) const final {
+  bool isLegalToInline(Operation*, Region*, bool,
+                       BlockAndValueMapping&) const final {
     return true;
   }
 };
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc
new file mode 100644
index 00000000000..90da1251ea0
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc
@@ -0,0 +1,18 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
+
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
index b8b6cb80fba..bdb3e3cf490 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
@@ -18,15 +18,13 @@ limitations under the License.
 include "mlir/Dialect/Shape/IR/ShapeOps.td"
 include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
 
-def EqualBinaryOperands : Constraint<CPred<"$0 == $1">>;
-
 // Canonicalization patterns.
 
 def DynamicBroadcastToOwnShape_1 : Pat<
-   (HLO_DynamicBroadcastInDimOp:$op $arg0,
-       (Shape_ToExtentTensorOp (Shape_ShapeOfOp $arg1)), $attr),
-  (replaceWithValue $arg0), [(EqualBinaryOperands $arg0, $arg1)]>;
+   (HLO_DynamicBroadcastInDimOp:$op $x,
+       (Shape_ToExtentTensorOp (Shape_ShapeOfOp $x)), $attr),
+   (replaceWithValue $x)>;
 def DynamicBroadcastToOwnShape_2 : Pat<
-  (HLO_DynamicBroadcastInDimOp:$op $arg0, (Shape_ShapeOfOp $arg1), $attr),
-  (replaceWithValue $arg0), [(EqualBinaryOperands $arg0, $arg1)]>;
+  (HLO_DynamicBroadcastInDimOp:$op $x, (Shape_ShapeOfOp $x), $attr),
+  (replaceWithValue $x)>;
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
index 503b100c7ab..ca8c6a8d150 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/register.h"
 
 void mlir::mhlo::registerAllMhloDialects(mlir::DialectRegistry &registry) {
   // clang-format off
   registry.insert<mlir::chlo::HloClientDialect,
+                  mlir::mhlo::MhloDialect,
                   mlir::lmhlo::LmhloDialect,
-                  mlir::mhlo::MhloDialect>();
+                  mlir::lmhlo_gpu::LmhloGpuDialect>();
   // clang-format on
 }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc
new file mode 100644
index 00000000000..10c5c0c2f9d
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the LMHLO GPU dialect.
+
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+namespace lmhlo_gpu {
+
+LmhloGpuDialect::LmhloGpuDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context, TypeID::get<LmhloGpuDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc"
+      >();
+}
+
+// TODO(jurahul): Add verification for operand shapes and ranks.
+
+}  // namespace lmhlo_gpu
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc
new file mode 100644
index 00000000000..cd2cfc58836
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc
@@ -0,0 +1,18 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h"
+
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
index cba0d3b4788..4524cf3ec1f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.cc.inc"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops_structs.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops_structs.cc
new file mode 100644
index 00000000000..83dd4e62b47
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops_structs.cc
@@ -0,0 +1,17 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
index d68fe920fe8..b232ca4e6cb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir-hlo/utils/broadcast_utils.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -69,13 +70,18 @@ struct ConvertConstantLikeOp : public OpConversionPattern<ConstantLikeOp> {
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding mhlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
-  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
-  LogicalResult matchAndRewrite(ChloOpTy op,
-                                PatternRewriter &rewriter) const override {
+struct ConvertTrivialNonBroadcastBinaryOp
+    : public OpConversionPattern<ChloOpTy> {
+  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      ChloOpTy op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
     // Only rewrite for statically determinable non-broadcasting cases.
-    auto lhs_type = op.lhs().getType().template dyn_cast<RankedTensorType>();
-    auto rhs_type = op.rhs().getType().template dyn_cast<RankedTensorType>();
+    typename ChloOpTy::Adaptor transformed(operands);
+    auto lhs_type =
+        transformed.lhs().getType().template dyn_cast<RankedTensorType>();
+    auto rhs_type =
+        transformed.rhs().getType().template dyn_cast<RankedTensorType>();
     if (!lhs_type || !rhs_type) return failure();
 
     // Requires rank broadcast.
@@ -93,8 +99,9 @@ struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
       }
     }
 
-    rewriter.replaceOp(op, {Adaptor::CreateOp(op, op.getResult().getType(),
-                                              op.lhs(), op.rhs(), rewriter)});
+    rewriter.replaceOp(
+        op, {Adaptor::CreateOp(op, op.getResult().getType(), operands[0],
+                               operands[1], rewriter)});
     return success();
   }
 };
@@ -113,13 +120,15 @@ struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
 // `shape.broadcast` op, which only supports prefix-padding.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertRankedDynamicBroadcastBinaryOp
-    : public OpRewritePattern<ChloOpTy> {
-  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
-  LogicalResult matchAndRewrite(ChloOpTy op,
-                                PatternRewriter &rewriter) const override {
+    : public OpConversionPattern<ChloOpTy> {
+  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      ChloOpTy op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
     // Only support ranked operands.
-    Value lhs = op.lhs();
-    Value rhs = op.rhs();
+    typename ChloOpTy::Adaptor transformed(operands);
+    Value lhs = transformed.lhs();
+    Value rhs = transformed.rhs();
     auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
     auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
     auto result_type =
@@ -193,360 +202,20 @@ struct ConvertRankedDynamicBroadcastBinaryOp
   }
 };
 
-// Converts a broadcasting binary operation with a scalar operand and an
-// unranked operand to a ranked broadcasting operation by dynamically reshaping
-// the unranked operand to a 1D tensor. This will always be safe because
-// broadcasting from a scalar to another shape always works.
-template <typename ChloOpTy, typename HloOpTy>
-struct ConvertUnrankedScalarDynamicBroadcastBinaryOp
-    : public OpRewritePattern<ChloOpTy> {
-  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
-  LogicalResult matchAndRewrite(ChloOpTy op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    Value lhs = op.lhs();
-    Value rhs = op.rhs();
-
-    auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-    auto lhs_unranked_type = lhs.getType().dyn_cast<UnrankedTensorType>();
-
-    auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-    auto rhs_unranked_type = rhs.getType().dyn_cast<UnrankedTensorType>();
-
-    bool lhs_is_scalar = lhs_ranked_type &&
-                         lhs_ranked_type.getShape().empty() &&
-                         rhs_unranked_type;
-    bool rhs_is_scalar = rhs_ranked_type &&
-                         rhs_ranked_type.getShape().empty() &&
-                         lhs_unranked_type;
-
-    // Only support the case where exactly one operand is scalar and the other
-    // is unranked. Other patterns in this file will create more efficient
-    // lowerings for cases where both ranks are known or will handle the more
-    // generic case of both inputs being unranked.
-    if (!(lhs_is_scalar ^ rhs_is_scalar)) return failure();
-
-    auto result_type = op.getResult().getType().template dyn_cast<TensorType>();
-
-    // Reshape the non-scalar value into a dynamically sized, rank-1 tensor
-    Value shape =
-        rewriter.create<shape::ShapeOfOp>(loc, lhs_is_scalar ? rhs : lhs);
-    Value num_elements = rewriter.create<shape::NumElementsOp>(loc, shape);
-    Value size_tensor =
-        rewriter.create<TensorFromElementsOp>(loc, num_elements);
-    Value reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc, RankedTensorType::get({-1}, result_type.getElementType()),
-        lhs_is_scalar ? rhs : lhs, size_tensor);
-
-    // Create a new ranked Chlo op that will be further lowered by other
-    // patterns into Mhlo.
-    SmallVector<Value, 2> operands{lhs_is_scalar ? lhs : reshaped,
-                                   rhs_is_scalar ? rhs : reshaped};
-    Value computed = rewriter.create<ChloOpTy>(
-        loc, SmallVector<Type, 1>{reshaped.getType()}, operands, op.getAttrs());
-
-    // Reshape the result back into an unranked tensor.
-    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_type,
-                                                        computed, shape);
-
-    return success();
-  }
-};
-
-// Handles lowering of the following pattern to patterns that will be further
-// matched by other patterns until they result in LHLO:
-//   %result = "chlo.op"(%lhs, %rhs) : (<*xTy>, <*xTy>) -> <*xTy>
-//
-// The sequence of specializations this handles is:
-//   - Either operand being scalar
-//   - Operands having equal shapes
-//   - The resulting value being any of ranks [2,6]
-template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertUnrankedDynamicBroadcastBinaryOp
-    : public OpRewritePattern<ChloOpTy> {
-  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ChloOpTy op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    Value lhs = op.lhs();
-    Value rhs = op.rhs();
-    auto lhs_type = lhs.getType().dyn_cast<UnrankedTensorType>();
-    auto rhs_type = rhs.getType().dyn_cast<UnrankedTensorType>();
-    auto result_type = op.getResult().getType().template dyn_cast<TensorType>();
-
-    // Only support unranked operands. If either operand is ranked, another
-    // pattern will handle the lowering.
-    if (!lhs_type || !rhs_type) return failure();
-
-    // If lhs is scalar
-    auto if_op = rewriter.create<scf::IfOp>(
-        loc, result_type, IsScalarTensor(rewriter, op, lhs), true);
-    OpBuilder if_lhs_scalar_builder = if_op.getThenBodyBuilder();
-    Value reshaped_lhs = if_lhs_scalar_builder.create<TensorCastOp>(
-        loc, RankedTensorType::get({}, lhs_type.getElementType()), lhs);
-    Value if_lhs_scalar_result = if_lhs_scalar_builder.create<ChloOpTy>(
-        loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{reshaped_lhs, rhs},
-        op.getAttrs());
-    if_lhs_scalar_builder.create<scf::YieldOp>(loc, if_lhs_scalar_result);
-
-    // If lhs is NOT scalar
-    //
-    // See if rhs is scalar
-    OpBuilder else_lhs_scalar_builder = if_op.getElseBodyBuilder();
-    auto if_rhs_scalar_op = else_lhs_scalar_builder.create<scf::IfOp>(
-        loc, result_type, IsScalarTensor(else_lhs_scalar_builder, op, rhs),
-        true);
-    else_lhs_scalar_builder.create<scf::YieldOp>(loc,
-                                                 if_rhs_scalar_op.getResult(0));
-    OpBuilder if_rhs_scalar_builder = if_rhs_scalar_op.getThenBodyBuilder();
-    Value reshaped_rhs = if_rhs_scalar_builder.create<TensorCastOp>(
-        loc, RankedTensorType::get({}, lhs_type.getElementType()), rhs);
-    Value if_rhs_scalar_result = if_rhs_scalar_builder.create<ChloOpTy>(
-        loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{lhs, reshaped_rhs},
-        op.getAttrs());
-    if_rhs_scalar_builder.create<scf::YieldOp>(loc, if_rhs_scalar_result);
-
-    // If NEITHER shape is scalar
-    //
-    // See if shapes are equal.
-    OpBuilder else_no_scalars_builder = if_rhs_scalar_op.getElseBodyBuilder();
-    Value shape_of_lhs =
-        else_no_scalars_builder.create<shape::ShapeOfOp>(loc, lhs);
-    Value shape_of_rhs =
-        else_no_scalars_builder.create<shape::ShapeOfOp>(loc, rhs);
-    Value equal_shapes = else_no_scalars_builder.create<shape::ShapeEqOp>(
-        loc, shape_of_lhs, shape_of_rhs);
-
-    auto if_eq_shapes_op = else_no_scalars_builder.create<scf::IfOp>(
-        loc, result_type, equal_shapes, true);
-    else_no_scalars_builder.create<scf::YieldOp>(loc,
-                                                 if_eq_shapes_op.getResult(0));
-
-    OpBuilder if_eq_shapes_builder = if_eq_shapes_op.getThenBodyBuilder();
-    Value non_broadcast_op =
-        Adaptor::CreateOp(op, result_type, lhs, rhs, if_eq_shapes_builder);
-    if_eq_shapes_builder.create<scf::YieldOp>(loc, non_broadcast_op);
-
-    // If shapes are not scalar, nor equal
-    //
-    // See if values are of a rank that we support.
-    OpBuilder if_neq_shapes_builder = if_eq_shapes_op.getElseBodyBuilder();
-    if_neq_shapes_builder.create<scf::YieldOp>(
-        loc, HandleBroadcastAndOp(if_neq_shapes_builder, op, lhs, rhs));
-
-    rewriter.replaceOp(op, {if_op.getResult(0)});
-    return success();
-  }
-
- private:
-  // Returns the dyanamic result of checking the given value is a scalar
-  // tensor.
-  Value IsScalarTensor(OpBuilder &rewriter, ChloOpTy op, Value tensor) const {
-    auto loc = op.getLoc();
-
-    Value shape_of_tensor = rewriter.create<shape::ShapeOfOp>(loc, tensor);
-    Value rank_tensor = rewriter.create<shape::RankOp>(
-        loc, rewriter.getIndexType(), shape_of_tensor);
-    return rewriter.create<CmpIOp>(loc, rewriter.getI1Type(), CmpIPredicate::eq,
-                                   rank_tensor,
-                                   rewriter.create<ConstantIndexOp>(loc, 0));
-  }
-
-  // Create the if statement and code for a broadcasting op with a result of a
-  // given rank.
-  scf::IfOp createRankSpecializedBroadcastAndOp(OpBuilder &builder, ChloOpTy op,
-                                                Value lhs, Value rhs,
-                                                Value actual_rank,
-                                                int targeted_rank) const {
-    auto loc = op.getLoc();
-
-    // Create the if block to place the current specialized logic in.
-    Value greater_rank_is_n = builder.create<CmpIOp>(
-        loc, CmpIPredicate::eq, actual_rank,
-        builder.create<ConstantIndexOp>(loc, targeted_rank));
-    auto if_op =
-        builder.create<scf::IfOp>(loc, lhs.getType(), greater_rank_is_n, true);
-    OpBuilder if_builder = if_op.getThenBodyBuilder();
-
-    // Handle shape broadcasting and inferrence.
-    Value lhs_shape = if_builder.create<shape::ShapeOfOp>(loc, lhs);
-    Value rhs_shape = if_builder.create<shape::ShapeOfOp>(loc, rhs);
-    SmallVector<int64_t, 6> ranked_shape(targeted_rank, 1);
-    auto unknown_rank_extent_tensor_type = RankedTensorType::get(
-        {RankedTensorType::kDynamicSize}, builder.getIndexType());
-    auto known_rank_extent_tensor_type =
-        RankedTensorType::get({targeted_rank}, builder.getIndexType());
-    auto reshaped_type = RankedTensorType::get(
-        llvm::SmallVector<int64_t, 6>(targeted_rank,
-                                      RankedTensorType::kDynamicSize),
-        lhs.getType().template dyn_cast<TensorType>().getElementType());
-    Value ranked_shape_val = if_builder.create<shape::ConstShapeOp>(
-        loc, known_rank_extent_tensor_type,
-        mlir::DenseIntElementsAttr::get(known_rank_extent_tensor_type,
-                                        ranked_shape));
-    Value extended_lhs = if_builder.create<shape::BroadcastOp>(
-        loc, unknown_rank_extent_tensor_type, lhs_shape, ranked_shape_val,
-        nullptr);
-    Value extended_lhs_casted = if_builder.create<TensorCastOp>(
-        loc, known_rank_extent_tensor_type, extended_lhs);
-    Value extended_rhs = if_builder.create<shape::BroadcastOp>(
-        loc, unknown_rank_extent_tensor_type, rhs_shape, ranked_shape_val,
-        nullptr);
-    Value extended_rhs_casted = if_builder.create<TensorCastOp>(
-        loc, known_rank_extent_tensor_type, extended_rhs);
-
-    // 1. Reshape operands to the given rank (with the same number of elements)
-    // 2. Compute the ranked-broadcasted ChloOp (which will assert that the ops
-    //    can be broadcasted and do the actual broadcasting)
-    // 3. Type erase the output back to unranked
-    Value reshaped_lhs = if_builder.create<mhlo::DynamicReshapeOp>(
-        loc, reshaped_type, lhs, extended_lhs_casted);
-    Value reshaped_rhs = if_builder.create<mhlo::DynamicReshapeOp>(
-        loc, reshaped_type, rhs, extended_rhs_casted);
-    Value result = if_builder.create<ChloOpTy>(
-        loc, ArrayRef<Type>{reshaped_type},
-        ArrayRef<Value>{reshaped_lhs, reshaped_rhs}, op.getAttrs());
-    Value reshaped_result = if_builder.create<TensorCastOp>(
-        loc, UnrankedTensorType::get(reshaped_type.getElementType()), result);
-    if_builder.create<scf::YieldOp>(loc, reshaped_result);
-
-    // Return the if_op, so the result can be used and the else block can be
-    // used for the next rank specialized step.
-    return if_op;
-  }
-
-  // Iterates over the desired ranks to be specialized and generates the code
-  // snippet for each case.
-  Value HandleBroadcastAndOp(OpBuilder &rewriter, ChloOpTy op, Value lhs,
-                             Value rhs) const {
-    constexpr int max_rank_specialization = 7;
-    auto loc = op.getLoc();
-
-    // Find the larger rank of the 2 operands.
-    auto extent_tensor_type = RankedTensorType::get({ShapedType::kDynamicSize},
-                                                    rewriter.getIndexType());
-    Value lhs_shape =
-        rewriter.create<shape::ShapeOfOp>(loc, extent_tensor_type, lhs);
-    Value rhs_shape =
-        rewriter.create<shape::ShapeOfOp>(loc, extent_tensor_type, rhs);
-    Value lhs_rank =
-        rewriter.create<RankOp>(loc, rewriter.getIndexType(), lhs_shape);
-    Value rhs_rank =
-        rewriter.create<RankOp>(loc, rewriter.getIndexType(), rhs_shape);
-    Value greater_rank_lhs =
-        rewriter.create<CmpIOp>(loc, CmpIPredicate::sgt, lhs_rank, rhs_rank);
-    Value greater_rank =
-        rewriter.create<SelectOp>(loc, greater_rank_lhs, lhs_rank, rhs_rank);
-
-    // Generate a list of nested if/else statements to handle rank
-    // specializations from 2-6.
-    scf::IfOp if_op = createRankSpecializedBroadcastAndOp(rewriter, op, lhs,
-                                                          rhs, greater_rank, 2);
-
-    // Put each subsequent rank specialization inside the else statement of the
-    // previous one.
-    OpBuilder else_builder = if_op.getElseBodyBuilder();
-    for (int i = 3; i < max_rank_specialization; i++) {
-      auto inner_if = createRankSpecializedBroadcastAndOp(else_builder, op, lhs,
-                                                          rhs, greater_rank, i);
-
-      else_builder.create<scf::YieldOp>(loc, inner_if.getResult(0));
-      else_builder = inner_if.getElseBodyBuilder();
-    }
-
-    // Fire an assertion if none of the rank specializations applied (one of the
-    // ranks was greater than 6).
-    else_builder.create<AssertOp>(
-        loc, else_builder.create<ConstantIntOp>(loc, 0, 1),
-        "Input for dynamic binary op lowering was of a rank greater than 6");
-    else_builder.create<scf::YieldOp>(loc, lhs);
-
-    // Return the result of the outermost if statement.
-    return if_op.getResult(0);
-  }
-};
-
-template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-void PopulateForBinaryOp(MLIRContext *context,
-                         OwningRewritePatternList *patterns) {
-  patterns
-      ->insert<ConvertTrivialNonBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
-          context, 10);
-  patterns->insert<
-      ConvertRankedDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
-      context, 5);
-  patterns->insert<
-      ConvertUnrankedScalarDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy>,
-      ConvertUnrankedDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
-      context);
-}
-
-template <typename FromOpTy, typename ToOpTy>
-struct HloBinaryElementwiseAdaptor {
-  static ToOpTy CreateOp(FromOpTy from_op, Type result_type,
-                         Value broadcasted_lhs, Value broadcasted_rhs,
-                         OpBuilder &builder) {
-    return builder.create<ToOpTy>(from_op.getLoc(), result_type,
-                                  broadcasted_lhs, broadcasted_rhs);
-  }
-};
-
-struct HloComplexAdaptor {
-  static mhlo::ComplexOp CreateOp(BroadcastComplexOp from_op, Type result_type,
-                                  Value broadcasted_lhs, Value broadcasted_rhs,
-                                  OpBuilder &builder) {
-    return builder.create<mhlo::ComplexOp>(from_op.getLoc(), result_type,
-                                           broadcasted_lhs, broadcasted_rhs);
-  }
-};
-
-struct HloCompareAdaptor {
-  static mhlo::CompareOp CreateOp(BroadcastCompareOp from_op, Type result_type,
-                                  Value broadcasted_lhs, Value broadcasted_rhs,
-                                  OpBuilder &builder) {
-    return builder.create<mhlo::CompareOp>(from_op.getLoc(), result_type,
-                                           broadcasted_lhs, broadcasted_rhs,
-                                           from_op.comparison_direction());
-  }
-};
-
 #include "generated_chlo_legalize_to_hlo.inc"
 }  // namespace
 
 void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
                                        OwningRewritePatternList *patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 
   // Instantiate conversion templates for conforming binary elementwise ops
   // that do not have different dtypes between operands and results and do
   // not have special attributes that need to be preserved.
-#define POPULATE_BCAST(ChloOp, HloOp)                                      \
-  PopulateForBinaryOp<ChloOp, HloOp,                                       \
-                      HloBinaryElementwiseAdaptor<ChloOp, HloOp>>(context, \
-                                                                  patterns);
-
-  POPULATE_BCAST(BroadcastAddOp, mhlo::AddOp);
-  POPULATE_BCAST(BroadcastAndOp, mhlo::AndOp);
-  POPULATE_BCAST(BroadcastAtan2Op, mhlo::Atan2Op);
-  POPULATE_BCAST(BroadcastDivOp, mhlo::DivOp);
-  POPULATE_BCAST(BroadcastMaxOp, mhlo::MaxOp);
-  POPULATE_BCAST(BroadcastMinOp, mhlo::MinOp);
-  POPULATE_BCAST(BroadcastMulOp, mhlo::MulOp);
-  POPULATE_BCAST(BroadcastOrOp, mhlo::OrOp);
-  POPULATE_BCAST(BroadcastPowOp, mhlo::PowOp);
-  POPULATE_BCAST(BroadcastRemOp, mhlo::RemOp);
-  POPULATE_BCAST(BroadcastShiftLeftOp, mhlo::ShiftLeftOp);
-  POPULATE_BCAST(BroadcastShiftRightArithmeticOp, mhlo::ShiftRightArithmeticOp);
-  POPULATE_BCAST(BroadcastShiftRightLogicalOp, mhlo::ShiftRightLogicalOp);
-  POPULATE_BCAST(BroadcastSubOp, mhlo::SubOp);
-  POPULATE_BCAST(BroadcastXorOp, mhlo::XorOp);
-
-  // Broadcasting ops requiring special construction.
-  PopulateForBinaryOp<BroadcastComplexOp, mhlo::ComplexOp, HloComplexAdaptor>(
-      context, patterns);
-  PopulateForBinaryOp<BroadcastCompareOp, mhlo::CompareOp, HloCompareAdaptor>(
-      context, patterns);
+  PopulateForBroadcastingBinaryOp<ConvertTrivialNonBroadcastBinaryOp>(
+      context, patterns, 10);
+  PopulateForBroadcastingBinaryOp<ConvertRankedDynamicBroadcastBinaryOp>(
+      context, patterns, 5);
 
   // Other patterns.
   patterns->insert<ConvertConstantLikeOp>(context);
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
index d2f415d91f9..ce63f6b59a0 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
@@ -49,7 +49,7 @@ struct ChloLegalizeToHloPass
     chlo::PopulateLegalizeChloToHloPatterns(&getContext(), &conversionPatterns);
 
     if (failed(applyPartialConversion(getFunction(), conversionTarget,
-                                      conversionPatterns))) {
+                                      std::move(conversionPatterns)))) {
       return signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
index a48abb6190c..2ad07eed773 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
@@ -31,7 +31,8 @@ def : Pat<(HLOClient_AcosOp $input),
     (HLO_CompareOp
       $input,
       (HLO_ConstantLike<"-1"> $input),
-      HLO_COMPARISON_DIRECTION_NE
+      HLO_COMPARISON_DIRECTION_NE,
+      (HLO_DEFAULT_COMPARISON_TYPE)
     ),
     (HLO_MulOp
       (HLO_ConstantLike<"2"> $input),
@@ -67,7 +68,8 @@ def : Pat<(HLOClient_SinhOp $input),
     (HLO_CompareOp
       (HLO_AbsOp $input),
       (HLO_ConstantLike<"1"> $input),
-      HLO_COMPARISON_DIRECTION_LT
+      HLO_COMPARISON_DIRECTION_LT,
+      (HLO_DEFAULT_COMPARISON_TYPE)
     ),
     (HLO_DivOp
       (HLO_SubOp
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
index 22338d2847d..1cf39150368 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/BufferPlacement.h"
+#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -42,7 +42,7 @@ namespace mhlo {
 namespace {
 
 template <typename T>
-using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
+using BaseOpConversion = OpConversionPattern<T>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
@@ -126,6 +126,60 @@ class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
   }
 };
 
+// This specialization exists so that LMHLO's Dot can be given a specific set of
+// dimension numbers, when lowering from MHLO's Dot, which does not have
+// dimension numbers (it uses DotGeneral for this generalized notion of dot
+// products). When these two dialects are in sync with respect to the
+// Dot/DotGeneral issue, this specialization should be deleted.
+template <>
+class HloToLhloOpConverter<mhlo::DotOp> : public BaseOpConversion<mhlo::DotOp> {
+ public:
+  using BaseOpConversion<mhlo::DotOp>::BaseOpConversion;
+  LogicalResult matchAndRewrite(
+      mhlo::DotOp hloOp, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    Operation* op = hloOp.getOperation();
+    const auto& original_results = op->getResults();
+    SmallVector<Value, 2> buffer_args(operands.begin(), operands.end());
+    for (auto result : llvm::enumerate(original_results)) {
+      RankedTensorType resultType =
+          result.value().getType().dyn_cast<RankedTensorType>();
+      if (!resultType) {
+        return failure();
+      }
+      if (resultType.hasStaticShape()) {
+        buffer_args.push_back(
+            InsertAlloc(op->getLoc(), result.value(), &rewriter));
+      } else {
+        SmallVector<Value, 1> results_shape;
+        auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
+        if (!shape_type_op) return failure();
+        if (failed(
+                shape_type_op.reifyReturnTypeShapes(rewriter, results_shape)))
+          return failure();
+        buffer_args.push_back(InsertDynamicAllocAndDealloc(
+            op->getLoc(), result.value(), results_shape.front(), &rewriter));
+      }
+    }
+
+    // TODO(silvasean): Move this helper to MLIR core.
+    auto make_elements_attr = [&rewriter](ArrayRef<int64_t> integers) {
+      auto type = RankedTensorType::get({static_cast<int64_t>(integers.size())},
+                                        rewriter.getIntegerType(64));
+      return DenseIntElementsAttr::get(type, integers);
+    };
+    auto dotOp = rewriter.create<lmhlo::DotOp>(op->getLoc(), llvm::None,
+                                               buffer_args, op->getAttrs());
+    // MHLO's Dot uses rank-2 operands, of the form ([N, M], [M, O]) -> [N, O].
+    auto dimension_numbers = mhlo::DotDimensionNumbers::get(
+        make_elements_attr({}), make_elements_attr({}), make_elements_attr({1}),
+        make_elements_attr({0}), rewriter.getContext());
+    dotOp.dot_dimension_numbersAttr(dimension_numbers);
+    rewriter.replaceOp(op, ArrayRef<Value>(buffer_args).slice(operands.size()));
+    return success();
+  }
+};
+
 struct HloToLhloDynamicBroadcastInDimOpConverter
     : public BaseOpConversion<mhlo::DynamicBroadcastInDimOp> {
  public:
@@ -236,6 +290,43 @@ struct HloToLhloDynamicReshapeConverter
   }
 };
 
+struct HloToLhloDotGeneralOpConverter
+    : public BaseOpConversion<mhlo::DotGeneralOp> {
+  using BaseOpConversion<mhlo::DotGeneralOp>::BaseOpConversion;
+  LogicalResult matchAndRewrite(
+      mhlo::DotGeneralOp dotGeneralOp, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    Operation* op = dotGeneralOp.getOperation();
+
+    if (op->getResults().empty()) return failure();
+    OpResult result = op->getResults()[0];
+    RankedTensorType resultType = result.getType().dyn_cast<RankedTensorType>();
+    if (!resultType) return failure();
+
+    // The third buffer argument will be filled with what used to be the return
+    // type of the DotGeneral.
+    if (operands.size() != 2) return failure();
+    std::array<Value, 3> bufferArgs = {operands[0], operands[1], {}};
+
+    if (resultType.hasStaticShape()) {
+      bufferArgs[2] = InsertAlloc(op->getLoc(), result, &rewriter);
+    } else {
+      SmallVector<Value, 1> results_shape;
+      auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
+      if (failed(shape_type_op.reifyReturnTypeShapes(rewriter, results_shape)))
+        return failure();
+
+      bufferArgs[2] = InsertDynamicAllocAndDealloc(
+          op->getLoc(), result, results_shape.front(), &rewriter);
+    }
+
+    rewriter.create<lmhlo::DotOp>(op->getLoc(), llvm::None, bufferArgs,
+                                  op->getAttrs());
+    rewriter.replaceOp(op, bufferArgs[2]);
+    return success();
+  }
+};
+
 struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
  public:
   using BaseOpConversion<mhlo::ReduceOp>::BaseOpConversion;
@@ -433,7 +524,7 @@ struct HloLegalizeToLhlo
     target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<mhlo::MhloDialect>();
 
-    BufferAssignmentTypeConverter converter;
+    BufferizeTypeConverter converter;
     auto isMemRefType = [](Type type) { return type.isa<BaseMemRefType>(); };
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
@@ -450,24 +541,22 @@ struct HloLegalizeToLhlo
       return std::all_of(op.operand_type_begin(), op.operand_type_end(),
                          isMemRefType);
     });
-    target.addDynamicallyLegalOp<shape::AssumingOp>([&](shape::AssumingOp op) {
-      return std::all_of(op.result_type_begin(), op.result_type_end(),
-                         isMemRefType);
-    });
 
     auto kind = results_escape_function
-                    ? BufferAssignmentTypeConverter::KeepAsFunctionResult
-                    : BufferAssignmentTypeConverter::AppendToArgumentsList;
+                    ? BufferizeTypeConverter::KeepAsFunctionResult
+                    : BufferizeTypeConverter::AppendToArgumentsList;
     converter.setResultConversionKind<UnrankedTensorType, UnrankedMemRefType>(
         kind);
     converter.setResultConversionKind<RankedTensorType, MemRefType>(kind);
 
     populateHLOToLHLOConversionPattern(&context, &converter, &patterns);
-    populateWithBufferAssignmentOpConversionPatterns<
-        mlir::ReturnOp, mlir::ReturnOp, lmhlo::CopyOp>(&context, &converter,
-                                                       &patterns);
-    populateShapeTypeConversionPatterns(&context, &converter, &patterns);
-    if (failed(applyPartialConversion(getOperation(), target, patterns)))
+    populateWithBufferizeOpConversionPatterns<mlir::ReturnOp, mlir::ReturnOp,
+                                              lmhlo::CopyOp>(
+        &context, converter, patterns);
+    populateShapeStructuralTypeConversionsAndLegality(&context, converter,
+                                                      patterns, target);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
       signalPassFailure();
   }
 
@@ -480,11 +569,12 @@ struct HloLegalizeToLhlo
 };
 }  // namespace
 
-void populateHLOToLHLOConversionPattern(
-    MLIRContext* context, BufferAssignmentTypeConverter* converter,
-    OwningRewritePatternList* patterns) {
+void populateHLOToLHLOConversionPattern(MLIRContext* context,
+                                        BufferizeTypeConverter* converter,
+                                        OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
+      HloToLhloDotGeneralOpConverter,
       HloToLhloDynamicBroadcastInDimOpConverter,
       HloToLhloDynamicReshapeConverter,
       HloToLhloOpConverter<mhlo::AbsOp>,
@@ -531,7 +621,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloReturnOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter
-  >(context, converter);
+  >(context);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
index 59cd3381133..b3722e3bf47 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir/IR/Function.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 
@@ -133,7 +133,7 @@ struct LegalizeGatherToTorchIndexSelectPass
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     PopulateGatherToTorchIndexSelectPatterns(&getContext(), &patterns);
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 }  // namespace
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index 57859b64bed..ec388a2dd3c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -60,13 +60,13 @@ ShapedType getHloOpResultType(Operation* op) {
 
 template <bool isLHLO = true>
 bool verifyHloOpBufferOrTensorSemantics(Operation* op) {
-  auto verifyType = [&](Value val) -> bool {
+  auto verify_type = [&](Value val) -> bool {
     return (isLHLO && val.getType().isa<MemRefType>()) ||
            (!isLHLO && val.getType().isa<RankedTensorType>());
   };
-  if (!llvm::all_of(op->getOperands(), verifyType)) return false;
+  if (!llvm::all_of(op->getOperands(), verify_type)) return false;
   return isLHLO ? op->getResults().empty()
-                : llvm::all_of(op->getResults(), verifyType);
+                : llvm::all_of(op->getResults(), verify_type);
 }
 
 template <typename OpTy, bool isLHLO = true>
@@ -99,51 +99,51 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
              << nloops << " parallel iterators: " << *(op.getOperation());
 
     // Construct the indexing maps needed for linalg.generic ops.
-    SmallVector<Type, 4> bodyArgTypes, bodyResultTypes, opResultTypes;
+    SmallVector<Type, 4> body_arg_types, body_result_types, op_result_types;
 
     // This doesnt account for implicit broadcast, but the working assumption
     // in HLO/LHLO is that are broadcasts are made explicit.
 
     if (isLHLO && !nloops) return failure();
 
-    int numInputs = (isLHLO ? args.size() - 1 : args.size());
+    int num_inputs = (isLHLO ? args.size() - 1 : args.size());
 
-    ValueRange inputs(args.take_front(numInputs));
+    ValueRange inputs(args.take_front(num_inputs));
     for (Value in : inputs)
-      bodyArgTypes.emplace_back(getElementTypeOrSelf(in.getType()));
+      body_arg_types.emplace_back(getElementTypeOrSelf(in.getType()));
 
-    ValueRange outputBuffers(args.take_back(args.size() - numInputs));
-    for (Value out : outputBuffers)
-      bodyResultTypes.emplace_back(getElementTypeOrSelf(out.getType()));
+    ValueRange output_buffers(args.take_back(args.size() - num_inputs));
+    for (Value out : output_buffers)
+      body_result_types.emplace_back(getElementTypeOrSelf(out.getType()));
 
     if (!isLHLO) {
       // HLO operations have return as tensor types.
-      assert(bodyResultTypes.empty() &&
+      assert(body_result_types.empty() &&
              "When lowering HLO ops result can't be part of arguments");
       Value result = op.getOperation()->getResult(0);
-      bodyResultTypes.push_back(getElementTypeOrSelf(result));
-      opResultTypes.push_back(result.getType());
+      body_result_types.push_back(getElementTypeOrSelf(result));
+      op_result_types.push_back(result.getType());
     }
 
-    AffineMap commonIndexingMap =
+    AffineMap common_indexing_map =
         nloops ? rewriter.getMultiDimIdentityMap(nloops)
                : AffineMap::get(nloops, 0, rewriter.getContext());
     SmallVector<AffineMap, 2> indexing_maps(args.size() + (isLHLO ? 0 : 1),
-                                            commonIndexingMap);
+                                            common_indexing_map);
 
-    auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, opResultTypes, inputs, outputBuffers,
+    auto linalg_op = rewriter.create<linalg::GenericOp>(
+        loc, op_result_types, inputs, output_buffers,
         /*initTensors=*/ValueRange{}, indexing_maps,
         GetNParallelLoopsAttrs(nloops),
-        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+        [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
           // TODO(ravishankarm) : For now use the method in lmhlo namespace.
           // That method needs to be moved out of there.
-          Value opResult = lmhlo::HloOpToStdScalarOp::map<OpTy>(
-              op, bodyResultTypes,
+          Value op_result = lmhlo::HloOpToStdScalarOp::map<OpTy>(
+              op, body_result_types,
               llvm::to_vector<2>(args.take_front(inputs.size())), &rewriter);
-          nestedBuilder.create<linalg::YieldOp>(loc, opResult);
+          nested_builder.create<linalg::YieldOp>(loc, op_result);
         });
-    rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
+    rewriter.replaceOp(op, linalg_op.getOperation()->getResults());
     return success();
   }
 };
@@ -157,10 +157,10 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
       LhloOp lhlo_op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     auto loc = lhlo_op.getLoc();
-    auto argType =
+    auto arg_type =
         lhlo_op.getOperand(0).getType().template dyn_cast<ShapedType>();
-    if (!argType || !argType.getElementType().isSignlessIntOrFloat() ||
-        (argType.getRank() != 0)) {
+    if (!arg_type || !arg_type.getElementType().isSignlessIntOrFloat() ||
+        (arg_type.getRank() != 0)) {
       return failure();
     }
 
@@ -168,10 +168,10 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
     auto lhs = rewriter.create<LoadOp>(loc, lhlo_op.lhs());
     auto rhs = rewriter.create<LoadOp>(loc, lhlo_op.rhs());
     // TODO(ravishankarm) : Move this method out of lmhlo namespace.
-    Value opResult = lmhlo::HloOpToStdScalarOp::map<LhloOp>(
-        lhlo_op, argType.getElementType(), llvm::ArrayRef<Value>{lhs, rhs},
+    Value op_result = lmhlo::HloOpToStdScalarOp::map<LhloOp>(
+        lhlo_op, arg_type.getElementType(), llvm::ArrayRef<Value>{lhs, rhs},
         &rewriter);
-    rewriter.create<StoreOp>(loc, opResult, lhlo_op.out());
+    rewriter.create<StoreOp>(loc, op_result, lhlo_op.out());
     rewriter.eraseOp(lhlo_op);
     return success();
   }
@@ -192,52 +192,52 @@ struct ConvToLinalgConverter : public OpConversionPattern<lmhlo::ConvOp> {
       lmhlo::ConvOp op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     // Check validity of dimension information.
-    if (const lmhlo::ConvDimensionNumbers& dimensionNumbers =
+    if (const mhlo::ConvDimensionNumbers& dimension_numbers =
             op.dimension_numbers()) {
-      const int inputSpatialRank =
-          llvm::size(dimensionNumbers.input_spatial_dimensions());
+      const int input_spatial_rank =
+          llvm::size(dimension_numbers.input_spatial_dimensions());
       // The dimensions for input should follow the order of
       // batch_count, spatial_dims..., input_feature_count.
-      if (dimensionNumbers.input_batch_dimension().getInt() != 0 ||
-          dimensionNumbers.input_feature_dimension().getInt() !=
-              (inputSpatialRank + 1))
+      if (dimension_numbers.input_batch_dimension().getInt() != 0 ||
+          dimension_numbers.input_feature_dimension().getInt() !=
+              (input_spatial_rank + 1))
         return failure();
 
-      const int kernelSpatialRank =
-          llvm::size(dimensionNumbers.kernel_spatial_dimensions());
+      const int kernel_spatial_rank =
+          llvm::size(dimension_numbers.kernel_spatial_dimensions());
       // The dimensions for filter should follow the order of
       // spatial_dims..., input_feature_count, num_output_feature_count.
-      if (dimensionNumbers.kernel_input_feature_dimension().getInt() !=
-              kernelSpatialRank ||
-          dimensionNumbers.kernel_output_feature_dimension().getInt() !=
-              (kernelSpatialRank + 1))
+      if (dimension_numbers.kernel_input_feature_dimension().getInt() !=
+              kernel_spatial_rank ||
+          dimension_numbers.kernel_output_feature_dimension().getInt() !=
+              (kernel_spatial_rank + 1))
         return failure();
 
-      const int outputSpatialRank =
-          llvm::size(dimensionNumbers.output_spatial_dimensions());
+      const int output_spatial_rank =
+          llvm::size(dimension_numbers.output_spatial_dimensions());
       // The dimensions for output should follow the order of
       // batch_count, spatial_dims.., output_feature_count.
-      if (dimensionNumbers.output_batch_dimension().getInt() != 0 ||
-          dimensionNumbers.output_feature_dimension().getInt() !=
-              (outputSpatialRank + 1))
+      if (dimension_numbers.output_batch_dimension().getInt() != 0 ||
+          dimension_numbers.output_feature_dimension().getInt() !=
+              (output_spatial_rank + 1))
         return failure();
 
-      if (inputSpatialRank != outputSpatialRank ||
-          inputSpatialRank != kernelSpatialRank)
+      if (input_spatial_rank != output_spatial_rank ||
+          input_spatial_rank != kernel_spatial_rank)
         return failure();
 
-      auto inputSpatialDim =
-          dimensionNumbers.input_spatial_dimensions().begin();
-      auto kernelSpatialDim =
-          dimensionNumbers.kernel_spatial_dimensions().begin();
-      auto outputSpatialDim =
-          dimensionNumbers.output_spatial_dimensions().begin();
+      auto input_spatial_dim =
+          dimension_numbers.input_spatial_dimensions().begin();
+      auto kernel_spatial_dim =
+          dimension_numbers.kernel_spatial_dimensions().begin();
+      auto output_spatial_dim =
+          dimension_numbers.output_spatial_dimensions().begin();
       // Check if spatial dims are ordered correctly.
-      for (int i = 0; i < inputSpatialRank; ++i) {
+      for (int i = 0; i < input_spatial_rank; ++i) {
         const int dim = i + 1;
-        if ((*inputSpatialDim++).getZExtValue() != dim ||
-            (*outputSpatialDim++).getZExtValue() != dim ||
-            (*kernelSpatialDim++).getZExtValue() != i)
+        if ((*input_spatial_dim++).getZExtValue() != dim ||
+            (*output_spatial_dim++).getZExtValue() != dim ||
+            (*kernel_spatial_dim++).getZExtValue() != i)
           return failure();
       }
     }
@@ -248,33 +248,33 @@ struct ConvToLinalgConverter : public OpConversionPattern<lmhlo::ConvOp> {
     }
 
     llvm::SmallVector<Attribute, 4> strides;
-    if (auto windowStrides = op.window_strides()) {
-      auto range = windowStrides->getAttributeValues();
+    if (auto window_strides = op.window_strides()) {
+      auto range = window_strides->getAttributeValues();
       strides.assign(range.begin(), range.end());
     }
-    auto stridesArg = ArrayAttr::get(strides, op.getContext());
+    auto strides_arg = ArrayAttr::get(strides, op.getContext());
 
     llvm::SmallVector<Attribute, 2> dilation;
-    if (auto rhsDilation = op.rhs_dilation()) {
-      auto range = rhsDilation->getAttributeValues();
+    if (auto rhs_dilation = op.rhs_dilation()) {
+      auto range = rhs_dilation->getAttributeValues();
       dilation.assign(range.begin(), range.end());
     } else {
       // Default dilation of 1.
       dilation.resize(2, IntegerAttr::get(rewriter.getIntegerType(64), 1));
     }
-    auto dilationArg = ArrayAttr::get(dilation, op.getContext());
+    auto dilation_arg = ArrayAttr::get(dilation, op.getContext());
 
     // Set padding only if it is non-zero.
     DenseIntElementsAttr padding = op.paddingAttr();
-    if (!padding || !llvm::any_of(padding.getValues<APInt>(), [](APInt intVal) {
-          return !intVal.isNullValue();
-        })) {
+    if (!padding ||
+        !llvm::any_of(padding.getValues<APInt>(),
+                      [](APInt int_val) { return !int_val.isNullValue(); })) {
       padding = nullptr;
     }
 
     // The order of input and filter are switched with linalg.conv.
     rewriter.replaceOpWithNewOp<linalg::ConvOp>(
-        op, args[1], args[0], args[2], stridesArg, dilationArg, padding);
+        op, args[1], args[0], args[2], strides_arg, dilation_arg, padding);
     return success();
   }
 };
@@ -293,25 +293,25 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
       OpTy op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     if (!verifyHloOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
-    auto resultType = getHloOpResultType<isLHLO>(op);
+    auto result_type = getHloOpResultType<isLHLO>(op);
 
     SmallVector<AffineMap, 2> indexing_maps =
         Derived::getIndexingMaps(op, &rewriter);
     if (indexing_maps.empty()) return failure();
 
-    auto nloops = resultType.getRank();
+    auto nloops = result_type.getRank();
     auto loc = op.getLoc();
-    auto linalgOp = rewriter.create<linalg::GenericOp>(
+    auto linalg_op = rewriter.create<linalg::GenericOp>(
         loc,
-        /*resultTensorTypes=*/isLHLO ? ArrayRef<Type>{} : resultType,
+        /*resultTensorTypes=*/isLHLO ? ArrayRef<Type>{} : result_type,
         /*inputs=*/args.front(),
         /*outputBuffers=*/isLHLO ? ValueRange{args.back()} : ValueRange{},
         /*initTensor=*/ValueRange{}, indexing_maps,
         GetNParallelLoopsAttrs(nloops),
-        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
-          nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
+        [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
+          nested_builder.create<linalg::YieldOp>(loc, *args.begin());
         });
-    rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
+    rewriter.replaceOp(op, linalg_op.getOperation()->getResults());
     return success();
   }
 };
@@ -325,32 +325,32 @@ class BroadcastConverter
   using DataMovementOpConverter<BroadcastConverter, OpTy,
                                 isLHLO>::DataMovementOpConverter;
 
-  static SmallVector<AffineMap, 2> getIndexingMaps(OpTy broadcastOp,
+  static SmallVector<AffineMap, 2> getIndexingMaps(OpTy broadcast_op,
                                                    Builder* b) {
-    ShapedType inputType =
-        broadcastOp.operand().getType().template cast<ShapedType>();
-    unsigned inputRank = inputType.getRank();
-    unsigned nloops = getHloOpResultType<isLHLO>(broadcastOp).getRank();
+    ShapedType input_type =
+        broadcast_op.operand().getType().template cast<ShapedType>();
+    unsigned input_rank = input_type.getRank();
+    unsigned nloops = getHloOpResultType<isLHLO>(broadcast_op).getRank();
 
     // BroadcastOp prepends the dimensions in the `broadcast_sizes` attribute to
     // the input's dimensions.
-    unsigned numPrependedDims = llvm::size(broadcastOp.broadcast_sizes());
-    SmallVector<AffineExpr, 4> inputDimExprs;
-    inputDimExprs.reserve(inputRank);
-    for (int i = 0; i < inputRank; ++i) {
-      inputDimExprs.push_back(b->getAffineDimExpr(numPrependedDims + i));
+    unsigned num_prepended_dims = llvm::size(broadcast_op.broadcast_sizes());
+    SmallVector<AffineExpr, 4> input_dim_exprs;
+    input_dim_exprs.reserve(input_rank);
+    for (int i = 0; i < input_rank; ++i) {
+      input_dim_exprs.push_back(b->getAffineDimExpr(num_prepended_dims + i));
     }
 
-    AffineMap inputMap;
+    AffineMap input_map;
     MLIRContext* context = b->getContext();
-    if (inputDimExprs.empty()) {
+    if (input_dim_exprs.empty()) {
       // The input is a scalar, i.e. this is a scalar broadcast op.
-      inputMap = AffineMap::get(nloops, /*symbolCount=*/0, context);
+      input_map = AffineMap::get(nloops, /*symbolCount=*/0, context);
     } else {
-      inputMap =
-          AffineMap::get(nloops, /*symbolCount=*/0, inputDimExprs, context);
+      input_map =
+          AffineMap::get(nloops, /*symbolCount=*/0, input_dim_exprs, context);
     }
-    return {inputMap, b->getMultiDimIdentityMap(nloops)};
+    return {input_map, b->getMultiDimIdentityMap(nloops)};
   }
 };
 
@@ -363,34 +363,34 @@ class HloBroadcastInDimConverter
                                 false>::DataMovementOpConverter;
 
   static SmallVector<AffineMap, 2> getIndexingMaps(
-      mhlo::BroadcastInDimOp broadcastOp, Builder* b) {
-    auto resultType = getHloOpResultType<false>(broadcastOp);
-    auto operandType =
-        broadcastOp.operand().getType().template cast<ShapedType>();
-    unsigned nloops = resultType.getRank();
+      mhlo::BroadcastInDimOp broadcast_op, Builder* b) {
+    auto result_type = getHloOpResultType<false>(broadcast_op);
+    auto operand_type =
+        broadcast_op.operand().getType().template cast<ShapedType>();
+    unsigned nloops = result_type.getRank();
 
     // The input is a scalar, i.e. this is a scalar broadcast op.
-    if (operandType.getRank() == 0) {
+    if (operand_type.getRank() == 0) {
       return {AffineMap::get(nloops, /*symbolCount=*/0, b->getContext()),
               b->getMultiDimIdentityMap(nloops)};
     }
 
-    auto operandShape = operandType.getShape();
-    SmallVector<AffineExpr, 4> dimExprs;
-    dimExprs.reserve(nloops);
+    auto operand_shape = operand_type.getShape();
+    SmallVector<AffineExpr, 4> dim_exprs;
+    dim_exprs.reserve(nloops);
 
-    if (broadcastOp.broadcast_dimensions()) {
+    if (broadcast_op.broadcast_dimensions()) {
       for (const auto& broadcastDim :
-           enumerate(broadcastOp.broadcast_dimensions().getIntValues())) {
+           enumerate(broadcast_op.broadcast_dimensions().getIntValues())) {
         int size = broadcastDim.value().getSExtValue();
-        bool expansion_needed = operandShape[broadcastDim.index()] == 1 &&
-                                resultType.getShape()[size] != 1;
-        dimExprs.push_back(expansion_needed ? b->getAffineConstantExpr(0)
-                                            : b->getAffineDimExpr(size));
+        bool expansion_needed = operand_shape[broadcastDim.index()] == 1 &&
+                                result_type.getShape()[size] != 1;
+        dim_exprs.push_back(expansion_needed ? b->getAffineConstantExpr(0)
+                                             : b->getAffineDimExpr(size));
       }
     }
     return {
-        AffineMap::get(nloops, /*symbolCount=*/0, dimExprs, b->getContext()),
+        AffineMap::get(nloops, /*symbolCount=*/0, dim_exprs, b->getContext()),
         b->getMultiDimIdentityMap(nloops)};
   }
 };
@@ -430,8 +430,8 @@ class LhloBroadcastInDimConverter
           /*outputBuffers=*/ValueRange{operand_adaptor.output()},
           llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
           GetNParallelLoopsAttrs(nloops),
-          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
-            nestedBuilder.create<linalg::YieldOp>(loc, val);
+          [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
+            nested_builder.create<linalg::YieldOp>(loc, val);
           });
 
     } else {
@@ -441,8 +441,8 @@ class LhloBroadcastInDimConverter
           loc, /*inputs=*/ValueRange{operand},
           /*outputBuffers=*/ValueRange{operand_adaptor.output()}, indexing_maps,
           GetNParallelLoopsAttrs(nloops),
-          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
-            nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
+          [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
+            nested_builder.create<linalg::YieldOp>(loc, *args.begin());
           });
     }
     rewriter.replaceOp(op, llvm::None);
@@ -520,35 +520,35 @@ class LhloBroadcastInDimConverter
   }
 
   SmallVector<AffineMap, 2> getIndexingMaps(lmhlo::BroadcastInDimOp op,
-                                            ArrayRef<int64_t> broadcastDims,
-                                            ArrayRef<int64_t> resultShape,
-                                            MemRefType operandType,
+                                            ArrayRef<int64_t> broadcast_dims,
+                                            ArrayRef<int64_t> result_shape,
+                                            MemRefType operand_type,
                                             Builder* b) const {
-    unsigned nloops = resultShape.size();
+    unsigned nloops = result_shape.size();
 
     // The input is a scalar, i.e. this is a scalar broadcast op.
-    if (operandType.getRank() == 0) {
+    if (operand_type.getRank() == 0) {
       return {AffineMap::get(nloops, /*symbolCount=*/0, b->getContext()),
               b->getMultiDimIdentityMap(nloops)};
     }
 
-    auto operandShape = operandType.getShape();
-    SmallVector<AffineExpr, 4> dimExprs;
-    dimExprs.reserve(nloops);
+    auto operand_shape = operand_type.getShape();
+    SmallVector<AffineExpr, 4> dim_exprs;
+    dim_exprs.reserve(nloops);
 
-    for (const auto& broadcastDim : llvm::enumerate(broadcastDims)) {
-      int size = broadcastDim.value();
+    for (const auto& broadcast_dim : llvm::enumerate(broadcast_dims)) {
+      int size = broadcast_dim.value();
       bool expansion_needed =
-          operandShape[broadcastDim.index()] == 1 && resultShape[size] != 1;
+          operand_shape[broadcast_dim.index()] == 1 && result_shape[size] != 1;
       if (expansion_needed) {
         op.emitOpError(
             "BroadcastInDimOp lowering to Linalg does not support size-1 "
             "dimensions expansion.");
       }
-      dimExprs.push_back(b->getAffineDimExpr(size));
+      dim_exprs.push_back(b->getAffineDimExpr(size));
     }
     return {
-        AffineMap::get(nloops, /*symbolCount=*/0, dimExprs, b->getContext()),
+        AffineMap::get(nloops, /*symbolCount=*/0, dim_exprs, b->getContext()),
         b->getMultiDimIdentityMap(nloops)};
   }
 };
@@ -561,17 +561,17 @@ class TransposeConverter
   using DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
                                 isLHLO>::DataMovementOpConverter;
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
-    auto resultType =
+    auto result_type =
         getHloOpResultType<isLHLO>(op).template cast<ShapedType>();
-    auto nloops = resultType.getRank();
-    SmallVector<AffineExpr, 2> inputExprs;
-    inputExprs.resize(resultType.getRank());
+    auto nloops = result_type.getRank();
+    SmallVector<AffineExpr, 2> input_exprs;
+    input_exprs.resize(result_type.getRank());
     for (auto permutation : llvm::enumerate(op.permutation())) {
-      inputExprs[permutation.value().getZExtValue()] =
+      input_exprs[permutation.value().getZExtValue()] =
           b->getAffineDimExpr(permutation.index());
     }
     return {
-        AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+        AffineMap::get(nloops, /*symbolCount=*/0, input_exprs, b->getContext()),
         b->getMultiDimIdentityMap(nloops)};
   }
 };
@@ -584,101 +584,104 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
   using OpConversionPattern<OpTy>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      OpTy reshapeOp, ArrayRef<Value> args,
+      OpTy reshape_op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    if (!verifyHloOpBufferOrTensorSemantics<isLHLO>(reshapeOp))
+    if (!verifyHloOpBufferOrTensorSemantics<isLHLO>(reshape_op))
       return failure();
-    ShapedType operandType =
-        reshapeOp.operand().getType().template cast<ShapedType>();
-    ShapedType resultType = getHloOpResultType<isLHLO>(reshapeOp);
+    ShapedType operand_type =
+        reshape_op.operand().getType().template cast<ShapedType>();
+    ShapedType result_type = getHloOpResultType<isLHLO>(reshape_op);
 
-    if (!operandType.hasStaticShape() || !resultType.hasStaticShape())
+    if (!operand_type.hasStaticShape() || !result_type.hasStaticShape())
       return failure();
 
     // Compute the reassociation maps for the linalg operation.
-    ArrayRef<int64_t> srcShape =
-        (operandType.getRank() > resultType.getRank() ? operandType.getShape()
-                                                      : resultType.getShape());
-    ArrayRef<int64_t> dstShape =
-        (operandType.getRank() > resultType.getRank() ? resultType.getShape()
-                                                      : operandType.getShape());
-    unsigned currSrcDim = 0, currDstDim = 0;
-    SmallVector<linalg::ReassociationExprs, 4> reassociationMap(
-        dstShape.size());
-    bool isExpandingOrCollapsing = true;
-    while (currSrcDim < srcShape.size() && currDstDim < dstShape.size()) {
-      int64_t dstSize = dstShape[currDstDim];
-      int64_t srcSize = srcShape[currSrcDim];
-      while (srcSize < dstSize && currSrcDim < srcShape.size()) {
-        reassociationMap[currDstDim].push_back(
-            rewriter.getAffineDimExpr(currSrcDim++));
-        srcSize *= srcShape[currSrcDim];
+    ArrayRef<int64_t> src_shape =
+        (operand_type.getRank() > result_type.getRank()
+             ? operand_type.getShape()
+             : result_type.getShape());
+    ArrayRef<int64_t> dst_shape =
+        (operand_type.getRank() > result_type.getRank()
+             ? result_type.getShape()
+             : operand_type.getShape());
+    unsigned curr_src_dim = 0, curr_dst_dim = 0;
+    SmallVector<linalg::ReassociationExprs, 4> reassociation_map(
+        dst_shape.size());
+    bool is_expanding_or_collapsing = true;
+    while (curr_src_dim < src_shape.size() && curr_dst_dim < dst_shape.size()) {
+      int64_t dst_size = dst_shape[curr_dst_dim];
+      int64_t src_size = src_shape[curr_src_dim];
+      while (src_size < dst_size && curr_src_dim < src_shape.size()) {
+        reassociation_map[curr_dst_dim].push_back(
+            rewriter.getAffineDimExpr(curr_src_dim++));
+        src_size *= src_shape[curr_src_dim];
       }
-      if (srcSize == dstSize) {
-        reassociationMap[currDstDim].push_back(
-            rewriter.getAffineDimExpr(currSrcDim++));
-        // If the next dim in dstShape is not 1, treat subsequent dims in
-        // srcShape which are 1 to be collapsed.
-        if (currDstDim == dstShape.size() - 1 ||
-            dstShape[currDstDim + 1] != 1) {
-          while (currSrcDim < srcShape.size() && srcShape[currSrcDim] == 1) {
-            reassociationMap[currDstDim].push_back(
-                rewriter.getAffineDimExpr(currSrcDim++));
+      if (src_size == dst_size) {
+        reassociation_map[curr_dst_dim].push_back(
+            rewriter.getAffineDimExpr(curr_src_dim++));
+        // If the next dim in dst_shape is not 1, treat subsequent dims in
+        // src_shape which are 1 to be collapsed.
+        if (curr_dst_dim == dst_shape.size() - 1 ||
+            dst_shape[curr_dst_dim + 1] != 1) {
+          while (curr_src_dim < src_shape.size() &&
+                 src_shape[curr_src_dim] == 1) {
+            reassociation_map[curr_dst_dim].push_back(
+                rewriter.getAffineDimExpr(curr_src_dim++));
           }
         }
       } else {
-        isExpandingOrCollapsing = false;
+        is_expanding_or_collapsing = false;
         break;
       }
-      currDstDim++;
+      curr_dst_dim++;
     }
-    if (currSrcDim != srcShape.size() || currDstDim != dstShape.size())
-      isExpandingOrCollapsing = false;
+    if (curr_src_dim != src_shape.size() || curr_dst_dim != dst_shape.size())
+      is_expanding_or_collapsing = false;
 
-    if (!isExpandingOrCollapsing) {
-      auto getIdentityExprs = [&rewriter](int n) {
+    if (!is_expanding_or_collapsing) {
+      auto get_identity_exprs = [&rewriter](int n) {
         SmallVector<AffineExpr, 4> exprs;
         for (int i = 0; i < n; ++i)
           exprs.push_back(rewriter.getAffineDimExpr(i));
         return exprs;
       };
-      Location loc = reshapeOp.getLoc();
-      int64_t totalElems = std::accumulate(srcShape.begin(), srcShape.end(), 1,
-                                           std::multiplies<int64_t>());
-      auto elemType = operandType.getElementType();
-      SmallVector<linalg::ReassociationExprs, 4> collapsingMap = {
-          getIdentityExprs(dstShape.size())};
-      SmallVector<linalg::ReassociationExprs, 4> expandingMap = {
-          getIdentityExprs(srcShape.size())};
+      Location loc = reshape_op.getLoc();
+      int64_t total_elems = std::accumulate(src_shape.begin(), src_shape.end(),
+                                            1, std::multiplies<int64_t>());
+      auto elem_type = operand_type.getElementType();
+      SmallVector<linalg::ReassociationExprs, 4> collapsing_map = {
+          get_identity_exprs(dst_shape.size())};
+      SmallVector<linalg::ReassociationExprs, 4> expanding_map = {
+          get_identity_exprs(src_shape.size())};
 
       if (isLHLO) {
-        auto collapsedType = MemRefType::get({totalElems}, elemType);
-        Value collapsedOp = rewriter.create<linalg::ReshapeOp>(
-            loc, collapsedType, args[0], collapsingMap);
-        Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
-            loc, resultType, collapsedOp, expandingMap);
+        auto collapsed_type = MemRefType::get({total_elems}, elem_type);
+        Value collapsed_op = rewriter.create<linalg::ReshapeOp>(
+            loc, collapsed_type, args[0], collapsing_map);
+        Value reshape_buffer = rewriter.create<linalg::ReshapeOp>(
+            loc, result_type, collapsed_op, expanding_map);
         rewriter.replaceOpWithNewOp<linalg::CopyOp>(
-            reshapeOp, reshapeBuffer, args[1], /*inputPermutation =*/nullptr,
+            reshape_op, reshape_buffer, args[1], /*inputPermutation =*/nullptr,
             /*outputPermutation =*/nullptr);
       } else {
-        auto collapsedType = RankedTensorType::get({totalElems}, elemType);
-        Value collapsedOp = rewriter.create<linalg::TensorReshapeOp>(
-            loc, collapsedType, args[0], collapsingMap);
+        auto collapsed_type = RankedTensorType::get({total_elems}, elem_type);
+        Value collapsed_op = rewriter.create<linalg::TensorReshapeOp>(
+            loc, collapsed_type, args[0], collapsing_map);
         rewriter.replaceOpWithNewOp<linalg::TensorReshapeOp>(
-            reshapeOp, resultType, collapsedOp, expandingMap);
+            reshape_op, result_type, collapsed_op, expanding_map);
       }
       return success();
     }
 
     if (isLHLO) {
-      Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
-          reshapeOp.getLoc(), resultType, args[0], reassociationMap);
+      Value reshape_buffer = rewriter.create<linalg::ReshapeOp>(
+          reshape_op.getLoc(), result_type, args[0], reassociation_map);
       rewriter.replaceOpWithNewOp<linalg::CopyOp>(
-          reshapeOp, reshapeBuffer, args[1], /*inputPermutation =*/nullptr,
+          reshape_op, reshape_buffer, args[1], /*inputPermutation =*/nullptr,
           /*outputPermutation =*/nullptr);
     } else {
       rewriter.replaceOpWithNewOp<linalg::TensorReshapeOp>(
-          reshapeOp, resultType, args[0], reassociationMap);
+          reshape_op, result_type, args[0], reassociation_map);
     }
     return success();
   }
@@ -690,42 +693,42 @@ class IotaConverter : public OpConversionPattern<OpTy> {
   using OpConversionPattern<OpTy>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      OpTy iotaOp, ArrayRef<Value> args,
+      OpTy iota_op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    ShapedType resultShapedType = getHloOpResultType<isLHLO>(iotaOp);
-    if (!resultShapedType) return failure();
+    ShapedType result_shaped_type = getHloOpResultType<isLHLO>(iota_op);
+    if (!result_shaped_type) return failure();
 
-    auto resultElementType = resultShapedType.getElementType();
-    if (!resultElementType.isSignlessIntOrFloat()) return failure();
+    auto result_element_type = result_shaped_type.getElementType();
+    if (!result_element_type.isSignlessIntOrFloat()) return failure();
 
     // Construct the indexing maps needed for linalg.generic ops.
-    unsigned nloops = resultShapedType.getRank();
+    unsigned nloops = result_shaped_type.getRank();
 
-    auto linalgOp = rewriter.create<linalg::IndexedGenericOp>(
-        iotaOp.getLoc(),
+    auto linalg_op = rewriter.create<linalg::IndexedGenericOp>(
+        iota_op.getLoc(),
         /*resultTensorTypes=*/
-        isLHLO ? ArrayRef<Type>{} : ArrayRef<Type>{resultShapedType},
+        isLHLO ? ArrayRef<Type>{} : ArrayRef<Type>{result_shaped_type},
         /*inputs=*/ValueRange{},
         /*outputBuffers=*/isLHLO ? ValueRange{args} : ValueRange{},
         /*initTensors=*/ValueRange{},
         llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
         GetNParallelLoopsAttrs(nloops),
-        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange ivs,
+        [&](OpBuilder& nested_builder, Location nested_loc, ValueRange ivs,
             ValueRange args) {
-          Value castOp = nestedBuilder.create<IndexCastOp>(
-              nestedLoc, ivs[iotaOp.iota_dimension()],
-              nestedBuilder.getIntegerType(
-                  resultElementType.getIntOrFloatBitWidth()));
-          if (resultElementType.template isa<FloatType>()) {
-            castOp = nestedBuilder.create<SIToFPOp>(nestedLoc, castOp,
-                                                    resultElementType);
+          Value cast_op = nested_builder.create<IndexCastOp>(
+              nested_loc, ivs[iota_op.iota_dimension()],
+              nested_builder.getIntegerType(
+                  result_element_type.getIntOrFloatBitWidth()));
+          if (result_element_type.template isa<FloatType>()) {
+            cast_op = nested_builder.create<SIToFPOp>(nested_loc, cast_op,
+                                                      result_element_type);
           }
-          nestedBuilder.create<linalg::YieldOp>(nestedLoc, castOp);
+          nested_builder.create<linalg::YieldOp>(nested_loc, cast_op);
         });
     if (isLHLO)
-      rewriter.replaceOp(iotaOp, llvm::None);
+      rewriter.replaceOp(iota_op, llvm::None);
     else
-      rewriter.replaceOp(iotaOp, linalgOp.result_tensors());
+      rewriter.replaceOp(iota_op, linalg_op.result_tensors());
     return success();
   }
 };
@@ -735,16 +738,106 @@ class ConstConverter : public OpConversionPattern<lmhlo::ConstOp> {
   using OpConversionPattern<lmhlo::ConstOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      lmhlo::ConstOp constOp, ArrayRef<Value> args,
+      lmhlo::ConstOp const_op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    auto loc = constOp.getLoc();
-    auto valueAttr = constOp.value().cast<DenseElementsAttr>();
-    if (valueAttr.getType().getRank() != 0) return failure();
-    auto stdConstOp =
-        rewriter.create<mlir::ConstantOp>(loc, valueAttr.getValue({}));
-    rewriter.create<mlir::AffineStoreOp>(loc, stdConstOp, constOp.getOperand(),
-                                         ValueRange());
-    rewriter.eraseOp(constOp);
+    auto loc = const_op.getLoc();
+    auto value_attr = const_op.value().cast<DenseElementsAttr>();
+    if (value_attr.getType().getRank() != 0) return failure();
+    auto std_const_op =
+        rewriter.create<mlir::ConstantOp>(loc, value_attr.getValue({}));
+    rewriter.create<mlir::AffineStoreOp>(loc, std_const_op,
+                                         const_op.getOperand(), ValueRange());
+    rewriter.eraseOp(const_op);
+    return success();
+  }
+};
+
+class ReduceConverter : public OpConversionPattern<lmhlo::ReduceOp> {
+ public:
+  using OpConversionPattern<lmhlo::ReduceOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      lmhlo::ReduceOp reduce_op, ArrayRef<Value> args,
+      ConversionPatternRewriter& rewriter) const final {
+    auto loc = reduce_op.getLoc();
+    lmhlo::ReduceOp::Adaptor adaptor(args);
+    auto operand_shape =
+        adaptor.operands()[0].getType().template dyn_cast<ShapedType>();
+    if (!operand_shape || !operand_shape.hasRank()) {
+      emitError(loc, "lhlo to linalg conversion expects known-rank args");
+      return failure();
+    }
+
+    // First fill the output buffer with the init value.
+    Value init_value = rewriter.create<LoadOp>(loc, adaptor.init_values()[0]);
+    rewriter.create<linalg::FillOp>(loc, adaptor.out()[0], init_value);
+
+    DenseIntElementsAttr dimensions_attr = reduce_op.dimensions();
+    SmallVector<int, 4> reduction_dims;
+    for (const auto& dim : dimensions_attr.getIntValues()) {
+      reduction_dims.push_back(dim.getSExtValue());
+    }
+
+    SmallVector<AffineExpr, 2> src_exprs;
+    SmallVector<AffineExpr, 2> dst_exprs;
+    SmallVector<StringRef, 4> types;
+    for (int i = 0, rank = operand_shape.getRank(); i != rank; ++i) {
+      bool is_reduced = llvm::is_contained(reduction_dims, i);
+      types.push_back(is_reduced ? getReductionIteratorTypeName()
+                                 : getParallelIteratorTypeName());
+
+      src_exprs.push_back(mlir::getAffineDimExpr(i, rewriter.getContext()));
+      if (!is_reduced) {
+        dst_exprs.push_back(mlir::getAffineDimExpr(i, rewriter.getContext()));
+      }
+    }
+
+    auto maps = AffineMap::inferFromExprList({src_exprs, dst_exprs});
+
+    auto linalg_op = rewriter.create<linalg::GenericOp>(
+        loc, /*resultTensorTypes=*/ArrayRef<Type>{},
+        /*inputs=*/adaptor.operands(), /*outputBuffers=*/adaptor.out(),
+        /*initTensors=*/ValueRange{}, maps, types);
+    linalg_op.region().takeBody(reduce_op.body());
+    {
+      OpBuilder::InsertionGuard region_guard(rewriter);
+      Block* block = linalg_op.getBody();
+      rewriter.setInsertionPoint(&block->front());
+
+      // The incoming region is operating on buffers, while linalg.generic
+      // expects scalar SSA values. Add some allocs around the original op to
+      // make it compatible.
+      auto arg_type = block->getArgument(0).getType().cast<MemRefType>();
+      Value alloc_a = rewriter.create<AllocaOp>(loc, arg_type);
+      Value alloc_b = rewriter.create<AllocaOp>(loc, arg_type);
+      Value alloc_res = rewriter.create<AllocaOp>(loc, arg_type);
+
+      // Now turn the existing signature
+      //   (memref<X>, memref<X>, memref<X>) -> ()
+      // into
+      //   (X, X) -> X
+      TypeConverter::SignatureConversion signature_converter(3);
+      signature_converter.remapInput(0, alloc_a);
+      signature_converter.remapInput(1, alloc_b);
+      signature_converter.remapInput(2, alloc_res);
+      signature_converter.addInputs(
+          {arg_type.getElementType(), arg_type.getElementType()});
+      Block* entry_block = rewriter.applySignatureConversion(
+          &linalg_op.region(), signature_converter);
+
+      // Store the arguments into the newly allocated buffers.
+      rewriter.setInsertionPointAfter(alloc_res.getDefiningOp());
+      rewriter.create<StoreOp>(loc, entry_block->getArgument(0), alloc_a);
+      rewriter.create<StoreOp>(loc, entry_block->getArgument(1), alloc_b);
+      rewriter.replaceOp(entry_block->getTerminator(), {});
+
+      // Load & yield the result.
+      rewriter.setInsertionPointToEnd(entry_block);
+      auto load_res = rewriter.create<LoadOp>(loc, alloc_res);
+      rewriter.create<linalg::YieldOp>(loc, ValueRange{load_res});
+    }
+
+    rewriter.replaceOp(reduce_op, linalg_op.getOperation()->getResults());
     return success();
   }
 };
@@ -758,21 +851,21 @@ class ReverseConverter
   using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
                                 isLHLO>::DataMovementOpConverter;
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
-    auto resultType =
+    auto result_type =
         getHloOpResultType<isLHLO>(op).template cast<ShapedType>();
-    auto nloops = resultType.getRank();
-    SmallVector<AffineExpr, 2> inputExprs;
-    inputExprs.reserve(nloops);
+    auto nloops = result_type.getRank();
+    SmallVector<AffineExpr, 2> input_exprs;
+    input_exprs.reserve(nloops);
     for (int i = 0; i < nloops; ++i)
-      inputExprs.push_back(b->getAffineDimExpr(i));
+      input_exprs.push_back(b->getAffineDimExpr(i));
     for (auto dim : op.dimensions()) {
       int i = dim.getZExtValue();
-      if (resultType.isDynamicDim(i)) return {};
-      int n = resultType.getShape()[i];
-      inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i];
+      if (result_type.isDynamicDim(i)) return {};
+      int n = result_type.getShape()[i];
+      input_exprs[i] = b->getAffineConstantExpr(n - 1) - input_exprs[i];
     }
     return {
-        AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+        AffineMap::get(nloops, /*symbolCount=*/0, input_exprs, b->getContext()),
         b->getMultiDimIdentityMap(nloops)};
   }
 };
@@ -782,31 +875,31 @@ class SliceConverter : public OpConversionPattern<lmhlo::SliceOp> {
   using OpConversionPattern<lmhlo::SliceOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      lmhlo::SliceOp sliceOp, ArrayRef<Value> args,
+      lmhlo::SliceOp slice_op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    auto loc = sliceOp.getLoc();
-    auto argType =
-        sliceOp.getOperand(0).getType().template dyn_cast<ShapedType>();
-    if (!argType || !argType.hasRank()) {
+    auto loc = slice_op.getLoc();
+    auto arg_type =
+        slice_op.getOperand(0).getType().template dyn_cast<ShapedType>();
+    if (!arg_type || !arg_type.hasRank()) {
       emitError(loc, "lhlo to linalg conversion expects known-rank args");
       return failure();
     }
 
     SmallVector<Value, 3> ranges;
-    for (int i = 0, e = argType.getRank(); i < e; ++i) {
+    for (int i = 0, e = arg_type.getRank(); i < e; ++i) {
       Value start_index = rewriter.create<ConstantIndexOp>(
-          loc, sliceOp.start_indices().getValue<int64_t>(i));
+          loc, slice_op.start_indices().getValue<int64_t>(i));
       Value limit_index = rewriter.create<ConstantIndexOp>(
-          loc, sliceOp.limit_indices().getValue<int64_t>(i));
+          loc, slice_op.limit_indices().getValue<int64_t>(i));
       Value stride = rewriter.create<ConstantIndexOp>(
-          loc, sliceOp.strides().getValue<int64_t>(i));
+          loc, slice_op.strides().getValue<int64_t>(i));
       ranges.push_back(rewriter.create<linalg::RangeOp>(loc, start_index,
                                                         limit_index, stride));
     }
     auto linalg_slice =
-        rewriter.create<linalg::SliceOp>(loc, sliceOp.getOperand(0), ranges);
-    rewriter.create<linalg::CopyOp>(loc, linalg_slice, sliceOp.getOperand(1));
-    rewriter.eraseOp(sliceOp);
+        rewriter.create<linalg::SliceOp>(loc, slice_op.getOperand(0), ranges);
+    rewriter.create<linalg::CopyOp>(loc, linalg_slice, slice_op.getOperand(1));
+    rewriter.eraseOp(slice_op);
     return success();
   }
 };
@@ -850,9 +943,11 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<lmhlo::SubOp>,
                    PointwiseToLinalgConverter<lmhlo::TanhOp>,
                    PointwiseToLinalgConverter<lmhlo::IsFiniteOp>,
+                   ReduceConverter,
                    ReshapeOpConverter<lmhlo::ReshapeOp>,
                    ReverseConverter<lmhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<lmhlo::AddOp>,
+                   ScalarPointwiseToStandardConverter<lmhlo::MaxOp>,
                    SliceConverter,
                    TransposeConverter<lmhlo::TransposeOp>
                   >(context);
@@ -890,7 +985,7 @@ struct LhloLegalizeToLinalgPass
 
     auto func = getFunction();
     populateLHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
@@ -909,7 +1004,7 @@ struct HloLegalizeToLinalgPass
 
     auto func = getFunction();
     mhlo::populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
index d2d4bab45ab..0173bcdd643 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Function.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace {
@@ -193,7 +193,7 @@ std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> createLegalizeToStdPass() {
 
 void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
                                mlir::MLIRContext *ctx) {
-  mlir::populateWithGenerated(ctx, patterns);
+  mlir::populateWithGenerated(ctx, *patterns);
   patterns->insert<CompareFConvert, CompareIConvert, ConvertIotaOp>(ctx);
 }
 
@@ -201,7 +201,7 @@ void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
 void LegalizeToStandardPass::runOnFunction() {
   OwningRewritePatternList patterns;
   mlir::mhlo::PopulateMhloToStdPatterns(&patterns, &getContext());
-  applyPatternsAndFoldGreedily(getFunction(), patterns);
+  applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
 }
 
 }  // end namespace mhlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
index 10030866d0f..b9f513f30d1 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Function.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace mhlo {
@@ -259,7 +259,7 @@ struct LegalizeTrigonometricToApproximationPass
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     PopulateTrigonometricToApproximationPatterns(&getContext(), &patterns);
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
index 6dc5b64a105..610b562f5cf 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
@@ -24,8 +24,9 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -73,9 +74,26 @@ class LhloFuseLinalgPass
         result_buffers.insert(operand);
       }
     }
+    // Resolve aliasing operations (like casts) on the result to identify
+    // results. This only handles escaping results.
+    // TODO(herhut): Use BufferizeAliasAnalysis for this.
+    llvm::SmallVector<Value, 4> worklist(result_buffers.begin(),
+                                         result_buffers.end());
+    while (!worklist.empty()) {
+      Value result = worklist.pop_back_val();
+      auto definingOp = result.getDefiningOp();
+      if (!definingOp) {
+        continue;
+      }
+      if (auto viewLike = dyn_cast<ViewLikeOpInterface>(definingOp)) {
+        auto alias = viewLike.getViewSource();
+        if (result_buffers.insert(alias).second) {
+          worklist.push_back(alias);
+        }
+      }
+    }
     MLIRContext* ctx = func.getContext();
     OpBuilder b(func);
-    OperationFolder folder(ctx);
     func.walk([&](linalg::GenericOp generic_op) {
       SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
                                          tile_sizes_.end());
@@ -92,7 +110,7 @@ class LhloFuseLinalgPass
       }
     });
     auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
-    applyPatternsAndFoldGreedily(func, patterns);
+    applyPatternsAndFoldGreedily(func, std::move(patterns));
 
     // Fuse producers of tiled linalg ops.
     llvm::SmallDenseSet<Operation*> erase_set;
@@ -102,7 +120,7 @@ class LhloFuseLinalgPass
       for (unsigned id = 0, e = LinalgOp(op).getNumInputs(); id < e; ++id) {
         linalg::Aliases aliases;
         linalg::LinalgDependenceGraph graph(aliases, linalg_ops);
-        if (auto info = fuseProducerOf(b, op, id, graph, &folder)) {
+        if (auto info = fuseProducerOfBuffer(b, op, id, graph)) {
           auto originalOp = info->originalProducer.getOperation();
           erase_set.insert(originalOp);
           auto originalOpInLinalgOpsVector = std::find_if(
@@ -113,7 +131,7 @@ class LhloFuseLinalgPass
       }
 
       auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
-      applyPatternsAndFoldGreedily(func, patterns);
+      applyPatternsAndFoldGreedily(func, std::move(patterns));
     }
     for (auto* e : erase_set) e->erase();
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
index 2771afc6302..5bd27f0dbdc 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
@@ -19,12 +19,10 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -59,6 +57,20 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
       return failure();
     }
 
+    // We don't currently support batching dimensions, or multiple contraction
+    // dimensions.
+    mhlo::DotDimensionNumbers dot_dimension_numbers =
+        op.dot_dimension_numbers();
+    if (dot_dimension_numbers.lhs_batching_dimensions().size() > 0 ||
+        dot_dimension_numbers.rhs_batching_dimensions().size() > 0)
+      return failure();
+    if (dot_dimension_numbers.lhs_contracting_dimensions().size() != 1 ||
+        *dot_dimension_numbers.lhs_contracting_dimensions().begin() != 1 ||
+        dot_dimension_numbers.rhs_contracting_dimensions().size() != 1 ||
+        *dot_dimension_numbers.rhs_contracting_dimensions().begin() != 0) {
+      return failure();
+    }
+
     LogicalResult map_status = success();
     auto body_builder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
       SmallVector<Value, 2> lhs_indices{ivs[0], ivs[2]},
@@ -146,7 +158,7 @@ struct LhloLegalizeToAffinePass
     OwningRewritePatternList patterns;
     auto func = getFunction();
     populateLHLOToAffineConversionPattern(func.getContext(), &patterns);
-    applyPatternsAndFoldGreedily(func, patterns);
+    applyPatternsAndFoldGreedily(func, std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
index fbade8f7387..ea3eea0af61 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
@@ -184,7 +184,7 @@ struct LhloLegalizeToGpuPass
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
-    if (failed(applyPartialConversion(func, target, patterns))) {
+    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
index 3d49027bb50..6b9286a6e89 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -47,7 +47,7 @@ class TestLhloToLLVMPass
     target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
     target.addIllegalDialect<LmhloDialect>();
 
-    if (failed(applyFullConversion(m, target, patterns))) {
+    if (failed(applyFullConversion(m, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
index d9a2d993496..78d681bdd06 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -713,7 +713,7 @@ struct LhloLegalizeToParallelLoopsPass
     target.addIllegalOp<lmhlo::ReduceOp, lmhlo::ReduceWindowOp,
                         lmhlo::SelectAndScatterOp>();
 
-    if (failed(applyPartialConversion(func, target, patterns))) {
+    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
index 90763b1381a..7bea88fe5c9 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
@@ -29,11 +29,11 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using mlir::FunctionPass;
 using mlir::OwningRewritePatternList;
@@ -59,7 +59,7 @@ namespace {
 
 void PopulateComplexLoweringPatterns(MLIRContext* context,
                                      OwningRewritePatternList* patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 }
 }  // end namespace mhlo
 }  // end namespace mlir
@@ -70,7 +70,7 @@ void LowerComplexPass::runOnFunction() {
   OwningRewritePatternList patterns;
   mlir::mhlo::PopulateComplexLoweringPatterns(&getContext(), &patterns);
 
-  applyPatternsAndFoldGreedily(getFunction(), patterns);
+  applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
 }
 
 std::unique_ptr<FunctionPass> mlir::mhlo::createLowerComplexPass() {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
index ada30a289a4..bc7007470f4 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using mlir::DenseIntElementsAttr;
 using mlir::ElementsAttr;
@@ -182,7 +182,7 @@ struct LegalizeGeneralDotPass
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     mlir::mhlo::PopulateGeneralDotOpLoweringPatterns(&patterns, &getContext());
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
index 3909f046007..d410a26ced0 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
@@ -42,7 +42,7 @@ struct TestMaterializeBroadcastsPass
     PopulateMaterializeBroadcastsPatterns(&getContext(), &conversionPatterns);
 
     if (failed(applyPartialConversion(getFunction(), conversionTarget,
-                                      conversionPatterns))) {
+                                      std::move(conversionPatterns)))) {
       return signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
index febd4423bf2..62fde5ec337 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using mlir::FunctionPass;
 using mlir::PassWrapper;
@@ -42,7 +42,7 @@ void OptimizeMhloPass::runOnFunction() {
   mlir::OwningRewritePatternList patterns;
   mlir::mhlo::PopulateOptimizeMHLOPatterns(&getContext(), &patterns);
 
-  applyPatternsAndFoldGreedily(getFunction(), patterns);
+  applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
 }
 
 std::unique_ptr<mlir::FunctionPass> mlir::mhlo::createOptimizeMhloPass() {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
index 35e5a184472..fed7f70c102 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include "mlir/IR/Identifier.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace mhlo {
@@ -87,7 +87,7 @@ struct TestInferShapedTypeMethodsPass
     OwningRewritePatternList patterns;
     patterns.insert<ReifyReturnTypeShapesPattern>(&getContext());
     patterns.insert<InferReturnTypeComponentsPattern>(&getContext());
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
index 7c01fa22372..e54c404c7bb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
@@ -16,7 +16,9 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Function.h"
@@ -126,6 +128,291 @@ struct ElementwiseOpConversion : public OpRewritePattern<OpTy> {
   }
 };
 
+// Converts a broadcasting binary operation with a scalar operand and an
+// unranked operand to a ranked broadcasting operation by dynamically reshaping
+// the unranked operand to a 1D tensor. This will always be safe because
+// broadcasting from a scalar to another shape always works.
+template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
+struct ConvertUnrankedScalarDynamicBroadcastBinaryOp
+    : public OpConversionPattern<ChloOpTy> {
+  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      ChloOpTy op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    typename ChloOpTy::Adaptor transformed(operands);
+    Value lhs = transformed.lhs();
+    Value rhs = transformed.rhs();
+
+    auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
+    auto lhs_unranked_type = lhs.getType().dyn_cast<UnrankedTensorType>();
+
+    auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
+    auto rhs_unranked_type = rhs.getType().dyn_cast<UnrankedTensorType>();
+
+    bool lhs_is_scalar = lhs_ranked_type &&
+                         lhs_ranked_type.getShape().empty() &&
+                         rhs_unranked_type;
+    bool rhs_is_scalar = rhs_ranked_type &&
+                         rhs_ranked_type.getShape().empty() &&
+                         lhs_unranked_type;
+
+    // Only support the case where exactly one operand is scalar and the other
+    // is unranked. Other patterns in chlo-to-hlo legalization will create more
+    // efficient lowerings for cases where both ranks are known or will handle
+    // the more generic case of both inputs being unranked.
+    if (!(lhs_is_scalar ^ rhs_is_scalar)) return failure();
+
+    auto result_type = op.getResult().getType().template dyn_cast<TensorType>();
+
+    // Reshape the non-scalar value into a dynamically sized, rank-1 tensor
+    Value shape =
+        rewriter.create<shape::ShapeOfOp>(loc, lhs_is_scalar ? rhs : lhs);
+    Value num_elements = rewriter.create<shape::NumElementsOp>(loc, shape);
+    Value size_tensor =
+        rewriter.create<TensorFromElementsOp>(loc, num_elements);
+    Value reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
+        loc, RankedTensorType::get({-1}, result_type.getElementType()),
+        lhs_is_scalar ? rhs : lhs, size_tensor);
+
+    // Create a new ranked Chlo op that will be further lowered by other
+    // patterns into Mhlo.
+    SmallVector<Value, 2> new_operands{lhs_is_scalar ? lhs : reshaped,
+                                       rhs_is_scalar ? rhs : reshaped};
+    Value computed =
+        rewriter.create<ChloOpTy>(loc, SmallVector<Type, 1>{reshaped.getType()},
+                                  new_operands, op.getAttrs());
+
+    // Reshape the result back into an unranked tensor.
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_type,
+                                                        computed, shape);
+
+    return success();
+  }
+};
+
+// Handles lowering of the following pattern to patterns that will be further
+// matched by other patterns until they result in LHLO:
+//   %result = "chlo.op"(%lhs, %rhs) : (<*xTy>, <*xTy>) -> <*xTy>
+//
+// The sequence of specializations this handles is:
+//   - Either operand being scalar
+//   - Operands having equal shapes
+//   - The resulting value being any of ranks [2,6]
+template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
+struct ConvertUnrankedDynamicBroadcastBinaryOp
+    : public OpConversionPattern<ChloOpTy> {
+  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      ChloOpTy op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    typename ChloOpTy::Adaptor transformed(operands);
+    Value lhs = transformed.lhs();
+    Value rhs = transformed.rhs();
+    auto lhs_type = lhs.getType().dyn_cast<UnrankedTensorType>();
+    auto rhs_type = rhs.getType().dyn_cast<UnrankedTensorType>();
+    auto result_type = op.getResult().getType().template dyn_cast<TensorType>();
+
+    // Only support unranked operands. If either operand is ranked, another
+    // pattern will handle the lowering.
+    if (!lhs_type || !rhs_type) return failure();
+
+    // If lhs is scalar
+    auto if_op = rewriter.create<scf::IfOp>(
+        loc, result_type, IsScalarTensor(rewriter, op, lhs), true);
+    OpBuilder if_lhs_scalar_builder =
+        if_op.getThenBodyBuilder(rewriter.getListener());
+    Value reshaped_lhs = if_lhs_scalar_builder.create<TensorCastOp>(
+        loc, RankedTensorType::get({}, lhs_type.getElementType()), lhs);
+    Value if_lhs_scalar_result = if_lhs_scalar_builder.create<ChloOpTy>(
+        loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{reshaped_lhs, rhs},
+        op.getAttrs());
+    if_lhs_scalar_builder.create<scf::YieldOp>(loc, if_lhs_scalar_result);
+
+    // If lhs is NOT scalar
+    //
+    // See if rhs is scalar
+    OpBuilder else_lhs_scalar_builder =
+        if_op.getElseBodyBuilder(rewriter.getListener());
+    auto if_rhs_scalar_op = else_lhs_scalar_builder.create<scf::IfOp>(
+        loc, result_type, IsScalarTensor(else_lhs_scalar_builder, op, rhs),
+        true);
+    else_lhs_scalar_builder.create<scf::YieldOp>(loc,
+                                                 if_rhs_scalar_op.getResult(0));
+    OpBuilder if_rhs_scalar_builder =
+        if_rhs_scalar_op.getThenBodyBuilder(rewriter.getListener());
+    Value reshaped_rhs = if_rhs_scalar_builder.create<TensorCastOp>(
+        loc, RankedTensorType::get({}, lhs_type.getElementType()), rhs);
+    Value if_rhs_scalar_result = if_rhs_scalar_builder.create<ChloOpTy>(
+        loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{lhs, reshaped_rhs},
+        op.getAttrs());
+    if_rhs_scalar_builder.create<scf::YieldOp>(loc, if_rhs_scalar_result);
+
+    // If NEITHER shape is scalar
+    //
+    // See if shapes are equal.
+    OpBuilder else_no_scalars_builder =
+        if_rhs_scalar_op.getElseBodyBuilder(rewriter.getListener());
+    Value shape_of_lhs =
+        else_no_scalars_builder.create<shape::ShapeOfOp>(loc, lhs);
+    Value shape_of_rhs =
+        else_no_scalars_builder.create<shape::ShapeOfOp>(loc, rhs);
+    Value equal_shapes = else_no_scalars_builder.create<shape::ShapeEqOp>(
+        loc, shape_of_lhs, shape_of_rhs);
+
+    auto if_eq_shapes_op = else_no_scalars_builder.create<scf::IfOp>(
+        loc, result_type, equal_shapes, true);
+    else_no_scalars_builder.create<scf::YieldOp>(loc,
+                                                 if_eq_shapes_op.getResult(0));
+
+    OpBuilder if_eq_shapes_builder =
+        if_eq_shapes_op.getThenBodyBuilder(rewriter.getListener());
+    Value non_broadcast_op =
+        Adaptor::CreateOp(op, result_type, lhs, rhs, if_eq_shapes_builder);
+    if_eq_shapes_builder.create<scf::YieldOp>(loc, non_broadcast_op);
+
+    // If shapes are not scalar, nor equal
+    //
+    // See if values are of a rank that we support.
+    OpBuilder if_neq_shapes_builder =
+        if_eq_shapes_op.getElseBodyBuilder(rewriter.getListener());
+    if_neq_shapes_builder.create<scf::YieldOp>(
+        loc, HandleBroadcastAndOp(if_neq_shapes_builder, op, lhs, rhs));
+
+    rewriter.replaceOp(op, {if_op.getResult(0)});
+    return success();
+  }
+
+ private:
+  // Returns the dyanamic result of checking the given value is a scalar
+  // tensor.
+  Value IsScalarTensor(OpBuilder &rewriter, ChloOpTy op, Value tensor) const {
+    auto loc = op.getLoc();
+
+    Value shape_of_tensor = rewriter.create<shape::ShapeOfOp>(loc, tensor);
+    Value rank_tensor = rewriter.create<shape::RankOp>(
+        loc, rewriter.getIndexType(), shape_of_tensor);
+    return rewriter.create<CmpIOp>(loc, rewriter.getI1Type(), CmpIPredicate::eq,
+                                   rank_tensor,
+                                   rewriter.create<ConstantIndexOp>(loc, 0));
+  }
+
+  // Create the if statement and code for a broadcasting op with a result of a
+  // given rank.
+  scf::IfOp createRankSpecializedBroadcastAndOp(OpBuilder &builder, ChloOpTy op,
+                                                Value lhs, Value rhs,
+                                                Value actual_rank,
+                                                int targeted_rank) const {
+    auto loc = op.getLoc();
+
+    // Create the if block to place the current specialized logic in.
+    Value greater_rank_is_n = builder.create<CmpIOp>(
+        loc, CmpIPredicate::eq, actual_rank,
+        builder.create<ConstantIndexOp>(loc, targeted_rank));
+    auto if_op =
+        builder.create<scf::IfOp>(loc, lhs.getType(), greater_rank_is_n, true);
+    OpBuilder if_builder = if_op.getThenBodyBuilder(builder.getListener());
+
+    // Handle shape broadcasting and inferrence.
+    Value lhs_shape = if_builder.create<shape::ShapeOfOp>(loc, lhs);
+    Value rhs_shape = if_builder.create<shape::ShapeOfOp>(loc, rhs);
+    SmallVector<int64_t, 6> ranked_shape(targeted_rank, 1);
+    auto unknown_rank_extent_tensor_type = RankedTensorType::get(
+        {RankedTensorType::kDynamicSize}, builder.getIndexType());
+    auto known_rank_extent_tensor_type =
+        RankedTensorType::get({targeted_rank}, builder.getIndexType());
+    auto reshaped_type = RankedTensorType::get(
+        llvm::SmallVector<int64_t, 6>(targeted_rank,
+                                      RankedTensorType::kDynamicSize),
+        lhs.getType().template dyn_cast<TensorType>().getElementType());
+    Value ranked_shape_val = if_builder.create<shape::ConstShapeOp>(
+        loc, known_rank_extent_tensor_type,
+        mlir::DenseIntElementsAttr::get(known_rank_extent_tensor_type,
+                                        ranked_shape));
+    Value extended_lhs = if_builder.create<shape::BroadcastOp>(
+        loc, unknown_rank_extent_tensor_type, lhs_shape, ranked_shape_val,
+        nullptr);
+    Value extended_lhs_casted = if_builder.create<TensorCastOp>(
+        loc, known_rank_extent_tensor_type, extended_lhs);
+    Value extended_rhs = if_builder.create<shape::BroadcastOp>(
+        loc, unknown_rank_extent_tensor_type, rhs_shape, ranked_shape_val,
+        nullptr);
+    Value extended_rhs_casted = if_builder.create<TensorCastOp>(
+        loc, known_rank_extent_tensor_type, extended_rhs);
+
+    // 1. Reshape operands to the given rank (with the same number of elements)
+    // 2. Compute the ranked-broadcasted ChloOp (which will assert that the ops
+    //    can be broadcasted and do the actual broadcasting)
+    // 3. Type erase the output back to unranked
+    Value reshaped_lhs = if_builder.create<mhlo::DynamicReshapeOp>(
+        loc, reshaped_type, lhs, extended_lhs_casted);
+    Value reshaped_rhs = if_builder.create<mhlo::DynamicReshapeOp>(
+        loc, reshaped_type, rhs, extended_rhs_casted);
+    Value result = if_builder.create<ChloOpTy>(
+        loc, ArrayRef<Type>{reshaped_type},
+        ArrayRef<Value>{reshaped_lhs, reshaped_rhs}, op.getAttrs());
+    Value reshaped_result = if_builder.create<TensorCastOp>(
+        loc, UnrankedTensorType::get(reshaped_type.getElementType()), result);
+    if_builder.create<scf::YieldOp>(loc, reshaped_result);
+
+    // Return the if_op, so the result can be used and the else block can be
+    // used for the next rank specialized step.
+    return if_op;
+  }
+
+  // Iterates over the desired ranks to be specialized and generates the code
+  // snippet for each case.
+  Value HandleBroadcastAndOp(OpBuilder &rewriter, ChloOpTy op, Value lhs,
+                             Value rhs) const {
+    constexpr int max_rank_specialization = 7;
+    auto loc = op.getLoc();
+
+    // Find the larger rank of the 2 operands.
+    auto extent_tensor_type = RankedTensorType::get({ShapedType::kDynamicSize},
+                                                    rewriter.getIndexType());
+    Value lhs_shape =
+        rewriter.create<shape::ShapeOfOp>(loc, extent_tensor_type, lhs);
+    Value rhs_shape =
+        rewriter.create<shape::ShapeOfOp>(loc, extent_tensor_type, rhs);
+    Value lhs_rank =
+        rewriter.create<shape::RankOp>(loc, rewriter.getIndexType(), lhs_shape);
+    Value rhs_rank =
+        rewriter.create<shape::RankOp>(loc, rewriter.getIndexType(), rhs_shape);
+    Value greater_rank_lhs =
+        rewriter.create<CmpIOp>(loc, CmpIPredicate::sgt, lhs_rank, rhs_rank);
+    Value greater_rank =
+        rewriter.create<SelectOp>(loc, greater_rank_lhs, lhs_rank, rhs_rank);
+
+    // Generate a list of nested if/else statements to handle rank
+    // specializations from 2-6.
+    scf::IfOp if_op = createRankSpecializedBroadcastAndOp(rewriter, op, lhs,
+                                                          rhs, greater_rank, 2);
+
+    // Put each subsequent rank specialization inside the else statement of the
+    // previous one.
+    OpBuilder else_builder = if_op.getElseBodyBuilder(rewriter.getListener());
+    for (int i = 3; i < max_rank_specialization; i++) {
+      auto inner_if = createRankSpecializedBroadcastAndOp(else_builder, op, lhs,
+                                                          rhs, greater_rank, i);
+
+      else_builder.create<scf::YieldOp>(loc, inner_if.getResult(0));
+      else_builder = inner_if.getElseBodyBuilder(rewriter.getListener());
+    }
+
+    // Fire an assertion if none of the rank specializations applied (one of the
+    // ranks was greater than 6).
+    else_builder.create<AssertOp>(
+        loc, else_builder.create<ConstantIntOp>(loc, 0, 1),
+        "Input for dynamic binary op lowering was of a rank greater than 6");
+    else_builder.create<scf::YieldOp>(loc, lhs);
+
+    // Return the result of the outermost if statement.
+    return if_op.getResult(0);
+  }
+};
+
 struct TransformUnrankedHloPass
     : public PassWrapper<TransformUnrankedHloPass, FunctionPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -137,7 +424,7 @@ struct TransformUnrankedHloPass
     MLIRContext &ctx = getContext();
     ConversionTarget target(ctx);
     target.addLegalDialect<mhlo::MhloDialect, StandardOpsDialect,
-                           shape::ShapeDialect>();
+                           shape::ShapeDialect, scf::SCFDialect>();
     target.addLegalOp<FuncOp>();
 #define ADD_LEGAL_MHLO(op) AddLegalOpOnRankedTensor<mhlo::op>(&target)
 #define ADD_LEGAL_CHLO(op) AddLegalOpOnRankedTensor<chlo::op>(&target)
@@ -148,13 +435,20 @@ struct TransformUnrankedHloPass
 #undef ADD_LEGAL_CHLO
     AddLegalOpOnRankedTensor<mhlo::CompareOp>(&target);
     AddLegalOpOnRankedTensor<mhlo::SelectOp>(&target);
+    target.addDynamicallyLegalDialect<chlo::HloClientDialect>(
+        [](Operation *op) {
+          return !llvm::any_of(op->getOperandTypes(), [](Type type) {
+            return type.isa<UnrankedTensorType>();
+          });
+        });
 
     // Populate rewrite patterns.
     OwningRewritePatternList patterns;
     PopulateTransformUnrankedHloPatterns(&ctx, &patterns);
 
     // Apply transformation.
-    if (failed(applyPartialConversion(getFunction(), target, patterns)))
+    if (failed(
+            applyPartialConversion(getFunction(), target, std::move(patterns))))
       return signalPassFailure();
   }
 };
@@ -179,6 +473,10 @@ void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
 #undef MAP_BINARY
 #undef MAP_CHLO_UNARY
 #undef COMMA
+  chlo::PopulateForBroadcastingBinaryOp<
+      ConvertUnrankedDynamicBroadcastBinaryOp>(context, patterns);
+  chlo::PopulateForBroadcastingBinaryOp<
+      ConvertUnrankedScalarDynamicBroadcastBinaryOp>(context, patterns);
 }
 
 std::unique_ptr<FunctionPass> createTransformUnrankedHloPass() {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
index f187a7470cf..19261d49d1f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace mhlo {
@@ -32,7 +32,7 @@ struct TestUnfuseBatchNormPass
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
-    applyPatternsAndFoldGreedily(getOperation(), patterns);
+    applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
index 974585b12c5..7624ba929ea 100644
--- a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
@@ -81,6 +81,14 @@ func @remainder_fold_float() -> tensor<4xf32> {
   return %2 : tensor<4xf32>
 }
 
+// CHECK-LABEL: round_fold
+func @round_fold() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<[-1.5, -0.1, 1.1, 2.5]> : tensor<4xf32>
+  %1 = "mhlo.round_nearest_afz"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+  // CHECK: mhlo.constant dense<[-2.000000e+00, -0.000000e+00, 1.000000e+00, 3.000000e+00]>
+}
+
 // CHECK-LABEL: max_scalar_fold
 func @max_scalar_fold() -> tensor<4xi64> {
   %0 = mhlo.constant dense<7> : tensor<4xi64>
@@ -1167,3 +1175,291 @@ func @not_fold_sqrt_neg_constants() -> tensor<4xf32> {
   // CHECK: mhlo.sqrt
   return %1 : tensor<4xf32>
 }
+
+// CHECK-LABEL: @tensor_flow_scatter_v1_update
+func @tensor_flow_scatter_v1_update() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, 2]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [10, 20, 30], [4, 5, 6], [70, 80, 90]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_v2_update
+func @tensor_flow_scatter_v2_update() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, 2]> : tensor<2xi32>
+  %2 = constant dense<[[10, 30], [40, 60], [70, 90]]> : tensor<3x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<1> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<1> : tensor<1xi64>,
+          update_window_dims = dense<[0]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<3x2xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [10, 2, 30], [40, 5, 60], [70, 8, 90]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_add
+func @tensor_flow_scatter_add() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, 2]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [11, 22, 33], [4, 5, 6], [77, 88, 99]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_repeated
+func @tensor_flow_scatter_repeated() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[1, 1]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [1, 2, 3], [84, 105, 126], [7, 8, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_multiple_batch
+func @tensor_flow_scatter_multiple_batch() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[[0, 2], [2, 1]]> : tensor<2x2xi32>
+  %2 = constant dense<[[[10, 30], [40, 60], [70, 90]], [[5, 5], [5, 5], [5, 5]]]> : tensor<2x3x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 2 : i64,
+          inserted_window_dims = dense<1> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<1> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2x2xi32>, tensor<2x3x2xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [11, 7, 38], [44, 10, 71], [77, 13, 104]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_nd
+func @tensor_flow_scatter_nd() -> tensor<3x3x2xi32> {
+  %0 = constant dense<[[[-1, 1], [-2, 2], [-3, 3]], [[-4, 4], [-5, 5], [-6, 6]], [[-7, 7], [-8, 8], [-9, 9]]]> : tensor<3x3x2xi32>
+  %1 = constant dense<[[0, 0], [1, 0]]> : tensor<2x2xi32>
+  %2 = constant dense<[[-10, 10], [-40, 40]]> : tensor<2x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<[0, 1]> : tensor<2xi64>,
+          scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<3x3x2xi32>
+  return %3 : tensor<3x3x2xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [-10, 10], [-2, 2], [-3, 3]
+  // CHECK-SAME: [-40, 40], [-5, 5], [-6, 6]
+  // CHECK-SAME: [-7, 7], [-8, 8], [-9, 9]
+  // CHECK-SAME: ]> : tensor<3x3x2xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_nd_index_vector
+func @tensor_flow_scatter_nd_index_vector() -> tensor<3x3x2xi32> {
+  %0 = constant dense<[[[-1, 1], [-2, 2], [-3, 3]], [[-4, 4], [-5, 5], [-6, 6]], [[-7, 7], [-8, 8], [-9, 9]]]> : tensor<3x3x2xi32>
+  %1 = constant dense<[[0, 0], [1, 0]]> : tensor<2x2xi32>
+  %2 = constant dense<[[-10, 10], [-20, 20]]> : tensor<2x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 0 : i64,
+          inserted_window_dims = dense<[0, 1]> : tensor<2xi64>,
+          scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<3x3x2xi32>
+  return %3 : tensor<3x3x2xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [-20, 20], [-10, 10], [-3, 3]
+  // CHECK-SAME: [-4, 4], [-5, 5], [-6, 6]
+  // CHECK-SAME: [-7, 7], [-8, 8], [-9, 9]
+  // CHECK-SAME: ]> : tensor<3x3x2xi32>
+}
+
+// CHECK-LABEL: @scatter_batch_dus
+func @scatter_batch_dus() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[[2, 1], [1, 1]]> : tensor<2x2xi32>
+  %2 = constant dense<[[[10]], [[20]]]> : tensor<2x1x1xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 0 : i64,
+          inserted_window_dims = dense<> : tensor<0xi64>,
+          scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+          update_window_dims = dense<[1, 2]> : tensor<2xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2x2xi32>, tensor<2x1x1xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [1, 2, 3], [4, 20, 6], [7, 10, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @scatter_no_update_window_dim
+func @scatter_no_update_window_dim() -> tensor<3xi32> {
+  %0 = constant dense<[0, 1, 2]> : tensor<3xi32>
+  %1 = constant dense<[[[0], [1]], [[2], [1]]]> : tensor<2x2x1xi32>
+  %2 = constant dense<[[10, 20], [30, 40]]> : tensor<2x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 2 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<> : tensor<0xi64>
+        },
+        unique_indices = false
+    } : (tensor<3xi32>, tensor<2x2x1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
+  return %3 : tensor<3xi32>
+  // CHECK: mhlo.constant dense<[10, 61, 32]> : tensor<3xi32>
+}
+
+// CHECK-LABEL: @scatter_negative_index
+func @scatter_negative_index() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, -1]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: constant dense<[
+  // CHECK-SAME: [1, 2, 3], [4, 5, 6], [7, 8, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+  // CHECK: "mhlo.scatter"
+}
+
+// CHECK-LABEL: @scatter_out_of_bound
+func @scatter_out_of_bound() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[1, 5]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: constant dense<[
+  // CHECK-SAME: [1, 2, 3], [4, 5, 6], [7, 8, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+  // CHECK: "mhlo.scatter"
+}
+
+// CHECK-LABEL: @pad_identity_fold
+func @pad_identity_fold(%arg0: tensor<5x7xf32>) -> tensor<5x7xf32> {
+  %0 = constant dense<0.0> : tensor<f32>
+  %1 = "mhlo.pad"(%arg0, %0) {
+    edge_padding_low = dense<0> : tensor<2xi64>,
+    edge_padding_high = dense<0> : tensor<2xi64>,
+    interior_padding = dense<0> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<5x7xf32>
+  return %1 : tensor<5x7xf32>
+  // CHECK: return %arg0 : tensor<5x7xf32>
+}
+
+// CHECK-LABEL: @pad_fold
+func @pad_fold() -> tensor<4x5xi32> {
+  %0 = constant dense<[[2, 3], [4, 5]]> : tensor<2x2xi32>
+  %1 = constant dense<1> : tensor<i32>
+  %3 = "mhlo.pad"(%0, %1) {
+    edge_padding_low = dense<[1, 0]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, 2]> : tensor<2xi64>,
+    interior_padding = dense<[0, 1]> : tensor<2xi64>
+  } : (tensor<2x2xi32>, tensor<i32>) -> tensor<4x5xi32>
+  return %3 : tensor<4x5xi32>
+  // CHECK: constant dense<[
+  // CHECK-SAME: [1, 1, 1, 1, 1], [2, 1, 3, 1, 1], [4, 1, 5, 1, 1], [1, 1, 1, 1, 1]
+  // CHECK-SAME: ]> : tensor<4x5xi32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 60ec26f48a1..a83a29ff96a 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -237,209 +237,3 @@ func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4x
   %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
   return %0 : tensor<4xi1>
 }
-
-// -----
-func @addScalarUnranked(%arg0: tensor<f32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<f32>, tensor<*xf32>)
-                                         -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-
-// CHECK-LABEL:   func @addScalarUnranked(
-// CHECK-SAME:                            %[[ARG_0:.*]]: tensor<f32>,
-// CHECK-SAME:                            %[[ARG_1:.*]]: tensor<*xf32>
-// CHECK-SAME:                            ) -> tensor<*xf32> {
-//                  First handle the dynamic reshaping of the unranked operand
-//                  to a 1D tensor.
-// CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<*xf32>
-// CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_1]] : tensor<?xindex> -> index
-// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
-// CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_1]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
-//                  The assuming region is part of the second stage of lowering
-//                  with ranked broadcasting logic.
-// CHECK:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<f32>
-// CHECK:           %[[SHAPE_RESHAPED:.*]] = shape.shape_of %[[RESHAPED]] : tensor<?xf32>
-// CHECK:           %[[WITNESS:.*]] = shape.cstr_broadcastable %[[SHAPE_0]], %[[SHAPE_RESHAPED]]
-// CHECK:           %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]] -> (tensor<?xf32>) {
-// CHECK:             %[[SCALAR_SHAPE:.*]] = shape.const_shape []
-// CHECK:             %[[BROADCASTED_SHAPE:.*]] = shape.broadcast %[[SCALAR_SHAPE]], %[[SHAPE_RESHAPED]]
-// CHECK:             %[[SHAPE_TENSOR:.*]] = tensor_cast %[[BROADCASTED_SHAPE]] : tensor<?xindex> to tensor<1xindex>
-// CHECK:             %[[BROADCASTED_LHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG_0]], %[[SHAPE_TENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK:             %[[BROADCASTED_RHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[RESHAPED]], %[[SHAPE_TENSOR]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK:             %[[BROADCASTED_RESULT:.*]] = mhlo.add %[[BROADCASTED_LHS]], %[[BROADCASTED_RHS]] : tensor<?xf32>
-// CHECK:             shape.assuming_yield %[[BROADCASTED_RESULT]] : tensor<?xf32>
-// CHECK:           }
-//                  As part of the unranked logic, the result is reshaped back
-//                  to an unranked tensor.
-// CHECK:           %[[RESHAPED_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[ASSUMING_RESULT:.*]], %[[SHAPE_1]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK:           return %[[RESHAPED_RESULT]] : tensor<*xf32>
-// CHECK:         }
-
-// -----
-func @addUnrankedScalar(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> tensor<*xf32> {
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<f32>)
-                                         -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-// CHECK-LABEL:   func @addUnrankedScalar(
-// CHECK-SAME:                            %[[ARG_0:.*]]: tensor<*xf32>,
-// CHECK-SAME:                            %[[ARG_1:.*]]: tensor<f32>) -> tensor<*xf32> {
-//                  First handle the dynamic reshaping of the unranked operand
-//                  to a 1D tensor.
-// CHECK:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32>
-// CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_0]] : tensor<?xindex> -> index
-// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
-// CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_0]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
-//                  The assuming region is part of the second stage of lowering
-//                  with ranked broadcasting logic.
-// CHECK:           %[[SHAPE_RESHAPED:.*]] = shape.shape_of %[[RESHAPED]] : tensor<?xf32>
-// CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<f32>
-// CHECK:           %[[WITNESS:.*]] = shape.cstr_broadcastable %[[SHAPE_RESHAPED]], %[[SHAPE_1]]
-// CHECK:           %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]] -> (tensor<?xf32>) {
-// CHECK:             %[[ASTENSOR:.*]] = tensor_cast %[[SHAPE_RESHAPED]]
-// CHECK:             %[[BROADCASTED_LHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[RESHAPED]], %[[ASTENSOR]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK:             %[[BROADCASTED_RHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG_1]], %[[ASTENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK:             %[[BROADCASTED_RESULT:.*]] = mhlo.add %[[BROADCASTED_LHS]], %[[BROADCASTED_RHS]] : tensor<?xf32>
-// CHECK:             shape.assuming_yield %[[BROADCASTED_RESULT]] : tensor<?xf32>
-// CHECK:           }
-//                  As part of the unranked logic, the result is reshaped back
-//                  to an unranked tensor.
-// CHECK:           %[[RESHAPED_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[ASSUMING_RESULT:.*]], %[[SHAPE_0]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK:           return %[[RESHAPED_RESULT]] : tensor<*xf32>
-// CHECK:         }
-
-// -----
-func @addUnrankedUnranked(
-      %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<*xf32>)
-                                         -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-
-// CHECK-LABEL:   func @addUnrankedUnranked(
-// CHECK-SAME:          %[[LHS:.*]]: tensor<*xf32>,
-// CHECK-SAME:          %[[RHS:.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           %[[LHS_SHAPE:.*]] = shape.shape_of %[[LHS]] : tensor<*xf32> -> tensor<?xindex>
-// CHECK:           %[[RANK_LHS:.*]] = shape.rank %[[LHS_SHAPE]] : tensor<?xindex> -> index
-// CHECK:           %[[C0:.*]] = constant 0 : index
-// CHECK:           %[[LHS_IS_SCALAR:.*]] = cmpi "eq", %[[RANK_LHS]], %[[C0]] : index
-//                  Handle scalar LHS case
-// CHECK:           %[[VAL_8:.*]] = scf.if %[[LHS_IS_SCALAR]] -> (tensor<*xf32>) {
-// CHECK:             %[[SCALAR_LHS:.*]] = tensor_cast %[[LHS]] : tensor<*xf32> to tensor<f32>
-// CHECK:             %[[VAL_10:.*]] = chlo.broadcast_add %[[SCALAR_LHS]], %[[RHS]] : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK:             scf.yield %[[VAL_10]] : tensor<*xf32>
-// CHECK:           } else {
-// CHECK:             %[[RHS_SHAPE:.*]] = shape.shape_of %[[RHS]] : tensor<*xf32> -> tensor<?xindex>
-// CHECK:             %[[RANK_RHS:.*]] = shape.rank %[[RHS_SHAPE]] : tensor<?xindex> -> index
-// CHECK:             %[[RHS_IS_SCALAR:.*]] = cmpi "eq", %[[RANK_RHS]], %[[C0]] : index
-  //                  Handle scalar RHS case
-// CHECK:             %[[VAL_14:.*]] = scf.if %[[RHS_IS_SCALAR]] -> (tensor<*xf32>) {
-// CHECK:               %[[SCALAR_RHS:.*]] = tensor_cast %[[RHS]] : tensor<*xf32> to tensor<f32>
-// CHECK:               %[[VAL_16:.*]] = chlo.broadcast_add %[[LHS]], %[[SCALAR_RHS]] : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
-// CHECK:               scf.yield %[[VAL_16]] : tensor<*xf32>
-// CHECK:             } else {
-// CHECK:               %[[SHAPES_EQ:.*]] = shape.shape_eq %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex>
-  //                    Handle scalar RHS case
-// CHECK:               %[[VAL_18:.*]] = scf.if %[[SHAPES_EQ]] -> (tensor<*xf32>) {
-// CHECK:                 %[[VAL_19:.*]] = mhlo.add %[[LHS]], %[[RHS]] : tensor<*xf32>
-// CHECK:                 scf.yield %[[VAL_19]] : tensor<*xf32>
-// CHECK:               } else {
-// CHECK:                 %[[LHS_RANK:.*]] = rank %[[LHS_SHAPE]] : tensor<?xindex>
-// CHECK:                 %[[RHS_RANK:.*]] = rank %[[RHS_SHAPE]] : tensor<?xindex>
-// CHECK:                 %[[LHS_RANK_GREATER:.*]] = cmpi "sgt", %[[LHS_RANK]], %[[RHS_RANK]] : index
-// CHECK:                 %[[GREATEST_RANK:.*]] = select %[[LHS_RANK_GREATER]], %[[LHS_RANK]], %[[RHS_RANK]] : index
-// CHECK:                 %[[C2:.*]] = constant 2 : index
-// CHECK:                 %[[GREATEST_RANK_IS_2:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C2]] : index
-//                        Handle rank 2 specialization
-// CHECK:                 %[[VAL_26:.*]] = scf.if %[[GREATEST_RANK_IS_2]] -> (tensor<*xf32>) {
-// CHECK:                   %[[CONST_SHAPE_2:.*]] = shape.const_shape [1, 1]
-// CHECK:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
-// CHECK:                   %[[CASTED_LHS_2:.*]] = tensor_cast %[[BROADCASTED_LHS_2]] : tensor<?xindex> to tensor<2xindex>
-// CHECK:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
-// CHECK:                   %[[CASTED_RHS_2:.*]] = tensor_cast %[[BROADCASTED_RHS_2]] : tensor<?xindex> to tensor<2xindex>
-// CHECK:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-// CHECK:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-// CHECK:                   %[[RESULT_RANK_2:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_2]], %[[RESHAPED_RHS_2]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK:                   %[[RESULT_2:.*]] = tensor_cast %[[RESULT_RANK_2]] : tensor<?x?xf32> to tensor<*xf32>
-// CHECK:                   scf.yield %[[RESULT_2]] : tensor<*xf32>
-// CHECK:                 } else {
-// CHECK:                   %[[C3:.*]] = constant 3 : index
-// CHECK:                   %[[GREATEST_RANK_IS_3:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C3]] : index
-//                          Handle rank 3 specialization
-// CHECK:                   %[[VAL_34:.*]] = scf.if %[[GREATEST_RANK_IS_3]] -> (tensor<*xf32>) {
-// CHECK:                     %[[CONST_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
-// CHECK:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
-// CHECK:                     %[[CASTED_LHS_3:.*]] = tensor_cast %[[BROADCASTED_LHS_3]] : tensor<?xindex> to tensor<3xindex>
-// CHECK:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
-// CHECK:                     %[[CASTED_RHS_3:.*]] = tensor_cast %[[BROADCASTED_RHS_3]] : tensor<?xindex> to tensor<3xindex>
-// CHECK:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-// CHECK:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-// CHECK:                     %[[RESULT_RANK_3:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_3]], %[[RESHAPED_RHS_3]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-// CHECK:                     %[[RESULT_3:.*]] = tensor_cast %[[RESULT_RANK_3]] : tensor<?x?x?xf32> to tensor<*xf32>
-// CHECK:                     scf.yield %[[RESULT_3]] : tensor<*xf32>
-// CHECK:                   } else {
-// CHECK:                     %[[C4:.*]] = constant 4 : index
-// CHECK:                     %[[GREATEST_RANK_IS_4:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C4]] : index
-//                            Handle rank 4 specialization
-// CHECK:                     %[[VAL_42:.*]] = scf.if %[[GREATEST_RANK_IS_4]] -> (tensor<*xf32>) {
-// CHECK:                       %[[CONST_SHAPE_4:.*]] = shape.const_shape [1, 1, 1, 1]
-// CHECK:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
-// CHECK:                       %[[CASTED_LHS_4:.*]] = tensor_cast %[[BROADCASTED_LHS_4]] : tensor<?xindex> to tensor<4xindex>
-// CHECK:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
-// CHECK:                       %[[CASTED_RHS_4:.*]] = tensor_cast %[[BROADCASTED_RHS_4]] : tensor<?xindex> to tensor<4xindex>
-// CHECK:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-// CHECK:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-// CHECK:                       %[[RESULT_RANK_4:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_4]], %[[RESHAPED_RHS_4]] : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
-// CHECK:                       %[[RESULT_4:.*]] = tensor_cast %[[RESULT_RANK_4]] : tensor<?x?x?x?xf32> to tensor<*xf32>
-// CHECK:                       scf.yield %[[RESULT_4]] : tensor<*xf32>
-// CHECK:                     } else {
-// CHECK:                       %[[C5:.*]] = constant 5 : index
-// CHECK:                       %[[GREATEST_RANK_IS_5:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C5]] : index
-//                              Handle rank 5 specialization
-// CHECK:                       %[[VAL_50:.*]] = scf.if %[[GREATEST_RANK_IS_5]] -> (tensor<*xf32>) {
-// CHECK:                         %[[CONST_SHAPE_5:.*]] = shape.const_shape [1, 1, 1, 1, 1]
-// CHECK:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
-// CHECK:                         %[[CASTED_LHS_5:.*]] = tensor_cast %[[BROADCASTED_LHS_5]] : tensor<?xindex> to tensor<5xindex>
-// CHECK:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
-// CHECK:                         %[[CASTED_RHS_5:.*]] = tensor_cast %[[BROADCASTED_RHS_5]] : tensor<?xindex> to tensor<5xindex>
-// CHECK:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
-// CHECK:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
-// CHECK:                         %[[RESULT_RANK_5:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_5]], %[[RESHAPED_RHS_5]] : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
-// CHECK:                         %[[RESULT_5:.*]] = tensor_cast %[[RESULT_RANK_5]] : tensor<?x?x?x?x?xf32> to tensor<*xf32>
-// CHECK:                         scf.yield %[[RESULT_5]] : tensor<*xf32>
-// CHECK:                       } else {
-// CHECK:                         %[[C6:.*]] = constant 6 : index
-// CHECK:                         %[[GREATEST_RANK_IS_6:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C6]] : index
-//                                Handle rank 6 specialization
-// CHECK:                         %[[VAL_58:.*]] = scf.if %[[GREATEST_RANK_IS_6]] -> (tensor<*xf32>) {
-// CHECK:                           %[[CONST_SHAPE_6:.*]] = shape.const_shape [1, 1, 1, 1, 1, 1]
-// CHECK:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
-// CHECK:                           %[[CASTED_LHS_6:.*]] = tensor_cast %[[BROADCASTED_LHS_6]] : tensor<?xindex> to tensor<6xindex>
-// CHECK:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
-// CHECK:                           %[[CASTED_RHS_6:.*]] = tensor_cast %[[BROADCASTED_RHS_6]] : tensor<?xindex> to tensor<6xindex>
-// CHECK:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
-// CHECK:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
-// CHECK:                           %[[RESULT_RANK_6:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_6]], %[[RESHAPED_RHS_6]] : (tensor<?x?x?x?x?x?xf32>, tensor<?x?x?x?x?x?xf32>) -> tensor<?x?x?x?x?x?xf32>
-// CHECK:                           %[[RESULT_6:.*]] = tensor_cast %[[RESULT_RANK_6]] : tensor<?x?x?x?x?x?xf32> to tensor<*xf32>
-// CHECK:                           scf.yield %[[RESULT_6]] : tensor<*xf32>
-// CHECK:                         } else {
-// CHECK:                           %false = constant false
-// CHECK:                           assert %false
-// CHECK:                           scf.yield %[[LHS]] : tensor<*xf32>
-// CHECK:                         }
-// CHECK:                         scf.yield %[[VAL_64:.*]] : tensor<*xf32>
-// CHECK:                       }
-// CHECK:                       scf.yield %[[VAL_65:.*]] : tensor<*xf32>
-// CHECK:                     }
-// CHECK:                     scf.yield %[[VAL_66:.*]] : tensor<*xf32>
-// CHECK:                   }
-// CHECK:                   scf.yield %[[VAL_67:.*]] : tensor<*xf32>
-// CHECK:                 }
-// CHECK:                 scf.yield %[[VAL_68:.*]] : tensor<*xf32>
-// CHECK:               }
-// CHECK:               scf.yield %[[VAL_69:.*]] : tensor<*xf32>
-// CHECK:             }
-// CHECK:             scf.yield %[[VAL_70:.*]] : tensor<*xf32>
-// CHECK:           }
-// CHECK:           return %[[VAL_71:.*]] : tensor<*xf32>
-// CHECK:         }
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo-unranked.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo-unranked.mlir
index cc60217be65..64009437182 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo-unranked.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo-unranked.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-placement %s -o - | FileCheck %s
+// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-hoisting -buffer-deallocation %s -o - | FileCheck %s
 
 // CHECK-LABEL: func @func_op_unranked_arg_result
 func @func_op_unranked_arg_result(%arg0: tensor<*xf32>) -> tensor<*xf32> {
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
index 3caa4f0bd3b..608973f57e9 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo -buffer-placement -split-input-file %s -o - | FILECHECK_OPTS="" FileCheck --check-prefixes=PRE,BOTH %s
-// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-placement -split-input-file %s -o - | FILECHECK_OPTS="" FileCheck --check-prefixes=ESC,BOTH %s
+// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo -buffer-hoisting -buffer-deallocation -split-input-file %s -o - | FILECHECK_OPTS="" FileCheck --check-prefixes=PRE,BOTH %s
+// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-hoisting -buffer-deallocation -split-input-file %s -o - | FILECHECK_OPTS="" FileCheck --check-prefixes=ESC,BOTH %s
 
 // BOTH-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -208,7 +208,7 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
   // BOTH: %[[TRANSFORMED_MEMREF:.*]] = lmhlo.dynamic_memref_cast
   // BOTH-SAME: %[[OPERAND]](%[[RESULT_DIM_1]], %[[RESULT_DIM_2]])
   // BOTH-SAME: {{\[}}%[[STRIDE_0]], %[[STRIDE_1]]]
-  // BOTH-SAME: : memref<?x?xf32> -> memref<?x?xf32, #map0>
+  // BOTH-SAME: : memref<?x?xf32> -> memref<?x?xf32, #map>
 
   // BOTH: "lmhlo.broadcast_in_dim"(%[[TRANSFORMED_MEMREF]], %[[RESULT]]) {
   // BOTH-SAME:   broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>
@@ -287,6 +287,28 @@ func @imag(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
 
 // -----
 
+// BOTH-LABEL: func @gather
+func @gather(%operand: memref<13x7xf32>, %idxs: memref<5xi32>, %result: memref<5x7xf32>) {
+  %tensor_operand = tensor_load %operand : memref<13x7xf32>
+  %tensor_idxs = tensor_load %idxs : memref<5xi32>
+  %tensor_result =
+    "mhlo.gather"(%tensor_operand, %tensor_idxs)
+      { dimension_numbers =
+        { collapsed_slice_dims = dense<0> : tensor<1xi64>
+        , index_vector_dim = 1 : i64
+        , offset_dims = dense<1> : tensor<1xi64>
+        , start_index_map = dense<0> : tensor<1xi64> }
+      , indices_are_sorted = false
+      , name = "gather.71"
+      , slice_sizes = dense<[1, 7]> : tensor<2xi64> }
+      : (tensor<13x7xf32>, tensor<5xi32>) -> tensor<5x7xf32>
+  // BOTH: "lmhlo.gather"(%{{.*}}, %{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<5x7xf32>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @imag_dyn
 func @imag_dyn(%operand: memref<?xcomplex<f32>>, %result: memref<?xf32>) {
   %tensor_operand = tensor_load %operand : memref<?xcomplex<f32>>
@@ -511,7 +533,13 @@ func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
 //  PRE-SAME: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
 //  ESC-SAME: (%[[ARG0:.*]]: [[TYPE:.*]]) -> [[TYPE]]
 // BOTH-NEXT: %[[ALLOC:.*]] = alloc
-//      BOTH: "lmhlo.dot"(%[[ARG0]], %[[ARG0]], %[[ALLOC]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+//      BOTH: "lmhlo.dot"(%[[ARG0]], %[[ARG0]], %[[ALLOC]]) {
+//        dot_dimension_numbers = {
+//          lhs_batching_dimensions = dense<> : tensor<0xi64>,
+//          lhs_contracting_dimensions = dense<1> : tensor<1xi64>,
+//          rhs_batching_dimensions = dense<> : tensor<0xi64>,
+//          rhs_contracting_dimensions = dense<0> : tensor<1xi64>}}
+//        : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
   %dot = "mhlo.dot"(%arg0, %arg0)
           : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
 // PRE: "lmhlo.copy"(%[[ALLOC]], %[[RESULT]])
@@ -632,4 +660,4 @@ func @shape_assuming_memref(%arg0: tensor<?xf16>) -> tensor<?xf16> {
     shape.assuming_yield %7 : tensor<?xf16>
   }
   return %2 : tensor<?xf16>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
index 91490b43f95..c4413ed3f3b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-hlo-opt %s -hlo-legalize-to-linalg -split-input-file | FILECHECK_OPTS="" FileCheck %s
 
-// CHECK: #map0 = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @float_add
 func @float_add(%lhs: tensor<2x2xf32>,
                 %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
index ae61fc8477e..8488a38460f 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt --transform-unranked-hlo --split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --transform-unranked-hlo --cse --split-input-file %s | FileCheck %s
 
 // Check the validity of expected IR.
 // CHECK-LABEL: @sqr_transform_result
@@ -96,3 +96,201 @@ func @tan(%a : tensor<*xf32>) -> tensor<*xf32> {
   %result = chlo.tan %a : tensor<*xf32>
   return %result : tensor<*xf32>
 }
+
+// -----
+
+func @addScalarUnranked(%arg0: tensor<f32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<f32>, tensor<*xf32>)
+                                         -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL:   func @addScalarUnranked(
+// CHECK-SAME:                            %[[ARG_0:.*]]: tensor<f32>,
+// CHECK-SAME:                            %[[ARG_1:.*]]: tensor<*xf32>
+// CHECK-SAME:                            ) -> tensor<*xf32> {
+//                  First handle the dynamic reshaping of the unranked operand
+//                  to a 1D tensor.
+// CHECK-NEXT:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<*xf32>
+// CHECK-NEXT:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_1]] : tensor<?xindex> -> index
+// CHECK-NEXT:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
+// CHECK-NEXT:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_1]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK-NEXT:           %[[BROADCASTED_RESULT:.*]] = chlo.broadcast_add %[[ARG_0]], %[[RESHAPED]] : (tensor<f32>, tensor<?xf32>) -> tensor<?xf32>
+//                  As part of the unranked logic, the result is reshaped back
+//                  to an unranked tensor.
+// CHECK-NEXT:           %[[RESHAPED_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[BROADCASTED_RESULT:.*]], %[[SHAPE_1]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK-NEXT:           return %[[RESHAPED_RESULT]] : tensor<*xf32>
+// CHECK-NEXT:         }
+
+// -----
+func @addUnrankedScalar(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> tensor<*xf32> {
+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<f32>)
+                                         -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+// CHECK-LABEL:   func @addUnrankedScalar(
+// CHECK-SAME:                            %[[ARG_0:.*]]: tensor<*xf32>,
+// CHECK-SAME:                            %[[ARG_1:.*]]: tensor<f32>) -> tensor<*xf32> {
+//                  First handle the dynamic reshaping of the unranked operand
+//                  to a 1D tensor.
+// CHECK-NEXT:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32>
+// CHECK-NEXT:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_0]] : tensor<?xindex> -> index
+// CHECK-NEXT:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
+// CHECK-NEXT:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_0]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+//                  The assuming region is part of the second stage of lowering
+//                  with ranked broadcasting logic.
+// CHECK-NEXT:           %[[BROADCASTED_RESULT:.*]] = chlo.broadcast_add %[[RESHAPED]], %[[ARG_1]] : (tensor<?xf32>, tensor<f32>)  -> tensor<?xf32>
+//                  As part of the unranked logic, the result is reshaped back
+//                  to an unranked tensor.
+// CHECK-NEXT:           %[[RESHAPED_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[BROADCASTED_RESULT:.*]], %[[SHAPE_0]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK-NEXT:           return %[[RESHAPED_RESULT]] : tensor<*xf32>
+// CHECK-NEXT:         }
+
+// -----
+func @addUnrankedUnranked(
+      %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<*xf32>)
+                                         -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL:   func @addUnrankedUnranked(
+// CHECK-SAME:          %[[LHS:.*]]: tensor<*xf32>,
+// CHECK-SAME:          %[[RHS:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK-NEXT:           %[[LHS_SHAPE:.*]] = shape.shape_of %[[LHS]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK-NEXT:           %[[LHS_RANK:.*]] = shape.rank %[[LHS_SHAPE]] : tensor<?xindex> -> index
+// CHECK-NEXT:           %[[C0:.*]] = constant 0 : index
+// CHECK-NEXT:           %[[LHS_IS_SCALAR:.*]] = cmpi "eq", %[[LHS_RANK]], %[[C0]] : index
+//                       Handle scalar LHS case
+// CHECK-NEXT:           %[[VAL_8:.*]] = scf.if %[[LHS_IS_SCALAR]] -> (tensor<*xf32>) {
+// CHECK-NEXT:             %[[SCALAR_LHS:.*]] = tensor_cast %[[LHS]] : tensor<*xf32> to tensor<f32>
+// CHECK-NEXT:             %[[RHS_SHAPE_1:.*]] = shape.shape_of %[[RHS]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK-NEXT:             %[[NUM_RHS:.*]] = shape.num_elements %[[RHS_SHAPE_1]] : tensor<?xindex> -> index
+// CHECK-NEXT:             %[[NUM_TENS_RHS:.*]] = tensor_from_elements %[[NUM_RHS]] : tensor<1xindex>
+// CHECK-NEXT:             %[[RESHAPED_RHS:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[NUM_TENS_RHS]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK-NEXT:             %[[LHS_SCALAR_RESULT:.*]] = chlo.broadcast_add %[[SCALAR_LHS]], %[[RESHAPED_RHS]] : (tensor<f32>, tensor<?xf32>) -> tensor<?xf32>
+// CHECK-NEXT:             %[[RESHAPED_LHS_SCALAR_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[LHS_SCALAR_RESULT]], %[[RHS_SHAPE_1]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK-NEXT:             scf.yield %[[RESHAPED_LHS_SCALAR_RESULT]] : tensor<*xf32>
+// CHECK-NEXT:           } else {
+// CHECK-NEXT:             %[[RHS_SHAPE:.*]] = shape.shape_of %[[RHS]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK-NEXT:             %[[RHS_RANK:.*]] = shape.rank %[[RHS_SHAPE]] : tensor<?xindex> -> index
+// CHECK-NEXT:             %[[RHS_IS_SCALAR:.*]] = cmpi "eq", %[[RHS_RANK]], %[[C0]] : index
+//                         Handle scalar RHS case
+// CHECK-NEXT:             %[[VAL_14:.*]] = scf.if %[[RHS_IS_SCALAR]] -> (tensor<*xf32>) {
+// CHECK-NEXT:               %[[SCALAR_RHS:.*]] = tensor_cast %[[RHS]] : tensor<*xf32> to tensor<f32>
+// CHECK-NEXT:               %[[NUM_LHS:.*]] = shape.num_elements %[[LHS_SHAPE]] : tensor<?xindex> -> index
+// CHECK-NEXT:               %[[NUM_TENS_LHS:.*]] = tensor_from_elements %[[NUM_LHS]] : tensor<1xindex>
+// CHECK-NEXT:               %[[RESHAPED_LHS:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[NUM_TENS_LHS]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK-NEXT:               %[[RHS_SCALAR_RESULT:.*]] = chlo.broadcast_add %[[RESHAPED_LHS]], %[[SCALAR_RHS]] : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
+// CHECK-NEXT:               %[[RESHAPED_RHS_SCALAR_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[RHS_SCALAR_RESULT:.*]], %[[LHS_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK-NEXT:               scf.yield %[[RESHAPED_RHS_SCALAR_RESULT]] : tensor<*xf32>
+// CHECK-NEXT:             } else {
+// CHECK-NEXT:               %[[SHAPES_EQ:.*]] = shape.shape_eq %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex>
+//                           Handle equal shapes case
+// CHECK-NEXT:               %[[VAL_18:.*]] = scf.if %[[SHAPES_EQ]] -> (tensor<*xf32>) {
+// CHECK-NEXT:                 %[[ANY_SHAPE:.*]] = shape.any %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+// CHECK-NEXT:                 %[[ANY_NUM:.*]] = shape.num_elements %[[ANY_SHAPE]] : tensor<?xindex> -> index
+// CHECK-NEXT:                 %[[ANY_TENSOR:.*]] = tensor_from_elements %[[ANY_NUM]] : tensor<1xindex>
+// CHECK-NEXT:                 %[[FLATTENED_LHS:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[ANY_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK-NEXT:                 %[[FLATTENED_RHS:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[ANY_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK-NEXT:                 %[[FLATTENED_RESULT:.*]] = mhlo.add %[[FLATTENED_LHS]], %[[FLATTENED_RHS]] : tensor<?xf32>
+// CHECK-NEXT:                 %[[RESHAPED_SAME_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[FLATTENED_RESULT]], %[[ANY_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK-NEXT:                 scf.yield %[[RESHAPED_SAME_RESULT]] : tensor<*xf32>
+// CHECK-NEXT:               } else {
+// CHECK-NEXT:                 %[[LHS_RANK_GREATER:.*]] = cmpi "sgt", %[[LHS_RANK]], %[[RHS_RANK]] : index
+// CHECK-NEXT:                 %[[GREATEST_RANK:.*]] = select %[[LHS_RANK_GREATER]], %[[LHS_RANK]], %[[RHS_RANK]] : index
+// CHECK-NEXT:                 %[[C2:.*]] = constant 2 : index
+// CHECK-NEXT:                 %[[GREATEST_RANK_IS_2:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C2]] : index
+//                             Handle rank 2 specialization
+// CHECK-NEXT:                 %[[VAL_26:.*]] = scf.if %[[GREATEST_RANK_IS_2]] -> (tensor<*xf32>) {
+// CHECK-NEXT:                   %[[CONST_SHAPE_2:.*]] = shape.const_shape [1, 1]
+// CHECK-NEXT:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+// CHECK-NEXT:                   %[[CASTED_LHS_2:.*]] = tensor_cast %[[BROADCASTED_LHS_2]] : tensor<?xindex> to tensor<2xindex>
+// CHECK-NEXT:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+// CHECK-NEXT:                   %[[CASTED_RHS_2:.*]] = tensor_cast %[[BROADCASTED_RHS_2]] : tensor<?xindex> to tensor<2xindex>
+// CHECK-NEXT:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK-NEXT:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK-NEXT:                   %[[RESULT_RANK_2:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_2]], %[[RESHAPED_RHS_2]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NEXT:                   %[[RESULT_2:.*]] = tensor_cast %[[RESULT_RANK_2]] : tensor<?x?xf32> to tensor<*xf32>
+// CHECK-NEXT:                   scf.yield %[[RESULT_2]] : tensor<*xf32>
+// CHECK-NEXT:                 } else {
+// CHECK-NEXT:                   %[[C3:.*]] = constant 3 : index
+// CHECK-NEXT:                   %[[GREATEST_RANK_IS_3:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C3]] : index
+//                               Handle rank 3 specialization
+// CHECK-NEXT:                   %[[VAL_34:.*]] = scf.if %[[GREATEST_RANK_IS_3]] -> (tensor<*xf32>) {
+// CHECK-NEXT:                     %[[CONST_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
+// CHECK-NEXT:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+// CHECK-NEXT:                     %[[CASTED_LHS_3:.*]] = tensor_cast %[[BROADCASTED_LHS_3]] : tensor<?xindex> to tensor<3xindex>
+// CHECK-NEXT:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+// CHECK-NEXT:                     %[[CASTED_RHS_3:.*]] = tensor_cast %[[BROADCASTED_RHS_3]] : tensor<?xindex> to tensor<3xindex>
+// CHECK-NEXT:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK-NEXT:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK-NEXT:                     %[[RESULT_RANK_3:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_3]], %[[RESHAPED_RHS_3]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK-NEXT:                     %[[RESULT_3:.*]] = tensor_cast %[[RESULT_RANK_3]] : tensor<?x?x?xf32> to tensor<*xf32>
+// CHECK-NEXT:                     scf.yield %[[RESULT_3]] : tensor<*xf32>
+// CHECK-NEXT:                   } else {
+// CHECK-NEXT:                     %[[C4:.*]] = constant 4 : index
+// CHECK-NEXT:                     %[[GREATEST_RANK_IS_4:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C4]] : index
+//                                 Handle rank 4 specialization
+// CHECK-NEXT:                     %[[VAL_42:.*]] = scf.if %[[GREATEST_RANK_IS_4]] -> (tensor<*xf32>) {
+// CHECK-NEXT:                       %[[CONST_SHAPE_4:.*]] = shape.const_shape [1, 1, 1, 1]
+// CHECK-NEXT:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+// CHECK-NEXT:                       %[[CASTED_LHS_4:.*]] = tensor_cast %[[BROADCASTED_LHS_4]] : tensor<?xindex> to tensor<4xindex>
+// CHECK-NEXT:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+// CHECK-NEXT:                       %[[CASTED_RHS_4:.*]] = tensor_cast %[[BROADCASTED_RHS_4]] : tensor<?xindex> to tensor<4xindex>
+// CHECK-NEXT:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK-NEXT:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK-NEXT:                       %[[RESULT_RANK_4:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_4]], %[[RESHAPED_RHS_4]] : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+// CHECK-NEXT:                       %[[RESULT_4:.*]] = tensor_cast %[[RESULT_RANK_4]] : tensor<?x?x?x?xf32> to tensor<*xf32>
+// CHECK-NEXT:                       scf.yield %[[RESULT_4]] : tensor<*xf32>
+// CHECK-NEXT:                     } else {
+// CHECK-NEXT:                       %[[C5:.*]] = constant 5 : index
+// CHECK-NEXT:                       %[[GREATEST_RANK_IS_5:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C5]] : index
+//                                   Handle rank 5 specialization
+// CHECK-NEXT:                       %[[VAL_50:.*]] = scf.if %[[GREATEST_RANK_IS_5]] -> (tensor<*xf32>) {
+// CHECK-NEXT:                         %[[CONST_SHAPE_5:.*]] = shape.const_shape [1, 1, 1, 1, 1]
+// CHECK-NEXT:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+// CHECK-NEXT:                         %[[CASTED_LHS_5:.*]] = tensor_cast %[[BROADCASTED_LHS_5]] : tensor<?xindex> to tensor<5xindex>
+// CHECK-NEXT:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+// CHECK-NEXT:                         %[[CASTED_RHS_5:.*]] = tensor_cast %[[BROADCASTED_RHS_5]] : tensor<?xindex> to tensor<5xindex>
+// CHECK-NEXT:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK-NEXT:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK-NEXT:                         %[[RESULT_RANK_5:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_5]], %[[RESHAPED_RHS_5]] : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+// CHECK-NEXT:                         %[[RESULT_5:.*]] = tensor_cast %[[RESULT_RANK_5]] : tensor<?x?x?x?x?xf32> to tensor<*xf32>
+// CHECK-NEXT:                         scf.yield %[[RESULT_5]] : tensor<*xf32>
+// CHECK-NEXT:                       } else {
+// CHECK-NEXT:                         %[[C6:.*]] = constant 6 : index
+// CHECK-NEXT:                         %[[GREATEST_RANK_IS_6:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C6]] : index
+//                                     Handle rank 6 specialization
+// CHECK-NEXT:                         %[[VAL_58:.*]] = scf.if %[[GREATEST_RANK_IS_6]] -> (tensor<*xf32>) {
+// CHECK-NEXT:                           %[[CONST_SHAPE_6:.*]] = shape.const_shape [1, 1, 1, 1, 1, 1]
+// CHECK-NEXT:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+// CHECK-NEXT:                           %[[CASTED_LHS_6:.*]] = tensor_cast %[[BROADCASTED_LHS_6]] : tensor<?xindex> to tensor<6xindex>
+// CHECK-NEXT:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+// CHECK-NEXT:                           %[[CASTED_RHS_6:.*]] = tensor_cast %[[BROADCASTED_RHS_6]] : tensor<?xindex> to tensor<6xindex>
+// CHECK-NEXT:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK-NEXT:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK-NEXT:                           %[[RESULT_RANK_6:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_6]], %[[RESHAPED_RHS_6]] : (tensor<?x?x?x?x?x?xf32>, tensor<?x?x?x?x?x?xf32>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK-NEXT:                           %[[RESULT_6:.*]] = tensor_cast %[[RESULT_RANK_6]] : tensor<?x?x?x?x?x?xf32> to tensor<*xf32>
+// CHECK-NEXT:                           scf.yield %[[RESULT_6]] : tensor<*xf32>
+// CHECK-NEXT:                         } else {
+// CHECK-NEXT:                           %false = constant false
+// CHECK-NEXT:                           assert %false
+// CHECK-NEXT:                           scf.yield %[[LHS]] : tensor<*xf32>
+// CHECK-NEXT:                         }
+// CHECK-NEXT:                         scf.yield %[[VAL_64:.*]] : tensor<*xf32>
+// CHECK-NEXT:                       }
+// CHECK-NEXT:                       scf.yield %[[VAL_65:.*]] : tensor<*xf32>
+// CHECK-NEXT:                     }
+// CHECK-NEXT:                     scf.yield %[[VAL_66:.*]] : tensor<*xf32>
+// CHECK-NEXT:                   }
+// CHECK-NEXT:                   scf.yield %[[VAL_67:.*]] : tensor<*xf32>
+// CHECK-NEXT:                 }
+// CHECK-NEXT:                 scf.yield %[[VAL_68:.*]] : tensor<*xf32>
+// CHECK-NEXT:               }
+// CHECK-NEXT:               scf.yield %[[VAL_69:.*]] : tensor<*xf32>
+// CHECK-NEXT:             }
+// CHECK-NEXT:             scf.yield %[[VAL_70:.*]] : tensor<*xf32>
+// CHECK-NEXT:           }
+// CHECK-NEXT:           return %[[VAL_71:.*]] : tensor<*xf32>
+// CHECK-NEXT:         }
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir
index 9a218b3657f..e51bdfec6f7 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir
@@ -3,7 +3,8 @@
 // RUN: mlir-hlo-opt -lhlo-fuse-linalg=use-parallel-loops %s -split-input-file | FileCheck %s -check-prefix=PLOOP
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
-#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
+#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0],
+                       iterator_types = ["parallel", "parallel"]}
 func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
              %summand_2: memref<6x6xf32>, %result: memref<6x6xf32>) {
   %temp_result = alloc() : memref<6x6xf32>
@@ -73,7 +74,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
    }
  %1 = alloc() : memref<100x10xf32>
  linalg.generic {
-   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                    affine_map<(d0, d1) -> (d0, d1)>,
+                    affine_map<(d0, d1) -> (d0, d1)>],
    iterator_types = ["parallel", "parallel"]}
     ins(%arg0, %0 : memref<100x10xf32>, memref<100x10xf32>)
    outs(%1 : memref<100x10xf32>) {
@@ -83,7 +86,8 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
      }
  dealloc %0 : memref<100x10xf32>
  linalg.generic {
-   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                    affine_map<(d0, d1) -> (d0, d1)>],
    iterator_types = ["parallel", "parallel"]}
      ins(%1 : memref<100x10xf32>)
     outs(%arg2 : memref<100x10xf32>) {
@@ -132,7 +136,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // -----
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#pointwise_4d_trait = {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+#pointwise_4d_trait = {indexing_maps = [#map0, #map0, #map0],
+                       iterator_types = ["parallel", "parallel", "parallel",
+                                         "parallel"]}
 func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32>,
              %summand_2: memref<6x6x6x6xf32>, %result: memref<6x6x6x6xf32>) {
   %temp_result = alloc() : memref<6x6x6x6xf32>
@@ -190,7 +196,8 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 // -----
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
-#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
+#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0],
+                       iterator_types = ["parallel", "parallel"]}
 func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
              %summand_2: memref<6x6xf32>) -> memref<6x6xf32> {
   %temp_result = alloc() : memref<6x6xf32>
@@ -244,3 +251,51 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
 //       PLOOP:        mulf
+
+// -----
+
+func @view_result(%arg0: memref<?xf32>, %arg1: memref<?xindex>, %arg2: index)
+    -> memref<*xf32> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %1 = alloc(%arg2) : memref<?xf32>
+  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                   affine_map<(d0) -> (d0)>],
+                  iterator_types = ["parallel"]}
+      ins(%arg0 : memref<?xf32>) outs(%1 : memref<?xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
+    %13 = absf %arg3 : f32
+    linalg.yield %13 : f32
+  }
+  %2 = lmhlo.reshape_memref_cast %1(%arg1)
+      : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
+  return %2 : memref<*xf32>
+}
+
+// CHECK-LABEL: func @view_result
+//       CHECK:  %[[C1:.*]] = constant 1
+//   CHECK-NOT:  linalg.generic
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
+//       CHECK:      linalg.generic
+//       CHECK:        absf
+//       CHECK:  reshape_memref_cast
+
+// TILED-LABEL: func @view_result
+//   TILED-DAG:  %[[C2:.*]] = constant 2
+//   TILED-NOT:  linalg.generic
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//   TILED-NOT:  scf.for
+//       TILED:      linalg.generic
+//       TILED:        absf
+//       TILED:  reshape_memref_cast
+
+
+// PLOOP-LABEL: func @view_result
+//   PLOOP-NOT:  linalg.generic
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
+//       PLOOP:      linalg.generic
+//       PLOOP:        absf
+//       PLOOP:  reshape_memref_cast
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir
index 87818045993..d020f7a083b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir
@@ -158,7 +158,14 @@ func @float_dot_op(%lhs: memref<7x3xf32>, %rhs:
     // CHECK-NEXT:      %[[ADD:.*]] =  addf %[[MULT]], %[[RESULT]] : f32
     // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xf32>
     // CHECK: return
-  "lmhlo.dot"(%lhs, %rhs, %result) :
+  "lmhlo.dot"(%lhs, %rhs, %result) {
+      dot_dimension_numbers = {
+        lhs_batching_dimensions = dense<> : tensor<0xi64>,
+        rhs_batching_dimensions = dense<> : tensor<0xi64>,
+        lhs_contracting_dimensions = dense<1> : tensor<1xi64>,
+        rhs_contracting_dimensions = dense<0> : tensor<1xi64>
+      }
+    } :
     (memref<7x3xf32>, memref<3x4xf32>, memref<7x4xf32>) -> ()
   return
 }
@@ -175,7 +182,14 @@ func @int_dot_op(%lhs: memref<7x3xi32>, %rhs:
     // CHECK-NEXT:      %[[ADD:.*]] =  addi %[[MULT]], %[[RESULT]] : i32
     // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xi32>
     // CHECK: return
-  "lmhlo.dot"(%lhs, %rhs, %result) :
+  "lmhlo.dot"(%lhs, %rhs, %result) {
+      dot_dimension_numbers = {
+        lhs_batching_dimensions = dense<> : tensor<0xi64>,
+        rhs_batching_dimensions = dense<> : tensor<0xi64>,
+        lhs_contracting_dimensions = dense<1> : tensor<1xi64>,
+        rhs_contracting_dimensions = dense<0> : tensor<1xi64>
+      }
+     } :
     (memref<7x3xi32>, memref<3x4xi32>, memref<7x4xi32>) -> ()
   return
 }
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-gpu.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-gpu.mlir
index 02ad3653639..f89c9969f7b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-gpu.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-gpu.mlir
@@ -23,8 +23,8 @@ func @reduce(%arg: memref<100x10xf32>,
 // CHECK-DAG: %[[UB:.*]] = constant 10 : index
 // CHECK-DAG: %[[STEP:.*]] = constant 1 : index
 // CHECK: scf.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-// CHECK: %[[LHS:.*]] = linalg.slice %[[ARG2]][%[[IDX]]] : memref<100xf32>, index, memref<f32, #map0>
-// CHECK: %[[RHS:.*]] = linalg.slice %[[ARG0]][%[[IDX]], %[[IDX1]]] : memref<100x10xf32>, index, index, memref<f32, #map0>
+// CHECK: %[[LHS:.*]] = linalg.slice %[[ARG2]][%[[IDX]]] : memref<100xf32>, index, memref<f32, #map>
+// CHECK: %[[RHS:.*]] = linalg.slice %[[ARG0]][%[[IDX]], %[[IDX1]]] : memref<100x10xf32>, index, index, memref<f32, #map>
 // CHECK: "lmhlo.add"(%[[LHS]], %[[RHS]], %[[LHS]]) : (memref<f32, {{.*}}>, memref<f32, {{.*}}>, memref<f32, {{.*}}>) -> ()
 // CHECK: }
 // CHECK: gpu.terminator
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
index debb035328c..5bfde291dce 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-hlo-opt %s -lhlo-legalize-to-linalg -split-input-file | FILECHECK_OPTS="" FileCheck %s
 
-// CHECK: #map0 = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @element_wise
 func @element_wise(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
                    %result: memref<2x2xf32>) {
@@ -621,10 +621,10 @@ func @sign_i16(%input: memref<2x2xi16>, %result: memref<2x2xi16>) {
 // CHECK: linalg.generic
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i16, %[[RESULT_OUT:.*]]):
 // CHECK-NEXT:   %[[C0:.*]] = constant 0 : i16
-// CHECK-NEXT:   %[[CMP:.*]] = cmpi "eq", %[[OPERAND_IN]], %[[C0]] : i16
 // CHECK-NEXT:   %[[C15:.*]] = constant 15 : i16
-// CHECK-NEXT:   %[[ASHR:.*]] = shift_right_signed %[[OPERAND_IN]], %[[C15]] : i16
 // CHECK-NEXT:   %[[C1:.*]] = constant 1 : i16
+// CHECK-NEXT:   %[[CMP:.*]] = cmpi "eq", %[[OPERAND_IN]], %[[C0]] : i16
+// CHECK-NEXT:   %[[ASHR:.*]] = shift_right_signed %[[OPERAND_IN]], %[[C15]] : i16
 // CHECK-NEXT:   %[[OR:.*]] = or %[[ASHR]], %[[C1]] : i16
 // CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[C0]], %[[OR]] : i16
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : i16
@@ -846,3 +846,76 @@ func @transpose(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[TRANSPOSE_INPUT_MAP]], #[[TRANSPOSE_OUTPUT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[REDUCE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-DAG: #[[REDUCE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0)>
+// CHECK-LABEL: func @reduce_add
+func @reduce_add(%arg: memref<100x10xf32>,
+             %init: memref<f32>,
+             %result: memref<100xf32>) {
+  "lmhlo.reduce"(%arg, %init, %result) ( {
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
+      "lmhlo.add"(%lhs, %rhs, %res)
+        : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "lmhlo.terminator"() : () -> ()
+    } ) {dimensions = dense<[1]> : tensor<1xi64>}
+      : (memref<100x10xf32>, memref<f32>, memref<100xf32>) -> ()
+  return
+}
+// CHECK: %[[INIT_VAL:.*]] = load %arg1[] : memref<f32>
+// CHECK: linalg.fill(%arg2, %[[INIT_VAL]])
+// CHECK: linalg.generic {
+// CHECK-SAME: indexing_maps = [#[[REDUCE_INPUT_MAP]], #[[REDUCE_OUTPUT_MAP]]],
+// CHECK-SAME: iterator_types = ["parallel", "reduction"]}
+// CHECK-SAME: ins(%arg0 : memref<100x10xf32>) outs(%arg2 : memref<100xf32>) {
+// CHECK: alloca
+// CHECK-NEXT: alloca
+// CHECK-NEXT: alloca
+// CHECK-NEXT: store
+// CHECK-NEXT: store
+// CHECK-NEXT: load
+// CHECK-NEXT: load
+// CHECK-NEXT: addf
+// CHECK-NEXT: store
+// CHECK-NEXT: load
+// CHECK-NEXT: linalg.yield
+// CHECK-NEXT: }
+
+// -----
+
+// CHECK-DAG: #[[REDUCE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-DAG: #[[REDUCE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0)>
+// CHECK-LABEL: func @reduce_maximum
+func @reduce_maximum(%arg: memref<100x10xf32>,
+             %init: memref<f32>,
+             %result: memref<100xf32>) {
+  "lmhlo.reduce"(%arg, %init, %result) ( {
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
+      "lmhlo.maximum"(%lhs, %rhs, %res)
+        : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "lmhlo.terminator"() : () -> ()
+    } ) {dimensions = dense<[1]> : tensor<1xi64>}
+      : (memref<100x10xf32>, memref<f32>, memref<100xf32>) -> ()
+  return
+}
+// CHECK: %[[INIT_VAL:.*]] = load %arg1[] : memref<f32>
+// CHECK: linalg.fill(%arg2, %[[INIT_VAL]])
+// CHECK: linalg.generic {
+// CHECK-SAME: indexing_maps = [#[[REDUCE_INPUT_MAP]], #[[REDUCE_OUTPUT_MAP]]],
+// CHECK-SAME: iterator_types = ["parallel", "reduction"]}
+// CHECK-SAME: ins(%arg0 : memref<100x10xf32>) outs(%arg2 : memref<100xf32>) {
+// CHECK: alloca
+// CHECK-NEXT: alloca
+// CHECK-NEXT: alloca
+// CHECK-NEXT: store
+// CHECK-NEXT: store
+// CHECK-NEXT: load
+// CHECK-NEXT: load
+// CHECK-NEXT: cmpf
+// CHECK-NEXT: select
+// CHECK-NEXT: store
+// CHECK-NEXT: load
+// CHECK-NEXT: linalg.yield
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir
new file mode 100644
index 00000000000..9e5ce67f39a
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file | mlir-hlo-opt | FileCheck %s
+
+// CHECK-LABEL: func @batch_norm_grad_memrefs
+func @batch_norm_grad_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
+                              %arg3: memref<8xf32>, %arg4: memref<8x8x8x8xf32>,
+                              %grad_operand: memref<8x8x8x8xf32>, %grad_scale: memref<8xf32>,
+                              %grad_offset: memref<8xf32>) -> () {
+  "lmhlo_gpu.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4, %grad_operand, %grad_scale, %grad_offset) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>,
+         memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @batch_norm_inference_memrefs
+func @batch_norm_inference_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
+                                   %arg3: memref<8xf32>, %arg4: memref<8xf32>, %arg_out: memref<8x8x8x8xf32>) -> () {
+  "lmhlo_gpu.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg_out) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @batch_norm_training_memrefs
+func @batch_norm_training_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
+                                  %output: memref<8x8x8x8xf32>, %batch_mean: memref<8xf32>,
+                                  %batch_var: memref<8xf32>) -> () {
+  "lmhlo_gpu.batch_norm_training"(%arg0, %arg1, %arg2, %output, %batch_mean, %batch_var) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @conv_forward
+func @conv_forward(%input : memref<1x1x8x8xf16>, %filter: memref<1x1x2x2xf16>, %output: memref<1x1x7x7xf16>) {
+  %scratch = alloc() : memref<32xi8>
+  // This defined a 2D convolution over a 8x8 single channel input using a 2x2
+  // filter and with an output of 7x7xf16. The 1x1x8x8 is (N, C, H, W)
+  "lmhlo_gpu.conv_forward"(%input, %filter, %output, %scratch)
+    { dimension_numbers = {input_batch_dimension = 0 : i64,
+                           input_feature_dimension = 1 : i64,
+                           input_spatial_dimensions = dense<[2,3]> : tensor<2xi64>,
+                           kernel_input_feature_dimension = 0 : i64,
+                           kernel_output_feature_dimension = 1 : i64,
+                           kernel_spatial_dimensions = dense<[2,3]> : tensor<2xi64>,
+                           output_batch_dimension = 0 : i64,
+                           output_feature_dimension = 1 : i64,
+                           output_spatial_dimensions = dense<[2,3]> : tensor<2xi64>},
+      window_strides = dense<[1, 1]> : tensor<2xi64>,
+      padding = dense<[0,0]> : tensor<2xi64>,
+      lhs_dilation = dense<[1,1]> : tensor<2xi64>,
+      rhs_dilation = dense<[1,1]> : tensor<2xi64>,
+      feature_group_count = 1,
+      batch_group_count = 1,
+      result_scale = 1.0,
+      backend_config = {algorithm=0, tensor_ops_enabled = true }
+    }
+    : (memref<1x1x8x8xf16>, memref<1x1x2x2xf16>, memref<1x1x7x7xf16>, memref<32xi8>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @gemm
+func @gemm(%lhs: memref<5x4xf32>, %rhs: memref<4x5xf32>, %output:memref<5x5xf32>) {
+  "lmhlo_gpu.gemm"(%lhs, %rhs, %output) { dot_dimension_numbers = {
+       lhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       lhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>},
+       alpha = 0.5,
+       batch_size = 1,
+       algorithm = 0}
+    : (memref<5x4xf32>, memref<4x5xf32>, memref<5x5xf32>) -> ()
+  return
+}
+
+
+// CHECK-LABEL: func @gemm_bias
+func @gemm_bias(%lhs: memref<5x4xf32>, %rhs: memref<4x5xf32>,
+                %bias: memref<5x5xf32>, %output:memref<5x5xf32>) {
+  "lmhlo_gpu.gemm_bias"(%lhs, %rhs, %bias, %output) { dot_dimension_numbers = {
+       lhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       lhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>},
+       alpha = 0.5,
+       beta = 1.0,
+       batch_size = 1,
+       algorithm = 0}
+    : (memref<5x4xf32>, memref<4x5xf32>, memref<5x5xf32>, memref<5x5xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @cholesky
+func @cholesky(%arg : memref<10x10xf32>, %out: memref<10x10xf32>) {
+  %scratch = alloc() : memref<32xi8>
+  %info = alloc() : memref<32xi32>
+  "lmhlo_gpu.cholesky"(%arg, %out, %scratch, %info) { is_upper = true }
+      : (memref<10x10xf32>, memref<10x10xf32>, memref<32xi8>, memref<32xi32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
index d626f520824..8829e4c7328 100644
--- a/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
@@ -28,7 +28,7 @@ func @compare(%a : tensor<2x?xf32>, %b : tensor<2x?xf32>) -> tensor<2xi64> {
   // CHECK: %[[DIM:.*]] = index_cast %[[DIM_AS_INDEX]] : index to i64
   // CHECK: %[[SHAPE:.*]] = tensor_from_elements %[[C2]], %[[DIM]] : tensor<2xi64>
   // CHECK: return %[[SHAPE]] : tensor<2xi64>
-  %0 = "mhlo.compare"(%a, %b) { comparison_direction = "NE" }
+  %0 = "mhlo.compare"(%a, %b) {comparison_direction = "NE"}
       : (tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xi1>
   %1 = "mhlo_test.reify_return_type_shapes"(%0)
       : (tensor<2x?xi1>) -> tensor<2xi64>
diff --git a/tensorflow/compiler/mlir/hlo/tests/ops.mlir b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
index 4462d9c45c6..d22f7d1614a 100644
--- a/tensorflow/compiler/mlir/hlo/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
@@ -328,6 +328,14 @@ func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> tensor<
 
 // -----
 
+func @concat_0D(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
+  // expected-error@+1 {{rank-0 values cannot be concatenated}}
+  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  return %0 : tensor<2xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @concat_1D
 func @concat_1D(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
@@ -683,9 +691,9 @@ func @select_bad_element_type_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf3
 // -----
 
 // CHECK-LABEL: func @slice
-func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x4xi32> {
-  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
+func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  return %0 : tensor<1x2xi32>
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
index d0c0e3c51e1..ed96dd5ffd8 100644
--- a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
 #include "mlir/InitAllDialects.h"
@@ -31,6 +32,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::mhlo::MhloDialect>();
   registry.insert<mlir::chlo::HloClientDialect>();
   registry.insert<mlir::lmhlo::LmhloDialect>();
+  registry.insert<mlir::lmhlo_gpu::LmhloGpuDialect>();
 
   return failed(
       mlir::MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index eff591895e1..9667b58613a 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -419,6 +419,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
@@ -467,6 +468,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -500,6 +502,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -668,8 +671,8 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/flex:allowlisted_flex_ops_lib",
         "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/tools/versioning",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index 44eba0d5e6f..32b00e2f4f2 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -82,6 +82,10 @@ static inline std::string GetOperatorBuilderName(StringRef op_name) {
   return oss.str();
 }
 
+static inline bool IsLstmOp(const StringRef op_name) {
+  return op_name.take_back(6) == "LSTMOp";
+}
+
 static void EmitOptionBuilders(const RecordKeeper &record_keeper,
                                const std::vector<Record *> &defs,
                                raw_ostream *ostream) {
@@ -123,7 +127,7 @@ static void EmitOptionBuilders(const RecordKeeper &record_keeper,
         // in the exporter. They are special because though they are attributes
         // in the MLIR they are expressed as tensors in the flatbuffer instead
         // of option.
-        if (op_name == "LSTMOp" && arg_name.take_back(12) == "intermediate")
+        if (IsLstmOp(op_name) && arg_name.take_back(12) == "intermediate")
           continue;
         os << formatv(
             "  auto {0} = Convert{1}ForOptionWriter(op.{0}(), fbb);\n",
@@ -170,7 +174,7 @@ static void EmitOperatorBuilders(const std::vector<Record *> &defs,
   for (const auto *def : defs) {
     StringRef op_name = def->getName().drop_front(4);
 
-    const bool has_intermediates = op_name == "LSTMOp";
+    const bool has_intermediates = op_name.take_back(6) == "LSTMOp";
     // Signature
     os << "static flatbuffers::Offset<tflite::Operator> "
        << GetOperatorBuilderName(def->getName()) << "(mlir::TFL::" << op_name
@@ -324,7 +328,8 @@ static void EmitBuildOperator(const std::vector<Record *> &defs,
        << ">(op))\n"
        << "    return " << GetOperatorBuilderName(def->getName())
        << "(tflOp, opcode_index, operands, results, "
-       << (op_name == "LSTMOp" ? "intermediates, " : "") << "fbb);\n";
+       << (op_name.take_back(6) == "LSTMOp" ? "intermediates, " : "")
+       << "fbb);\n";
   }
 
   os << "  return llvm::None;\n"
@@ -368,7 +373,8 @@ static void EmitBuiltinOptionsToAttributes(const RecordKeeper &record_keeper,
       if (arg_def->getDef()->isSubClassOf(attr_type)) {
         StringRef arg_name = arg_values->getArgNameStr(i);
         // Already handle this case in flatbuffer_import.cc.
-        if (option_name == "LSTMOptions" &&
+        if ((option_name == "LSTMOptions" ||
+             option_name == "UnidirectionalSequenceLSTMOptions") &&
             arg_name.take_back(12) == "intermediate")
           continue;
         StringRef attr_type = mlir::tblgen::Attribute(arg_def).getAttrDefName();
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 3f4f3997536..c230790c649 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -74,8 +74,8 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 #include "tensorflow/lite/tools/versioning/runtime_version.h"
@@ -195,6 +195,22 @@ static bool IsConst(Operation* op) {
              tfl::SparseConstOp, tfl::SparseQConstOp>(op);
 }
 
+static bool IsTFResourceOp(Operation* op) {
+  for (const auto& operand : op->getOperands()) {
+    auto elementType = getElementTypeOrSelf(operand.getType());
+    if (elementType.isa<mlir::TF::ResourceType>()) {
+      return true;
+    }
+  }
+  for (const auto& result : op->getResults()) {
+    auto elementType = getElementTypeOrSelf(result.getType());
+    if (elementType.isa<mlir::TF::ResourceType>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 template <typename T>
 static bool HasValidTFLiteType(Value value, T& error_handler) {
   // None type is allowed to represent unspecified operands.
@@ -246,8 +262,18 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
     auto& bb = fn.front();
 
     for (auto arg : bb.getArguments()) {
-      if (!HasValidTFLiteType(arg, fn))
+      if (!HasValidTFLiteType(arg, fn)) {
+        auto elementType = getElementTypeOrSelf(arg.getType());
+        if (elementType.isa<mlir::TF::VariantType>()) {
+          return fn.emitError(
+                     "function argument uses variant type. Currently, the "
+                     "variant type is not natively supported in TFLite. Please "
+                     "consider not using the variant type: ")
+                     << arg.getType(),
+                 false;
+        }
         return fn.emitError("invalid TFLite type: ") << arg.getType(), false;
+      }
     }
 
     // Verify that all operations except the terminator have exactly one
@@ -256,9 +282,20 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
       if (inst.isKnownTerminator()) break;
 
       for (auto result : inst.getResults()) {
-        if (!HasValidTFLiteType(result, inst))
+        if (!HasValidTFLiteType(result, inst)) {
+          auto elementType = getElementTypeOrSelf(result.getType());
+          if (elementType.isa<mlir::TF::VariantType>()) {
+            return inst.emitError(
+                       "operand result uses variant type. Currently, the "
+                       "variant type is not natively supported in TFLite. "
+                       "Please "
+                       "consider not using the variant type: ")
+                       << result.getType(),
+                   false;
+          }
           return fn.emitError("invalid TFLite type: ") << result.getType(),
                  false;
+        }
       }
     }
   }
@@ -326,6 +363,21 @@ static Optional<TfLitePoolParams> GetTflitePoolParams(Operation* inst,
 
 namespace {
 
+// Helper struct that wraps inputs/outputs of a single SignatureDef.
+struct SignatureDefData {
+  // Note, we are using maps here to make order deterministic
+  // for easily testing only.
+
+  // Inputs defined in the signature def mapped to tensor names.
+  std::map<std::string, std::string> inputs;
+  // Outputs defined in the signature def mapped to tensor names.
+  std::map<std::string, std::string> outputs;
+  // Method name exported by the signature def.
+  std::string method_name;
+  // SignatureDef key.
+  std::string signature_def_key;
+};
+
 // Translates an MLIR module in TFLite dialect to TFLite FlatBuffer.
 class Translator {
  public:
@@ -334,16 +386,19 @@ class Translator {
   // internal error.
   static Optional<std::string> Translate(
       ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
-      bool emit_custom_ops, OpOrArgNameMapper* op_or_arg_name_mapper);
+      bool emit_custom_ops, const std::unordered_set<std::string>& tags,
+      OpOrArgNameMapper* op_or_arg_name_mapper);
 
  private:
   enum class OpType : char { kTfliteBuiltin, kSelectTf, kCustomOp };
   explicit Translator(ModuleOp module, bool emit_builtin_tflite_ops,
                       bool emit_select_tf_ops, bool emit_custom_ops,
+                      const std::unordered_set<std::string>& saved_model_tags,
                       OpOrArgNameMapper* op_or_arg_name_mapper)
       : module_(module),
         name_mapper_(*op_or_arg_name_mapper),
-        builder_(kInitialBufferSize) {
+        builder_(kInitialBufferSize),
+        saved_model_tags_(saved_model_tags) {
     // The first buffer must be empty according to the schema definition.
     empty_buffer_ = tflite::CreateBuffer(builder_);
     buffers_.push_back(empty_buffer_);
@@ -450,6 +505,17 @@ class Translator {
   Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
   CreateMetadataVector();
 
+  // Builds and returns list of tfl.SignatureDef sections in the model.
+  Optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
+  CreateSignatureDefs(const std::vector<SignatureDefData>& signature_defs);
+
+  // Returns list of offsets for the passed 'items' in TensorMap structure
+  // inside the flatbuffer.
+  // 'items' is a map from tensor name in signatureDef to tensor name in
+  // the model.
+  std::vector<BufferOffset<tflite::TensorMap>> GetList(
+      const std::map<std::string, std::string>& items);
+
   // Uses the tf.entry_function attribute (if set) to initialize the op to name
   // mapping.
   void InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr);
@@ -472,6 +538,8 @@ class Translator {
   BufferOffset<tflite::Buffer> empty_buffer_;
 
   std::vector<BufferOffset<tflite::Buffer>> buffers_;
+  // Maps tensor name in the graph to the tensor index.
+  absl::flat_hash_map<std::string, int> tensor_index_map_;
 
   // Maps op name to index of the corresponding OperatorCode in opcodes_ vector.
   absl::flat_hash_map<std::string, uint32_t> opcode_index_map_;
@@ -490,6 +558,12 @@ class Translator {
   // The failed ops during legalization.
   std::set<std::string> failed_flex_ops_;
   std::set<std::string> failed_custom_ops_;
+
+  // Resource ops to provide warning messages.
+  std::set<std::string> resource_ops_;
+
+  // Set of saved model tags, if any.
+  const std::unordered_set<std::string> saved_model_tags_;
 };
 
 std::string Translator::UniqueName(mlir::Value val) {
@@ -565,7 +639,9 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
   BufferOffset<tflite::QuantizationParameters> q_params;
   auto qtype = element_type.dyn_cast<mlir::quant::UniformQuantizedType>();
   if (!qtype) {
-    return llvm::None;
+    return tflite::CreateTensor(builder_, builder_.CreateVector(shape),
+                                tflite_element_type,
+                                /*buffer=*/0, builder_.CreateString(name));
   }
   q_params = tflite::CreateQuantizationParameters(
       builder_, /*min=*/0, /*max=*/0,
@@ -987,6 +1063,10 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       return llvm::None;
     }
 
+    if (IsTFResourceOp(inst)) {
+      resource_ops_.insert(node_def->op());
+    }
+
     // Flex op case
     // Eventually, the allowlist will go away and we will rely on some TF op
     // trait (e.g. No side effect) to determine if it is a supported "Flex"
@@ -1131,6 +1211,7 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     }
 
     tensor_index_map.insert({value, tensors.size()});
+    tensor_index_map_[name] = tensors.size();
     auto tensor_or = BuildTensor(value, name, buffers_.size());
     if (!tensor_or) return false;
     tensors.push_back(*tensor_or);
@@ -1169,7 +1250,8 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     std::vector<int32_t> intermediates;
     // Build intermediate tensors for tfl.lstm and insert these tensors into
     // flatbuffer.
-    if (llvm::isa<mlir::TFL::LSTMOp>(inst)) {
+    if (llvm::isa<mlir::TFL::LSTMOp, mlir::TFL::UnidirectionalSequenceLSTMOp>(
+            inst)) {
       std::vector<std::string> intermediate_names = {
           "input_to_input_intermediate", "input_to_forget_intermediate",
           "input_to_cell_intermediate", "input_to_output_intermediate",
@@ -1286,6 +1368,149 @@ Translator::CreateMetadataVector() {
   return builder_.CreateVector(metadata);
 }
 
+// Helper method that returns list of all strings in a StringAttr identified
+// by 'attr_key' and values are separated by a comma.
+llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
+    mlir::DictionaryAttr attr, const std::string& attr_key) {
+  llvm::SmallVector<llvm::StringRef, 2> result;
+  if (auto str = attr.get(attr_key).dyn_cast_or_null<mlir::StringAttr>()) {
+    str.getValue().split(result, ',', /*MaxSplit=*/-1,
+                         /*KeepEmpty=*/false);
+  }
+  return result;
+}
+
+// Helper method that return list of string for all the StringAttr in the
+// Attribute identified by 'attr_name'.
+std::vector<std::string> GetStringsFromDictionaryAttr(
+    const llvm::SmallVector<mlir::MutableDictionaryAttr, 4>& dict_attrs,
+    const std::string& attr_name) {
+  std::vector<std::string> result;
+  for (const auto& arg_attr : dict_attrs) {
+    auto attrs = arg_attr.getAttrs();
+    for (const auto attr : attrs) {
+      if (attr.first.str() == attr_name) {
+        auto array_attr = attr.second.dyn_cast_or_null<mlir::ArrayAttr>();
+        if (!array_attr || array_attr.empty()) continue;
+        auto string_attr = array_attr[0].dyn_cast_or_null<mlir::StringAttr>();
+        if (!string_attr) continue;
+        result.push_back(string_attr.getValue().str());
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<SignatureDefData> BuildSignaturedef(
+    FuncOp main_op, const std::string& saved_model_tag) {
+  static const char kSignatureDefIndexPath[] = "tf_saved_model.index_path";
+  static const char kEntryFunctionAttributes[] = "tf.entry_function";
+
+  // Fetch inputs and outputs from the signature.
+  llvm::SmallVector<mlir::MutableDictionaryAttr, 4> arg_attrs, res_attrs;
+  main_op.getAllArgAttrs(arg_attrs);
+  main_op.getAllResultAttrs(res_attrs);
+  std::vector<std::string> sig_def_inputs =
+      GetStringsFromDictionaryAttr(arg_attrs, kSignatureDefIndexPath);
+  std::vector<std::string> sig_def_outputs =
+      GetStringsFromDictionaryAttr(res_attrs, kSignatureDefIndexPath);
+
+  // If no defined saved model signature, then return empty list.
+  // This can happen when we are converting model not from SavedModel.
+  if (sig_def_inputs.empty() || sig_def_outputs.empty()) return {};
+
+  // Fetch function inputs and outputs tensor names.
+  auto dict_attr =
+      main_op.getAttrOfType<mlir::DictionaryAttr>(kEntryFunctionAttributes);
+  if (!dict_attr) return {};
+
+  // Get Input and output tensor names from attribute.
+  llvm::SmallVector<llvm::StringRef, 2> input_names =
+      GetStringsFromAttrWithSeparator(dict_attr, /*attr_key=*/"inputs");
+  llvm::SmallVector<llvm::StringRef, 2> output_names =
+      GetStringsFromAttrWithSeparator(dict_attr, /*attr_key=*/"outputs");
+
+  // Verify input size match the number of arguments.
+  if (input_names.size() != main_op.getNumArguments()) {
+    main_op.emitWarning() << "invalid entry function specification";
+    return {};
+  }
+  // Verify output size match the number of arguments.
+  auto term = main_op.back().getTerminator();
+  if (output_names.size() != term->getNumOperands()) {
+    main_op.emitWarning() << "output names (" << output_names.size()
+                          << ") != terminator operands ("
+                          << term->getNumOperands() << ")";
+    return {};
+  }
+  // Verify number of tensors for inputs and outputs matches size
+  // of the list in the signature def.
+  if (input_names.size() != sig_def_inputs.size() ||
+      output_names.size() != sig_def_outputs.size()) {
+    main_op.emitWarning(
+        "Mismatch between signature def inputs/outputs and main function "
+        "arguments.");
+    return {};
+  }
+  // Exported method name.
+  auto exported_name =
+      main_op.getAttrOfType<mlir::ArrayAttr>("tf_saved_model.exported_names");
+  if (exported_name.empty()) {
+    main_op.emitError("Empty exported names for main Function");
+    return {};
+  }
+  // Fill the SignatureDefData container.
+  // We create vector of size 1 as TFLite now supports only 1 signatureDef.
+  std::vector<SignatureDefData> result(1);
+  for (int i = 0; i < input_names.size(); ++i) {
+    result[0].inputs[sig_def_inputs[i]] = input_names[i].str();
+  }
+  for (int i = 0; i < output_names.size(); ++i) {
+    result[0].outputs[sig_def_outputs[i]] = output_names[i].str();
+  }
+  if (auto name_attr = exported_name[0].dyn_cast_or_null<StringAttr>())
+    result[0].method_name = name_attr.getValue().str();
+  result[0].signature_def_key = saved_model_tag;
+  return result;
+}
+
+std::vector<BufferOffset<tflite::TensorMap>> Translator::GetList(
+    const std::map<std::string, std::string>& items) {
+  std::vector<BufferOffset<tflite::TensorMap>> result;
+  for (const auto& item : items) {
+    auto name_buf = builder_.CreateString(item.first);
+    tflite::TensorMapBuilder tensor_map_builder(builder_);
+    tensor_map_builder.add_name(name_buf);
+    tensor_map_builder.add_tensor_index(tensor_index_map_[item.second]);
+    result.push_back(tensor_map_builder.Finish());
+  }
+  return result;
+}
+
+Optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
+Translator::CreateSignatureDefs(
+    const std::vector<SignatureDefData>& signature_defs) {
+  std::vector<BufferOffset<tflite::SignatureDef>> signature_defs_buffer;
+  for (const auto& signature_def_data : signature_defs) {
+    auto inputs = GetList(signature_def_data.inputs);
+    auto outputs = GetList(signature_def_data.outputs);
+    auto inputs_buf = builder_.CreateVector(inputs);
+    auto outputs_buf = builder_.CreateVector(outputs);
+    auto method_name_buf =
+        builder_.CreateString(signature_def_data.method_name);
+    auto signature_def_key_buf =
+        builder_.CreateString(signature_def_data.signature_def_key);
+    tflite::SignatureDefBuilder sig_def_builder(builder_);
+    sig_def_builder.add_inputs(inputs_buf);
+    sig_def_builder.add_outputs(outputs_buf);
+    sig_def_builder.add_method_name(method_name_buf);
+    sig_def_builder.add_key(signature_def_key_buf);
+    signature_defs_buffer.push_back(sig_def_builder.Finish());
+  }
+
+  return builder_.CreateVector(signature_defs_buffer);
+}
+
 bool UpdateEntryFunction(ModuleOp module) {
   if (module.lookupSymbol<FuncOp>("main") != nullptr) {
     // We already have an entry function.
@@ -1312,11 +1537,12 @@ bool UpdateEntryFunction(ModuleOp module) {
 
 Optional<std::string> Translator::Translate(
     ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
-    bool emit_custom_ops, OpOrArgNameMapper* op_or_arg_name_mapper) {
+    bool emit_custom_ops, const std::unordered_set<std::string>& tags,
+    OpOrArgNameMapper* op_or_arg_name_mapper) {
   if (!UpdateEntryFunction(module)) return llvm::None;
   if (!IsValidTFLiteMlirModule(module)) return llvm::None;
   Translator translator(module, emit_builtin_tflite_ops, emit_select_tf_ops,
-                        emit_custom_ops, op_or_arg_name_mapper);
+                        emit_custom_ops, tags, op_or_arg_name_mapper);
   return translator.TranslateInternal();
 }
 
@@ -1356,6 +1582,15 @@ Optional<std::string> Translator::TranslateInternal() {
     }
   }
 
+  if (!resource_ops_.empty()) {
+    std::string resource_ops_list = absl::StrJoin(resource_ops_, ", ");
+    LOG(WARNING) << "Graph contains " << resource_ops_list
+                 << " op(s), that use(s) resource type. Currently, the "
+                    "resource type is not natively supported in TFLite. Please "
+                    "consider not using the resource type if there are issues "
+                    "with either TFLite converter or TFLite runtime.";
+  }
+
   if (first_failed_func != -1) {
     std::string failed_flex_ops_list = absl::StrJoin(failed_flex_ops_, "\n\t");
     std::string failed_custom_ops_list =
@@ -1392,10 +1627,17 @@ Optional<std::string> Translator::TranslateInternal() {
   auto metadata = CreateMetadataVector();
   if (!metadata) return llvm::None;
 
-  auto model = tflite::CreateModel(
-      builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(opcodes_),
-      builder_.CreateVector(subgraphs), description,
-      builder_.CreateVector(buffers_), metadata_buffer, *metadata);
+  // Build SignatureDef
+  // We only have 1 entry point 'main' function, so build only 1 signature def.
+  auto main_fn_signature_def = BuildSignaturedef(
+      main_fn, saved_model_tags_.empty() ? "" : *saved_model_tags_.begin());
+  auto signature_defs = CreateSignatureDefs(main_fn_signature_def);
+
+  auto model = tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
+                                   builder_.CreateVector(opcodes_),
+                                   builder_.CreateVector(subgraphs),
+                                   description, builder_.CreateVector(buffers_),
+                                   metadata_buffer, *metadata, *signature_defs);
   tflite::FinishModelBuffer(builder_, model);
   tflite::UpdateOpVersion(builder_.GetBufferPointer());
   tflite::UpdateMinimumRuntimeVersionForModel(builder_.GetBufferPointer());
@@ -1519,12 +1761,10 @@ bool tflite::MlirToFlatBufferTranslateFunction(
     ModuleOp module, std::string* serialized_flatbuffer,
     bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
     OpOrArgNameMapper* op_or_arg_name_mapper) {
-  auto maybe_translated =
-      Translator::Translate(module, emit_builtin_tflite_ops, emit_select_tf_ops,
-                            emit_custom_ops, op_or_arg_name_mapper);
-  if (!maybe_translated) return true;
-  *serialized_flatbuffer = std::move(*maybe_translated);
-  return false;
+  return MlirToFlatBufferTranslateFunction(
+      module, serialized_flatbuffer, emit_builtin_tflite_ops,
+      emit_select_tf_ops, emit_custom_ops, /*saved_model_tags=*/{},
+      op_or_arg_name_mapper);
 }
 
 bool tflite::MlirToFlatBufferTranslateFunction(
@@ -1534,5 +1774,30 @@ bool tflite::MlirToFlatBufferTranslateFunction(
   OpOrArgLocNameMapper op_or_arg_name_mapper;
   return MlirToFlatBufferTranslateFunction(
       module, serialized_flatbuffer, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, &op_or_arg_name_mapper);
+      emit_select_tf_ops, emit_custom_ops, /*saved_model_tags=*/{},
+      &op_or_arg_name_mapper);
+}
+
+bool tflite::MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags) {
+  OpOrArgLocNameMapper op_or_arg_name_mapper;
+  return MlirToFlatBufferTranslateFunction(
+      module, serialized_flatbuffer, emit_builtin_tflite_ops,
+      emit_select_tf_ops, emit_custom_ops, saved_model_tags,
+      &op_or_arg_name_mapper);
+}
+
+bool tflite::MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags,
+    OpOrArgNameMapper* op_or_arg_name_mapper) {
+  auto maybe_translated = Translator::Translate(
+      module, emit_builtin_tflite_ops, emit_select_tf_ops, emit_custom_ops,
+      saved_model_tags, op_or_arg_name_mapper);
+  if (!maybe_translated) return true;
+  *serialized_flatbuffer = std::move(*maybe_translated);
+  return false;
 }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.h b/tensorflow/compiler/mlir/lite/flatbuffer_export.h
index 0fbf2f07dfb..0888d2a4a41 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
 
 #include <string>
+#include <unordered_set>
 
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
@@ -33,11 +34,24 @@ bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
                                        bool emit_select_tf_ops,
                                        bool emit_custom_ops);
 
+// Same as above but takes SavedModel tags of the model.
+bool MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags);
+
 // Same as the above but with a custom op name mapper.
 bool MlirToFlatBufferTranslateFunction(
     mlir::ModuleOp module, std::string* serialized_flatbuffer,
     bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
     tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper);
+
+// Same as above but takes SavedModel tags of the model.
+bool MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags,
+    tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 7d64e268063..986ee590457 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -92,6 +92,7 @@ using mlir::UnrankedTensorType;
 using mlir::Value;
 using mlir::quant::QuantizedType;
 using tflite::TensorT;
+using xla::Status;
 using xla::StatusOr;
 
 namespace errors = tensorflow::errors;
@@ -527,6 +528,34 @@ llvm::SmallVector<mlir::NamedAttribute, 4> ConvertSubgraphIdxsToFunctionAttrs(
   return {};
 }
 
+Status AddOpIntermediatesForLstm(
+    const tflite::OperatorT& op,
+    const std::vector<mlir::TensorType>& intermediate_types,
+    OperationState& op_state, Location loc, OpBuilder& builder) {
+  if (!op.intermediates.empty()) {
+    if (op.intermediates.size() != 5) {
+      auto err = errors::InvalidArgument(
+          "operator has intermediate tensors but the number of them is not "
+          "five.");
+      return emitError(loc, err.ToString()), err;
+    }
+    // Create intermediate value
+
+    const llvm::SmallVector<llvm::StringRef, 5> kIntermediateNames = {
+        "input_to_input_intermediate", "input_to_forget_intermediate",
+        "input_to_cell_intermediate", "input_to_output_intermediate",
+        "effective_hidden_scale_intermediate"};
+    for (auto type_and_name :
+         llvm::zip(intermediate_types, kIntermediateNames)) {
+      mlir::TypeAttr type_attr =
+          mlir::TypeAttr::get(std::get<0>(type_and_name));
+      auto named_attr =
+          builder.getNamedAttr(std::get<1>(type_and_name), type_attr);
+      op_state.addAttribute(named_attr.first, named_attr.second);
+    }
+  }
+  return Status::OK();
+}
 
 // TODO(krzysd) Handle function calls
 StatusOr<Operation*> ConvertOp(
@@ -613,28 +642,12 @@ StatusOr<Operation*> ConvertOp(
   if (op_name == "tfl.lstm") {
     // TODO(b/147587779): add the right region if region is empty.
     op_state.addRegion();
-    if (!op.intermediates.empty()) {
-      if (op.intermediates.size() != 5) {
-        auto err = errors::InvalidArgument(
-            "operator has intermediate tensors but the number of them is not "
-            "five.");
-        return emitError(loc, err.ToString()), err;
-      }
-      // Create intermediate value
-
-      const llvm::SmallVector<llvm::StringRef, 5> kIntermediateNames = {
-          "input_to_input_intermediate", "input_to_forget_intermediate",
-          "input_to_cell_intermediate", "input_to_output_intermediate",
-          "effective_hidden_scale_intermediate"};
-      for (auto type_and_name :
-           llvm::zip(intermediate_types, kIntermediateNames)) {
-        mlir::TypeAttr type_attr =
-            mlir::TypeAttr::get(std::get<0>(type_and_name));
-        auto named_attr =
-            builder.getNamedAttr(std::get<1>(type_and_name), type_attr);
-        op_state.addAttribute(named_attr.first, named_attr.second);
-      }
-    }
+    TF_CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
+                                          builder));
+  }
+  if (op_name == "tfl.unidirectional_sequence_lstm") {
+    TF_CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
+                                          builder));
   }
 
   llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 2894af9b97e..d593c0ec836 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -243,12 +243,17 @@ struct TensorFlowLiteInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
-  bool isLegalToInline(Operation *op, Region *dest,
+  // Allow all call operations to be inlined.
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
+  bool isLegalToInline(Operation *op, Region *dest, bool wouldBeCloned,
                        BlockAndValueMapping &) const final {
     // No TFLite op restricts inlining today, revise as needed in the future.
     return true;
   }
-  bool isLegalToInline(Region *dest, Region *src,
+  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
                        BlockAndValueMapping &valueMapping) const final {
     return isa<WhileOp>(dest->getParentOp());
   }
@@ -301,6 +306,14 @@ inline bool IsF32ShapedType(Type t) {
   return false;
 }
 
+// Returns true if it is a shaped type of bf16 elements.
+inline bool IsBF16ShapedType(Type t) {
+  if (auto shaped_type = t.dyn_cast_or_null<ShapedType>()) {
+    return shaped_type.getElementType().isBF16();
+  }
+  return false;
+}
+
 // Performs const folding `calculate` with broadcast behavior on the two
 // attributes `operand1` and `operand2` and returns the result if possible.
 // The two operands are expected to both be scalar values.
@@ -493,7 +506,7 @@ Attribute ConstFoldBinaryOp(
 /// "tfl.logical_not".
 Attribute ConstFoldUnaryOp(Type result_type, Attribute operand,
                            llvm::function_ref<APFloat(APFloat)> calculate) {
-  assert(IsF32ShapedType(result_type));
+  assert(IsF32ShapedType(result_type) || IsBF16ShapedType(result_type));
   auto result_shape_type = result_type.cast<ShapedType>();
 
   if (auto dense_elements = operand.dyn_cast_or_null<DenseElementsAttr>()) {
@@ -1157,6 +1170,20 @@ void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<RemoveAdjacentReshape>(context);
 }
 
+static LogicalResult Verify(ReshapeOp op) {
+  auto shape = op.shape().getType().dyn_cast_or_null<mlir::RankedTensorType>();
+  if (!shape || !shape.hasRank()) return success();
+  if (shape.getNumDynamicDims() <= 1) return success();
+
+  op.emitError()
+      << "its shape value, " << shape
+      << ", is invalid because it has more than one dynamic dimensions. You "
+         "need to set up the unspecified size(s) to avoid this problem, for "
+         "example, setting batch size in keras model or setting unspecified "
+         "input size(s) with fixed ones.";
+  return failure();
+}
+
 //===----------------------------------------------------------------------===//
 // PackOp
 //===----------------------------------------------------------------------===//
@@ -1892,13 +1919,20 @@ OpFoldResult SqrtOp::fold(ArrayRef<Attribute> operands) {
 
 OpFoldResult RsqrtOp::fold(ArrayRef<Attribute> operands) {
   Type result_type = getType();
-  // Only constant fold for tensor of f32 is implemented.
-  if (!IsF32ShapedType(result_type)) return nullptr;
+  // Only constant fold for tensor of f32/bf16 is implemented.
+  if (!IsF32ShapedType(result_type) && !IsBF16ShapedType(result_type))
+    return nullptr;
 
   auto compute = [](APFloat value) -> APFloat {
+    bool loseInfo;
+    const llvm::fltSemantics &original_float_semantics = value.getSemantics();
+    value.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                  &loseInfo);
     float f = value.convertToFloat();
-    float result = 1.f / std::sqrt(f);
-    return APFloat(result);
+    APFloat result(1.f / std::sqrt(f));
+    result.convert(original_float_semantics, APFloat::rmNearestTiesToEven,
+                   &loseInfo);
+    return result;
   };
   return ConstFoldUnaryOp(result_type, operands[0], compute);
 }
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index f7ee323957d..539c28133b9 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -365,7 +365,7 @@ class TFL_TCopVTEtAreSameAt<int i, int j> : Or<[
 
 // This is a constraint for most of the binary ops, e.g., add, mul, div, etc.
 // Binary ops lhs & rhs should have the same value type, and is capable to
-// compare quantiziation types as well.
+// compare quantization types as well.
 def BinaryOpSameElementTypeConstraint :
   PredOpTrait<"operands have same element type",
     Or<[
@@ -384,8 +384,8 @@ def BinaryOpSameElementTypeConstraint :
 // TFL common builders.
 //===----------------------------------------------------------------------===//
 
-def TFL_BroadcastableBinaryBuilder : OpBuilder<
-  "Value lhs, Value rhs",
+def TFL_BroadcastableBinaryBuilder :
+  OpBuilderDAG<(ins "Value":$lhs, "Value":$rhs),
   [{
     auto resultType =
       OpTrait::util::getBroadcastedType(lhs.getType(), rhs.getType());
@@ -395,15 +395,16 @@ def TFL_BroadcastableBinaryBuilder : OpBuilder<
     $_state.types.push_back(resultType);
   }]>;
 
-def TFL_FusedBroadcastableBinaryBuilder : OpBuilder<
-  "Value lhs, Value rhs, StringAttr fusedActivationFunction",
+def TFL_FusedBroadcastableBinaryBuilder :
+  OpBuilderDAG<(ins "Value":$lhs, "Value":$rhs,
+    "StringAttr":$fusedActivationFunction),
   [{
     buildFusedBroadcastableBinOp(
        &$_builder, $_state, lhs, rhs, fusedActivationFunction);
   }]>;
 
-def TFL_ComparisonBinaryBuilder : OpBuilder<
-  "Value lhs, Value rhs",
+def TFL_ComparisonBinaryBuilder :
+  OpBuilderDAG<(ins "Value":$lhs, "Value":$rhs),
   [{
     buildComparisonBinOp(&$_builder, $_state, lhs, rhs);
   }]>;
@@ -770,8 +771,8 @@ def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [ConstantLike, NoSideEffect,
 
   let hasFolder = 1;
 
-  let builders = [OpBuilder<
-    "Attribute value",
+  let builders = [
+    OpBuilderDAG<(ins "Attribute":$value),
     [{
       $_state.addAttribute("value", value);
       $_state.addTypes(value.getType());
@@ -822,8 +823,9 @@ def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [
 
   let results = (outs AnyTensor:$output);
 
-  let builders = [OpBuilder<
-    "Attribute value, SparsityParameterAttr s_param, Attribute compressed_data",
+  let builders = [
+    OpBuilderDAG<(ins "Attribute":$value, "SparsityParameterAttr":$s_param,
+      "Attribute":$compressed_data),
     [{
       $_state.addTypes(value.getType());
       $_state.addAttribute("value", value);
@@ -876,6 +878,30 @@ def TFL_CosOp: TFL_Op<"cos", [
   let hasFolder = 1;
 }
 
+def TFL_CumsumOp: TFL_Op<"cumsum", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoQuantizableResult,
+    TFL_OperandHasRank<1, 0>]> {
+  let summary = "Cumsum operator";
+
+  let description = [{
+    Compute the cumulative sum of the tensor x along axis.
+  }];
+
+  let arguments = (
+    ins TFL_TensorOf<[F32, I32, I64]>:$input,
+    TFL_I32Tensor:$axis,
+    DefaultValuedAttr<BoolAttr, "false">:$exclusive,
+    DefaultValuedAttr<BoolAttr, "false">:$reverse
+  );
+
+  let results = (outs TFL_TensorOf<[F32, I32, I64]>:$output);
+
+  let hasOptions = 1;
+}
+
 def TFL_DepthwiseConv2DOp :
     TFL_ConvOp<"depthwise_conv_2d", "Depthwise-separable convolution", 3> {
   let arguments = (
@@ -1007,8 +1033,8 @@ def TFL_GatherOp : TFL_Op<"gather", [
 
   let builders =
   [
-    OpBuilder<"Value params, Value indices, IntegerAttr axis",
-        [{ BuildGatherOp(&$_builder, $_state, params, indices, axis); }]>
+    OpBuilderDAG<(ins "Value":$params, "Value":$indices, "IntegerAttr":$axis),
+    [{ BuildGatherOp(&$_builder, $_state, params, indices, axis); }]>
   ];
 
   let results = (outs
@@ -1343,8 +1369,8 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
 
   let builders =
   [
-    OpBuilder<"Value lhs, Value rhs",
-      [{
+    OpBuilderDAG<(ins "Value":$lhs, "Value":$rhs),
+    [{
         buildComparisonBinOp(&$_builder, $_state, lhs, rhs);
       }]>
   ];
@@ -2481,7 +2507,8 @@ def TFL_ReluOp: TFL_Op<"relu", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<"Value input",
+  let builders = [
+    OpBuilderDAG<(ins "Value":$input),
     [{
       $_state.addOperands({input});
       $_state.addTypes(input.getType());
@@ -2508,7 +2535,8 @@ def TFL_Relu6Op: TFL_Op<"relu6", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<"Value input",
+  let builders = [
+    OpBuilderDAG<(ins "Value":$input),
     [{
       $_state.addOperands({input});
       $_state.addTypes(input.getType());
@@ -2535,8 +2563,8 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<
-    "Value input",
+  let builders = [
+    OpBuilderDAG<(ins "Value":$input),
     [{
       $_state.addOperands({input});
       $_state.addTypes(input.getType());
@@ -2560,6 +2588,8 @@ def TFL_ReshapeOp: TFL_Op<"reshape", [
   let results = (outs AnyTensor:$output);
   let hasCanonicalizer = 0b1;
   let hasFolder = 1;
+
+  let verifier = [{ return Verify(*this); }];
 }
 
 def TFL_ReverseSequenceOp : TFL_Op<"reverse_sequence", [
@@ -2719,8 +2749,9 @@ def TFL_SelectOp : TFL_Op<"select", [
     TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // TODO(jpienaar): autogenerate this.
-  let builders = [OpBuilder<"Value condition, Value x, Value y",
-  [{
+  let builders = [
+    OpBuilderDAG<(ins "Value":$condition, "Value":$x, "Value":$y),
+    [{
     auto resultType = x.getType();
     $_state.addOperands({condition, x, y});
     $_state.types.push_back(resultType);
@@ -2754,8 +2785,9 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [
   let results = (outs
     TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
-  let builders = [OpBuilder<"Value cond, Value x, Value y",
-  [{
+  let builders = [
+    OpBuilderDAG<(ins "Value":$cond, "Value":$x, "Value":$y),
+    [{
     BuildSelectV2Op(&$_builder, $_state, cond, x, y);
   }]>];
 
@@ -2931,7 +2963,8 @@ def TFL_TanhOp: TFL_Op<"tanh", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<"Value input",
+  let builders = [
+    OpBuilderDAG<(ins "Value":$input),
     [{
       $_state.addOperands({input});
       $_state.addTypes(input.getType());
@@ -3004,8 +3037,9 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [
     TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$values,
     TFL_I32Tensor:$indices);
 
-  let builders = [OpBuilder<"Value input, Value k",
-                  [{ BuildTopKOp(&$_builder, $_state, input, k); }]>];
+  let builders = [
+    OpBuilderDAG<(ins "Value":$input, "Value":$k),
+    [{ BuildTopKOp(&$_builder, $_state, input, k); }]>];
 
   let hasOptions = 1;
 }
@@ -3537,8 +3571,8 @@ def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
 
   let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
-  let builders = [OpBuilder<
-    "TypeAttr qtype, Attribute value",
+  let builders = [
+    OpBuilderDAG<(ins "TypeAttr":$qtype, "Attribute":$value),
     [{
       $_state.addAttribute("qtype", qtype);
       $_state.addAttribute("value", value);
@@ -3566,9 +3600,9 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
 
   let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
-  let builders = [OpBuilder<
-    "TypeAttr qtype, Attribute value, SparsityParameterAttr s_param, "
-    "Attribute compressed_data",
+  let builders = [
+    OpBuilderDAG<(ins "TypeAttr":$qtype, "Attribute":$value,
+      "SparsityParameterAttr":$s_param, "Attribute":$compressed_data),
     [{
       $_state.addTypes(qtype.getValue());
       $_state.addAttribute("qtype", qtype);
@@ -3905,7 +3939,15 @@ def TFL_UnidirectionalSequenceLSTMOp :
     TFL_AFAttr:$fused_activation_function,
     Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
     Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
-    BoolAttr:$time_major
+    BoolAttr:$time_major,
+
+    // Types of the optional intermediate tensors, which exist for fully
+    // quantized op and hold the ranges of the intermediate tensors.
+    OptionalAttr<TypeAttr>:$input_to_input_intermediate,
+    OptionalAttr<TypeAttr>:$input_to_forget_intermediate,
+    OptionalAttr<TypeAttr>:$input_to_cell_intermediate,
+    OptionalAttr<TypeAttr>:$input_to_output_intermediate,
+    OptionalAttr<TypeAttr>:$effective_hidden_scale_intermediate
   );
 
   let results = (outs TFL_TensorOf<[F32, QI8]>:$output);
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index e786bedc86d..005c5123906 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -90,9 +90,10 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
   pass_config.lower_tensor_list_ops = true;
 
-  return internal::ConvertMLIRToTFLiteFlatBuffer(toco_flags, std::move(module),
-                                                 pass_config, result,
-                                                 /*session=*/llvm::None);
+  return internal::ConvertMLIRToTFLiteFlatBuffer(
+      toco_flags, std::move(module), pass_config, /*saved_model_tags=*/{},
+      result,
+      /*session=*/llvm::None);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index aabc54335d1..7bbd3209dfe 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -177,7 +177,7 @@ Status ConvertSavedModelToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
 
   // TODO(b/153507667): Pass the session object when importing logic is removed.
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
-      toco_flags, std::move(module), pass_config, result,
+      toco_flags, std::move(module), pass_config, tags, result,
       /*session=*/llvm::None);
   return status;
 }
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index a4e58123e05..ffeac01713c 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -113,6 +113,8 @@ DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
       return DT_QUINT8;
     case toco::IODataType::INT8:
       return DT_QINT8;
+    case toco::IODataType::QUANTIZED_INT16:
+      return DT_INT16;
     case toco::IODataType::INT32:
       return DT_INT32;
     case toco::IODataType::INT64:
@@ -273,7 +275,8 @@ Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
 
 Status ConvertMLIRToTFLiteFlatBuffer(
     const toco::TocoFlags& toco_flags, mlir::OwningModuleRef module,
-    const mlir::TFL::PassConfig& pass_config, string* result,
+    const mlir::TFL::PassConfig& pass_config,
+    const std::unordered_set<std::string>& saved_model_tags, string* result,
     llvm::Optional<tensorflow::Session*> session) {
   bool emit_builtin_tflite_ops = !toco_flags.force_select_tf_ops();
   bool emit_select_tf_ops = toco_flags.enable_select_tf_ops();
@@ -297,8 +300,8 @@ Status ConvertMLIRToTFLiteFlatBuffer(
 
   auto status = ConvertTFExecutorToTFLOrFlatbuffer(
       module.get(), /*export_to_mlir=*/false, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, pass_config.quant_specs, result,
-      &pm);
+      emit_select_tf_ops, emit_custom_ops, pass_config.quant_specs,
+      saved_model_tags, result, &pm);
   if (toco_flags.has_dump_graphviz_dir()) {
     TF_RETURN_IF_ERROR(DumpOpGraphToFile(
         // rename once we enable the new converter feature flag.
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index d79bdc6df67..d4f9e739121 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
 
 #include <ostream>
+#include <unordered_set>
 #include <utility>
 
 #include "llvm/ADT/Optional.h"
@@ -48,7 +49,8 @@ Status PopulateQuantizationSpecs(
 // This will also run relevant passes as well.
 Status ConvertMLIRToTFLiteFlatBuffer(
     const toco::TocoFlags& toco_flags, mlir::OwningModuleRef module,
-    const mlir::TFL::PassConfig& pass_config, string* result,
+    const mlir::TFL::PassConfig& pass_config,
+    const std::unordered_set<std::string>& saved_model_tags, string* result,
     llvm::Optional<tensorflow::Session*> session);
 
 // Give a warning for any unused flags that have been specified.
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 16b51496b5f..5e76bd6566c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -288,32 +288,59 @@ class QuantizationDriver {
 
   void DumpStates(Operation *current_op) {
     if (current_op) {
-      llvm::errs() << "\n\n\n" << current_op->getName() << "\n";
+      llvm::dbgs() << "\n\n\n" << current_op->getName() << "\n";
     }
     fn_.walk([&](Operation *op) {
-      if (llvm::isa<quant::QuantizeCastOp, quant::DequantizeCastOp, ConstantOp>(
+      if (op->isKnownTerminator() ||
+          op->hasTrait<OpTrait::quant::NoQuantizableResult>() ||
+          llvm::isa<quant::QuantizeCastOp, quant::DequantizeCastOp, ConstantOp>(
               op))
         return;
-      if (current_op == op) llvm::errs() << "===>>>";
-      llvm::errs() << op->getName() << " : (";
+      if (current_op == op) llvm::dbgs() << "===>>>";
+      llvm::dbgs() << op->getName() << " : (";
+      if (llvm::isa<FuncOp>(op)) {
+        for (auto &arg : fn_.getArguments()) {
+          if (auto params = GetArgQuantState(arg).params) {
+            params.print(llvm::dbgs());
+            auto requantize_state = GetArgRequantizeState(arg);
+            if (requantize_state.pos != RequantizeState::NO_REQUANTIZE) {
+              llvm::dbgs() << "+";
+              requantize_state.params.print(llvm::dbgs());
+            }
+          }
+          llvm::dbgs() << ",";
+        }
+      }
       for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
-        if (auto params = GetOperandQuantState(op, i).params)
-          params.print(llvm::errs());
-        else
+        if (auto params = GetOperandQuantState(op, i).params) {
+          params.print(llvm::dbgs());
+          auto requantize_state = GetOperandRequantizeState(op, i);
+          if (requantize_state.pos != RequantizeState::NO_REQUANTIZE) {
+            llvm::dbgs() << "+";
+            requantize_state.params.print(llvm::dbgs());
+          }
+        } else {
           op->getOperand(i).getType().cast<ShapedType>().getElementType().print(
-              llvm::errs());
-        llvm::errs() << ",";
+              llvm::dbgs());
+        }
+        llvm::dbgs() << ",";
       }
-      llvm::errs() << ") -> (";
+      llvm::dbgs() << ") -> (";
       for (int i = 0, e = op->getNumResults(); i < e; ++i) {
-        if (auto params = GetResultQuantState(op, i).params)
-          params.print(llvm::errs());
-        else
+        if (auto params = GetResultQuantState(op, i).params) {
+          params.print(llvm::dbgs());
+          auto requantize_state = GetResultRequantizeState(op, i);
+          if (requantize_state.pos != RequantizeState::NO_REQUANTIZE) {
+            llvm::dbgs() << "+";
+            requantize_state.params.print(llvm::dbgs());
+          }
+        } else {
           op->getResult(i).getType().cast<ShapedType>().getElementType().print(
-              llvm::errs());
-        llvm::errs() << ",";
+              llvm::dbgs());
+        }
+        llvm::dbgs() << ",";
       }
-      llvm::errs() << ")\n";
+      llvm::dbgs() << ")\n";
     });
   }
 
@@ -821,6 +848,10 @@ bool QuantizationDriver::PropagateParams() {
       changed |= SetOperandParams(op, it.first, params);
     }
   }
+
+  LLVM_DEBUG(llvm::dbgs() << "\n\n\n");
+  LLVM_DEBUG(DumpStates(nullptr));
+
   return changed;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
index 76fd75e18ea..52beceaf084 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
@@ -25,14 +25,12 @@ cc_library(
         "passes.h",
     ],
     deps = [
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index b043834188c..e9935499f38 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -145,7 +145,7 @@ void LegalizeTFToQuant::runOnFunction() {
   auto func = getFunction();
   auto *ctx = func.getContext();
   patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant>(ctx);
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index ff7c47fb621..69009ae594b 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -577,3 +577,13 @@ func @div_dense_different_rank() -> tensor<1x2x2xf32> {
 // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
 // CHECK:  return %[[CST]]
 }
+
+// CHECK-LABEL: @rsqrt_bf16
+func @rsqrt_bf16() -> tensor<bf16> {
+  %cst = constant dense<4.0> : tensor<bf16>
+  %0 = "tfl.rsqrt"(%cst) : (tensor<bf16>) -> tensor<bf16>
+  return %0 : tensor<bf16>
+
+// CHECK: %[[CST:.*]] = constant dense<5.000000e-01> : tensor<bf16>
+// CHECK:  return %[[CST]]
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
index d92bdc3f460..8fa9d6aa25a 100644
--- a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
@@ -303,3 +303,50 @@ func @testDilatedConvWithDifferentExpandSqueezeAxis(%arg0: tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BatchToSpaceND"
   // CHECK-NEXT: return [[RESULT]]
 }
+
+func @testNoDilatedConvWhenFirstDimIsDynamic(%arg0: tensor<?x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>) -> tensor<?x128x128x8xf32> {
+  %cst = constant dense<[2, 2]> : tensor<2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
+  %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<?x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?x68x68x3xf32>
+  %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<?x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<?x64x64x8xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<?x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?x128x128x8xf32>
+  return %2 : tensor<?x128x128x8xf32>
+
+  // CHECK-LABEL: testNoDilatedConvWhenFirstDimIsDynamic
+  // CHECK: [[STB:%.*]] = "tf.SpaceToBatchND"
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.BatchToSpaceND"
+  // CHECK-NEXT: return [[RESULT]]
+}
+
+func @testNoDilatedConvWhenLastDimIsDynamic(%arg0: tensor<1x128x128x?xf32>, %arg1: tensor<5x5x3x8xf32>) -> tensor<1x128x128x?xf32> {
+  %cst = constant dense<[2, 2]> : tensor<2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
+  %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x?xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x?xf32>
+  %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x?xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x?xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<4x64x64x?xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x?xf32>
+  return %2 : tensor<1x128x128x?xf32>
+
+  // CHECK-LABEL: testNoDilatedConvWhenLastDimIsDynamic
+  // CHECK: [[STB:%.*]] = "tf.SpaceToBatchND"
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.BatchToSpaceND"
+  // CHECK-NEXT: return [[RESULT]]
+}
+
+func @testNoDilatedConvWhenGivenInputIsNonFloatType(%arg0: tensor<1x128x128x3xi32>, %arg1: tensor<5x5x3x8xi32>) -> tensor<1x128x128x8xi32> {
+  %cst = constant dense<[2, 2]> : tensor<2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xi32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xi32>
+  %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xi32>, tensor<5x5x3x8xi32>) -> tensor<4x64x64x8xi32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_0) : (tensor<4x64x64x8xi32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xi32>
+  return %2 : tensor<1x128x128x8xi32>
+
+  // CHECK-LABEL: testNoDilatedConvWhenGivenInputIsNonFloatType
+  // CHECK: [[STB:%.*]] = "tf.SpaceToBatchND"
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.BatchToSpaceND"
+  // CHECK-NEXT: return [[RESULT]]
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
index facd6005e7d..9f8d82eb184 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
@@ -96,5 +96,6 @@ versions {
 # CHECK-NEXT:   metadata: [ {
 # CHECK-NEXT:   name: "min_runtime_version",
 # CHECK-NEXT:   buffer: 4
-# CHECK-NEXT:   } ]
+# CHECK-NEXT:   } ],
+# CHECK-NEXT:   signature_defs: [ ]
 # CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
index a7f6040f211..97edfdf9c45 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
@@ -406,16 +406,16 @@ versions {
   min_consumer: 12
 }
 
-# CHECK:      func @StatefulIf_else
+# CHECK:      func @main
+# CHECK-NEXT: constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]>
 # CHECK-NEXT: constant dense<[5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]>
-# CHECK-NEXT: tfl.mul
-# CHECK:      func @StatefulIf_then
-# CHECK-NEXT: constant dense<[6.000000e+00, 8.000000e+00, 1.000000e+01, 1.200000e+01]>
-# CHECK-NEXT: return
-# CHECK:      func @StatelessIf_else
+# CHECK:      "tf.If"{{.+}}else_branch = @cond_false_10{{.+}}is_stateless = true{{.+}}then_branch = @cond_true_10
+# CHECK:      "tf.If"{{.+}}else_branch = @cond_false0{{.+}}is_stateless = false{{.+}}then_branch = @cond_true0
+# CHECK:      func @cond_false_10
 # CHECK-NEXT: tfl.div
-# CHECK:      func @StatelessIf_then
+# CHECK:      func @cond_true_10
 # CHECK-NEXT: tfl.sub
-# CHECK:      "tf.If"{{.+}}else_branch = @StatelessIf_else{{.+}}then_branch = @StatelessIf_then
-# CHECK:      "tf.If"{{.+}}else_branch = @StatefulIf_else{{.+}}then_branch = @StatefulIf_then
-
+# CHECK:      func @cond_false0
+# CHECK-NEXT: tfl.mul
+# CHECK:      func @cond_true0
+# CHECK-NEXT: tfl.add
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index 41fbbbcb9c5..e579aea558e 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -54,6 +54,7 @@ tf_native_cc_binary(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
@@ -70,6 +71,7 @@ tf_native_cc_binary(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_legacy_reshape.cc b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_legacy_reshape.cc
index 3d4f440efe6..f5b73207157 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_legacy_reshape.cc
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_legacy_reshape.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 using llvm::Optional;
 using llvm::cl::opt;
@@ -95,7 +96,8 @@ Optional<std::unique_ptr<tflite::ModelT>> RemoveConstantOpInReshape(
   // Find the reshape ops and make it single operand.
   for (auto& sub_graph : model->subgraphs) {
     for (auto& op : sub_graph->operators) {
-      if (model->operator_codes[op->opcode_index]->builtin_code ==
+      if (tflite::GetBuiltinCode(
+              model->operator_codes[op->opcode_index].get()) ==
           tflite::BuiltinOperator_RESHAPE) {
         auto& output_tensor = sub_graph->tensors[op->outputs[0]];
         auto shape = output_tensor->shape;
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
index 1349622eefc..c1f2417cdb5 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 using llvm::Optional;
 using llvm::cl::opt;
@@ -114,7 +115,8 @@ Optional<std::unique_ptr<tflite::ModelT>> InjectStatsToFullyConnected(
   // Find the tensors and inject the min and max to the input and output
   for (auto& sub_graph : model->subgraphs) {
     for (auto& op : sub_graph->operators) {
-      if (model->operator_codes[op->opcode_index]->builtin_code ==
+      if (tflite::GetBuiltinCode(
+              model->operator_codes[op->opcode_index].get()) ==
           tflite::BuiltinOperator_FULLY_CONNECTED) {
         // inject min/max to the input and output tensors
         auto& input_tensor = sub_graph->tensors[op->inputs[0]];
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index faf96878087..e50d2cf526b 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -26,3 +26,12 @@ func @testFullyQuantizedLSTM(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248
 // CHECK: }) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = "FULL", proj_clip = 0.00999999977 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.6013896674849093E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
 }
 
+// -----
+
+// CHECK-LABEL: testUnidirectionalSequenceLstmWithIntermediates
+func @testUnidirectionalSequenceLstmWithIntermediates(%arg0: tensor<? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x ? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
index 138614d81e6..d56c2cc221a 100644
--- a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -3442,8 +3442,8 @@ func @sgnn_projection(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "va
   %0 = "tf.Const"() {value = dense<[[1902835825], [-1475704015], [473120514], [1254202069], [1558833093], [1756181982], [1906603252], [-1034142694], [542842690], [535515822]]> : tensor<10x1xi64>} : () -> tensor<10x1xi64>
   %1 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 2147483647 : i64} : (tensor<?x!tf.string>) -> tensor<?xi64>
   %2 = "tf.Sgnn"(%1, %0) {device = ""} : (tensor<?xi64>, tensor<10x1xi64>) -> tensor<10x?xf64>
-  %3 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-  %4 = "tf.Reshape"(%2, %3) : (tensor<10x?xf64>, tensor<1xi64>) -> tensor<?x10xf64>
+  %3 = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %4 = "tf.Reshape"(%2, %3) : (tensor<10x?xf64>, tensor<2xi64>) -> tensor<?x10xf64>
   return %4 : tensor<?x10xf64>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 4de278ee324..97a496a0b89 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1361,7 +1361,8 @@ func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf32>, %
 
   // CHECK-LABEL: conv2d_backprop_input
   // CHECK: %[[CST:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
-  // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
+  // CHECK: %[[CAST:.*]] = "tfl.cast"(%[[CST]]) : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CAST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
   // CHECK: %[[CST_0:.*]] = constant unit
   // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
@@ -1587,10 +1588,31 @@ func @tranpose_int64_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   // CHECK: "tfl.transpose"
 }
 
-func @tranpose_arg(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) -> tensor<3x2xf32> {
+func @tranpose_arg32(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) -> tensor<3x2xf32> {
   %0 = "tf.Transpose"(%arg0, %arg1): (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
   return %0 : tensor<3x2xf32>
-  // CHECK-LABEL: tranpose_arg
+  // CHECK-LABEL: tranpose_arg32
   // CHECK: "tfl.transpose"
 }
 
+func @tranpose_arg64(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi64>) -> tensor<3x2xf32> {
+  %0 = "tf.Transpose"(%arg0, %arg1): (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_arg64
+  // CHECK: "tfl.transpose"
+}
+
+func @cumsum(%arg0: tensor<3x3xf32>, %arg1: tensor<i32>) -> tensor<3x3xf32> {
+  %0 = "tf.Cumsum"(%arg0, %arg1) {exclusive = false, reverse = false} : (tensor<3x3xf32>, tensor<i32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+  // CHECK-LABEL: cumsum
+  // CHECK: "tfl.cumsum"(%arg0, %arg1) {exclusive = false, reverse = false} : (tensor<3x3xf32>, tensor<i32>) -> tensor<3x3xf32>
+}
+
+func @cumsum_invalid(%arg0: tensor<3x3xf32>, %arg1: tensor<i64>) -> tensor<3x3xf32> {
+  %0 = "tf.Cumsum"(%arg0, %arg1) {exclusive = false, reverse = false} : (tensor<3x3xf32>, tensor<i64>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+  // CHECK-LABEL: cumsum_invalid
+  // CHECK-NOT: "tfl.cumsum"
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
index f5e5087d420..dc958f013ad 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
@@ -84,7 +84,8 @@ func @main(tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384x
 // CHECK-NEXT:        cell_clip: 1.0,
 // CHECK-NEXT:        proj_clip: 2.0,
 // CHECK-NEXT:        kernel_type: BASIC
-// CHECK-NEXT:      }
+// CHECK-NEXT:      },
+// CHECK-NEXT:      intermediates: [ ]
 // CHECK-NEXT:    } ],
 // CHECK-NEXT:    name: "main"
 // CHECK-NEXT:  } ],
@@ -116,6 +117,7 @@ func @main(tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384x
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 10
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
 ^bb0(%arg0: tensor<1x384xf32>, %arg1: tensor<1x96xf32>, %arg2: tensor<384x480xf32>, %arg3: tensor<384xf32>, %arg4: tensor<1x96xf32>):
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index 02767ddceba..a067826f86d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -100,6 +100,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
index a576c84e207..ef82175a47d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
@@ -91,6 +91,7 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 6
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<-1.23697901> : tensor<32xf32>} : () -> tensor<32xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
index 7d2d84d242a..f4bc10b2fe2 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
@@ -93,6 +93,7 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 6
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<-1.23697901> : tensor<32xf32>} : () -> tensor<32xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
index 66ec5ed8a04..f7ff99b117d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
@@ -97,6 +97,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
index dc8590b4a20..9aca1ecb47d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
@@ -54,6 +54,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 3
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 
 // IMPORT: "tfl.fake_quant"(%arg0) {max = 1.400000e+00 : f32, min = 3.000000e-01 : f32, narrow_range = false, num_bits = 6 : i32}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
index 245260d994d..b2d7f611ede 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
@@ -47,6 +47,7 @@ func @main(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 3
 // CHECK-NEXT:    } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:  }
 
   %0 = "tf.AddV2"(%arg0, %arg0) : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
index 82ac52d2a64..b8749b4b76c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
@@ -60,6 +60,7 @@ func @main(tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 4
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tf.Add"(%arg0, %arg1)  : (tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f64>> loc("add")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir
index 543ca7fc3e7..c8f3949500e 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir
@@ -60,6 +60,7 @@ func @main(tensor<4xf64>, tensor<4xf64>) -> tensor<4xf64> {
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 4
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tf.Add"(%arg0, %arg1)  : (tensor<4xf64>, tensor<4xf64>) -> tensor<4xf64> loc("add")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
index e15c7f23585..059cfc0d54e 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
@@ -99,6 +99,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
index 58f693c1b0a..b01bafe4ea7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
@@ -69,6 +69,7 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 5
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %cst = constant unit
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
index 913a69c4d46..95bcc1547f7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
@@ -69,6 +69,7 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 5
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %cst = constant unit
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
index 86794afdf4c..c89239c2e6f 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
@@ -166,6 +166,7 @@
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 11
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 
 func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
index d24fa33fa13..f32fe880121 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
@@ -87,6 +87,7 @@ func @main(tensor<4xi1>) -> tensor<4xi1> {
   // CHECK-NEXT:   name: "min_runtime_version",
   // CHECK-NEXT:   buffer: 6
   // CHECK-NEXT:   } ]
+  // CHECK-NEXT:   signature_defs: [ ]
   // CHECK-NEXT: }
   // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index 19bfc661425..017870ca334 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -258,6 +258,7 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 26
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
index a48e3c82e9d..10332e45bec 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
@@ -320,5 +320,6 @@ func @main(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 23
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
index c4cb910f17b..eeca4267524 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
@@ -140,6 +140,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT:   name: "min_runtime_version",
   // CHECK-NEXT:   buffer: 8
   // CHECK-NEXT:   } ]
+  // CHECK-NEXT:   signature_defs: [ ]
   // CHECK-NEXT: }
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
index 49d71f24d2d..3fb00cf6024 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
@@ -33,4 +33,5 @@ module attributes {
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 6
 // CHECK-NEXT: } ]
+// CHECK-NEXT: signature_defs: [ ]
 // CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
index 04ceb3855f2..c8af68a190d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
@@ -66,6 +66,7 @@ func @main(tensor<3x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x!quant.uniform<i8:
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 4
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<i8:f32, 0.1>>, value = dense<2> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<i8:f32, 0.1>>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
index 0206724f8ad..441dbd8f925 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
@@ -66,6 +66,7 @@ func @main(tensor<3x!quant.uniform<i8:f32, 1.0>>) -> tensor<3x!quant.uniform<i8:
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 4
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<i8:f32, 1.0>>, value = dense<2> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<i8:f32, 1.0>>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
index 455a7695d48..ec0fd07c25a 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
@@ -55,6 +55,7 @@ func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
   // CHECK-NEXT:   name: "min_runtime_version",
   // CHECK-NEXT:   buffer: 3
   // CHECK-NEXT:   } ]
+  // CHECK-NEXT:   signature_defs: [ ]
   // CHECK-NEXT: }
 
   %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> loc("avgpool")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
index e39ded18b86..60360c7ded6 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
@@ -48,6 +48,7 @@
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 3
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
 func @main(%arg0: tensor<4xf32>, %arg1: tensor<4x!quant.uniform<u8:f32, 0.1>>) -> tensor<4xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
index 81065798271..93581e54f10 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
@@ -165,6 +165,7 @@ func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 10
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<[1, 1001]> : tensor<2xi32>} : () -> tensor<2xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
index 129e037a2ee..af59475f6a1 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
@@ -59,6 +59,7 @@ func @main(tensor<3x2xi32>) -> tensor<6xi32> {
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 4
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 
   %0 = "tfl.pseudo_const" () {value = dense<[6]> : tensor<1xi32>} : () -> tensor<1xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
new file mode 100644
index 00000000000..b9866b4696d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
@@ -0,0 +1,117 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+// CHECK: {
+// CHECK-NEXT:  version: 3,
+// CHECK-NEXT:  operator_codes: [ {
+// CHECK-NEXT:    deprecated_builtin_code: 9,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: FULLY_CONNECTED
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  subgraphs: [ {
+// CHECK-NEXT:    tensors: [ {
+// CHECK-NEXT:      shape: [ 1, 384 ],
+// CHECK-NEXT:      buffer: 1,
+// CHECK-NEXT:      name: "serving_default_input2:0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 384 ]
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 384 ],
+// CHECK-NEXT:      buffer: 2,
+// CHECK-NEXT:      name: "serving_default_input1:0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 384 ]
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 5 ],
+// CHECK-NEXT:      buffer: 3,
+// CHECK-NEXT:      name: "std.constant",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 5, 384 ],
+// CHECK-NEXT:      buffer: 4,
+// CHECK-NEXT:      name: "std.constant1",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 5, 384 ],
+// CHECK-NEXT:      buffer: 5,
+// CHECK-NEXT:      name: "std.constant2",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 5 ],
+// CHECK-NEXT:      buffer: 6,
+// CHECK-NEXT:      name: "StatefulPartitionedCall:0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 5 ]
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 5 ],
+// CHECK-NEXT:      buffer: 7,
+// CHECK-NEXT:      name: "StatefulPartitionedCall:1",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 5 ]
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    inputs: [ 0, 1 ],
+// CHECK-NEXT:    outputs: [ 6, 5 ],
+// CHECK-NEXT:    operators: [ {
+// CHECK-NEXT:      inputs: [ 0, 3, 2 ],
+// CHECK-NEXT:      outputs: [ 5 ],
+// CHECK-NEXT:      builtin_options_type: FullyConnectedOptions,
+// CHECK-NEXT:      builtin_options: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      inputs: [ 0, 4, 2 ],
+// CHECK-NEXT:      outputs: [ 6 ],
+// CHECK-NEXT:      builtin_options_type: FullyConnectedOptions,
+// CHECK-NEXT:      builtin_options: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    name: "main"
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  description: "MLIR Converted.",
+
+// CHECK:  metadata: [ {
+// CHECK-NEXT:    name: "min_runtime_version",
+// CHECK-NEXT:    buffer: 8
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  signature_defs: [ {
+// CHECK-NEXT:    inputs: [ {
+// CHECK-NEXT:      name: "input1",
+// CHECK-NEXT:      tensor_index: 1
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      name: "input2"
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    outputs: [ {
+// CHECK-NEXT:      name: "end_logits",
+// CHECK-NEXT:      tensor_index: 5
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      name: "start_logits",
+// CHECK-NEXT:      tensor_index: 6
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    method_name: "serving_default",
+// CHECK-NEXT:    key: ""
+// CHECK-NEXT:  } ]
+// CHECK-NEXT:}
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 554 : i32}, tf_saved_model.semantics} {
+  func @main(%arg0: tensor<?x384xf32> {tf_saved_model.index_path = ["input2"]}, %arg1: tensor<?x384xf32> {tf_saved_model.index_path = ["input1"]}) -> (tensor<?x5xf32> {tf_saved_model.index_path = ["start_logits"]}, tensor<?x5xf32> {tf_saved_model.index_path = ["end_logits"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input2:0,serving_default_input1:0", outputs = "StatefulPartitionedCall:1,StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = constant dense<0.000000e+00> : tensor<5xf32>
+    %cst_0 = constant dense<1.0> : tensor<5x384xf32>
+    %cst_1 = constant dense<1.0> : tensor<5x384xf32>
+    %0 = "tfl.fully_connected"(%arg0, %cst_0, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<?x384xf32>, tensor<5x384xf32>, tensor<5xf32>) -> tensor<?x5xf32>
+    %1 = "tfl.fully_connected"(%arg0, %cst_1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<?x384xf32>, tensor<5x384xf32>, tensor<5xf32>) -> tensor<?x5xf32>
+    return %1, %0 : tensor<?x5xf32>, tensor<?x5xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
index ea380f8f47d..fd0e0386c46 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
@@ -105,6 +105,7 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 6
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 
   %0 = "tfl.pseudo_const" () {value = dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
index e2ad4c73baa..3f48facd122 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
@@ -87,6 +87,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
index c1d30a9b4d4..1b6b66ed087 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
@@ -88,6 +88,7 @@ func @main(tensor<4 x f32>, tensor<4 x i8>, tensor<4 x f32>, tensor<4 x f32>) ->
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
index 87e3ccf4688..68b21765717 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
@@ -199,6 +199,7 @@
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 14
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT: }
 
 func @WhileOp_cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tensor<i1> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
index 5252dc21a59..1256dd19036 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
@@ -70,6 +70,7 @@ func @main(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 5
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %cst = constant unit
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
index 38c1ed40b35..48c38eb7495 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s --dump-input=always
 
 func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32> {
 // CHECK: {
@@ -178,6 +178,26 @@ func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-NEXT:       },
 // CHECK-NEXT:       is_variable: true
 // CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       name: "input_to_input_intermediate"
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       name: "input_to_forget_intermediate"
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       name: "input_to_cell_intermediate"
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       name: "input_to_output_intermediate"
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       name: "effective_hidden_scale_intermediate",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.007788 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
 // CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 25,
 // CHECK-NEXT:       name: "tfl.unidirectional_sequence_lstm",
@@ -186,14 +206,15 @@ func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-NEXT:       }
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 ],
-// CHECK-NEXT:     outputs: [ 24 ],
+// CHECK-NEXT:     outputs: [ 29 ],
 // CHECK-NEXT:     operators: [ {
 // CHECK-NEXT:       inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 22, 23, 18, 19, 20, 21 ],
-// CHECK-NEXT:       outputs: [ 24 ],
+// CHECK-NEXT:       outputs: [ 29 ],
 // CHECK-NEXT:         builtin_options_type: UnidirectionalSequenceLSTMOptions,
 // CHECK-NEXT:         builtin_options: {
 // CHECK-NEXT:           time_major: true
-// CHECK-NEXT:         }
+// CHECK-NEXT:         },
+// CHECK-NEXT:       intermediates: [ 24, 25, 26, 27, 28 ]
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "main"
 // CHECK-NEXT:   } ],
@@ -257,12 +278,13 @@ func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 26
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 // CHECK-EMPTY:
 
 ^bb0(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4x4xf32>, %arg18: tensor<4x4xf32>, %arg19: tensor<4x4xf32>, %arg20: tensor<4x4xf32>, %arg21: tensor<4x4xf32>):
   %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
   %1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
-  %2 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %0, %1, %arg18, %arg19, %arg20, %arg21) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  %2 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %0, %1, %arg18, %arg19, %arg20, %arg21) {effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "NONE", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
   return %2 : tensor<4x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
index 575499c8d66..5b29c1ff050 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
@@ -87,6 +87,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variant_type_on_func.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variant_type_on_func.mlir
new file mode 100644
index 00000000000..79f7d43da54
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variant_type_on_func.mlir
@@ -0,0 +1,6 @@
+// RUN: not flatbuffer_translate -mlir-to-tflite-flatbuffer %s 2>&1 | FileCheck %s
+
+// CHECK: function argument uses variant type. Currently, the variant type is not natively supported in TFLite. Please consider not using the variant type: 'tensor<!tf.variant<tensor<2xi32>>>'
+func @main(%arg0 : tensor<!tf.variant<tensor<2xi32>>>) -> tensor<!tf.variant<tensor<2xi32>>> {
+  return %arg0 : tensor<!tf.variant<tensor<2xi32>>>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variant_type_on_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variant_type_on_op.mlir
new file mode 100644
index 00000000000..ab4044e1c66
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variant_type_on_op.mlir
@@ -0,0 +1,7 @@
+// RUN: not flatbuffer_translate -mlir-to-tflite-flatbuffer %s 2>&1 | FileCheck %s
+
+// CHECK: error: operand result uses variant type. Currently, the variant type is not natively supported in TFLite. Please consider not using the variant type: 'tensor<!tf.variant<tensor<2xi32>>>'
+func @main() -> tensor<!tf.variant<tensor<2xi32>>> {
+  %0 = "tf.Const"() {device = "", name = "", dtype = "tfdtype$DT_INT32", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<!tf.variant>} : () -> tensor<!tf.variant<tensor<2xi32>>>
+  return %0 : tensor<!tf.variant<tensor<2xi32>>>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
index e58c54219ab..51935676eed 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
@@ -199,6 +199,7 @@
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 14
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT: }
 
 func @main(%arg0: tensor<i32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index b62f5655183..341dbe3359e 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -678,6 +678,13 @@ func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<?
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL: testUnidirectionalSequenceLstmWithIntermediates
+func @testUnidirectionalSequenceLstmWithIntermediates(%arg0: tensor<? x ? x f32>, %arg1: none, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x ? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
 // -----
 
 // test violation of projection weight and projection bias pred op trait
@@ -2386,3 +2393,29 @@ func @main(%arg0: tensor<i32>, %arg1: tensor<1xf32>) -> tensor<i32> {
   }) : (tensor<i32>, tensor<1xf32>) -> (tensor<i32>)
   return %0#0 : tensor<i32>
 }
+
+// -----
+
+// CHECK-LABEL: valid_unranked_inputs_on_reshape
+func @valid_unranked_inputs_on_reshape(%arg0: tensor<3x4xi32>, %arg1: tensor<*xi32>) -> tensor<3x4xi32> {
+  // CHECK: "tfl.reshape"(%arg0, %arg1)
+  %0 = "tfl.reshape"(%arg0, %arg1) : (tensor<3x4xi32>, tensor<*xi32>) -> tensor<3x4xi32>
+  return %0 : tensor<3x4xi32>
+}
+
+// -----
+
+// CHECK-LABEL: valid_one_dynamic_dim_on_reshape
+func @valid_one_dynamic_dim_on_reshape(%arg0: tensor<3x4xi32>, %arg1: tensor<1x?x4xi32>) -> tensor<1x3x4xi32> {
+  // CHECK: "tfl.reshape"(%arg0, %arg1)
+  %0 = "tfl.reshape"(%arg0, %arg1) : (tensor<3x4xi32>, tensor<1x?x4xi32>) -> tensor<1x3x4xi32>
+  return %0 : tensor<1x3x4xi32>
+}
+
+// -----
+
+func @invalid_two_dynamic_dims_on_reshape(%arg0: tensor<3x4xi32>, %arg1: tensor<?x?x4xi32>) -> tensor<1x3x4xi32> {
+  // expected-error @+1 {{its shape value, 'tensor<?x?x4xi32>', is invalid because it has more than one dynamic dimensions. You need to set up the unspecified size(s) to avoid this problem, for example, setting batch size in keras model or setting unspecified input size(s) with fixed ones.}}
+  %0 = "tfl.reshape"(%arg0, %arg1) : (tensor<3x4xi32>, tensor<?x?x4xi32>) -> tensor<1x3x4xi32>
+  return %0 : tensor<1x3x4xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 154e4fa8e0e..336a4653748 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -26,6 +26,26 @@ func @fusedDepthwiseConv2dRelu6(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16
   // CHECK: return %0
 }
 
+// CHECK-LABEL: fusedMaxPool2dRelu
+func @fusedMaxPool2dRelu(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> {
+  %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  %1 = "tfl.relu"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32>
+  return %1 : tensor<1x73x73x16xf32>
+
+  // CHECK: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  // CHECK: return %0
+}
+
+// CHECK-LABEL: fusedAvgPool2dRelu1
+func @fusedAvgPool2dRelu1(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> {
+  %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  %1 = "tfl.relu_n1_to_1"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32>
+  return %1 : tensor<1x73x73x16xf32>
+
+  // CHECK: %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU_N1_TO_1", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  // CHECK: return %0
+}
+
 // CHECK-LABEL: fuseAddIntoConv2d
 func @fuseAddIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>) -> tensor<256x30x30x16xf32> {
   %cst = constant dense<1.5> : tensor<16xf32>
@@ -971,6 +991,16 @@ func @Relu(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: return %[[RESULT]]
 }
 
+// CHECK-LABEL: Relu_bf16
+func @Relu_bf16(%arg0: tensor<2x3xbf16>) -> tensor<2x3xbf16> {
+  %cst = constant dense<0.0> : tensor<2x3xbf16>
+  %0 = "tfl.maximum"(%arg0, %cst) : (tensor<2x3xbf16>, tensor<2x3xbf16>) -> tensor<2x3xbf16>
+  return %0 : tensor<2x3xbf16>
+
+  // CHECK: %[[RESULT:.*]] = "tfl.relu"(%arg0)
+  // CHECK: return %[[RESULT]]
+}
+
 // CHECK-LABEL: Relu1
 func @Relu1(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   %cst = constant dense<-1.0> : tensor<f32>
@@ -1172,6 +1202,15 @@ func @DontConvertSqueezeToReshape(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK:  return %[[RESULT]]
 }
 
+func @DontConvertSqueezeToReshapeOnMultiDynamicDims(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = "tfl.squeeze"(%arg0) {squeeze_dims = [0]}: (tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0: tensor<?x?xf32>
+
+// CHECK-LABEL: DontConvertSqueezeToReshapeOnMultiDynamicDims
+// CHECK: %[[RESULT:.*]] = "tfl.squeeze"(%arg0)
+// CHECK:  return %[[RESULT]]
+}
+
 func @ConvertPow1ToIdentity(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.000000e+00> : tensor<f32>
   %0 = "tfl.pow"(%arg0, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
@@ -1307,3 +1346,39 @@ func @SoftMaxWithoutNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128xf32>
 // CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
 // CHECK: return %[[RESULT]] : tensor<8x128xf32>
 }
+
+// CHECK-LABEL: fuseScalarAddIntoConv2d
+func @fuseScalarAddIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>) -> tensor<256x30x30x16xf32> {
+  %cst = constant dense<1.5> : tensor<f32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %cst_0) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<256x30x30x16xf32>, tensor<f32>) -> tensor<256x30x30x16xf32>
+  return %1 : tensor<256x30x30x16xf32>
+
+  // CHECK: %cst = constant dense<[2.500000e+00, 3.500000e+00, 4.500000e+00, 5.500000e+00, 6.500000e+00, 7.500000e+00, 8.500000e+00, 9.500000e+00, 1.050000e+01, 1.150000e+01, 1.250000e+01, 1.350000e+01, 1.450000e+01, 1.550000e+01, 1.650000e+01, 1.750000e+01]> : tensor<16xf32>
+  // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
+}
+
+// CHECK-LABEL: fuseScalarAddIntoConv2dBf16
+func @fuseScalarAddIntoConv2dBf16(%arg0: tensor<256x32x32x3xbf16>, %arg1: tensor<16x3x3x3xbf16>) -> tensor<256x30x30x16xbf16> {
+  %cst = constant dense<1.5> : tensor<bf16>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xbf16>
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %cst_0) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xbf16>, tensor<16x3x3x3xbf16>, tensor<16xbf16>) -> tensor<256x30x30x16xbf16>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<256x30x30x16xbf16>, tensor<bf16>) -> tensor<256x30x30x16xbf16>
+  return %1 : tensor<256x30x30x16xbf16>
+
+  // CHECK: %cst = constant dense<[2.500000e+00, 3.500000e+00, 4.500000e+00, 5.500000e+00, 6.500000e+00, 7.500000e+00, 8.500000e+00, 9.500000e+00, 1.050000e+01, 1.150000e+01, 1.250000e+01, 1.350000e+01, 1.450000e+01, 1.550000e+01, 1.650000e+01, 1.750000e+01]> : tensor<16xbf16>
+  // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
+}
+
+// CHECK-LABEL: fuseScalarAddIntoConv2dHalf
+func @fuseScalarAddIntoConv2dHalf(%arg0: tensor<256x32x32x3xf16>, %arg1: tensor<16x3x3x3xf16>) -> tensor<256x30x30x16xf16> {
+  %cst = constant dense<1.5> : tensor<f16>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf16>
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %cst_0) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf16>, tensor<16x3x3x3xf16>, tensor<16xf16>) -> tensor<256x30x30x16xf16>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<256x30x30x16xf16>, tensor<f16>) -> tensor<256x30x30x16xf16>
+  return %1 : tensor<256x30x30x16xf16>
+
+  // CHECK: %cst = constant dense<[2.500000e+00, 3.500000e+00, 4.500000e+00, 5.500000e+00, 6.500000e+00, 7.500000e+00, 8.500000e+00, 9.500000e+00, 1.050000e+01, 1.150000e+01, 1.250000e+01, 1.350000e+01, 1.450000e+01, 1.550000e+01, 1.650000e+01, 1.750000e+01]> : tensor<16xf16>
+  // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index a0cc6cc1fdb..88de48cf1f9 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -62,40 +62,6 @@ func @Conv2dNCHW(%arg0: tensor<256x3x32x32xf32>, %arg1: tensor<3x3x3x16xf32>) ->
   // LAYOUT: "tfl.conv_2d"
 }
 
-
-func @fusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
-^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>):
-  // OK
-  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  // Unsupported training
-  %1:5 = "tf.FusedBatchNorm"( %0#0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true}  : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  // Use other output
-  %2:5 = "tf.FusedBatchNorm"( %1#0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-
-  return %2, %2#1 : tensor<8x8x8x8xf32>, tensor<8xf32>
-
-// CHECK-LABEL: fusedBatchNorm
-// CHECK:  %[[CONSTANT:.*]] = constant dense<1.000000e-03>
-//              variance + epsilon
-// CHECK:  %[[ADD1:.*]] = "tf.Add"(%[[ARG4:.*]], %[[CONSTANT]])
-//              rsqrt(variance + epsilon)
-// CHECK:  %[[RSQRT:.*]] = "tf.Rsqrt"(%[[ADD1]])
-//              scale * rsqrt(variance + epsilon)
-// CHECK:  %[[MUL1:.*]] = "tf.Mul"(%[[ARG1:.*]], %[[RSQRT]])
-//              x * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[MUL2:.*]] = "tf.Mul"(%[[ARG0:.*]], %[[MUL1]])
-//              mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[MUL3:.*]] = "tf.Mul"(%[[ARG3:.*]], %[[MUL1]])
-//              offset - mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[SUB:.*]] = "tf.Sub"(%[[ARG2:.*]], %[[MUL3]])
-//              x * scale * rsqrt(variance + epsilon) +
-//              offset - mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[ADD2:.*]] = "tf.Add"(%[[MUL2]], %[[SUB]])
-
-// CHECK:  %[[BATCHNORM1_a:[^,]+]], {{.*}} = "tf.FusedBatchNormV3"(%[[ADD2]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
-// CHECK:  "tf.FusedBatchNormV3"(%[[BATCHNORM1_a]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
-}
-
 func @fusedBatchNormV3(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
 ^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>):
   // OK
@@ -666,4 +632,18 @@ func @xla_gather_to_slice(%arg0 : tensor<1x9x104x768xf32>) -> tensor<*xf32> {
 // CHECK: return %[[V0]] : tensor<*xf32>
 }
 
+// CHECK-LABEL: DontMatchFusedBatchNormV3
+func @DontMatchFusedBatchNormV3(%arg0 :tensor<?x576x1x1xf32>, %arg1 : tensor<576xf32>, %arg2 : tensor<576xf32>, %arg3 : tensor<576xf32>,%arg4 : tensor<576xf32>) -> (tensor<?x576x1x1xf32>) {
+  %result:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {data_format = "NHWC", device = "", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = false} : (tensor<?x576x1x1xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> (tensor<?x576x1x1xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<*xf32>)
+  return %result : tensor<?x576x1x1xf32>
+  // CHECK: "tf.FusedBatchNormV3"
+}
+
+// CHECK-LABEL: DoNotConvertConv2DWhenFilterTypeDimIsNotDecided
+func @DoNotConvertConv2DWhenFilterTypeDimIsNotDecided(%arg0 : tensor<?x?x?x96xf32>, %arg1 : tensor<3x3x96x?xf32>) -> tensor<?x?x?x?xf32> {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x?x?x96xf32>, tensor<3x3x96x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+// CHECK: tf.Conv2D
+}
+
 }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 2feb7fedb81..f667749f69d 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -121,6 +121,8 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(mlir::TFL::CreatePrepareCompositeFunctionsPass());
   }
 
+  pass_manager->addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+
   pass_manager->addPass(mlir::createInlinerPass());
   pass_manager->addPass(mlir::createSymbolDCEPass());
 
@@ -139,8 +141,6 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
   }
 
-  pass_manager->addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
-
   // Legalize while early to allow further constant folding.
   // TODO(jpienaar): This may not actually matter as we do canonicalization
   // after the legalize below, for now it needs to be below the above passes
@@ -211,20 +211,21 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(mlir::createSymbolDCEPass());
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
-    // This pass should be always at the end of the floating point model
-    // conversion. Some TFL ops like unidirectional
-    // sequence lstm will have stateful operands and some optimization passes
-    // will merge those operands if they have identical values & types. However,
-    // it's not desired by TFL. This pass serves as a "fix" pass to split the
-    // merged inputs until we have 1st class variable support or reuse
-    // tf.variable to model this.
-    pass_manager->addPass(mlir::TFL::CreateSplitMergedOperandsPass());
 
     // Run quantization after all the floating point model conversion is
     // completed.
     if (pass_config.quant_specs.RunPropagationAndRewriteQuantizationPasses()) {
       AddQuantizationPasses(pass_config.quant_specs, pass_manager);
     }
+
+    // This pass should be always at the end of the model
+    // conversion (even after quantization). Some TFL ops like unidirectional
+    // sequence lstm will have stateful operands and some optimization passes
+    // will merge those operands if they have identical values & types. However,
+    // it's not desired by TFL. This pass serves as a "fix" pass to split the
+    // merged inputs until we have 1st class variable support or reuse
+    // tf.variable to model this.
+    pass_manager->addPass(mlir::TFL::CreateSplitMergedOperandsPass());
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 0835e3fdefa..aa3545d9beb 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -143,6 +143,7 @@ int main(int argc, char **argv) {
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
 
   StatusOr<mlir::OwningModuleRef> module;
+  std::unordered_set<std::string> tags;
 
   tensorflow::GraphImportConfig specs;
   specs.upgrade_legacy = upgrade_legacy;
@@ -161,8 +162,7 @@ int main(int argc, char **argv) {
       module = tensorflow::errors::InvalidArgument(
           "Importing saved model should not have input_mlir set");
 
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
+    tags = absl::StrSplit(saved_model_tags, ',');
     std::vector<std::string> exported_names_vector =
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     absl::Span<std::string> exported_names(exported_names_vector);
@@ -241,7 +241,7 @@ int main(int argc, char **argv) {
   std::string result;
   auto status = tensorflow::ConvertTFExecutorToTFLOrFlatbuffer(
       module.ValueOrDie().get(), output_mlir, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, quant_specs, &result, &pm);
+      emit_select_tf_ops, emit_custom_ops, quant_specs, tags, &result, &pm);
   if (!status.ok()) return kTrFailure;
 
   std::string error_msg;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index a2387e89483..622e32c2766 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -137,8 +137,9 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
 Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, bool emit_builtin_tflite_ops,
     bool emit_select_tf_ops, bool emit_custom_ops,
-    const mlir::TFL::QuantizationSpecs& quant_specs, std::string* result,
-    mlir::PassManager* pass_manager) {
+    const mlir::TFL::QuantizationSpecs& quant_specs,
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result, mlir::PassManager* pass_manager) {
   // Explicitly disable dumping Op details on failures.
   module.getContext()->printOpOnDiagnostic(false);
 
@@ -171,7 +172,7 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   if (!quant_specs.RunWeightQuantization()) {
     if (tflite::MlirToFlatBufferTranslateFunction(
             module, result, emit_builtin_tflite_ops, emit_select_tf_ops,
-            emit_custom_ops)) {
+            emit_custom_ops, saved_model_tags)) {
       return statusHandler.ConsumeStatus();
     }
   } else {
@@ -180,7 +181,7 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     std::string pre_quantized_result;
     if (tflite::MlirToFlatBufferTranslateFunction(
             module, &pre_quantized_result, emit_builtin_tflite_ops,
-            emit_select_tf_ops, emit_custom_ops)) {
+            emit_select_tf_ops, emit_custom_ops, saved_model_tags)) {
       return statusHandler.ConsumeStatus();
     }
     flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index ec2f9e10d26..95b6097e1eb 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -63,8 +63,9 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
 Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, bool emit_builtin_tflite_ops,
     bool emit_select_tf_ops, bool emit_custom_ops,
-    const mlir::TFL::QuantizationSpecs& quant_specs, std::string* result,
-    mlir::PassManager* pass_manager);
+    const mlir::TFL::QuantizationSpecs& quant_specs,
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result, mlir::PassManager* pass_manager);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.cc b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.cc
index b85b3de989a..d98d963c91d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
 
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+
 namespace mlir {
 namespace TFL {
 namespace {
@@ -30,7 +32,7 @@ void IdentifyDilatedConvPass::runOnFunction() {
   patterns.insert<ConvertTFDilatedConvOp<TF::Conv2DOp>,
                   ConvertTFDilatedConvOp<TF::DepthwiseConv2dNativeOp>>(
       &getContext());
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
index 2054bab4185..7e18c3f5fa3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -92,6 +92,15 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     return failure();
   }
 
+  if (!TFTypeIsFloatTensor(op.input()) || !TFDataFormatIsNHWC(op))
+    return failure();
+
+  // Allow dynamic width and height dimensions only.
+  auto result_ty = op.getResult().getType().template cast<TensorType>();
+  if (!result_ty.hasRank() || result_ty.getRank() != 4 ||
+      result_ty.isDynamicDim(0) || result_ty.isDynamicDim(3))
+    return failure();
+
   // Check if the ConvOp is preceded by a `Expand` op and succeeded by a
   // `Squeeze` op.
   Operation* prev_op = op.getOperation()->getPrevNode();
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 322da815a47..85df2417ef2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -54,7 +54,7 @@ def ExtractSingleElementAsInt32 : NativeCodeCall<
     "$_builder.getI32IntegerAttr(ExtractSingleElementAsInteger($_self.cast<ElementsAttr>()).getInt())">;
 
 // Converts tensor with int64 to int32.
-def CreateCastToInt32 : NativeCodeCall<
+def CreateTFLCastToInt32Op : NativeCodeCall<
   "CreateCastToInt32($0, $_loc, $_builder)">;
 
 // Checks whether the given operation has static shapes and same shapes of all inputs.
@@ -216,12 +216,9 @@ def LegalizeSqueeze : Pat<(TF_SqueezeOp $arg, $squeeze_dims),
                           (TFL_SqueezeOp $arg, $squeeze_dims)>;
 def LegalizeTanh : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
 
-def LegalizeTransposeInt64 : Pat<
-  (TF_TransposeOp $arg, (ConstantOp Int64ElementsAttr:$perm)),
-  (TFL_TransposeOp $arg, (CreateCastToInt32 $perm))>;
-
 def LegalizeTranspose : Pat<(TF_TransposeOp $arg, $perm),
-                            (TFL_TransposeOp $arg, $perm)>;
+                            (TFL_TransposeOp $arg,
+                            (CreateTFLCastToInt32Op $perm))>;
 
 def LegalizeWhere : Pat<(TF_WhereOp $arg), (TFL_WhereOp $arg)>;
 def LegalizeZerosLike : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
@@ -444,3 +441,7 @@ def LegalizeMatrixSetDiag : Pat<
 def LegalizeScatterNd : Pat<
   (TF_ScatterNdOp I32Tensor:$indices, $updates, $shape),
   (TFL_ScatterNdOp I32Tensor:$indices, $updates, $shape)>;
+
+def LegalizeCumsum : Pat<
+  (TF_CumsumOp $input, $axis, $exclusive, $reverse),
+  (TFL_CumsumOp $input, $axis, $exclusive, $reverse)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 6f7f3b88471..c5398d290a9 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -118,14 +119,11 @@ bool HasSameStaticShapes(Operation* op) {
 }
 
 // Util that casts 'val' to Int32 by adding a cast Op.
-Value CreateCastToInt32(Attribute val, Location loc,
-                        PatternRewriter& rewriter) {
+Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   auto shape = val.getType().dyn_cast<RankedTensorType>().getShape();
   IntegerType new_ele_type = rewriter.getIntegerType(32);
   ShapedType new_type = RankedTensorType::get(shape, new_ele_type);
-  return rewriter.create<TF::CastOp>(loc, new_type,
-                                     rewriter.create<TF::ConstOp>(loc, val),
-                                     rewriter.getBoolAttr(false));
+  return rewriter.create<TFL::CastOp>(loc, new_type, val);
 }
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_legalize_tf.inc"
@@ -152,7 +150,6 @@ DECL_CONVERT_OP(Split);
 DECL_CONVERT_OP(SplitV);
 DECL_CONVERT_OP(StridedSlice);
 DECL_CONVERT_OP(Unpack);
-DECL_CONVERT_OP(Reciprocal);
 DECL_CONVERT_OP(RandomUniform);
 
 #undef DECL_CONVERT_OP
@@ -511,26 +508,6 @@ LogicalResult ConvertTFAssertOp::matchAndRewrite(
   return success();
 }
 
-LogicalResult ConvertTFReciprocalOp::matchAndRewrite(
-    Operation* op, PatternRewriter& rewriter) const {
-  auto tf_reciprocal_op = cast<TF::ReciprocalOp>(op);
-
-  auto status_or_const_op = CreateConstOpWithSingleValue(
-      &rewriter, op->getLoc(),
-      tf_reciprocal_op.x().getType().cast<ShapedType>(), 1);
-  if (!status_or_const_op.ok()) {
-    return failure();
-  }
-
-  StringAttr fused_activation_function =
-      StringAttr::get("NONE", rewriter.getContext());
-
-  rewriter.replaceOpWithNewOp<TFL::DivOp>(op, status_or_const_op.ValueOrDie(),
-                                          tf_reciprocal_op.x(),
-                                          fused_activation_function);
-  return success();
-}
-
 // Legalize unidirectional sequence lstm.
 struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
   explicit LegalizeUnidirectionalSequenceLstm(MLIRContext* context)
@@ -664,18 +641,22 @@ void LegalizeTF::runOnFunction() {
   auto* context = &getContext();
   auto func = getFunction();
 
+  // Add TF->TF lowering patterns.
+  TF::PopulateLoweringTFPatterns(context, &patterns);
+
   // Add the generated patterns to the list.
-  populateWithGenerated(context, &patterns);
+  populateWithGenerated(context, patterns);
   patterns
       .insert<ConvertTFConcatV2Op, ConvertTFMatMulOp, ConvertTFMatrixDiagV2Op,
               ConvertTFMatrixDiagV3Op, ConvertTFPackOp, ConvertTFReshapeOp,
               ConvertTFSplitOp, ConvertTFSplitVOp, ConvertTFStridedSliceOp,
-              ConvertTFUnpackOp, ConvertTFAssertOp, ConvertTFReciprocalOp,
-              ConvertTFRandomUniformOp>(context);
+              ConvertTFUnpackOp, ConvertTFAssertOp, ConvertTFRandomUniformOp>(
+          context);
 
   // Ophint python converter converted tf node pattern.
   patterns.insert<LegalizeUnidirectionalSequenceLstm,
                   LegalizeUnidirectionalSequenceRnn>(context);
+  FrozenRewritePatternList frozenPatterns(std::move(patterns));
 
   ConversionTarget target(*context);
   // It is legal to have TF ops in the graph still which can be
@@ -715,7 +696,7 @@ void LegalizeTF::runOnFunction() {
   // Currently unit-test doesn't do multiple tries, so we need this.
   const int max_iterations = 15;
   for (int i = 0; i < max_iterations; ++i) {
-    if (failed(applyPartialConversion(func, target, patterns))) {
+    if (failed(applyPartialConversion(func, target, frozenPatterns))) {
       return;
     }
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 54bfc5fa3a7..04ee875689d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -892,14 +892,14 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<TFL::BidirectionalSequenceLSTMOp>();
 
   OwningRewritePatternList patterns;
-  populateWithGenerated(context, &patterns);
+  populateWithGenerated(context, patterns);
   patterns.insert<ConvertConst, ConvertEmptyTensorList, ConvertIdentity,
                   ConvertTensorListGetItem, ConvertTensorListLength,
                   ConvertTensorListPushBack, ConvertTensorListReserve,
                   ConvertTensorListSetItem, ConvertTensorListStack,
                   ConvertTensorListResize, ConvertWhile, ConvertWhileRegion>(
       context);
-  return applyPartialConversion(func, target, patterns);
+  return applyPartialConversion(func, target, std::move(patterns));
 }
 
 void LowerStaticTensorListPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 578a1c167fe..e6a3fd8c822 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -35,13 +35,13 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -128,6 +128,12 @@ bool CanFuseConvOrDepthwiseConvShapes(const ArrayRef<int64_t> filter_shape,
     return false;
   }
   auto elements_depth = elements_shape.empty() ? 1 : elements_shape.back();
+  // If elements depth equals 1 (i.e., scalar or tensor with 1 element), then we
+  // can let binary op to broadcast elements.
+  if (elements_depth == 1) {
+    return true;
+  }
+
   // In TFLite Conv2D uses OHWI format for filter, and 1HWO for Depthwise Conv.
   // For conv:
   // Check if last dimension in filter equals the first dimension
@@ -798,21 +804,27 @@ void Optimize::runOnFunction() {
   // Potentially the binary ops might be fused together, like hard_swish, thus
   // we explore these potentially first and then fuse the binary ops with the
   // following ops in a second pattern match.
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   patterns.insert<FuseFullyConnectedAndAdd,
                   FuseFullyConnectedAndReluX<TFL::ReluOp, kRelu>,
                   FuseFullyConnectedAndReluX<TFL::Relu6Op, kRelu6>,
                   FuseFullyConnectedAndReluX<TFL::Relu1Op, kRelu1>,
                   FuseFullyConnectedAndMul>(ctx);
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 
   // Fuse the binary ops with the following ops.
-  patterns.insert<
-      FuseBinaryOpToFollowingConv2D, FuseBinaryOpToFollowingDepthwiseConv2D,
+  OwningRewritePatternList phase_2_patterns;
+  TFL::populateWithGenerated(ctx, phase_2_patterns);
+  phase_2_patterns.insert<
+      FuseFullyConnectedAndAdd, FuseFullyConnectedAndReluX<TFL::ReluOp, kRelu>,
+      FuseFullyConnectedAndReluX<TFL::Relu6Op, kRelu6>,
+      FuseFullyConnectedAndReluX<TFL::Relu1Op, kRelu1>,
+      FuseFullyConnectedAndMul, FuseBinaryOpToFollowingConv2D,
+      FuseBinaryOpToFollowingDepthwiseConv2D,
       FuseBinaryOpToFollowingFullyConnected, FuseConv2DAndMulWithQDQs,
       FuseDepthwiseConv2DAndMulWithQDQs, ConvertTrivialTransposeOpToReshapeOp>(
       ctx);
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(phase_2_patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index f1ea837446b..d0cedcc5051 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -150,7 +150,7 @@ void OptimizeFunctionalOpsPass::runOnOperation() {
   patterns.insert<FoldIfOp>(&getContext());
 
   ModuleOp module = getOperation();
-  applyPatternsAndFoldGreedily(module, patterns);
+  applyPatternsAndFoldGreedily(module, std::move(patterns));
 }
 
 PassRegistration<OptimizeFunctionalOpsPass> pass(
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 8243ed2a620..ff750590a19 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -24,6 +24,11 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 // Checks if the param passed is a F32 ElementsAttr.
 def F32ElementsAttr : ElementsAttrBase<
   CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getType().getElementType().isF32()">,
+        "32 bit float constant tensor">;
+
+// Checks if the param passed is a float ElementsAttr.
+def FloatElementsAttr : ElementsAttrBase<
+  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getType().getElementType().isa<FloatType>()">,
         "float constant tensor">;
 
 // Checks if the param passed is of NoneType.
@@ -57,15 +62,31 @@ multiclass FuseActFnIntoConvOpPat<dag ActFnOp, dag ActFnAttr> {
     [(HasOneUse $conv_out)]>;
 }
 
+multiclass FuseActFnIntoPoolOpPat<dag ActFnOp, dag ActFnAttr> {
+  def FuseActivationFuncWithAvgPool#ActFnOp#ActFnAttr : Pat<
+    (ActFnOp (TFL_AveragePool2DOp:$pool_out $input, $filter_height,
+                  $filter_width, $padding, $stride_h, $stride_w, TFL_AF_None)),
+    (TFL_AveragePool2DOp $input, $filter_height, $filter_width, $padding,
+        $stride_h, $stride_w, ActFnAttr),
+    [(HasOneUse $pool_out)]>;
+  def FuseActivationFuncWithMaxPool#ActFnOp#ActFnAttr : Pat<
+    (ActFnOp (TFL_MaxPool2DOp:$pool_out $input, $padding, $stride_w, $stride_h,
+                  $filter_width, $filter_height, TFL_AF_None)),
+    (TFL_MaxPool2DOp $input, $padding, $stride_w, $stride_h,
+        $filter_width, $filter_height, ActFnAttr),
+    [(HasOneUse $pool_out)]>;
+}
+
 // TODO(hinsu): Also fuse ops corresponding to SIGN_BIT fused
 // activation functions.
 // Currently we're not fusing tanh, sigmoid, hard_swish and other activations
 // those cannot be simply translated into clamping.
 foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu],
                      [TFL_Relu6Op, TFL_AF_Relu6],
-                     [TFL_Relu1Op, TFL_AF_Relu1]] in
+                     [TFL_Relu1Op, TFL_AF_Relu1]] in {
   defm : FuseActFnIntoConvOpPat<actFnPair[0], actFnPair[1]>;
-
+  defm : FuseActFnIntoPoolOpPat<actFnPair[0], actFnPair[1]>;
+}
 
 class CanFuseConvOrDepthwiseConv<string is_depthwise> : Constraint<
   CPred<"TFL::CanFuseConvOrDepthwiseConv($0, $1, " # is_depthwise # ")">>;
@@ -77,9 +98,9 @@ class CanFuseConvOrDepthwiseConv<string is_depthwise> : Constraint<
 multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
   def FuseBinaryOpWithConv#binaryOp : Pat<
     (binaryOp (TFL_Conv2DOp:$output $input, $filter,
-                (ConstantOp F32ElementsAttr:$bias), $h_factor, $w_factor,
+                (ConstantOp FloatElementsAttr:$bias), $h_factor, $w_factor,
                 TFL_AF_None, $padding, $stride_h, $stride_w),
-              (ConstantOp F32ElementsAttr:$value), $act_fn),
+              (ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_Conv2DOp $input, $filter,
       (binaryOp (ConstantOp $bias),
          (ConstantOp $value), TFL_AF_None),
@@ -88,10 +109,10 @@ multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
      (HasOneUse $output)]>;
   def FuseBinaryOpWithDepthwiseConv#binaryOp : Pat<
     (binaryOp (TFL_DepthwiseConv2DOp:$output $input, $filter,
-                (ConstantOp F32ElementsAttr:$bias),
+                (ConstantOp FloatElementsAttr:$bias),
                 $h_factor, $w_factor, TFL_AF_None, $padding, $stride_h,
                 $stride_w, $multiplier),
-              (ConstantOp F32ElementsAttr:$value), $act_fn),
+              (ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_DepthwiseConv2DOp $input, $filter,
       (binaryOp (ConstantOp $bias), (ConstantOp $value), TFL_AF_None),
       $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w,
@@ -100,9 +121,9 @@ multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
      (HasOneUse $output)]>;
    def FuseBinaryOpWithTransposeConv#binaryOp : Pat<
     (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
-                (ConstantOp F32ElementsAttr:$bias), $padding,
+                (ConstantOp FloatElementsAttr:$bias), $padding,
                 $stride_h, $stride_w),
-              (ConstantOp F32ElementsAttr:$value), TFL_AF_None),
+              (ConstantOp FloatElementsAttr:$value), TFL_AF_None),
     (TFL_TransposeConvOp $output_shape, $weights, $inputs,
       (binaryOp (ConstantOp $bias),
          (ConstantOp $value), TFL_AF_None),
@@ -114,7 +135,7 @@ multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
     (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
                 (ConstantOp $bias), $padding,
                 $stride_h, $stride_w),
-              (ConstantOp F32ElementsAttr:$value), TFL_AF_None),
+              (ConstantOp FloatElementsAttr:$value), TFL_AF_None),
     (TFL_TransposeConvOp $output_shape, $weights, $inputs,
       (ConstantOp $value),
       $padding, $stride_h, $stride_w),
@@ -139,11 +160,11 @@ def ExpandTo4DForDepthwiseConv: NativeCodeCall<
 multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
   def FuseMulOrDivWithDepthwiseConv#BinaryOp : Pat<
     (BinaryOp (TFL_DepthwiseConv2DOp:$output $input,
-                (ConstantOp F32ElementsAttr:$filter),
-                (ConstantOp F32ElementsAttr:$bias),
+                (ConstantOp FloatElementsAttr:$filter),
+                (ConstantOp FloatElementsAttr:$bias),
                 $h_factor, $w_factor, TFL_AF_None, $padding, $stride_h,
                 $stride_w, $multiplier),
-              (ConstantOp F32ElementsAttr:$value), $act_fn),
+              (ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_DepthwiseConv2DOp $input,
       (BinaryOp
         (ConstantOp $filter),
@@ -159,11 +180,11 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
      (HasOneUse $output)]>;
   def FuseMulOrDivWithConv#BinaryOp : Pat<
     (BinaryOp (TFL_Conv2DOp:$conv_output $input,
-                (ConstantOp F32ElementsAttr:$filter),
-                (ConstantOp F32ElementsAttr:$bias),
+                (ConstantOp FloatElementsAttr:$filter),
+                (ConstantOp FloatElementsAttr:$bias),
                 $h_factor, $w_factor, TFL_AF_None,
                 $padding, $stride_h, $stride_w),
-              (ConstantOp F32ElementsAttr:$value), $act_fn),
+              (ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_Conv2DOp $input,
       (BinaryOp (ConstantOp $filter),
         (ConstantOp (ExpandTo4DForConv $value)),
@@ -176,8 +197,8 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
      (HasOneUse $conv_output)]>;
   def FuseMulOrDivWithTransposeConv#BinaryOp : Pat<
     (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
-                (ConstantOp F32ElementsAttr:$weights), $input,
-                (ConstantOp F32ElementsAttr:$bias),
+                (ConstantOp FloatElementsAttr:$weights), $input,
+                (ConstantOp FloatElementsAttr:$bias),
                 $padding, $stride_h, $stride_w),
               (ConstantOp $value), TFL_AF_None),
     (TFL_TransposeConvOp $output_shape,
@@ -193,7 +214,7 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
      (HasOneUse $output)]>;
   def FuseMulOrDivWithTransposeConvWithNoneBias#BinaryOp : Pat<
     (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
-                (ConstantOp F32ElementsAttr:$weights), $input,
+                (ConstantOp FloatElementsAttr:$weights), $input,
                 (ConstantOp $bias),
                 $padding, $stride_h, $stride_w),
               (ConstantOp $value), TFL_AF_None),
@@ -222,8 +243,6 @@ def eliminate_dq_q_pairs : Pat<
   [(NotFromQuantOpOrSameQuantType $in, $qt)]>;
 
 
-// Constraint that makes sure both operands are the same operands.
-def EqualOperands : Constraint<CPred<"$0 == $1">>;
 
 
 // Checks if the operand has rank == n
@@ -235,28 +254,26 @@ def MatchHardSwishPattern1 : Pat<
   (TFL_MulOp
     (TFL_MulOp
      $x, (TFL_AddOp
-          $y,
+          $x,
           (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "3.0f">),
           TFL_AF_Relu6),
      TFL_AF_None),
     (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "0.166666666f">),
      TFL_AF_None),
-  (TFL_HardSwishOp $x),
-  [(EqualOperands $x, $y)]>;
+  (TFL_HardSwishOp $x)>;
 
 def MatchHardSwishPattern2 : Pat<
   (TFL_MulOp
     $x,
     (TFL_MulOp
      (TFL_AddOp
-      $y,
+      $x,
       (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "3.0f">),
       TFL_AF_Relu6),
      (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "0.166666666f">),
      TFL_AF_None),
      TFL_AF_None),
-  (TFL_HardSwishOp $x),
-  [(EqualOperands $x, $y)]>;
+  (TFL_HardSwishOp $x)>;
 
 // Matching HardSwish with extra FakeQuant. These FakeQuant ops were due to
 // incorrect placement in the quantization aware training.
@@ -265,14 +282,13 @@ def MatchHardSwishQuantized : Pat<
   (TFL_MulOp (TFL_DequantizeOp (TFL_QuantizeOp
     (TFL_MulOp
      $x, (TFL_DequantizeOp (TFL_QuantizeOp (TFL_AddOp
-          $y,
+          $x,
           (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "3.0f">),
           TFL_AF_Relu6), $qattr2)),
      TFL_AF_None), $qattr1)),
     (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "0.166666666f">),
      TFL_AF_None),
-  (TFL_HardSwishOp $x),
-  [(EqualOperands $x, $y)]>;
+  (TFL_HardSwishOp $x)>;
 
 // Constraint that the attribute value is less than 'n'
 class ConstDoubleValueLessThan<string n> : Constraint<
@@ -293,47 +309,44 @@ multiclass L2NormalizePatterns<dag FirstOp, dag SecondOp> {
   // Mul->Rsqrt->Sum->Square Or
   // Div->sqrt->Sum->Square
   def L2NormalizePattern1#FirstOp#SecondOp : Pat<
-                  (FirstOp $operand1,
+                  (FirstOp $x,
                      (SecondOp
                         (TFL_SumOp
-                           (TFL_SquareOp:$sq_op $square_operand),
+                           (TFL_SquareOp:$sq_op $x),
                            (ConstantOp I32ElementsAttr:$axis),
                            $keep_dims)),
                      TFL_AF_None),
-           (TFL_L2NormalizationOp $operand1, TFL_AF_None),
-           [(EqualOperands $operand1, $square_operand),
-            (L2NormValidReduceIndex $sq_op, $axis)]>;
+           (TFL_L2NormalizationOp $x, TFL_AF_None),
+           [(L2NormValidReduceIndex $sq_op, $axis)]>;
 
   // Below patterns for L2Normalize when there is an Add or Maximum
   // adding or clamping to a small constant scalar.
   def L2NormalizePattern2#FirstOp#SecondOp : Pat<
-                    (FirstOp $operand1,
+                    (FirstOp $x,
                      (SecondOp
                       (TFL_AddOp
                        (TFL_SumOp
-                        (TFL_SquareOp:$sq_op $square_operand),
+                        (TFL_SquareOp:$sq_op $x),
                         (ConstantOp I32ElementsAttr:$axis),
                         $keep_dims),
                        (ConstantOp $epsilon), TFL_AF_None)),
            TFL_AF_None),
-           (TFL_L2NormalizationOp $operand1, TFL_AF_None),
-           [(EqualOperands $operand1, $square_operand),
-            (L2NormValidReduceIndex $sq_op, $axis),
+           (TFL_L2NormalizationOp $x, TFL_AF_None),
+           [(L2NormValidReduceIndex $sq_op, $axis),
             (ConstDoubleValueLessThan<"1e-3"> $epsilon)]>;
 
   def L2NormalizePattern3#FirstOp#SecondOp : Pat<
-                    (FirstOp $operand1,
+                    (FirstOp $x,
                      (SecondOp
                       (TFL_MaximumOp
                        (TFL_SumOp
-                        (TFL_SquareOp:$sq_op $square_operand),
+                        (TFL_SquareOp:$sq_op $x),
                         (ConstantOp I32ElementsAttr:$axis),
                         $keep_dims),
                        (ConstantOp $epsilon))),
            TFL_AF_None),
-           (TFL_L2NormalizationOp $operand1, TFL_AF_None),
-           [(EqualOperands $operand1, $square_operand),
-            (L2NormValidReduceIndex $sq_op, $axis),
+           (TFL_L2NormalizationOp $x, TFL_AF_None),
+           [(L2NormValidReduceIndex $sq_op, $axis),
             (ConstDoubleValueLessThan<"1e-3"> $epsilon)]>;
 
 }
@@ -465,14 +478,15 @@ foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
 // if called without a ranked tensor it will fail.
 def GetShape: NativeCodeCall<"GetShape($0)">;
 
-// Returns True if the operand type is RankedTensorType.
-def HasRankedTensor : Constraint<
-    CPred<"$0.getType().isa<RankedTensorType>()">>;
+// Returns True if the operand type is RankedTensorType and valid.
+def HasValidRankedTensor : Constraint<CPred<
+  "$0.getType().isa<RankedTensorType>() && "
+  "$0.getType().cast<RankedTensorType>().getNumDynamicDims() <= 1">>;
 
 def ConvertSqueezeToReshape : Pat<
   (TFL_SqueezeOp:$squeeze_op $input, $squeeze_dims),
   (TFL_ReshapeOp $input, (ConstantOp (GetShape $squeeze_op))),
-  [(HasRankedTensor $squeeze_op)]>;
+  [(HasValidRankedTensor $squeeze_op)]>;
 
 // Convert expand_dims to reshape if possible.
 def ConvertExpandDimsToReshape : Pat<
@@ -481,9 +495,9 @@ def ConvertExpandDimsToReshape : Pat<
   [(AnyStaticShapeTensor $expand_dims_op)]>;
 
 class FloatValueEquals<string val> : Constraint<CPred<
-  "$0.cast<DenseElementsAttr>().getNumElements() == 1 &&"
-  "$0.isa<DenseFPElementsAttr>() &&"
-  "*$0.cast<DenseElementsAttr>().getValues<float>().begin() == " # val>>;
+  "$0.isa<DenseFPElementsAttr>() && "
+  "llvm::all_of($0.cast<DenseElementsAttr>().getFloatValues(), "
+  "[](const APFloat& f) { return f.isExactlyValue(" # val # "); })">>;
 
 // ReLU patterns
 def MatchReluPattern : Pat<
@@ -505,12 +519,11 @@ def MatchRelu1Pattern2 : Pat<
 
 def MatchLeakyRelu : Pat<
   (TFL_MaximumOp
-    (TFL_MulOp:$mul_out $input1,
+    (TFL_MulOp:$mul_out $x,
      (ConstantOp F32ElementsAttr:$alpha), TFL_AF_None),
-    $input2),
-  (TFL_LeakyReluOp $input1, ExtractSingleElementAsFloat:$alpha),
+    $x),
+  (TFL_LeakyReluOp $x, ExtractSingleElementAsFloat:$alpha),
   [(ConstDoubleValueLessThan<"1"> $alpha),
-   (EqualOperands $input1, $input2),
    (HasOneUse $mul_out)]>;
 
 def RemoveTrivialCast : Pat<(TFL_CastOp:$output $input),
@@ -526,15 +539,14 @@ def PReluAlphaRankCheck : Constraint<
 // f(x) = Relu(x) + (-alpha * Relu(-x))
 def MatchPRelu : Pat<
   (TFL_AddOp
-   (TFL_ReluOp:$relu_out $input1),
+   (TFL_ReluOp:$relu_out $x),
    (TFL_MulOp:$mul_out
-    (TFL_ReluOp (TFL_NegOp:$input_neg_out $input2)),
+    (TFL_ReluOp (TFL_NegOp:$input_neg_out $x)),
     $neg_alpha,
     TFL_AF_None),
    TFL_AF_None),
-  (TFL_PReluOp $input1, (TFL_NegOp $neg_alpha)),
-  [(EqualOperands $input1, $input2),
-   (PReluAlphaRankCheck $neg_alpha, $input1),
+  (TFL_PReluOp $x, (TFL_NegOp $neg_alpha)),
+  [(PReluAlphaRankCheck $neg_alpha, $x),
    (HasOneUse $relu_out),
    (HasOneUse $mul_out),
    (HasOneUse $input_neg_out)]>;
@@ -600,7 +612,7 @@ def ShapeMatchesReduceWithKeepAxes : Constraint<CPred<
   "ShapeMatchesReduceWithKeepAxes($0, $1, $2)">>;
 
 // Fold reshapes re-inserting reduced dimensions into the results of a reduction
-// with `keep_dims=false` by chaning it to one using `keep_dims=true`.
+// with `keep_dims=false` by changing it to one using `keep_dims=true`.
 foreach ReduceOp = [TFL_ReduceMaxOp, TFL_ReduceMinOp, TFL_ReduceProdOp,
                     TFL_SumOp] in {
   def FoldReshapeTo#ReduceOp : Pat<
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 1c6550bc902..5afbfe18320 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -144,16 +145,20 @@ void PostQuantizePass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
   auto* ctx = func.getContext();
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   patterns.insert<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 
   if (!emit_quant_adaptor_ops_) {
     RemoveQuantizationAdaptorOps(getFunction());
   }
 
-  patterns.insert<RemoveVolatileOps>(ctx);
-  applyPatternsAndFoldGreedily(func, patterns);
+  OwningRewritePatternList phase_2_patterns;
+  TFL::populateWithGenerated(ctx, phase_2_patterns);
+  phase_2_patterns
+      .insert<quant::FoldTrivalRequantizeOp<QuantizeOp>, RemoveVolatileOps>(
+          ctx);
+  applyPatternsAndFoldGreedily(func, std::move(phase_2_patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 326b6b23398..5cfdb4b982d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -40,42 +40,6 @@ def : Pat<
     (TF_MulOp $t, (TF_MulOp:$mul (TF_RsqrtOp (TF_AddOp $v, (TF_ConstOp $variance_epsilon))), $gamma)),
     (TF_SubOp $beta, (TF_MulOp $m, $mul)))>;
 
-// Converts tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
-// operations. Specifically, performs the following calculation:
-//
-//   (x - mean) * scale / sqrt(variance + epsilon) + offset
-//
-// Let multiplier = scale / sqrt(variance + epsilon),
-// to compute
-//   (x - mean) * scale / sqrt(variance + epsilon) + offset,
-// is then to compute
-//   (x * multiplier) + (offset - mean * multiplier).
-
-def : Pattern<
-    (TF_FusedBatchNormV3Op:$root
-        $x, $scale, $offset, $mean, $variance,
-        F32Attr:$epsilon, $exponential_avg_factor,
-        $data_format, FalseBoolAttr:$is_training),
-    [(TF_AddOp
-        (TF_MulOp
-            $x,
-            (TF_MulOp:$multiplier
-                $scale,
-                (TF_RsqrtOp
-                    (TF_AddOp $variance,
-                              (TF_ConstOp $epsilon))))),
-        (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
-     // We already guaranteed that the last five results have no use so it does
-     // not matter what value we provide here for replacement.
-     /*batch_mean=*/(replaceWithValue $x),
-     /*batch_variance=*/(replaceWithValue $x),
-     /*reserve_space_1=*/(replaceWithValue $x),
-     /*reserve_space_2=*/(replaceWithValue $x),
-     /*reserve_space_3=*/(replaceWithValue $x)],
-    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
-     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
-     (HasNoUseOf:$root__5)]>;
-
 class TFi32<int v> : ConstantAttr<I32ElementsAttr, !cast<string>(v)>;
 
 // Matmul without transpose on b to matmul with explicit transpose op and
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 783f21fce21..78fac7f4e11 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
@@ -337,7 +337,7 @@ void PrepareQuantizePass::runOnFunction() {
     // Currently, only activation stats are imported, so narrow_range = false.
     patterns.insert<PrepareQuantStats>(bit_width, false, false, ctx);
   }
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 
   SanityCheckAndAdjustment(func);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 2b118d0b810..3fb5c2cc6f7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -46,12 +46,12 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -328,7 +328,9 @@ struct ConvertTFConvOp : public RewritePattern {
     // tensor, for setting depth_multiplier attribute, etc.).
     auto filter = tf_op.filter();
     auto filter_type = filter.getType().template dyn_cast<RankedTensorType>();
-    if (!filter_type || filter_type.getRank() != 4) return failure();
+    if (!filter_type || filter_type.getRank() != 4 ||
+        !filter_type.hasStaticShape())
+      return failure();
 
     // TensorFlow convolution op only has two inputs, while the TFLite one has
     // three, with the bias vector marked as optional. However, TOCO has a
@@ -740,29 +742,276 @@ struct ConvertTFBroadcastTo : public RewritePattern {
   }
 };
 
-struct ConvertFusedBatchNorm : public OpRewritePattern<TF::FusedBatchNormOp> {
-  explicit ConvertFusedBatchNorm(MLIRContext *context)
-      : OpRewritePattern<TF::FusedBatchNormOp>(context) {}
+// The below pattern is equivalent to the DRR rule below
+// The checks are dependent on generated values, so we can't add
+// the checks on intermediate values, ideally we should find equivalent
+// checks that guarantees the resultant ops are valid.
+// The extra conditions are the broadcasting conditions.
+//
+// The pattern lower FusedBatchNormV3 to arithmetic ops.
+// Specifically, performs the following calculation:
+//
+//   (x - mean) * scale / sqrt(variance + epsilon) + offset
+//
+// Let multiplier = scale / sqrt(variance + epsilon),
+// to compute
+//   (x - mean) * scale / sqrt(variance + epsilon) + offset,
+// is then to compute
+//   (x * multiplier) + (offset - mean * multiplier).
+//
+// def : Pattern<
+//     (TF_FusedBatchNormV3Op:$root
+//         $x, $scale, $offset, $mean, $variance,
+//         F32Attr:$epsilon, $exponential_avg_factor,
+//         $data_format, FalseBoolAttr:$is_training),
+//     [(TF_AddOp
+//         (TF_MulOp
+//             $x,
+//             (TF_MulOp:$multiplier
+//                 $scale,
+//                 (TF_RsqrtOp
+//                     (TF_AddOp $variance,
+//                               (TF_ConstOp $epsilon))))),
+//         (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
+//    // We already guaranteed that the last five results have no use so it does
+//    // not matter what value we provide here for replacement.
+//      /*batch_mean=*/(replaceWithValue $x),
+//      /*batch_variance=*/(replaceWithValue $x),
+//      /*reserve_space_1=*/(replaceWithValue $x),
+//      /*reserve_space_2=*/(replaceWithValue $x),
+//      /*reserve_space_3=*/(replaceWithValue $x)],
+//     [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
+//      (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
+//      (HasNoUseOf:$root__5), (AreBroadcastableTypes $multiplier, $x)]>;
 
-  LogicalResult matchAndRewrite(TF::FusedBatchNormOp tf_fused_batch_norm_op,
-                                PatternRewriter &rewriter) const override {
-    auto new_result_types =
-        llvm::to_vector<6>(tf_fused_batch_norm_op.getResultTypes());
-    // reserve_space_3
-    new_result_types.push_back(
-        UnrankedTensorType::get(FloatType::getF32(rewriter.getContext())));
+struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
+  explicit FusedBatchNormV3Pat(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern(
+            "tf.FusedBatchNormV3",
+            {"tf.Add", "tf.Const", "tf.Mul", "tf.Rsqrt", "tf.Sub"}, 1,
+            context) {}
 
-    OperationState new_state(tf_fused_batch_norm_op.getLoc(),
-                             TF::FusedBatchNormV3Op::getOperationName(),
-                             tf_fused_batch_norm_op.getOperands(),
-                             new_result_types,
-                             tf_fused_batch_norm_op.getAttrs());
-    Operation *tf_fused_batch_norm_op_v3 = rewriter.createOperation(new_state);
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *fused_batch_norm,
+      ::mlir::PatternRewriter &rewriter) const override {
+    // Variables for capturing values and attributes used for creating ops
+    Operation::operand_range mean(fused_batch_norm->getOperands());
+    ::mlir::FloatAttr exponential_avg_factor;
+    ::mlir::StringAttr data_format;
+    ::mlir::TF::FusedBatchNormV3Op root;
+    Operation::operand_range offset(fused_batch_norm->getOperands());
+    Operation::operand_range x(fused_batch_norm->getOperands());
+    Operation::operand_range scale(fused_batch_norm->getOperands());
+    Operation::operand_range variance(fused_batch_norm->getOperands());
+    ::mlir::FloatAttr epsilon;
+    ::mlir::BoolAttr is_training;
 
-    rewriter.replaceOp(tf_fused_batch_norm_op,
-                       tf_fused_batch_norm_op_v3->getResults().drop_back());
+    // Match
+    auto fused_batch_norm_op =
+        dyn_cast_or_null<::mlir::TF::FusedBatchNormV3Op>(fused_batch_norm);
+    root = fused_batch_norm_op;
+    x = fused_batch_norm_op.getODSOperands(0);
+    scale = fused_batch_norm_op.getODSOperands(1);
+    offset = fused_batch_norm_op.getODSOperands(2);
+    mean = fused_batch_norm_op.getODSOperands(3);
+    variance = fused_batch_norm_op.getODSOperands(4);
+    {
+      epsilon = fused_batch_norm_op.getAttrOfType<::mlir::FloatAttr>("epsilon");
+      if (!epsilon)
+        epsilon = rewriter.getFloatAttr(rewriter.getF32Type(), 0.0001f);
+
+      if (!(((epsilon.isa<::mlir::FloatAttr>())) &&
+            ((epsilon.cast<::mlir::FloatAttr>().getType().isF32())))) {
+        return rewriter.notifyMatchFailure(
+            fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+              diag << "op 'tf.FusedBatchNormV3' attribute 'epsilon' failed to "
+                      "satisfy constraint: 32-bit float attribute";
+            });
+      }
+    }
+    {
+      exponential_avg_factor =
+          fused_batch_norm_op.getAttrOfType<::mlir::FloatAttr>(
+              "exponential_avg_factor");
+      if (!exponential_avg_factor)
+        exponential_avg_factor =
+            rewriter.getFloatAttr(rewriter.getF32Type(), 1.0f);
+    }
+    {
+      data_format =
+          fused_batch_norm_op.getAttrOfType<::mlir::StringAttr>("data_format");
+      if (!data_format) data_format = rewriter.getStringAttr("NHWC");
+    }
+    {
+      is_training =
+          fused_batch_norm_op.getAttrOfType<::mlir::BoolAttr>("is_training");
+      if (!is_training) is_training = rewriter.getBoolAttr(true);
+
+      if (!((!is_training.getValue()))) {
+        return rewriter.notifyMatchFailure(
+            fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+              diag << "op 'tf.FusedBatchNormV3' attribute 'is_training' failed "
+                      "to "
+                      "satisfy constraint: FalseBoolAttr";
+            });
+      }
+    }
+
+    if (!(((*root.getODSResults(1).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(2).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(3).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(4).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(5).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+    // Rewrite
+    auto odsLoc = rewriter.getFusedLoc({fused_batch_norm->getLoc()});
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    ::mlir::TF::ConstOp epsilon_const_op;
+    {
+      epsilon_const_op =
+          rewriter.create<::mlir::TF::ConstOp>(odsLoc,
+                                               /*value=*/epsilon);
+    }
+    ::mlir::TF::AddOp add_op_1;
+    {
+      ::mlir::Value tblgen_value_0 = (*variance.begin());
+      ::mlir::Value tblgen_value_1 =
+          (*epsilon_const_op.getODSResults(0).begin());
+      add_op_1 = rewriter.create<::mlir::TF::AddOp>(odsLoc,
+                                                    /*x=*/tblgen_value_0,
+                                                    /*y=*/tblgen_value_1);
+      // We need to make sure the Add operands are broadcastable.
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(add_op_1)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::RsqrtOp rsqrt_op;
+    {
+      ::mlir::SmallVector<::mlir::Value, 4> tblgen_values;
+      ::mlir::SmallVector<::mlir::NamedAttribute, 4> tblgen_attrs;
+      tblgen_values.push_back((*add_op_1.getODSResults(0).begin()));
+      rsqrt_op = rewriter.create<::mlir::TF::RsqrtOp>(odsLoc, tblgen_values,
+                                                      tblgen_attrs);
+    }
+    ::mlir::TF::MulOp multiplier;
+    {
+      ::mlir::Value tblgen_value_0 = (*scale.begin());
+      ::mlir::Value tblgen_value_1 = (*rsqrt_op.getODSResults(0).begin());
+      // We need to make sure the Add operands are broadcastable.
+      multiplier = rewriter.create<::mlir::TF::MulOp>(odsLoc,
+                                                      /*x=*/tblgen_value_0,
+                                                      /*y=*/tblgen_value_1);
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(multiplier)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::MulOp mul_op_1;
+    {
+      ::mlir::Value tblgen_value_0 = (*x.begin());
+      ::mlir::Value tblgen_value_1 = (*multiplier.getODSResults(0).begin());
+      mul_op_1 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
+                                                    /*x=*/tblgen_value_0,
+                                                    /*y=*/tblgen_value_1);
+      // We need to make sure the Mul operands are broadcastable.
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(mul_op_1)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::MulOp mul_op_2;
+    {
+      ::mlir::Value tblgen_value_0 = (*mean.begin());
+      ::mlir::Value tblgen_value_1 = (*multiplier.getODSResults(0).begin());
+      mul_op_2 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
+                                                    /*x=*/tblgen_value_0,
+                                                    /*y=*/tblgen_value_1);
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(mul_op_2)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::SubOp sub_op;
+    {
+      ::mlir::Value tblgen_value_0 = (*offset.begin());
+      ::mlir::Value tblgen_value_1 = (*mul_op_2.getODSResults(0).begin());
+      sub_op = rewriter.create<::mlir::TF::SubOp>(odsLoc,
+                                                  /*x=*/tblgen_value_0,
+                                                  /*y=*/tblgen_value_1);
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(sub_op).value ==
+          LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::AddOp add_op_2;
+    {
+      ::mlir::SmallVector<::mlir::Value, 4> tblgen_values;
+      ::mlir::SmallVector<::mlir::NamedAttribute, 4> tblgen_attrs;
+      tblgen_values.push_back((*mul_op_1.getODSResults(0).begin()));
+      tblgen_values.push_back((*sub_op.getODSResults(0).begin()));
+      ::mlir::SmallVector<::mlir::Type, 4> tblgen_types;
+      for (auto v : fused_batch_norm_op.getODSResults(0)) {
+        tblgen_types.push_back(v.getType());
+      }
+      add_op_2 = rewriter.create<::mlir::TF::AddOp>(
+          odsLoc, tblgen_types, tblgen_values, tblgen_attrs);
+      // We need to make sure the Add operands are broadcastable.
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(add_op_2)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{add_op_2.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(fused_batch_norm, replace_values);
     return success();
-  }
+  };
 };
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_prepare_tf.inc"
@@ -798,7 +1047,7 @@ LogicalResult ConvertTf2XlaOps(FuncOp func, MLIRContext *context) {
   TF::PopulateLegalizeHloToTfPatterns(&patterns, context);
   mhlo::GatherOp::getCanonicalizationPatterns(patterns, context);
 
-  return applyPartialConversion(func, target, patterns);
+  return applyPartialConversion(func, target, std::move(patterns));
 }
 
 // Convert rfft to rfft2d.
@@ -898,7 +1147,7 @@ struct ConvertRfftToRfft2d : public RewritePattern {
 };
 
 void PrepareTFPass::runOnFunction() {
-  OwningRewritePatternList patterns;
+  OwningRewritePatternList patterns, phase_2_patterns;
   auto func = getFunction();
   MLIRContext *ctx = &getContext();
 
@@ -927,30 +1176,29 @@ void PrepareTFPass::runOnFunction() {
   // This pattern will try to identify and optimize for dilated convolution.
   // e.g. Patterns like "SpaceToBatchND -> Conv2D -> BatchToSpaceND" will be
   // replaced with a single Conv op with dilation parameter.
-  patterns.insert<ConvertTFDilatedConvOp<TF::Conv2DOp>,
+  patterns.insert<ConvertTFDilatedConvOp<TF::Conv2DOp>, FusedBatchNormV3Pat,
                   ConvertTFDilatedConvOp<TF::DepthwiseConv2dNativeOp>>(ctx);
 
-  patterns.insert<ConvertFusedBatchNorm>(ctx);
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   // TODO(karimnosseir): Split to separate pass probably after
   // deciding on long term plan for this optimization.
   // This will allow optimizing any TF_Mul->TF_Conv in the graph
   // and any expanded from FusedBatchNorm. We need to do this
   // before converting TF_Conv to TFL_Conv
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 
   // Load the generated pattern again, so new quantization pass-through
   // will be applied.
-  patterns.clear();
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, phase_2_patterns);
   if (unfold_batch_matmul_) {
-    patterns.insert<TF::ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
-                    TF::ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(ctx);
+    phase_2_patterns.insert<TF::ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
+                            TF::ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(
+        ctx);
   }
-  patterns.insert<TF::ConvertTFEinsumOp, ConvertTFBroadcastTo, ConvertTFConv2D,
-                  ConvertTFDepthwiseConv2dNative, ConvertTFStridedSlice,
-                  ConvertRfftToRfft2d>(ctx);
-  applyPatternsAndFoldGreedily(func, patterns);
+  phase_2_patterns.insert<TF::ConvertTFEinsumOp, ConvertTFBroadcastTo,
+                          ConvertTFConv2D, ConvertTFDepthwiseConv2dNative,
+                          ConvertTFStridedSlice, ConvertRfftToRfft2d>(ctx);
+  applyPatternsAndFoldGreedily(func, std::move(phase_2_patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index ba25b5c897c..65ecaacaea5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -84,10 +85,10 @@ void QuantizePass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
   auto* ctx = func.getContext();
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   patterns.insert<TFLFullQuantization>(
       ctx, enable_numeric_verify, error_tolerance, enable_single_layer_verify);
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index 3a469dd7341..9bb12ff84a9 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -744,7 +744,12 @@ LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
       /*cell_layer_norm_coefficients=*/none,
       /*output_layer_norm_coefficients=*/none, builder->getStringAttr("TANH"),
       builder->getF32FloatAttr(10.0), builder->getF32FloatAttr(0.0),
-      builder->getBoolAttr(time_majored));
+      builder->getBoolAttr(time_majored),
+      /*input_to_input_intermediate=*/mlir::TypeAttr(),
+      /*input_to_forget_intermediate=*/mlir::TypeAttr(),
+      /*input_to_cell_intermediate=*/mlir::TypeAttr(),
+      /*input_to_output_intermediate=*/mlir::TypeAttr(),
+      /*effective_hidden_scale_intermediate=*/mlir::TypeAttr());
 
   auto final_output_full_sequences = lstm.getResult();
 
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index d97e12fbe45..58b57d89dbf 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -130,12 +130,6 @@ Status MlirFunctionOptimizationPass::Run(
   import_config.graph_as_function = true;
   import_config.control_outputs = *control_ret_node_names;
   import_config.upgrade_legacy = true;
-  // Disable shape inference during import as some TensorFlow op fails during
-  // shape inference with dynamic shaped operands. This in turn causes the
-  // import to fail. Shape inference during import is going to be removed and
-  // the shape inference pass is run early in the pass pipeline, shape inference
-  // during import is not necessary.
-  import_config.enable_shape_inference = false;
   TF_ASSIGN_OR_RETURN(auto module_ref,
                       ConvertGraphToMlir(**graph, debug_info, *flib_def,
                                          import_config, &context));
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 17410b4e5b2..8b81f8a7a4c 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -37,7 +37,7 @@ config.name = 'MLIR ' + os.path.basename(config.mlir_test_dir)
 config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.cc', '.hlo', '.hlotxt', '.mlir', '.pbtxt', '.py']
+config.suffixes = ['.cc', '.hlo', '.hlotxt', '.json', '.mlir', '.pbtxt', '.py']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = config.mlir_test_dir
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 199a9c0939c..0b323af1563 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -110,6 +110,8 @@ cc_library(
     deps = [
         ":tensorflow_op_interfaces_inc_gen",
         ":tensorflow_structs",
+        "//tensorflow/core:framework",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
@@ -738,6 +740,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
 )
 
@@ -810,8 +813,8 @@ cc_library(
     ],
     deps = [
         ":tensorflow",
+        ":tensorflow_op_interfaces",
         ":tensorflow_types",
-        "//tensorflow/core:framework",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
@@ -839,6 +842,7 @@ cc_library(
         "transforms/executor_tpuv1_inline_tpu_island.cc",
         "transforms/executor_tpuv1_island_coarsening.cc",
         "transforms/executor_tpuv1_outline_tpu_island.cc",
+        "transforms/fold_broadcast.cc",
         "transforms/fold_switch.cc",
         "transforms/functional_control_flow_to_cfg.cc",
         "transforms/functional_control_flow_to_regions.cc",
@@ -881,6 +885,7 @@ cc_library(
         "transforms/tpu_cluster_cleanup_attributes.cc",
         "transforms/tpu_cluster_formation.cc",
         "transforms/tpu_colocate_composite_resource_ops.cc",
+        "transforms/tpu_device_propagation.cc",
         "transforms/tpu_dynamic_layout_pass.cc",
         "transforms/tpu_dynamic_padding_mapper.cc",
         "transforms/tpu_extract_head_tail_outside_compilation.cc",
@@ -928,6 +933,7 @@ cc_library(
         ":shape_inference_utils",
         ":tensorflow",
         ":tensorflow_analysis",
+        ":tensorflow_ops",
         ":tensorflow_optimize_inc_gen",
         ":tensorflow_types",
         ":tf_data_optimization",
@@ -955,6 +961,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Parser",
@@ -992,6 +999,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -1220,7 +1228,7 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@llvm-project//mlir:Analysis",
+        "//tensorflow/stream_executor/lib",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:StandardOps",
     ],
@@ -1791,6 +1799,7 @@ cc_library(
     hdrs = ["utils/tpu_rewrite_device_util.h"],
     deps = [
         ":tensorflow",
+        "//tensorflow/compiler/mlir:string_container_utils",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
index 93a55cd9289..cdc9e33e368 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Analysis/CallGraph.h"  // from @llvm-project
@@ -40,8 +39,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
@@ -228,51 +227,16 @@ BacktrackAnalysisInfo::BacktrackAnalysisInfo(
     backtracked_values_.push_back(backtrack_analysis.BacktrackValue(result));
 }
 
-namespace {
-
-//===----------------------------------------------------------------------===//
-// ResourceAliasAnalysisInfo helper functions.
-//===----------------------------------------------------------------------===//
-
-constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
-
-// Returns if a VarHandleOp is anonymous, which means it always creates a new
-// variable.
-bool IsResourceHandleAnonymous(VarHandleOp handle) {
-  return handle.shared_name() == tensorflow::ResourceHandle::ANONYMOUS_NAME;
-}
-
-// Returns a string unique identifier for a non-anonymous VarHandleOp.
-std::string GetVarHandleStringId(VarHandleOp handle) {
-  auto device = handle.getAttrOfType<StringAttr>("device");
-  return llvm::join(
-      llvm::ArrayRef<llvm::StringRef>{
-          handle.container(), handle.shared_name(),
-          device ? device.getValue() : llvm::StringRef()},
-      "/");
-}
-
-// Finds a unique ID for a VarHandleOp's output. If it is anonymous, always
-// creates a new ID; otherwise, tries to reuse the existing ID for the
-// referenced variable if it exists, or creates a new one if not.
-int64_t GetOrCreateIdForVarHandle(VarHandleOp handle, int64_t& next_id,
-                                  llvm::StringMap<int64_t>& name_id_map) {
-  // Always create a new ID for anonymous handle.
-  if (IsResourceHandleAnonymous(handle)) return next_id++;
-
-  auto name = GetVarHandleStringId(handle);
-  auto emplace_res = name_id_map.try_emplace(name, next_id);
-  // New ID created, increment next_id.
-  if (emplace_res.second) ++next_id;
-  return emplace_res.first->second;
-}
-
-}  // namespace
-
 //===----------------------------------------------------------------------===//
 // ResourceAliasAnalysisInfo
 //===----------------------------------------------------------------------===//
 
+namespace {
+
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+
+}  // namespace
+
 constexpr int64_t ResourceAliasAnalysisInfo::kUnknownResourceId;
 
 // Constructs the analysis info by analyzing the given function.
@@ -338,13 +302,13 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
     }
   });
 
-  llvm::StringMap<int64_t> var_handle_name_id_map;
+  llvm::SmallDenseMap<ResourceHandle, int64_t> resource_handle_id_map;
   func_op.walk([&](Operation* op) {
-    if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
-      AddValueUniqueIDMapping(
-          var_handle.resource(),
-          GetOrCreateIdForVarHandle(var_handle, next_unique_id,
-                                    var_handle_name_id_map));
+    if (auto resource_alloc = dyn_cast<ResourceHandleAllocatorInterface>(op)) {
+      ResourceHandleValueAndId resource =
+          resource_alloc.GetResourceHandleValueAndId(resource_handle_id_map,
+                                                     next_unique_id);
+      AddValueUniqueIDMapping(resource.value, resource.id);
     } else if (llvm::isa<IdentityNOp, IdentityOp>(op)) {
       for (auto result : filter_resources(op->getResults()))
         PropagateInputToOutput(op->getOperand(result.getResultNumber()),
diff --git a/tensorflow/compiler/mlir/tensorflow/g3doc/space_to_depth.md b/tensorflow/compiler/mlir/tensorflow/g3doc/space_to_depth.md
index 5eb2d2a5ed6..504ff9da284 100644
--- a/tensorflow/compiler/mlir/tensorflow/g3doc/space_to_depth.md
+++ b/tensorflow/compiler/mlir/tensorflow/g3doc/space_to_depth.md
@@ -168,7 +168,7 @@ module {
               %filter: tensor<7x7x3x64xf32>) {
      %filter_transform = "tf.Pad/tf.Transpose/tf.Reshape"(%filter):
        tensor<7x7x3x64xf32>) -> tensor<4x4x12x64xf32>
-     %conv = "tf.Conv2D"(%input, %filter_transfrom) {strides = [1, 1, 1, 1]}:
+     %conv = "tf.Conv2D"(%input, %filter_transform) {strides = [1, 1, 1, 1]}:
        (tensor<2x112x112x12xf32>, tensor<4x4x12x64xf32>) ->
        tensor<2x112x112x64xf32>
    }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 3a2e8095139..ca7f7a747e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -59,8 +59,14 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
+  // Allow all call operations to be inlined.
+  bool isLegalToInline(Operation* call, Operation* callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
   // Defines the legality of inlining TF Device operations.
-  bool isLegalToInline(Operation*, Region*, BlockAndValueMapping&) const final {
+  bool isLegalToInline(Operation*, Region*, bool,
+                       BlockAndValueMapping&) const final {
     // For now, enable inlining all operations.
     return true;
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index 65de4ea306f..fb26f4d85ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -74,14 +74,12 @@ This op captures all needed live-in values.
   }];
 
   let builders = [
-    OpBuilder<[{OpBuilder &builder, OperationState &result,
-                StringAttr device, ArrayRef<Type> result_types}],
-      [{
-        result.addAttribute("device", device);
-        result.addTypes(result_types);
-        result.addRegion();
-      }]
-    >
+    OpBuilderDAG<(ins "StringAttr":$device, "ArrayRef<Type>":$result_types),
+    [{
+        $_state.addAttribute("device", device);
+        $_state.addTypes(result_types);
+        $_state.addRegion();
+      }]>
   ];
 
   let hasCanonicalizer = 1;
@@ -97,10 +95,10 @@ The `tf_device.return` operation terminates and returns values from a
     Variadic<AnyType>:$results
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result",
+  let builders = [
+    OpBuilderDAG<(ins),
     [{
-      build(builder, result, {});
+      build($_builder, $_state, {});
     }]>
    ];
 
@@ -169,8 +167,7 @@ def TfDevice_ParallelExecuteOp : TfDevice_Op<"parallel_execute",
   }];
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& state, int num_regions,"
-              "llvm::ArrayRef<Type> output_types">,
+    OpBuilderDAG<(ins "int":$num_regions, "llvm::ArrayRef<Type>":$output_types)>,
   ];
 
   let verifier = [{ return Verify(*this); }];
@@ -293,10 +290,10 @@ For example:
   }];
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& state, int n, "
-              "const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>& devices, "
-              "llvm::ArrayRef<std::pair<ValueRange, Type>> replicated_inputs, "
-              "ValueRange packed_inputs, TypeRange replica_output_types">,
+    OpBuilderDAG<(ins "int":$n,
+      "const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>&":$devices,
+      "llvm::ArrayRef<std::pair<ValueRange, Type>>":$replicated_inputs,
+      "ValueRange":$packed_inputs, "TypeRange":$replica_output_types)>,
   ];
 
   let parser = [{ return Parse$cppClass(&parser, &result); }];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index f2d0a548420..e802d266842 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -61,9 +61,14 @@ struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
+  // Allow all call operations to be inlined.
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
   // Override the inlining hook to determine if 'src' can be inlined into
   // 'dest'.
-  bool isLegalToInline(Region *dest, Region *src,
+  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
                        BlockAndValueMapping &value_mapping) const final {
     // Allow inlining into tf.island regions if the incoming region has a single
     // block.
@@ -824,6 +829,10 @@ LogicalResult Verify(NextIterationSinkOp sink) {
 
 }  // anonymous namespace
 
+NextIterationSourceOp NextIterationSinkOp::GetSource() {
+  return cast<NextIterationSourceOp>(token().getDefiningOp());
+}
+
 //===----------------------------------------------------------------------===//
 // tf_executor.Exit
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 713ddc44cba..a6d89f06473 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -143,10 +143,10 @@ def TfExecutor_FetchOp : TfExecutor_Op<"fetch",
     Variadic<AnyType>:$fetches
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result",
+  let builders = [
+    OpBuilderDAG<(ins),
     [{
-      build(builder, result, {});
+      build($_builder, $_state, {});
     }]>
    ];
 
@@ -228,10 +228,10 @@ def TfExecutor_YieldOp : TfExecutor_Op<"yield",
     Variadic<AnyType>:$fetches
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result",
+  let builders = [
+    OpBuilderDAG<(ins),
     [{
-      build(builder, result, {});
+      build($_builder, $_state, {});
     }]>
    ];
 
@@ -459,14 +459,14 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source",
     TfeControlType:$control
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Type result_type, "
-    "ArrayRef<NamedAttribute> attributes = {}",
+  let builders = [
+    OpBuilderDAG<(ins "Type":$result_type,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
     [{
-      Type token_type = TokenType::get(builder.getContext());
-      Type control_type = ControlType::get(builder.getContext());
-      result.types = { result_type, token_type, control_type };
-      result.attributes.append(attributes.begin(), attributes.end());
+      Type token_type = TokenType::get($_builder.getContext());
+      Type control_type = ControlType::get($_builder.getContext());
+      $_state.types = { result_type, token_type, control_type };
+      $_state.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
 
@@ -529,19 +529,23 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink",
     Variadic<TfeControlType>:$controlInputs
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value token, "
-    "ArrayRef<Value> operands, ArrayRef<NamedAttribute> attributes = {}",
+  let builders = [
+    OpBuilderDAG<(ins "Value":$token, "ArrayRef<Value>":$operands,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
     [{
       assert(operands.size() >= 1 && "tf_executor.NextIteration.Sink builder "
              "expects at least one operand");
-      result.operands.push_back(token);
-      result.operands.insert(result.operands.end(), operands.begin(),
+      $_state.operands.push_back(token);
+      $_state.operands.insert($_state.operands.end(), operands.begin(),
                               operands.end());
-      result.attributes.append(attributes.begin(), attributes.end());
+      $_state.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
 
+  let extraClassDeclaration = [{
+    NextIterationSourceOp GetSource();
+  }];
+
   let assemblyFormat = " `[` $token `]` $input (`,` $controlInputs^)? `:` type($input) attr-dict";
 
   let printer = ?;
@@ -613,17 +617,17 @@ def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger",
 
   let hasCanonicalizer = 1;
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, "
-    "ArrayRef<Value> operands, ArrayRef<NamedAttribute> attributes = {}",
+  let builders = [
+    OpBuilderDAG<(ins "ArrayRef<Value>":$operands,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
     [{
       assert(operands.size() >= 1 && "tf_executor.ControlTrigger builder "
              "expects at least one operand");
-      result.operands.insert(result.operands.end(), operands.begin(),
+      $_state.operands.insert($_state.operands.end(), operands.begin(),
                               operands.end());
-      Type control_type = ControlType::get(builder.getContext());
-      result.types = {control_type};
-      result.attributes.append(attributes.begin(), attributes.end());
+      Type control_type = ControlType::get($_builder.getContext());
+      $_state.types = {control_type};
+      $_state.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 362c0081c77..a7d24fa21a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -509,7 +509,7 @@ array([b'3.14', b'2.72'], dtype=object)
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bool, TF_Complex128, TF_Complex64, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$input,
+    TensorOf<[TF_Bool, TF_Complex128, TF_Complex64, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Variant]>:$input,
 
     DefaultValuedAttr<I64Attr, "-1">:$precision,
     DefaultValuedAttr<BoolAttr, "false">:$scientific,
@@ -1041,7 +1041,7 @@ beta function.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect, TF_ContractionFusableInterface]> {
+def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect, TF_ContractionFusableInterface, TF_LayoutSensitiveInterface]> {
   let summary = "Adds `bias` to `value`.";
 
   let description = [{
@@ -1065,6 +1065,11 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   let extraClassDeclaration = [{
     // TF_ContractionFusableInterface:
     Optional<ContractionFusion> GetContractionFusion();
+    // TF_LayoutSensitiveInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
+    LogicalResult UpdateDataFormat(StringRef data_format);
   }];
 
   let verifier = [{
@@ -1310,6 +1315,26 @@ for dtype in dtype_list:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_BoostedTreesBucketizeOp : TF_Op<"BoostedTreesBucketize", [NoSideEffect, SameVariadicOperandSize]> {
+  let summary = "Bucketize each feature based on bucket boundaries.";
+
+  let description = [{
+An op that returns a list of float tensors, where each tensor represents the
+bucketized values for a single feature.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Float32Tensor>:$float_values,
+    Variadic<TF_Float32Tensor>:$bucket_boundaries
+  );
+
+  let results = (outs
+    Variadic<TF_Int32Tensor>:$buckets
+  );
+
+  TF_DerivedOperandSizeAttr num_features = TF_DerivedOperandSizeAttr<0>;
+}
+
 def TF_BroadcastArgsOp : TF_Op<"BroadcastArgs", [NoSideEffect]> {
   let summary = "Return the shape of s0 op s1 with broadcast.";
 
@@ -1330,7 +1355,7 @@ broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_BroadcastGradientArgsOp : TF_Op<"BroadcastGradientArgs", [NoSideEffect]> {
+def TF_BroadcastGradientArgsOp : TF_Op<"BroadcastGradientArgs", [NoSideEffect, SameOperandsAndResultElementType, TF_OperandHasRank<0, 1>, TF_OperandHasRank<1, 1>, TF_ResultHasRank<0, 1>, TF_ResultHasRank<1, 1>]> {
   let summary = [{
 Return the reduction indices for computing gradients of s0 op s1 with broadcast.
   }];
@@ -1350,6 +1375,12 @@ This is typically used by gradient computations for a broadcasting operation.
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+
+  let hasFolder = 1;
 }
 
 def TF_BroadcastToOp : TF_Op<"BroadcastTo", [NoSideEffect]> {
@@ -1640,10 +1671,12 @@ Mutually reduces multiple tensors of identical type and shape.
     TF_Int32Tensor:$group_size,
     TF_Int32Tensor:$group_key,
     TF_Int32Tensor:$instance_key,
+    Variadic<TF_ResourceTensor>:$ordering_token,
 
     TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
     TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
-    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint,
+    DefaultValuedAttr<F32Attr, "0.0f">:$timeout_seconds
   );
 
   let results = (outs
@@ -1651,6 +1684,7 @@ Mutually reduces multiple tensors of identical type and shape.
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandSizeAttr Nordering_token = TF_DerivedOperandSizeAttr<4>;
 }
 
 def TF_ComplexOp : TF_Op<"Complex", [NoSideEffect, ResultsBroadcastableShape]> {
@@ -1815,7 +1849,7 @@ def TF_ConfigureTPUEmbeddingOp : TF_Op<"ConfigureTPUEmbedding", []> {
   let results = (outs);
 }
 
-def TF_ConjOp : TF_Op<"Conj", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ConjOp : TF_Op<"Conj", [Involution, NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns the complex conjugate of a complex number.";
 
   let description = [{
@@ -1843,8 +1877,6 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
 }
 
 def TF_ConjugateTransposeOp : TF_Op<"ConjugateTranspose", [NoSideEffect]> {
@@ -2799,6 +2831,23 @@ Converts the given variant tensor to an iterator and stores it in the given reso
   let results = (outs);
 }
 
+def TF_DestroyResourceOp : TF_Op<"DestroyResourceOp", []> {
+  let summary = "Deletes the resource specified by the handle.";
+
+  let description = [{
+All subsequent operations using the resource will result in a NotFound
+error status.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$resource,
+
+    DefaultValuedAttr<BoolAttr, "true">:$ignore_lookup_error
+  );
+
+  let results = (outs);
+}
+
 def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
@@ -3323,7 +3372,8 @@ tf.math.equal(x, y) ==> array([True,  True])
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Value x, Value y, BoolAttr incompatible_shape_error">
+    OpBuilderDAG<(ins "Value":$x, "Value":$y,
+      "BoolAttr":$incompatible_shape_error)>
   ];
 
   let verifier = [{
@@ -3469,7 +3519,7 @@ size 1.
   TF_DerivedOperandTypeAttr Tdim = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Value condition, Value dim">
+    OpBuilderDAG<(ins "Value":$condition, "Value":$dim)>
   ];
 }
 
@@ -3829,7 +3879,7 @@ fill([2, 3], 9) ==> [[9, 9, 9]
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<"Value dims, Value value">
+    OpBuilderDAG<(ins "Value":$dims, "Value":$value)>
   ];
 }
 
@@ -3925,6 +3975,8 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
+  let hasCanonicalizer = 1;
+
   let verifier = [{
     return Verify(*this);
   }];
@@ -4868,7 +4920,7 @@ I.e., \\(y = 1 / x\\).
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_InvertOp : TF_Op<"Invert", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_InvertOp : TF_Op<"Invert", [Involution, NoSideEffect, SameOperandsAndResultType]> {
   let summary = [{
 Invert (flip) each bit of supported types; for example, type `uint8` value 01010101 becomes 10101010.
   }];
@@ -4924,8 +4976,6 @@ for dtype in dtype_list:
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
 }
 
 def TF_InvertPermutationOp : TF_Op<"InvertPermutation", [NoSideEffect]> {
@@ -5597,7 +5647,7 @@ def TF_LogicalAndOp : TF_Op<"LogicalAnd", [Commutative, NoSideEffect, ResultsBro
   );
 }
 
-def TF_LogicalNotOp : TF_Op<"LogicalNot", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_LogicalNotOp : TF_Op<"LogicalNot", [Involution, NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns the truth value of `NOT x` element-wise.";
 
   let arguments = (ins
@@ -6710,7 +6760,8 @@ retained with length 1.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Value input, Value reduction_indices, BoolAttr keep_dims">
+    OpBuilderDAG<(ins "Value":$input, "Value":$reduction_indices,
+      "BoolAttr":$keep_dims)>
   ];
 }
 
@@ -6984,6 +7035,47 @@ pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
   TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_MirrorPadGradOp : TF_Op<"MirrorPadGrad", [NoSideEffect]> {
+  let summary = [{
+Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
+  }];
+
+  let description = [{
+This operation folds the padded areas of `input` by `MirrorPad` according to the
+`paddings` you specify. `paddings` must be the same as `paddings` argument
+given to the corresponding `MirrorPad` op.
+
+The folded size of each dimension D of the output is:
+
+`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+# 'paddings' is [[0, 1]], [0, 1]].
+# 'mode' is SYMMETRIC.
+# rank of 't' is 2.
+pad(t, paddings) ==> [[ 1,  5]
+                      [11, 28]]
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$paddings,
+
+    TF_AnyStrAttrOf<["REFLECT", "SYMMETRIC"]>:$mode
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_MlirLocalVarOp : TF_Op<"MlirLocalVarOp", []> {
   let summary = "Creates a handle to an in-scope variable.";
 
@@ -7310,7 +7402,7 @@ def TF_NdtriOp : TF_Op<"Ndtri", [NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_NegOp : TF_Op<"Neg", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_NegOp : TF_Op<"Neg", [Involution, NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes numerical negative value element-wise.";
 
   let description = [{
@@ -7326,8 +7418,6 @@ I.e., \\(y = -x\\).
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
 }
 
 def TF_NextAfterOp : TF_Op<"NextAfter", [NoSideEffect, ResultsBroadcastableShape]>,
@@ -7525,7 +7615,8 @@ def TF_NotEqualOp : TF_Op<"NotEqual", [Commutative, NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Value x, Value y, BoolAttr incompatible_shape_error">
+    OpBuilderDAG<(ins "Value":$x, "Value":$y,
+      "BoolAttr":$incompatible_shape_error)>
   ];
 
   let verifier = [{
@@ -7643,8 +7734,8 @@ output =
   TF_DerivedOperandTypeAttr TI = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Value indices, Value depth, Value on_value, Value off_value, "
-              "IntegerAttr axis">
+    OpBuilderDAG<(ins "Value":$indices, "Value":$depth, "Value":$on_value,
+      "Value":$off_value, "IntegerAttr":$axis)>
   ];
 
   let verifier = [{
@@ -7690,6 +7781,20 @@ times by rerunning "MakeIterator".
   );
 }
 
+def TF_OnesLikeOp : TF_Op<"OnesLike", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Returns a tensor of ones with the same shape and type as x.";
+
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$x
+  );
+
+  let results = (outs
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_OutfeedEnqueueTupleOp : TF_Op<"OutfeedEnqueueTuple", []> {
   let summary = "Enqueue multiple Tensor values on the computation outfeed.";
 
@@ -8489,7 +8594,7 @@ tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Value start, Value limit, Value delta">
+    OpBuilderDAG<(ins "Value":$start, "Value":$limit, "Value":$delta)>
   ];
 }
 
@@ -8542,7 +8647,7 @@ of the tensor. Rank is also known as "order", "degree", or "ndims."
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"Value input">
+    OpBuilderDAG<(ins "Value":$input)>
   ];
 
   let hasFolder = 1;
@@ -8602,7 +8707,7 @@ tf.real(input) ==> [-2.25, 3.25]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ReciprocalOp : TF_Op<"Reciprocal", [Involution, NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
   let description = [{
@@ -8618,8 +8723,6 @@ I.e., \\(y = 1 / x\\).
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
 }
 
 def TF_ReciprocalGradOp : TF_Op<"ReciprocalGrad", [NoSideEffect, SameOperandsAndResultType]> {
@@ -8813,7 +8916,7 @@ reshape(t, []) ==> 7
   TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Value tensor, Value shape">
+    OpBuilderDAG<(ins "Value":$tensor, "Value":$shape)>
   ];
 
   let verifier = [{
@@ -10600,7 +10703,7 @@ def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]>
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Value condition, Value e, Value t">
+    OpBuilderDAG<(ins "Value":$condition, "Value":$e, "Value":$t)>
   ];
 }
 
@@ -10726,7 +10829,7 @@ shape(t) ==> [2, 2, 3]
   }];
 
   let builders = [
-    OpBuilder<"Value input, BoolAttr use32Bit">
+    OpBuilderDAG<(ins "Value":$input, "BoolAttr":$use32Bit)>
   ];
 
   let hasFolder = 1;
@@ -11470,6 +11573,56 @@ See `tf.sparse.segment_sum` for usage examples.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SparseSegmentSumOp : TF_Op<"SparseSegmentSum", [NoSideEffect]> {
+  let summary = "Computes the sum along sparse segments of a tensor.";
+
+  let description = [{
+Read
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+for an explanation of segments.
+
+Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+dimension, selecting a subset of dimension 0, specified by `indices`.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+# Select two rows, one segment.
+tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+# => [[0 0 0 0]]
+
+# Select two rows, two segment.
+tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+# => [[ 1  2  3  4]
+#     [-1 -2 -3 -4]]
+
+# Select all rows, two segments.
+tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+# => [[0 0 0 0]
+#     [5 6 7 8]]
+
+# Which is equivalent to:
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+```
+  }];
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$data,
+    TF_I32OrI64Tensor:$indices,
+    TF_I32OrI64Tensor:$segment_ids
+  );
+
+  let results = (outs
+    TF_IntOrFpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tsegmentids = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_SparseSoftmaxCrossEntropyWithLogitsOp : TF_Op<"SparseSoftmaxCrossEntropyWithLogits", [NoSideEffect]> {
   let summary = [{
 Computes softmax cross entropy cost and gradients to backpropagate.
@@ -11845,6 +11998,28 @@ The outputs are a deterministic function of `shape`, `seed`, and `alpha`.
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_StatelessRandomGetKeyCounterAlgOp : TF_Op<"StatelessRandomGetKeyCounterAlg", []> {
+  let summary = [{
+Picks the best algorithm based on device, and scrambles seed into key and counter.
+  }];
+
+  let description = [{
+This op picks the best counter-based RNG algorithm based on device, and scrambles a shape-[2] seed into a key and a counter, both needed by the counter-based algorithm. The scrambling is opaque but approximately satisfies the property that different seed results in different key/counter pair (which will in turn result in different random numbers).
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_Uint64Tensor:$key,
+    TF_Uint64Tensor:$counter,
+    TF_Int32Tensor:$alg
+  );
+
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_StatelessRandomNormalOp : TF_Op<"StatelessRandomNormal", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom values from a normal distribution.
@@ -11870,6 +12045,32 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_StatelessRandomNormalV2Op : TF_Op<"StatelessRandomNormalV2", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom values from a normal distribution.
+  }];
+
+  let description = [{
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_Uint64Tensor:$key,
+    TF_Uint64Tensor:$counter,
+    TF_Int32Tensor:$alg
+  );
+
+  let results = (outs
+    TF_FloatTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StatelessRandomPoissonOp : TF_Op<"StatelessRandomPoisson", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom random numbers from a Poisson distribution.
@@ -11975,6 +12176,61 @@ The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxv
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_StatelessRandomUniformIntV2Op : TF_Op<"StatelessRandomUniformIntV2", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom random integers from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `key`, `counter`, `alg`, `minval` and `maxval`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_Uint64Tensor:$key,
+    TF_Uint64Tensor:$counter,
+    TF_Int32Tensor:$alg,
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$minval,
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$maxval
+  );
+
+  let results = (outs
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<4>;
+}
+
+def TF_StatelessRandomUniformV2Op : TF_Op<"StatelessRandomUniformV2", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom random values from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_Uint64Tensor:$key,
+    TF_Uint64Tensor:$counter,
+    TF_Int32Tensor:$alg
+  );
+
+  let results = (outs
+    TF_FloatTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StatelessTruncatedNormalOp : TF_Op<"StatelessTruncatedNormal", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom values from a truncated normal distribution.
@@ -12002,6 +12258,34 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_StatelessTruncatedNormalV2Op : TF_Op<"StatelessTruncatedNormalV2", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom values from a truncated normal distribution.
+  }];
+
+  let description = [{
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_Uint64Tensor:$key,
+    TF_Uint64Tensor:$counter,
+    TF_Int32Tensor:$alg
+  );
+
+  let results = (outs
+    TF_FloatTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "Stops gradient computation.";
 
@@ -12324,7 +12608,8 @@ retained with length 1.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Value input, Value reduction_indices, BoolAttr keep_dims">
+    OpBuilderDAG<(ins "Value":$input, "Value":$reduction_indices,
+      "BoolAttr":$keep_dims)>
   ];
 }
 
@@ -13204,6 +13489,175 @@ num_elements: optional. If not -1, the number of elements in the list.
   }];
 }
 
+def TF_TensorScatterAddOp : TF_Op<"TensorScatterAdd", [NoSideEffect]> {
+  let summary = [{
+Adds sparse `updates` to an existing tensor according to `indices`.
+  }];
+
+  let description = [{
+This operation creates a new tensor by adding sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd_add`, except that the updates
+are added onto an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`tensor.shape`.  The last dimension of `indices` can be at most the rank of
+`tensor.shape`:
+
+    indices.shape[-1] <= tensor.shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = tensor.shape.rank`) or slices
+(if `indices.shape[-1] < tensor.shape.rank`) along dimension
+`indices.shape[-1]` of `tensor.shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + tensor.shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_add is to add individual elements to a
+tensor by index. For example, say we want to add 4 elements in a rank-1
+tensor with 8 elements.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_nd_add(tensor, indices, updates)
+    print(updated)
+```
+
+The resulting tensor would look like this:
+
+    [1, 12, 1, 11, 10, 1, 1, 13]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4],dtype=tf.int32)
+    updated = tf.tensor_scatter_nd_add(tensor, indices, updates)
+    print(updated)
+```
+
+The resulting tensor would look like this:
+
+    [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$tensor,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_TensorScatterSubOp : TF_Op<"TensorScatterSub", [NoSideEffect]> {
+  let summary = [{
+Subtracts sparse `updates` from an existing tensor according to `indices`.
+  }];
+
+  let description = [{
+This operation creates a new tensor by subtracting sparse `updates` from the
+passed in `tensor`.
+This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+are subtracted from an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_sub is to subtract individual elements
+from a tensor by index. For example, say we want to insert 4 scattered elements
+in a rank-1 tensor with 8 elements.
+
+In Python, this scatter subtract operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_nd_sub(tensor, indices, updates)
+    print(updated)
+```
+
+The resulting tensor would look like this:
+
+    [1, -10, 1, -9, -8, 1, 1, -11]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4],dtype=tf.int32)
+    updated = tf.tensor_scatter_nd_sub(tensor, indices, updates)
+    print(updated)
+```
+
+The resulting tensor would look like this:
+
+    [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$tensor,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_TensorScatterUpdateOp : TF_Op<"TensorScatterUpdate", [NoSideEffect]> {
   let summary = [{
 Scatter `updates` into an existing tensor according to `indices`.
@@ -13301,9 +13755,8 @@ On GPU, if an out of bound index is found, the index is ignored.
   let verifier = [{ return Verify(*this); }];
 
   let builders = [
-    OpBuilder<"Value tensor, Value indices, Value updates",
-      [{build($_builder, $_state, tensor.getType(), tensor, indices, updates);}]
-    >
+    OpBuilderDAG<(ins "Value":$tensor, "Value":$indices, "Value":$updates),
+    [{build($_builder, $_state, tensor.getType(), tensor, indices, updates);}]>
   ];
 }
 
@@ -13446,7 +13899,7 @@ The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
   TF_DerivedOperandTypeAttr Tperm = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"Value x, Value perm">
+    OpBuilderDAG<(ins "Value":$x, "Value":$perm)>
   ];
 
   let verifier = [{
@@ -13456,6 +13909,33 @@ The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
   let hasFolder = 1;
 }
 
+def TF_TridiagonalSolveOp : TF_Op<"TridiagonalSolve", [NoSideEffect]> {
+  let summary = "Solves tridiagonal systems of equations.";
+
+  let description = [{
+Solves tridiagonal systems of equations.
+  Supports batch dimensions and multiple right-hand sides per each left-hand
+  side.
+  On CPU, solution is computed via Gaussian elimination with or without partial
+  pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
+  library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
+  Partial pivoting is not yet supported by XLA backends.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float32, TF_Float64]>:$diagonals,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float32, TF_Float64]>:$rhs,
+
+    DefaultValuedAttr<BoolAttr, "true">:$partial_pivoting
+  );
+
+  let results = (outs
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float32, TF_Float64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_TruncateDivOp : TF_Op<"TruncateDiv", [NoSideEffect, ResultsBroadcastableShape]>,
                        WithBroadcastableBinOpBuilder {
   let summary = "Returns x / y element-wise for integer types.";
@@ -14401,6 +14881,27 @@ key: A unique identifier for this region used to match up host transfers.
   TF_DerivedOperandTypeAttr Tinput = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaSortOp : TF_Op<"XlaSort", [NoSideEffect]> {
+  let summary = "Wraps the XLA Sort operator, documented at";
+
+  let description = [{
+https://www.tensorflow.org/performance/xla/operation_semantics#sort
+.
+
+Sorts a tensor. Currently only sorts in ascending order are supported.
+  }];
+
+  let arguments = (ins
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input
+  );
+
+  let results = (outs
+    Res<TF_Tensor, [{A `Tensor` of type T.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSvdOp : TF_Op<"XlaSvd", [NoSideEffect]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -14766,4 +15267,4 @@ execution the transfer corresponds to.}]>:$dynamic_key,
   let results = (outs);
 
   TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 15c0d7b10f7..ae53b1739f5 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -100,6 +100,30 @@ class TF_AllTypesMatch<list<string> names> :
         TF_AllTypesMatchPred<
             !foreach(n, names, !subst("$_self", "$" # n, "$_self.getType()"))>>;
 
+//===----------------------------------------------------------------------===//
+// Rank/Shape helpers.
+//===----------------------------------------------------------------------===//
+
+class TF_OperandIsUnrankedPred<int n> :
+  CPred<"$_op.getOperand(" # n # ").getType().isa<UnrankedTensorType>()">;
+
+class TF_ResultIsUnrankedPred<int n> :
+  CPred<"$_op.getResult(" # n # ").getType().isa<UnrankedTensorType>()">;
+
+// Returns true if the n-th operand has unknown rank or has rank m.
+class TF_OperandHasRank<int n, int m> :
+  PredOpTrait<"operand " # n # " is " # m # "-D",
+    Or<[TF_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n #
+      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+
+// Returns true if the n-th result has unknown rank or has rank m.
+class TF_ResultHasRank<int n, int m> :
+  PredOpTrait<"result " # n # " is " # m # "-D",
+    Or<[TF_ResultIsUnrankedPred<n>,
+      CPred<"$_op.getResult(" # n #
+      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op side effects
 //===----------------------------------------------------------------------===//
@@ -565,42 +589,40 @@ def TF_IntTypeAttr : TypeAttrBase<"IntegerType", "integer type"> {
 // Mixin class defining a builder for binary ops supporting broadcast
 // behavior. The result type has the same element type as both operands.
 class WithBroadcastableBinOpBuilder {
-  list<OpBuilder> builders = [OpBuilder<
-"OpBuilder &builder, OperationState &result, Value  x, Value  y",
-[{
+  list<OpBuilderDAG> builders = [
+    OpBuilderDAG<(ins "Value":$x, "Value":$y),
+    [{
   auto resultType =
       OpTrait::util::getBroadcastedType(x.getType(), y.getType());
   if (!resultType)
-    mlir::emitError(result.location, "non-broadcastable operands");
-  return build(builder, result, resultType, x, y);
-}]
-  >];
+    mlir::emitError($_state.location, "non-broadcastable operands");
+  return build($_builder, $_state, resultType, x, y);
+}]>];
 }
 
 // Mixin class defining a builder for comparison ops supporting broadcast
 // behavior. The result type has bool element type.
 class WithBroadcastableCmpOpBuilder {
-  list<OpBuilder> builders = [OpBuilder<
-"OpBuilder &builder, OperationState &result, Value  x, Value  y",
-[{
+  list<OpBuilderDAG> builders = [
+    OpBuilderDAG<(ins "Value":$x, "Value":$y),
+    [{
   Type resultType;
   if (x.getType().isa<UnrankedTensorType>() ||
       y.getType().isa<UnrankedTensorType>()) {
-    resultType = UnrankedTensorType::get(builder.getI1Type());
+    resultType = UnrankedTensorType::get($_builder.getI1Type());
   } else {
     SmallVector<int64_t, 4> resultShape;
     if (!OpTrait::util::getBroadcastedShape(
             x.getType().cast<ShapedType>().getShape(),
             y.getType().cast<ShapedType>().getShape(), resultShape)) {
-      mlir::emitError(result.location,
+      mlir::emitError($_state.location,
                       "operands have no broadcastable shapes");
     }
 
-    resultType = RankedTensorType::get(resultShape, builder.getI1Type());
+    resultType = RankedTensorType::get(resultShape, $_builder.getI1Type());
   }
-  return build(builder, result, resultType, x, y);
-}]
-  >];
+  return build($_builder, $_state, resultType, x, y);
+}]>];
 }
 
 #endif // TF_OP_BASE
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
index 1eb5c89f0fc..3a6a9336a24 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -16,10 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
 
+#include <string>
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
@@ -49,8 +56,80 @@ struct ContractionFusion {
   SmallVector<NamedAttribute, 4> additional_attributes;
 };
 
+//===----------------------------------------------------------------------===//
+// TensorFlow Resource Handles.
+//===----------------------------------------------------------------------===//
+
+inline bool IsResourceHandleAnonymous(StringRef name) {
+  return name == ::tensorflow::ResourceHandle::ANONYMOUS_NAME;
+}
+
+// Helper struct representing an identifier for a resource handle. For resource
+// handles created explicitly and shared across resource allocator ops,
+// `container`, `name`, and `device` can be set. If an resource handle is tied
+// to an instance of an operation (e.g. TensorFlow runtime operation caching),
+// `op` can be set instead.
+struct ResourceHandle {
+  ResourceHandle(StringRef container, StringRef name, StringRef device,
+                 Operation* op)
+      : container(container), name(name), device(device), op(op) {}
+
+  bool operator==(const ResourceHandle& rhs) const {
+    return container == rhs.container && name == rhs.name &&
+           device == rhs.device && op == rhs.op;
+  }
+
+  // Make ResourceHandle hashable.
+  friend ::llvm::hash_code hash_value(const ResourceHandle& resource_handle);
+
+  std::string container;
+  std::string name;
+  std::string device;
+  Operation* op = nullptr;
+};
+
+// Make ResourceHandle hashable.
+inline ::llvm::hash_code hash_value(const ResourceHandle& resource_handle) {
+  return ::llvm::hash_combine(resource_handle.container, resource_handle.name,
+                              resource_handle.device, resource_handle.op);
+}
+
+// Helper struct holding a resource handle value and unique id associated to the
+// resource handle.
+struct ResourceHandleValueAndId {
+  ResourceHandleValueAndId(Value value, int64_t id) : value(value), id(id) {}
+
+  Value value;
+  int64_t id = -1;
+};
+
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
 }  // namespace TF
 }  // namespace mlir
 
+namespace llvm {
+template <>
+struct DenseMapInfo<mlir::TF::ResourceHandle> {
+  static mlir::TF::ResourceHandle getEmptyKey() {
+    return {/*container=*/"", /*name=*/"", /*device=*/"",
+            /*op=*/DenseMapInfo<mlir::Operation*>::getEmptyKey()};
+  }
+
+  static mlir::TF::ResourceHandle getTombstoneKey() {
+    return {/*container=*/"", /*name=*/"", /*device=*/"",
+            /*op=*/DenseMapInfo<mlir::Operation*>::getTombstoneKey()};
+  }
+
+  static unsigned getHashValue(
+      const mlir::TF::ResourceHandle& resource_handle) {
+    return mlir::TF::hash_value(resource_handle);
+  }
+
+  static bool isEqual(const mlir::TF::ResourceHandle& lhs,
+                      const mlir::TF::ResourceHandle& rhs) {
+    return lhs == rhs;
+  }
+};
+}  // namespace llvm
+
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index 3c41c04a0d6..1ed30c89a77 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -125,4 +125,27 @@ def TF_ContractionFusableInterface : OpInterface<"ContractionFusableInterface">
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// TensorFlow Resource Handle Interfaces.
+//===----------------------------------------------------------------------===//
+
+def TF_ResourceHandleAllocatorInterface : OpInterface<"ResourceHandleAllocatorInterface"> {
+  let description = [{
+    A resource handle allocator operation is one that creates a resource handle,
+    or looks up and reuses an existing resource handle.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{Returns the resource handle value and unique id associated with
+                 the resource handle. If a resource handle is reused, then an
+                 existing id will be returned.}],
+      /*retTy=*/"ResourceHandleValueAndId",
+      /*methodName=*/"GetResourceHandleValueAndId",
+      /*args=*/(ins "llvm::SmallDenseMap<ResourceHandle, int64_t>&":$resource_handle_id_map,
+                    "int64_t&":$next_id)
+    >,
+  ];
+}
+
 #endif // TF_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 634004038d0..e135a1a9854 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -77,43 +77,6 @@ namespace TF {
 
 namespace {
 
-// Returns true of the given function has a single uses (within the scope
-// of the module containing it and all parent modules).
-bool HasSingleUse(FuncOp func) {
-  // Public function can have any number of external uses.
-  if (func.isPublic()) return false;
-
-  // Return false if unexpected IR structure seen.
-  ModuleOp module = func.getParentOfType<ModuleOp>();
-  if (!module) return false;
-
-  // Inspect function uses in the containing module and all parent
-  // modules.
-  bool use_seen = false;
-  for (; module; module = module.getParentOfType<ModuleOp>()) {
-    auto func_uses_optional =
-        SymbolTable::getSymbolUses(func, &module.getBodyRegion());
-    // Found an unknown use.
-    if (!func_uses_optional) return false;
-
-    // If no uses in this scope, continue looking in parent module
-    SymbolTable::UseRange func_uses = func_uses_optional.getValue();
-    if (func_uses.empty()) continue;
-
-    // Check if multiple uses at this scope or another use already seen.
-    if (!llvm::hasSingleElement(func_uses) || use_seen) return false;
-
-    // This is the first use seen.
-    use_seen = true;
-
-    // If the function is private, no need to inspect parent modules.
-    if (func.isPrivate()) break;
-  }
-
-  // No multiple uses seen.
-  return true;
-}
-
 struct TFConstantFoldInterface : public DialectFoldInterface {
   TFConstantFoldInterface(Dialect *dialect) : DialectFoldInterface(dialect) {}
   LogicalResult fold(Operation *op, ArrayRef<Attribute> operands,
@@ -137,9 +100,17 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
+  // Returns if it's legal to inline 'callable' into the 'call', where 'call' is
+  // a TF operation.
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final {
+    // Check that the TF call operation is one that is legal to inline.
+    return !isa<TPUPartitionedCallOp>(call);
+  }
+
   // Returns if its legal to inline 'src' region into the 'dest' region
   // attached to a TF operation.
-  bool isLegalToInline(Region *dest, Region *src,
+  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
                        BlockAndValueMapping &valueMapping) const final {
     // Allow inlining in regions attached to region based control flow
     // operations only if the src region is a single block region
@@ -149,15 +120,15 @@ struct TFInlinerInterface : public DialectInlinerInterface {
 
   // Returns true if its legal to inline a TF operation `op` into the `dest`
   // region.
-  bool isLegalToInline(Operation *op, Region *dest,
+  bool isLegalToInline(Operation *op, Region *dest, bool wouldBeCloned,
                        BlockAndValueMapping &) const final {
     // An op is legal to inline if either of the following conditions is true:
     // (a) Its legal to duplicate the Op.
-    // (a) The Op is inside a single use function. If that function is inlined,
+    // (b) The Op is inside a single use function. If that function is inlined,
     //     post inlining, the function will be dead and eliminated from the IR.
     //     So there won't be any code duplication.
-    FuncOp func = op->getParentOfType<FuncOp>();
-    return !func || TensorFlowDialect::CanDuplicate(op) || HasSingleUse(func);
+    // plus the function caller op can be replaced by inlined ops.
+    return !wouldBeCloned || TensorFlowDialect::CanDuplicate(op);
   }
 
   //===--------------------------------------------------------------------===//
@@ -221,8 +192,11 @@ bool TensorFlowDialect::CanHaveSideEffects(Operation *op) {
 }
 
 std::vector<TensorFlowDialect::AdditionalOpFunction>
-    *TensorFlowDialect::additional_operation_hooks_ =
-        new std::vector<TensorFlowDialect::AdditionalOpFunction>();
+    *TensorFlowDialect::GetAdditionalOperationHooks() {
+  static auto *const additional_operation_hooks =
+      new std::vector<TensorFlowDialect::AdditionalOpFunction>();
+  return additional_operation_hooks;
+}
 
 TensorFlowDialect::ConstantFoldHook TensorFlowDialect::constant_fold_hook_;
 TensorFlowDialect::DecodeConstantHook TensorFlowDialect::decode_constant_hook_;
@@ -250,7 +224,7 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
   // registered.
   allowUnknownOperations();
 
-  for (const auto &hook : *TensorFlowDialect::additional_operation_hooks_) {
+  for (const auto &hook : *GetAdditionalOperationHooks()) {
     hook(*this);
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 9ebd59007e3..a9a57f32151 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -104,7 +104,7 @@ class TensorFlowDialect : public Dialect {
   // operations to the dialect. Hooks will only apply to subsequent
   // instantations of the Dialect/MLIRContext.
   static void RegisterAdditionalOperationHook(AdditionalOpFunction fn) {
-    additional_operation_hooks_->push_back(std::move(fn));
+    GetAdditionalOperationHooks()->push_back(std::move(fn));
   }
 
   // Re-define publicly the protected addOperations() method from the Dialect
@@ -141,7 +141,7 @@ class TensorFlowDialect : public Dialect {
  private:
   // Hook functions which may add additional operations to the dialect.
   // These are invoked at construction time.
-  static std::vector<AdditionalOpFunction> *additional_operation_hooks_;
+  static std::vector<AdditionalOpFunction> *GetAdditionalOperationHooks();
 
   static ConstantFoldHook constant_fold_hook_;
   static DecodeConstantHook decode_constant_hook_;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 544f07f7075..660ca69e71c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -178,6 +178,9 @@ An n-way switch statement, implementing the following:
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasCanonicalizer = 1;
+
 }
 
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
@@ -197,10 +200,8 @@ def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect,
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 
   let builders = [
-    OpBuilder<
-      "OpBuilder &builder, OperationState &result, Attribute value">,
-    OpBuilder<
-      "OpBuilder &builder, OperationState &result, Type type, Attribute value">,
+    OpBuilderDAG<(ins "Attribute":$value)>,
+    OpBuilderDAG<(ins "Type":$type, "Attribute":$value)>,
   ];
 
   let hasFolder = 1;
@@ -235,6 +236,22 @@ source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaAllReduceOp : TF_Op<"XlaAllReduce", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
+  let summary = "An Op to reduce inputs across replicated TPU instances.";
+
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int32, TF_Uint32]>:$input,
+    TF_Int32Tensor:$group_assignment,
+    TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add", "Mean"]>:$reduce_op
+  );
+
+  let results = (outs
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int32, TF_Uint32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_EmptyTensorListOp : TF_TensorListInitOp<"EmptyTensorList"> {
   let summary = "Creates and returns an empty tensor list.";
 
@@ -387,6 +404,15 @@ else_branch: A region that computes the outputs of the op if cond = false.
     return Verify(*this);
   }];
 
+  let builders = [
+    OpBuilderDAG<(ins "TypeRange":$resultTypes, "ValueRange":$operands,
+      "llvm::ArrayRef<::mlir::NamedAttribute>":$attributes,
+      "unsigned":$numRegions),
+    [{
+      assert(numRegions == 2u && "mismatched number of regions");
+      build($_builder, $_state, resultTypes, operands, attributes);
+    }]>];
+
   let hasCanonicalizer = 1;
 }
 
@@ -787,7 +813,7 @@ This operation holds the metadata common to operations of a `tpu.replicate()` co
   let results = (outs);
 }
 
-def TF_VarHandleOp : TF_Op<"VarHandleOp", []> {
+def TF_VarHandleOp : TF_Op<"VarHandleOp", [TF_ResourceHandleAllocatorInterface]> {
   let summary = "Creates a handle to a Variable resource from its name.";
 
   let description = [{
@@ -816,6 +842,13 @@ Example:
     TF_DerivedOperandOrResultHandleTypeAttr<"resource">;
   TF_DerivedOperandOrResultHandleShapeAttr shape =
     TF_DerivedOperandOrResultHandleShapeAttr<"resource">;
+
+  let extraClassDeclaration = [{
+    // TF_ResourceHandleAllocatorInterface:
+    ResourceHandleValueAndId GetResourceHandleValueAndId(
+      llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+      int64_t &next_id);
+  }];
 }
 
 // Multiple variadic operands with different sizes are not supported by the
@@ -1116,9 +1149,10 @@ as true/false for a branch condition.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value value", [{
-      build(builder, result, RankedTensorType::get({}, builder.getI1Type()),
+  let builders = [
+    OpBuilderDAG<(ins "Value":$value),
+    [{
+      build($_builder, $_state, RankedTensorType::get({}, $_builder.getI1Type()),
             value);
     }]>];
 
@@ -1619,7 +1653,7 @@ event: A string containing a binary-encoded tf.Event proto.
   let results = (outs);
 }
 
-def TF_SummaryWriterOp : TF_Op<"SummaryWriter", []> {
+def TF_SummaryWriterOp : TF_Op<"SummaryWriter", [TF_ResourceHandleAllocatorInterface]> {
   let summary = "Returns a handle to be used to access a summary writer.";
 
   let description = [{
@@ -1637,6 +1671,13 @@ writer: the summary writer resource. Scalar handle.
   let results = (outs
     Res<TF_ResourceTensor, "", [TF_SummaryAlloc]>:$writer
   );
+
+  let extraClassDeclaration = [{
+    // TF_ResourceHandleAllocatorInterface:
+    ResourceHandleValueAndId GetResourceHandleValueAndId(
+      llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+      int64_t &next_id);
+  }];
 }
 
 def TF_WriteAudioSummaryOp : TF_Op<"WriteAudioSummary", []> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index df23638d186..7bbb7f3bad0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -454,6 +454,20 @@ Optional<ContractionFusion> BiasAddOp::GetContractionFusion() {
   return ContractionFusion("BiasAdd", /*additional_arguments=*/{1});
 }
 
+LogicalResult BiasAddOp::UpdateDataFormat(StringRef data_format) {
+  return ::mlir::TF::UpdateDataFormat(data_format, this);
+}
+
+StringRef BiasAddOp::GetOptimalLayout(const RuntimeDevices &devices) {
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
+    return data_format();
+
+  // Prefer NHWC for GPU devices.
+  return "NHWC";
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddGradOp
 //===----------------------------------------------------------------------===//
@@ -527,6 +541,132 @@ OpFoldResult BroadcastToOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// BroadcastGradientArgsOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Returns `true` if both s0 & s1 are defined via constant op, and fills
+// s0_shape & s1_shape.
+bool ExtractInputConstShape(BroadcastGradientArgsOp op,
+                            DenseIntElementsAttr &s0, DenseIntElementsAttr &s1,
+                            SmallVectorImpl<int64_t> &s0_shape,
+                            SmallVectorImpl<int64_t> &s1_shape) {
+  if (!matchPattern(op.s0(), m_Constant(&s0))) return false;
+  if (!matchPattern(op.s1(), m_Constant(&s1))) return false;
+
+  for (auto s : s0.getIntValues()) s0_shape.push_back(s.getSExtValue());
+  for (auto s : s1.getIntValues()) s1_shape.push_back(s.getSExtValue());
+
+  return true;
+}
+
+// Calculates r0 & r1 output based on inputs and calculated broadcasted shape.
+//
+// For given bcasted_shape, s0_shape and s1_shape, the broadcasted dimension is
+// calculated and push back to its corresponding result, r0 or r1. For example,
+// for s0_shape [1,4] and s1_shape [4, 4], bcasted_shape is computed to be
+// [4,4] - this leads to the result of r0 to be [0] as the first dimension of s0
+// is broadcasted, and r1 to be <> as no broadcasting is happening for s1.
+void GetOutputShapeForBroadcastGradientArgs(ArrayRef<int64_t> bcasted_shape,
+                                            ArrayRef<int64_t> s0_shape,
+                                            ArrayRef<int64_t> s1_shape,
+                                            SmallVectorImpl<int64_t> &r0,
+                                            SmallVectorImpl<int64_t> &r1) {
+  for (int i = bcasted_shape.size(); i > 0; --i) {
+    int idx = bcasted_shape.size() - i;
+    int s0_idx = i > s0_shape.size() ? -1 : s0_shape.size() - i;
+    int s1_idx = i > s1_shape.size() ? -1 : s1_shape.size() - i;
+    if (s0_idx == -1) {
+      r0.push_back(idx);
+      if (s1_shape[s1_idx] == 1) r1.push_back(idx);
+    } else if (s1_idx == -1) {
+      r1.push_back(idx);
+      if (s0_shape[s0_idx] == 1) r0.push_back(idx);
+    } else if (s0_shape[s0_idx] != s1_shape[s1_idx]) {
+      if (s0_shape[s0_idx] != bcasted_shape[idx])
+        r0.push_back(idx);
+      else
+        r1.push_back(idx);
+    }
+  }
+}
+}  // namespace
+
+// Verifies that,
+// * Broadcast compatability for input shapes.
+// * Output shape dimension matches the expected dimension size for input
+// shapes.
+static LogicalResult Verify(BroadcastGradientArgsOp op) {
+  SmallVector<int64_t, 4> s0_shape, s1_shape;
+  DenseIntElementsAttr s0, s1;
+  if (!ExtractInputConstShape(op, s0, s1, s0_shape, s1_shape)) return success();
+
+  // If both shape is known const, try to validate shape on them as well.
+  SmallVector<int64_t, 4> bcasted_shape;
+  if (!OpTrait::util::getBroadcastedShape(s0_shape, s1_shape, bcasted_shape))
+    return op.emitOpError() << "requires broadcast compatible shape tensors "
+                               "for 's0' and 's1', but got "
+                            << s0 << " and " << s1;
+
+  SmallVector<int64_t, 4> r0, r1;
+  GetOutputShapeForBroadcastGradientArgs(bcasted_shape, s0_shape, s1_shape, r0,
+                                         r1);
+
+  RankedTensorType r0_ty = GetRankedTensorTypeForOperand(op.r0());
+  RankedTensorType r1_ty = GetRankedTensorTypeForOperand(op.r1());
+  if (r0_ty && r0_ty.hasStaticShape() && r0_ty.getShape()[0] != r0.size())
+    return op.emitOpError() << "requires dimension 0 size of 'r0' to be "
+                            << r0.size() << " but got " << r0_ty.getShape()[0];
+  if (r1_ty && r1_ty.hasStaticShape() && r1_ty.getShape()[0] != r1.size())
+    return op.emitOpError() << "requires dimension 0 size of 'r1' to be "
+                            << r1.size() << " but got " << r1_ty.getShape()[0];
+
+  return success();
+}
+
+LogicalResult BroadcastGradientArgsOp::fold(
+    ArrayRef<Attribute> operands, SmallVectorImpl<OpFoldResult> &results) {
+  SmallVector<int64_t, 4> s0_shape, s1_shape;
+  DenseIntElementsAttr s0, s1;
+  if (!ExtractInputConstShape(*this, s0, s1, s0_shape, s1_shape))
+    return failure();
+
+  // Fold BroadcastGradientArgs into two constants if both of the inputs have
+  // known shape.
+  SmallVector<int64_t, 4> bcasted_shape;
+  // Verifier should already ensure the broadcast compatibility.
+  bool bcast_compatible =
+      OpTrait::util::getBroadcastedShape(s0_shape, s1_shape, bcasted_shape);
+  assert(bcast_compatible);
+  (void)bcast_compatible;
+
+  SmallVector<int64_t, 4> r0, r1;
+  GetOutputShapeForBroadcastGradientArgs(bcasted_shape, s0_shape, s1_shape, r0,
+                                         r1);
+
+  auto build_out_dense_element = [](SmallVectorImpl<int64_t> &shape,
+                                    Type input_type) {
+    Type element_type = input_type.cast<mlir::TensorType>().getElementType();
+    RankedTensorType type = RankedTensorType::get(
+        {static_cast<int64_t>(shape.size())}, element_type);
+    // Input could only be i32 or i64. For i32, downcast to int32_t array.
+    if (element_type.isInteger(32)) {
+      SmallVector<int32_t, 4> i32_shape;
+      for (auto s : shape) i32_shape.push_back(static_cast<int32_t>(s));
+      return DenseIntElementsAttr::get(type, i32_shape);
+    } else {
+      assert(element_type.isInteger(64));
+      return DenseIntElementsAttr::get(type, shape);
+    }
+  };
+
+  results.push_back(build_out_dense_element(r0, this->s0().getType()));
+  results.push_back(build_out_dense_element(r1, this->s1().getType()));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // CaseOp
 //===----------------------------------------------------------------------===//
@@ -659,6 +799,75 @@ static LogicalResult Verify(CaseRegionOp op) {
   return success();
 }
 
+namespace {
+// Eliminate values that pass through the CaseRegionOp or IfRegionOp branches.
+template <class CaseOrIfRegionOp>
+class CaseOrIfRegionEliminatePassThrough
+    : public OpRewritePattern<CaseOrIfRegionOp> {
+  using OpRewritePattern<CaseOrIfRegionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CaseOrIfRegionOp op,
+                                PatternRewriter &rewriter) const override {
+    RegionRange branches = op.getRegions();
+    SmallVector<Type, 4> new_result_types;
+    // Maps pass through results to extern values.
+    llvm::SmallDenseMap<Value, Value, 4> result_to_extern_value;
+
+    for (auto result : op.getResults()) {
+      unsigned index = result.getResultNumber();
+      Region *first_branch = *branches.begin();
+      Operation *first_terminator = first_branch->front().getTerminator();
+      Value returned_val = first_terminator->getOperand(index);
+
+      // Pass through values would be defined outside the branch region. Keep
+      // the type of non pass through results to create a new op later, if
+      // required.
+      if (returned_val.getParentBlock() == &first_branch->front()) {
+        new_result_types.push_back(result.getType());
+        continue;
+      }
+      // Check if the same extern value is returned in each branch.
+      for (Region *region : branches.drop_front()) {
+        Operation *terminator = region->front().getTerminator();
+        if (terminator->getOperand(index) != returned_val) return failure();
+      }
+      result_to_extern_value[result] = returned_val;
+    }
+
+    // If no pass through values are found, no change is required.
+    if (result_to_extern_value.empty()) return failure();
+
+    // Create new case/if region op.
+    auto new_op = rewriter.create<CaseOrIfRegionOp>(
+        op.getLoc(), new_result_types, op.getOperand(), op.getAttrs(),
+        op.getNumRegions());
+
+    int next_index = 0;
+    for (auto result : op.getResults()) {
+      if (!result_to_extern_value.count(result)) {
+        result.replaceAllUsesWith(new_op.getResult(next_index++));
+        continue;
+      }
+      result.replaceAllUsesWith(result_to_extern_value[result]);
+      for (Region *branch : branches)
+        branch->front().getTerminator()->eraseOperand(next_index);
+    }
+
+    // Move region bodies to the new op.
+    for (auto region_index : llvm::seq<int>(0, branches.size()))
+      new_op.getRegion(region_index).takeBody(op.getRegion(region_index));
+
+    op.erase();
+    return success();
+  }
+};
+}  // namespace
+
+void CaseRegionOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<CaseOrIfRegionEliminatePassThrough<TF::CaseRegionOp>>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // CastOp
 //===----------------------------------------------------------------------===//
@@ -1054,15 +1263,6 @@ LogicalResult ConcatOffsetOp::fold(ArrayRef<Attribute> operands,
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ConjOp
-//===----------------------------------------------------------------------===//
-
-void ConjOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                         MLIRContext *context) {
-  results.insert<ConjNested>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // ConstOp
 //===----------------------------------------------------------------------===//
@@ -2112,16 +2312,8 @@ LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
 
 void IfRegionOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                              MLIRContext *context) {
-  results.insert<FoldConstantIfRegionOp>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// InvertOp
-//===----------------------------------------------------------------------===//
-
-void InvertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<InvertNested>(context);
+  results.insert<FoldConstantIfRegionOp,
+                 CaseOrIfRegionEliminatePassThrough<TF::IfRegionOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2187,9 +2379,9 @@ void LogOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void LogicalNotOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<LogicalNotNested, LogicalNotOfEqual, LogicalNotOfNotEqual,
-                 LogicalNotOfGreater, LogicalNotOfGreaterEqual,
-                 LogicalNotOfLess, LogicalNotOfLessEqual>(context);
+  results.insert<LogicalNotOfEqual, LogicalNotOfNotEqual, LogicalNotOfGreater,
+                 LogicalNotOfGreaterEqual, LogicalNotOfLess,
+                 LogicalNotOfLessEqual>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
index 44df2b12d88..72ca50b5c37 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
@@ -587,3 +587,31 @@ struct DropAttributes : public OpRewritePattern<Op> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// TF op helper functions for handling resource handles and ids.
+//===----------------------------------------------------------------------===//
+
+// Returns device of op if present. If op has no device set, an empty string ref
+// is returned instead.
+llvm::StringRef GetDeviceOrEmpty(Operation *op) {
+  if (auto device_attr = op->getAttrOfType<StringAttr>("device"))
+    return device_attr.getValue();
+  return llvm::StringRef();
+}
+
+// Returns resource handle value and id for resource op based on attributes. If
+// a resource handle is anonymous, a new id is always returned.
+ResourceHandleValueAndId GetResourceHandleValueAndIdBase(
+    llvm::StringRef container, llvm::StringRef shared_name,
+    llvm::StringRef device, Value resource,
+    llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+    int64_t &next_id) {
+  // Always create a new ID for anonymous handle.
+  if (IsResourceHandleAnonymous(shared_name)) return {resource, next_id++};
+
+  ResourceHandle handle(container, shared_name, device, /*op=*/nullptr);
+  auto emplace_res = resource_handle_id_map.try_emplace(handle, next_id);
+  // New ID created, increment next_id.
+  if (emplace_res.second) ++next_id;
+  return {resource, emplace_res.first->second};
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 519f7e9fcaf..4e2fdbd0014 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -34,6 +36,7 @@ limitations under the License.
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -75,15 +78,6 @@ namespace {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
 
-//===----------------------------------------------------------------------===//
-// NegOp
-//===----------------------------------------------------------------------===//
-
-void NegOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                        MLIRContext *context) {
-  results.insert<NegNested>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // NotEqualOp
 //===----------------------------------------------------------------------===//
@@ -481,15 +475,6 @@ void ReadVariableOp::getCanonicalizationPatterns(
   results.insert<ReadVariableOfCast>(context);
 }
 
-//===----------------------------------------------------------------------===//
-// ReciprocalOp
-//===----------------------------------------------------------------------===//
-
-void ReciprocalOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<ReciprocalNested>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // RandomUniformOp
 //===----------------------------------------------------------------------===//
@@ -579,131 +564,143 @@ Optional<ContractionFusion> ReluOp::GetContractionFusion() {
 // ReshapeOp
 //===----------------------------------------------------------------------===//
 
-// TODO(b/128020684): Verify the output type.
-static LogicalResult Verify(ReshapeOp op) {
-  auto shape_type = op.shape().getType().cast<TensorType>();
-  if (!shape_type.hasRank()) return success();
-  if (shape_type.getRank() != 1)
-    return op.emitOpError("shape must be 1D tensor");
-  auto rank_by_shape = shape_type.getShape()[0];
-  auto type_of_tensor = op.tensor().getType().cast<TensorType>();
-  // No compile time verification for unknown sized shape.
-  if (rank_by_shape == -1 || !type_of_tensor.hasStaticShape()) return success();
-  int64_t num_by_tensor = type_of_tensor.getNumElements();
+namespace {
+using ReshapeErrorHandler =
+    llvm::function_ref<LogicalResult(const llvm::Twine &)>;
 
-  auto out_ty = op.getType().dyn_cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_by_tensor != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") does not match expected number of elements ("
-             << num_by_tensor << ")";
-  }
+LogicalResult GetReshapeOutputType(Value tensor, Value shape,
+                                   ReshapeErrorHandler error_handler,
+                                   TensorType &output_ty) {
+  auto tensor_ty = tensor.getType().cast<TensorType>();
+  auto element_ty = tensor_ty.getElementType();
+  output_ty = UnrankedTensorType::get(element_ty);
 
-  // Check values if constant shape. No compiling time verification for
-  // non-constant shape.
-  auto *shape_op = op.shape().getDefiningOp();
-  if (!shape_op) return success();
-  Attribute shape_cst;
-  if (!matchPattern(shape_op, m_Constant(&shape_cst))) return success();
-  auto shape_cst_attr = shape_cst.dyn_cast<ElementsAttr>();
-  if (!shape_cst_attr) return op.emitOpError("shape must be a valid tensor");
+  auto shape_ty = shape.getType().dyn_cast<RankedTensorType>();
+  if (!shape_ty) return success();
+  if (shape_ty.getRank() != 1)
+    return error_handler(llvm::formatv(
+        "requires 'shape' to be rank 1, but got {0}", shape_ty.getRank()));
 
-  if (auto opaque_attr = shape_cst_attr.dyn_cast<OpaqueElementsAttr>()) {
-    opaque_attr.decode(shape_cst_attr);
-  }
-
-  // We know the shape is a 1-D Tensor, then let us get the number of
-  // elements it implies.
-  unsigned num_by_shape = 1;
-  unsigned unknown_dim_count = 0;
-  for (int i = 0, e = rank_by_shape; i != e; ++i) {
-    auto num = shape_cst_attr.getValue<IntegerAttr>(i).getInt();
-    // The dimension size value can be -1, and that the real size needs to
-    // be computed so that the total size remains constant. At most one
-    // component of shape can be -1.
-    if (num == -1) {
-      if (++unknown_dim_count > 1) {
-        return op.emitOpError("more than one component of shape are -1");
-      }
-    } else {
-      num_by_shape *= num;
+  DenseIntElementsAttr shape_attr;
+  if (!matchPattern(shape, m_Constant(&shape_attr))) {
+    // If only shape of `shape` is known, return ranked but dynamic output
+    // shape.
+    if (shape_ty.hasStaticShape()) {
+      llvm::SmallVector<int64_t, 8> dynamic_shape(shape_ty.getDimSize(0),
+                                                  ShapedType::kDynamicSize);
+      output_ty = RankedTensorType::get(dynamic_shape, element_ty);
     }
-  }
-  // If there is one component of shape is -1, the dimension should be
-  // computed so that the total size remains constant.
-  if (unknown_dim_count == 1) {
-    if (num_by_tensor % num_by_shape != 0)
-      return op.emitOpError(
-          "one component of shape is -1 but couldn't infer the dimension");
     return success();
   }
-  // If the elements by the tensor and implies by the shape don't match,
-  // fail this static check.
-  if (num_by_tensor != num_by_shape) {
-    return op.emitOpError(
-        "mismatch in tensor elements and shape implied elements");
+
+  // Detect if reshape output shape is folded.
+  bool shape_ty_zero_dim = false;
+  int unknown_index = -1;
+  // The product of constant shape argument excluding unknown dimension.
+  int64_t shape_ty_size = 1;
+  llvm::SmallVector<int64_t, 8> output_ty_shape;
+  output_ty_shape.reserve(shape_attr.getNumElements());
+  for (const auto &dim : llvm::enumerate(shape_attr.getIntValues())) {
+    const int64_t size = dim.value().getSExtValue();
+    if (size == ShapedType::kDynamicSize) {
+      if (unknown_index != -1)
+        return error_handler(llvm::formatv(
+            "requires 'shape' to have at most one dynamic dimension, but got "
+            "multiple dynamic dimensions at indices {0} and {1}",
+            unknown_index, dim.index()));
+
+      unknown_index = dim.index();
+    } else if (size == 0) {
+      shape_ty_zero_dim = true;
+    } else if (size > 0) {
+      shape_ty_size *= size;
+    } else {
+      return error_handler(
+          llvm::formatv("requires 'shape' to have dimensions greater than -1, "
+                        "but got {0} at index {1}",
+                        size, dim.index()));
+    }
+    output_ty_shape.push_back(size);
   }
+
+  if (!tensor_ty.hasStaticShape()) {
+    output_ty = RankedTensorType::get(output_ty_shape, element_ty);
+    return success();
+  }
+
+  // Compute the value of the unknown dimension.
+  if (unknown_index != -1) {
+    // Compute number of elements in tensor shape.
+    int64_t tensor_ty_size = 1;
+    bool tensor_ty_zero_dim = false;
+    for (const auto &dim : tensor_ty.getShape()) {
+      if (dim > 0 || !shape_ty_zero_dim) {
+        tensor_ty_size *= dim;
+      } else {
+        tensor_ty_zero_dim = true;
+      }
+    }
+
+    const int64_t missing_dim = tensor_ty_size / shape_ty_size;
+    if (!tensor_ty_zero_dim && shape_ty_size * missing_dim != tensor_ty_size)
+      return error_handler(
+          llvm::formatv("requires 'tensor' number of elements be a multiple of "
+                        "{0}, but got {1}",
+                        shape_ty_size, tensor_ty_size));
+
+    // Set the unknown dimension such that total number of elements remain
+    // constant.
+    output_ty_shape[unknown_index] = missing_dim;
+  }
+
+  output_ty = RankedTensorType::get(output_ty_shape, element_ty);
+
+  return success();
+}
+}  // namespace
+
+static LogicalResult Verify(ReshapeOp op) {
+  auto error_handler = [&op](const llvm::Twine &message) -> LogicalResult {
+    return op.emitOpError() << message;
+  };
+  TensorType expected_ty;
+  if (failed(GetReshapeOutputType(op.tensor(), op.shape(), error_handler,
+                                  expected_ty)))
+    return failure();
+
+  auto output_ty = op.getType().dyn_cast<RankedTensorType>();
+  if (!output_ty) return success();
+  auto tensor_ty = op.tensor().getType().cast<TensorType>();
+  if (output_ty.hasStaticShape() && tensor_ty.hasStaticShape()) {
+    const int64_t output_ty_size = output_ty.getNumElements();
+    const int64_t tensor_ty_size = tensor_ty.getNumElements();
+    if (tensor_ty_size != output_ty_size)
+      return op.emitOpError() << "requires 'output' number of elements to "
+                                 "match 'tensor' number of elements, but got "
+                              << output_ty_size << " and " << tensor_ty_size;
+  }
+
+  if (!AreCastCompatible({output_ty, expected_ty}))
+    return op.emitOpError()
+           << "requires 'output' type " << output_ty
+           << " to be cast compatible with expected type " << expected_ty;
+
   return success();
 }
 
+// Currently there are use cases that rely on partial evaluation of the `shape`
+// operand, so InferTypeOpInterface is not used (along with generated builder of
+// the same signature).
 void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
                       Value shape) {
-  auto ttype = tensor.getType().cast<ShapedType>();
-  auto etype = ttype.getElementType();
-
-  auto unranked = [&builder, etype, &result, shape, tensor]() {
-    return ReshapeOp::build(builder, result, UnrankedTensorType::get(etype),
-                            tensor, shape);
+  auto error_handler = [&result](const llvm::Twine &message) {
+    return mlir::emitError(result.location) << message;
   };
+  TensorType output_ty;
+  if (failed(GetReshapeOutputType(tensor, shape, error_handler, output_ty)))
+    return;
 
-  // If tensor is unranked then we have no info about output of shape.
-  if (!ttype.hasRank()) return unranked();
-
-  DenseIntElementsAttr attr_shape;
-  if (matchPattern(shape, m_Constant(&attr_shape))) {
-    llvm::SmallVector<int64_t, 4> const_shape;
-    const_shape.reserve(attr_shape.getNumElements());
-
-    // Detect if reshape output shape is folded.
-    bool flatten = false;
-    int unknown_index = -1;
-    // The product of constant shape argument excluding unknown dimension.
-    int64_t product_cshape = 1;
-    for (auto e : llvm::enumerate(attr_shape)) {
-      int64_t val = e.value().getSExtValue();
-      if (IsUnknownDimOrRank(val)) {
-        if (flatten) {
-          mlir::emitError(result.location)
-              << "only one unknown dimension allowed";
-          return;
-        }
-        flatten = true;
-        unknown_index = e.index();
-      } else {
-        product_cshape *= val;
-      }
-      const_shape.push_back(val);
-    }
-
-    // Compute the value of the unknown dimension.
-    if (flatten) {
-      // Compute number of elements in tensor shape.
-      auto tshape = ttype.getShape();
-      int64_t product_tshape = std::accumulate(tshape.begin(), tshape.end(), 1,
-                                               std::multiplies<int64_t>());
-      // Set the unknown dimension such that total number of elements remain
-      // constant.
-      // Note: The case where the ratio is not integral, and so the total size
-      // of reshape not constant, is checked in verify function.
-      const_shape[unknown_index] = product_tshape / product_cshape;
-    }
-    return ReshapeOp::build(builder, result,
-                            RankedTensorType::get(const_shape, etype), tensor,
-                            shape);
-  }
-  return unranked();
+  return ReshapeOp::build(builder, result, output_ty, tensor, shape);
 }
 
 void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
@@ -1317,7 +1314,7 @@ LogicalResult SpaceToBatchNDOp::inferReturnTypes(
   SmallVector<int64_t, 4> return_shape(input_rank, ShapedType::kDynamicSize);
 
   // The return has all dimension sizes unknown when block_rank is unknown.
-  if (block_rank == -1) {
+  if (block_rank == ShapedType::kDynamicSize) {
     inferredReturnTypes.assign(
         {RankedTensorType::get(return_shape, input_type.getElementType())});
     return success();
@@ -1336,6 +1333,14 @@ LogicalResult SpaceToBatchNDOp::inferReturnTypes(
       matchPattern(paddings_val, m_Constant(&paddings_attr))) {
     int64_t return_batch = input_shape[0];
     for (uint64_t i = 0; i < block_rank; ++i) {
+      // Propagate dynamic dimension.
+      if (input_shape[i + 1] == ShapedType::kDynamicSize) {
+        return_batch = ShapedType::kDynamicSize;
+      }
+      if (return_batch == ShapedType::kDynamicSize) {
+        return_shape[1 + i] = ShapedType::kDynamicSize;
+        continue;
+      }
       int64_t paddings_sum =
           paddings_attr.getValue({i, 0}).cast<IntegerAttr>().getInt() +
           paddings_attr.getValue({i, 1}).cast<IntegerAttr>().getInt();
@@ -1914,6 +1919,19 @@ bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
   return true;
 }
 
+//===----------------------------------------------------------------------===//
+// SummaryWriterOp
+//===----------------------------------------------------------------------===//
+
+ResourceHandleValueAndId SummaryWriterOp::GetResourceHandleValueAndId(
+    llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+    int64_t &next_id) {
+  llvm::StringRef device = GetDeviceOrEmpty(getOperation());
+  return GetResourceHandleValueAndIdBase(container(), shared_name(), device,
+                                         writer(), resource_handle_id_map,
+                                         next_id);
+}
+
 //===----------------------------------------------------------------------===//
 // TensorListReserveOp
 //===----------------------------------------------------------------------===//
@@ -2321,6 +2339,41 @@ void NonMaxSuppressionV3Op::getCanonicalizationPatterns(
   results.insert<NMSV3ToNMSV4Op>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// FusedBatchNormOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ConvertFusedBatchNorm : public OpRewritePattern<TF::FusedBatchNormOp> {
+  using OpRewritePattern<FusedBatchNormOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::FusedBatchNormOp tf_fused_batch_norm_op,
+                                PatternRewriter &rewriter) const override {
+    auto new_result_types =
+        llvm::to_vector<6>(tf_fused_batch_norm_op.getResultTypes());
+    // reserve_space_3
+    new_result_types.push_back(
+        UnrankedTensorType::get(FloatType::getF32(rewriter.getContext())));
+
+    OperationState new_state(tf_fused_batch_norm_op.getLoc(),
+                             TF::FusedBatchNormV3Op::getOperationName(),
+                             tf_fused_batch_norm_op.getOperands(),
+                             new_result_types,
+                             tf_fused_batch_norm_op.getAttrs());
+    Operation *tf_fused_batch_norm_op_v3 = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(tf_fused_batch_norm_op,
+                       tf_fused_batch_norm_op_v3->getResults().drop_back());
+    return success();
+  }
+};
+}  // namespace.
+
+void FusedBatchNormOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertFusedBatchNorm>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // UnpackOp
 //===----------------------------------------------------------------------===//
@@ -2388,6 +2441,19 @@ static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// VarHandleOp
+//===----------------------------------------------------------------------===//
+
+ResourceHandleValueAndId VarHandleOp::GetResourceHandleValueAndId(
+    llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+    int64_t &next_id) {
+  llvm::StringRef device = GetDeviceOrEmpty(getOperation());
+  return GetResourceHandleValueAndIdBase(container(), shared_name(), device,
+                                         resource(), resource_handle_id_map,
+                                         next_id);
+}
+
 //===----------------------------------------------------------------------===//
 // VarIsInitializedOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index b0d96963088..e77dd365abf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -975,6 +975,65 @@ func @foldIfRegionMismatchedTypes(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %a
   return %0 : tensor<1xf32>
 }
 
+// CHECK-LABEL: func @eliminatePassThroughIfRegion(
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<!tf.resource>
+func @eliminatePassThroughIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<!tf.resource>) -> (tensor<f32>) {
+  // CHECK: %[[PRED:.*]] = "tf._SomeOp"() : () -> tensor<i1>
+  %pred = "tf._SomeOp"() : () -> tensor<i1>
+  // CHECK: %[[IF_OUTPUT:.*]] = "tf.IfRegion"(%[[PRED]]) ( {
+  // CHECK:   %[[MUL:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG1]])
+  // CHECK:   "tf.Yield"(%[[MUL]]) : (tensor<f32>)
+  // CHECK:  },  {
+  // CHECK:    %[[SUB:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]])
+  // CHECK:    "tf.Yield"(%[[SUB]]) : (tensor<f32>)
+  // CHECK:  }) {is_stateless = true} : (tensor<i1>) -> tensor<f32>
+  %0:4 = "tf.IfRegion"(%pred) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %arg2, %true_value, %arg2) : (tensor<f32>, tensor<!tf.resource>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %arg2, %false_value, %arg2) : (tensor<f32>, tensor<!tf.resource>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> (tensor<f32>, tensor<!tf.resource>, tensor<f32>, tensor<!tf.resource>)
+  // CHECK: "tf._SomeOp"(%[[ARG2]], %[[ARG1]]) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  "tf._SomeOp"(%0#1, %0#0) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK: "tf._SomeOp"(%[[ARG2]], %[[IF_OUTPUT]]) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  "tf._SomeOp"(%0#3, %0#2) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK: return %[[IF_OUTPUT]] : tensor<f32>
+  return %0#2 : tensor<f32>
+}
+
+// CHECK-LABEL: func @eliminatePassThroughCaseRegion(
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<!tf.resource>
+func @eliminatePassThroughCaseRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<!tf.resource>) -> (tensor<f32>) {
+  // CHECK: %[[INDEX:.*]] = "tf._SomeOp"() : () -> tensor<i32>
+  %index = "tf._SomeOp"() : () -> tensor<i32>
+  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[INDEX]]) ( {
+  // CHECK:   %[[MUL:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG1]])
+  // CHECK:   "tf.Yield"(%[[MUL]]) : (tensor<f32>)
+  // CHECK:  },  {
+  // CHECK:    %[[SUB:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]])
+  // CHECK:    "tf.Yield"(%[[SUB]]) : (tensor<f32>)
+  // CHECK:  },  {
+  // CHECK:    %[[ADD:.*]] = "tf.AddV2"(%[[ARG0]], %[[ARG1]])
+  // CHECK:    "tf.Yield"(%[[ADD]]) : (tensor<f32>)
+  // CHECK:  }) {is_stateless = true} : (tensor<i32>) -> tensor<f32>
+  %0:3 = "tf.CaseRegion"(%index) ({
+      %mul = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %mul, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }, {
+      %sub = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %sub, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }, {
+      %add = "tf.AddV2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %add, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }) { is_stateless = true}: (tensor<i32>) -> (tensor<f32>, tensor<f32>, tensor<!tf.resource>)
+  // CHECK: "tf._SomeOp"(%[[ARG2]], %[[ARG1]]) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  "tf._SomeOp"(%0#2, %0#0) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK: return %[[CASE_OUTPUT]] : tensor<f32>
+  return %0#1 : tensor<f32>
+}
+
+
 // CHECK-LABEL: foldCase
 func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %2 = constant dense<1> : tensor<i32>
@@ -1225,3 +1284,10 @@ func @testNMSV3ToNMSV4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tens
   %0 = "tf.NonMaxSuppressionV3"(%arg0, %arg1, %max_size, %arg2, %arg3): (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>)
   return %0 : tensor<2xi32>
 }
+
+// CHECK-LABEL: testFusedBatchNormToBatchNormV3
+func @testFusedBatchNormToBatchNormV3(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+  // CHECK: "tf.FusedBatchNormV3"
+  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4): (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32> )
+  return %0#0  : tensor<8x8x8x8xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
index 5fb90b1bce0..b8c779992ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function | tf-mlir-translate -mlir-tf-graph-to-hlo-text -tf-input-shapes=2:2 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-xla-input-types=parameter,resource -emit-return-tuple | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-control-output-arrays=assign_variable | tf-mlir-translate -mlir-tf-graph-to-hlo-text -tf-input-shapes=2:2 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-xla-input-types=parameter,resource -emit-return-tuple | FileCheck %s
 
 node {
   name: "arg0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
index b68f177b183..93cb5e23321 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
@@ -7,4 +7,4 @@ module attributes {tf.versions = {producer = 888 : i32}} {
   } loc(unknown)
 } loc(unknown)
 
-// CHECK: "\0A\0Amodule attributes {tf.versions = {producer = 888 : i32}} {\0A func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {\0A %0 = \22tf.Identity\22(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)\0A return %0 : tensor<?xi32> loc(unknown)\0A } loc(unknown)\0A} loc(unknown)"
+// CHECK: "module attributes {tf.versions = {producer = 888 : i32}} {\0A func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {\0A %0 = \22tf.Identity\22(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)\0A return %0 : tensor<?xi32> loc(unknown)\0A } loc(unknown)\0A} loc(unknown)"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 779065b94d5..528d26c47e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -511,3 +511,55 @@ func @DontFoldNoConstantFold() -> tensor<8xf32> {
   %2 = "tf.StatelessRandomUniform"(%0, %1) : (tensor<1xi32>, tensor<2xi32>) -> tensor<8xf32>
   return %2 : tensor<8xf32>
 }
+
+// CHECK-LABEL: func @testBroadcastGradientArgs
+func @testBroadcastGradientArgs() -> (tensor<1xi32>, tensor<0xi32>) {
+  %s0 = "tf.Const"() {value = dense<[4]> : tensor<1xi32>} : () -> tensor<1xi32>
+  %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<1xi32>, tensor<2xi32>) -> (tensor<1xi32>, tensor<0xi32>)
+  // CHECK: %[[R0:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NOT: tf.BroadcastGradientArgs
+  // CEHCK: return [[R0]], [[R1]]
+
+  return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+}
+
+// CHECK-LABEL: func @testBroadcastGradientArgsHigherRank
+func @testBroadcastGradientArgsHigherRank() -> (tensor<2xi32>, tensor<2xi32>) {
+  %s0 = "tf.Const"() {value = dense<[1, 4, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %s1 = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<3xi32>, tensor<2xi32>) -> (tensor<2xi32>, tensor<2xi32>)
+  // CHECK: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[R1:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-NOT: tf.BroadcastGradientArgs
+  // CEHCK: return [[R0]], [[R1]]
+
+  return %r0, %r1 : tensor<2xi32>, tensor<2xi32>
+}
+
+// CHECK-LABEL: func @testBroadcastGradientArgsScalar
+func @testBroadcastGradientArgsScalar() -> (tensor<2xi32>, tensor<0xi32>) {
+  %s0 = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<0xi32>, tensor<2xi32>) -> (tensor<2xi32>, tensor<0xi32>)
+  // CHECK: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NOT: tf.BroadcastGradientArgs
+  // CEHCK: return [[R0]], [[R1]]
+
+  return %r0, %r1 : tensor<2xi32>, tensor<0xi32>
+}
+
+// CHECK-LABEL: func @testBroadcastGradientArgI64
+func @testBroadcastGradientArgI64() -> (tensor<2xi64>, tensor<0xi64>) {
+  %s0 = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
+  %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<0xi64>, tensor<2xi64>) -> (tensor<2xi64>, tensor<0xi64>)
+  // CHECK: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
+  // CHECK-NOT: tf.BroadcastGradientArgs
+  // CEHCK: return [[R0]], [[R1]]
+
+  return %r0, %r1 : tensor<2xi64>, tensor<0xi64>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir b/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir
new file mode 100644
index 00000000000..afc9e1e51ed
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir
@@ -0,0 +1,43 @@
+// RUN: tf-opt -tf-broadcast-fold %s | FileCheck %s
+
+// CHECK-LABEL: @broadcast_mul0
+func @broadcast_mul0(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> tensor<5x7xf32> {
+  %cst = constant dense<[5, 7]> : tensor<2xi32>
+  %0 = "tf.BroadcastTo"(%arg1, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
+  return %1 : tensor<5x7xf32>
+  // CHECK: %[[V0:.*]] = "tf.Mul"(%arg0, %arg1) : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xf32>
+  // CHECK: %[[V0]] : tensor<5x7xf32>
+}
+
+// CHECK-LABEL: @broadcast_mul1
+func @broadcast_mul1(%arg0: tensor<7xf32>, %arg1: tensor<5x7xf32>) -> tensor<5x7xf32> {
+  %cst = constant dense<[5, 7]> : tensor<2xi32>
+  %0 = "tf.BroadcastTo"(%arg0, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
+  %1 = "tf.Mul"(%0, %arg1) : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
+  return %1 : tensor<5x7xf32>
+  // CHECK: %[[V0:.*]] = "tf.Mul"(%arg0, %arg1) : (tensor<7xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
+  // CHECK: %[[V0]] : tensor<5x7xf32>
+}
+
+// CHECK-LABEL: @broadcast_add_implicit_fold
+func @broadcast_add_implicit_fold(%arg0: tensor<5x1xf32>, %arg1: tensor<7xf32>) -> tensor<5x7xf32> {
+  %cst = constant dense<[5, 7]> : tensor<2xi32>
+  %0 = "tf.BroadcastTo"(%arg1, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
+  %1 = "tf.AddV2"(%arg0, %0) : (tensor<5x1xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
+  return %1 : tensor<5x7xf32>
+  // CHECK: %[[V0:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<5x1xf32>, tensor<7xf32>) -> tensor<5x7xf32>
+  // CHECK: %[[V0]] : tensor<5x7xf32>
+}
+
+// CHECK-LABEL: @broadcast_mul_implicit_no_fold
+func @broadcast_mul_implicit_no_fold(%arg0: tensor<5x7xf32>, %arg1: tensor<5xf32>) -> tensor<3x5x7xf32> {
+  %cst = constant dense<[3, 5, 7]> : tensor<3xi32>
+  %0 = "tf.BroadcastTo"(%arg1, %cst) : (tensor<5xf32>, tensor<3xi32>) -> tensor<3x5x7xf32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<5x7xf32>, tensor<3x5x7xf32>) -> tensor<3x5x7xf32>
+  return %1 : tensor<3x5x7xf32>
+  // CHECK: %[[C0:.*]] = constant dense<[3, 5, 7]> : tensor<3xi32>
+  // CHECK: %[[V0:.*]] = "tf.BroadcastTo"(%arg1, %[[C0]]) : (tensor<5xf32>, tensor<3xi32>) -> tensor<3x5x7xf32>
+  // CHECK: %[[V1:.*]] = "tf.Mul"(%arg0, %[[V0]]) : (tensor<5x7xf32>, tensor<3x5x7xf32>) -> tensor<3x5x7xf32>
+  // CHECK: %[[V1]] : tensor<3x5x7xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
index c52488b4afc..1f0a183c19e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-executor-graph-pruning | FileCheck %s
+// RUN: tf-opt %s -split-input-file -tf-executor-graph-pruning | FileCheck %s
 
 // Two islands chained by data-flow contributing to the graph return are
 // preserved.
@@ -18,20 +18,6 @@ func @chained_islands(%arg0 : i32) -> i32 {
   return %0 : i32
 }
 
-// Check that a function that does not have arguments/results is ignored by
-// thep pruning pass: this could be a V1 graph imported without feeds/fetches.
-// CHECK-LABEL: func @empty_islands(
-func @empty_islands() {
-// CHECK: tf_executor.island
-  tf_executor.graph {
-    %0 = tf_executor.island {
-      tf_executor.yield
-    }
-    tf_executor.fetch
-  }
-  return
-}
-
 // Check that an unused island that doesn't contribute to the fetch is removed.
 // CHECK-LABEL: func @dead_island(
 func @dead_island(%arg0 : i32) -> i32 {
@@ -165,3 +151,37 @@ func @control_fetch(%arg0 : i32) {
   }
   return
 }
+
+// -----
+
+// Check that a function that is named "main" and does not have the
+// "tf.entry_function" attribute defined is ignored by the pruning pass: this
+// could be a V1 graph imported without feed/fetch/target nodes.
+// CHECK-LABEL: func @main(
+func @main() {
+// CHECK: tf_executor.island
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// -----
+
+// Check that a function that is named "main" and does have the
+// "tf.entry_function" attribute defined with no feed/fetch/target nodes is
+// pruned.
+// CHECK-LABEL: func @main(
+func @main() attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = ""}} {
+// CHECK-NOT: tf_executor.island
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
index 7e583d0425a..67b4691f296 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
@@ -15,6 +15,23 @@ func @inline_simple() -> tensor<2xi32> {
   return %result : tensor<2xi32>
 }
 
+// Test that TPUParitionedCallOp is not inlined.
+
+func @simple_callee() -> tensor<2xi32> attributes {sym_visibility = "private"} {
+  %cst = "tf.Const"() { value = dense<2> : tensor<2xi32> } : () -> tensor<2xi32>
+  return %cst : tensor<2xi32>
+}
+
+// CHECK-LABEL: func @dont_inline_tpu_partitioned_call(
+func @dont_inline_tpu_partitioned_call() -> tensor<2xi32> {
+  // CHECK-NEXT: %[[ORDINAL:.*]] = "tf.TPUOrdinalSelector"
+  // CHECK-NEXT: %[[PARTITIONED_CALL:.*]] = "tf.TPUPartitionedCall"(%[[ORDINAL]])
+  // CHECK-NEXT: return %[[PARTITIONED_CALL]]
+  %0 = "tf.TPUOrdinalSelector"() {device = ""} : () -> tensor<?xi32>
+  %result = "tf.TPUPartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @simple_callee} : (tensor<?xi32>) -> tensor<2xi32>
+  return %result : tensor<2xi32>
+}
+
 // Check that TF call operations can be inlined, even when the shape of the
 // argument or result is different than the called function.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir b/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir
index 561a8382701..211789f68bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir
@@ -58,29 +58,248 @@ func @multi_op_launch() {
 // CHECK:      tf_executor.yield %[[A]], %[[C]], %[[B]], %[[D]]
 
 
-// -----
-
-
-// Tests ops are hoisted out and devices are set only if the `tf_device.launch`
-// contains TensorFlow ops.
-func @non_tf_dialect_op_launch() {
+// Tests empty device string attributes are overwritten.
+// CHECK-LABEL: func @empty_device_op
+func @empty_device_op() {
   tf_executor.graph {
-    %0:5 = tf_executor.island {
-      %a = "tf.opA"() : () -> tensor<i1>
-      // expected-error@+1 {{'tf_device.launch' op must contain only 'tf' dialect ops}}
+    %0:3 = tf_executor.island {
       %launch:2 = "tf_device.launch"() ( {
-        %b = "tf.opB"(%a) : (tensor<i1>) -> tensor<i32>
-        %c = addi %b, %b : tensor<i32>
-        tf_device.return %c, %b : tensor<i32>, tensor<i32>
+        %a:2 = "tf.opA"() {device = ""} : () -> (tensor<i32>, tensor<f32>)
+        tf_device.return %a#1, %a#0 : tensor<f32>, tensor<i32>
       }) {device = "CPU:0"} : () -> (tensor<f32>, tensor<i32>)
-      %d = "tf.opD"() : () -> tensor<i1>
-      tf_executor.yield %a, %launch#0, %launch#1, %d : tensor<i1>, tensor<f32>, tensor<i32>, tensor<i1>
+      tf_executor.yield %launch#0, %launch#1: tensor<f32>, tensor<i32>
     }
     tf_executor.fetch
   }
   return
 }
 
+// CHECK:      [[A:%.+]]:2 = "tf.opA"
+// CHECK-SAME: device = "CPU:0"
+// CHECK-NOT:  tf_device.launch
+// CHECK:      tf_executor.yield [[A]]#1, [[A]]#0
+
+
+// Tests devices are propagated into tf.Case op branches.
+// CHECK-LABEL: func @case
+func @case(%arg0: tensor<i32>) {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      "tf_device.launch"() ( {
+        "tf.Case"(%arg0) {branches = [@case_branch], is_stateless = false} : (tensor<i32>) -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.Case
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @case_branch
+func @case_branch() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.opA"() : () -> tensor<i1>
+    tf_executor.fetch %0#1 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.opA
+// CHECK-SAME: device = "CPU:0"
+
+
+// Tests devices are propagated into tf.If op branches.
+// CHECK-LABEL: func @if
+func @if(%arg0: tensor<i1>) {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      "tf_device.launch"() ( {
+        "tf.If"(%arg0) {then_branch = @then_branch, else_branch = @else_branch, is_stateless = false} : (tensor<i1>) -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.If
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @then_branch
+func @then_branch() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.opB"() : () -> tensor<i1>
+    tf_executor.fetch %0#1 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.opB
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @else_branch
+func @else_branch() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.opC"() : () -> tensor<i1>
+    tf_executor.fetch %0#1 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.opC
+// CHECK-SAME: device = "CPU:0"
+
+
+// Tests devices are propagated into tf.While op functions.
+// CHECK-LABEL: func @while
+func @while(%arg0: tensor<i1>) {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      "tf_device.launch"() ( {
+        %1 = "tf.While"(%arg0) {cond = @cond_func, body = @body_func, is_stateless = false} : (tensor<i1>) -> tensor<i1>
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.While
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @cond_func
+func @cond_func(%arg0: tensor<i1>) -> tensor<i1> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island wraps "tf.opD"(%arg0) : (tensor<i1>) -> tensor<i1>
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  return %0 : tensor<i1>
+}
+
+// CHECK:      tf.opD
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @body_func
+func @body_func(%arg0: tensor<i1>) -> tensor<i1> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island wraps "tf.opE"(%arg0) : (tensor<i1>) -> tensor<i1>
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  return %0 : tensor<i1>
+}
+
+// CHECK:      tf.opE
+// CHECK-SAME: device = "CPU:0"
+
+
+// Tests devices are propagated into functions.
+// CHECK-LABEL: func @call(
+func @call() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      "tf_device.launch"() ( {
+        "tf.StatefulPartitionedCall"() {f = @call_func0, config = "", config_proto = "", executor_type = ""} : () -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.StatefulPartitionedCall
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @call_func0
+func @call_func0() {
+  tf_executor.graph {
+    %0 = tf_executor.island wraps "tf.StatefulPartitionedCall"() {f = @call_func1, config = "", config_proto = "", executor_type = ""} : () -> ()
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.StatefulPartitionedCall
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @call_func1
+func @call_func1() {
+  tf_executor.graph {
+    %0 = tf_executor.island wraps "tf.opF"() : () -> ()
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.opF
+// CHECK-SAME: device = "CPU:0"
+
+
+// Test v1 control flow ops reachable from a tf_device.launch have devices
+// assigned.
+// CHECK-LABEL: func @call_to_graph
+func @call_to_graph() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      "tf_device.launch"() ( {
+        "tf.StatefulPartitionedCall"() {f = @v1_control_flow, config = "", config_proto = "", executor_type = ""} : () -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      tf.StatefulPartitionedCall
+// CHECK-SAME: device = "CPU:0"
+
+// CHECK-LABEL: func @v1_control_flow
+func @v1_control_flow() {
+  %0 = tf_executor.graph {
+    %0:3 = tf_executor.NextIteration.Source : tensor<*xi32> {T = "tfdtype$DT_INT32"}
+    %1:2 = tf_executor.island wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %2:2 = tf_executor.Enter %1#0 frame "while/while_context" : (tensor<i32>) -> (tensor<*xi32>, !tf_executor.control) {T = "tfdtype$DT_INT32"}
+    %3:3 = tf_executor.Merge %2#0, %0#0 : tensor<*xi32> {N = 2 : i64, T = "tfdtype$DT_INT32"}
+    %4:2 = tf_executor.island(%3#2) wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<10> : tensor<i32>} : () -> tensor<i32>
+    %5:2 = tf_executor.island wraps "tf.Less"(%3#0, %4#0) {T = "tfdtype$DT_INT32"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+    %6:2 = tf_executor.LoopCond %5#0 : (tensor<*xi1>) -> (tensor<i1>, !tf_executor.control) {}
+    %7:3 = tf_executor.Switch %3#0, %6#0 : tensor<*xi32> {T = "tfdtype$DT_INT32"}
+    %8:2 = tf_executor.Exit %7#0 : tensor<*xi32> {T = "tfdtype$DT_INT32"}
+    %9:2 = tf_executor.island wraps "tf.Identity"(%7#1) {T = "tfdtype$DT_INT32"} : (tensor<*xi32>) -> tensor<*xi32>
+    %10:2 = tf_executor.island(%9#1) wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %11:2 = tf_executor.island wraps "tf.Add"(%9#0, %10#0) {T = "tfdtype$DT_INT32"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+    tf_executor.NextIteration.Sink [%0#1] %11#0 : tensor<*xi32> {T = "tfdtype$DT_INT32"}
+    tf_executor.fetch %8#0 : tensor<*xi32>
+  }
+  return
+}
+
+// CHECK:      tf_executor.NextIteration.Source
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      tf_executor.Enter
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      tf_executor.Merge
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      tf_executor.LoopCond
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      tf_executor.Switch
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      tf_executor.Exit
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      tf_executor.NextIteration.Sink
+// CHECK-SAME: device = "CPU:0"
+
 
 // -----
 
@@ -90,7 +309,7 @@ func @non_tf_dialect_op_launch() {
 func @conflicting_device() {
   tf_executor.graph {
     %0 = tf_executor.island {
-      // expected-error@+1 {{'tf_device.launch' op inner 'tf' dialect op has conflicting 'device' attribute, got 'GPU:0' but expected 'CPU:0'}}
+      // expected-error@+1 {{'tf_device.launch' op inner op has conflicting 'device' attribute, got 'GPU:0' but expected 'CPU:0'}}
       "tf_device.launch"() ( {
         "tf.opA"() {device = "GPU:0"} : () -> ()
         tf_device.return
@@ -106,11 +325,40 @@ func @conflicting_device() {
 // -----
 
 
+// Tests TensorFlow op with conflicting `device` attribute compared to parent
+// `tf_device.launch` via a reachable function.
+func @conflicting_device_function() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      // expected-error@+1 {{'tf_device.launch' op inner op has conflicting 'device' attribute, got 'GPU:0' but expected 'CPU:0'}}
+      "tf_device.launch"() ( {
+        "tf.StatefulPartitionedCall"() {f = @callee, config = "", config_proto = "", executor_type = ""} : () -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+func @callee() {
+  tf_executor.graph {
+    %0 = tf_executor.island wraps "tf.opA"() {device = "GPU:0"} : () -> ()
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+
+// -----
+
+
 // Tests TensorFlow op with bad `device` attribute already set.
 func @bad_tf_device_attr() {
   tf_executor.graph {
     %0 = tf_executor.island {
-      // expected-error@+1 {{'tf_device.launch' op inner 'tf' dialect op has bad 'device' attribute}}
+      // expected-error@+1 {{'tf_device.launch' op inner op has bad 'device' attribute}}
       "tf_device.launch"() ( {
         "tf.opA"() {device = 0 : i32} : () -> ()
         tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
index c71d8ef2850..0034d3f4308 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
@@ -70,3 +70,15 @@ func @transposeFusedBatchNormV3(
 
   return %y : tensor<1x64x28x28xf32>
 }
+
+// CHECK-LABEL: bias_add_nchw
+func @bias_add_nchw(%arg0: tensor<1x256x150x150xf32>, %arg1: tensor<256xf32>) -> tensor<1x256x150x150xf32> {
+  // CHECK: (%[[ARG0:.*]]: tensor<1x256x150x150xf32>, %[[ARG1:.*]]: tensor<256xf32>)
+  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[R0:.*]] = "tf.Transpose"(%[[ARG0]], %[[CST]])
+  // CHECK: %[[R1:.*]] = "tf.BiasAdd"(%[[R0]], %[[ARG1]]) {data_format = "NHWC", device = ""}
+  // CHECK: %[[CST_0:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: "tf.Transpose"(%[[R1]], %[[CST_0]])
+  %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW", device = ""} : (tensor<1x256x150x150xf32>, tensor<256xf32>) -> tensor<1x256x150x150xf32>
+  return %0 : tensor<1x256x150x150xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index cc923070077..4be65bd0a3b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -69,6 +69,13 @@ func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi3
   return %0 : tensor<4x4x4x4xi32>
 }
 
+// CHECK-LABEL:   func @unsupported_broadcast_add
+// CHECK: chlo.broadcast_add
+func @unsupported_broadcast_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  return %0 : tensor<4x4x4x4xi32>
+}
+
 // CHECK-LABEL:   func @div(
 // CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
@@ -533,6 +540,13 @@ func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL: func @equal_unsupported_compare_type
+func @equal_unsupported_compare_type(%arg0: tensor<1xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xi1> {
+  // CHECK: chlo.broadcast_compare
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, compare_type = "TOTALORDER", comparison_direction = "EQ"} : (tensor<1xf32>, tensor<1x2xf32>) -> tensor<1x2xi1>
+  return %0 : tensor<1x2xi1>
+}
+
 // CHECK-LABEL:   func @notequal(
 // CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi32>,
 // CHECK-SAME:                   %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
@@ -599,6 +613,13 @@ func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL: func @greater_unsupported_compare_type
+func @greater_unsupported_compare_type(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xi1> {
+  // CHECK: mhlo.compare
+  %0 = "mhlo.compare"(%arg0, %arg1) {compare_type = "TOTALORDER", comparison_direction = "GT"} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
+  return %0 : tensor<2xi1>
+}
+
 // CHECK-LABEL:   func @greater_equal(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<2xi32>,
 // CHECK-SAME:                        %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index fb4b24037a4..e844c20e4a7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -226,8 +226,8 @@ func @rsqrt_grad_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<
 
 // %input has 1 batch dimension then 2 block dimensions then 1 remainder
 // dimension.
-// CHECK-LABEL: fourdim_SpaceToBatchND
-func @fourdim_SpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
+// CHECK-LABEL: fourdim_space_to_batch_nd
+func @fourdim_space_to_batch_nd(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
   // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"() {value = dense<0> : tensor<1x2xi64>}
   // CHECK-DAG: [[ZERO_I32:%.+]] = "tf.Const"() {value = dense<0> : tensor<i32>}
   // CHECK-DAG: [[ZERO_I64:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
@@ -255,11 +255,23 @@ func @fourdim_SpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<
   return %0 : tensor<?x?x?x10xf32>
 }
 
+// Verify the result shape for the tf.PadV2 op.
+func @const_paddings_space_to_batch_nd(%arg0: tensor<1x8x2xf32>) -> (tensor<3x5x2xf32>) {
+  %0 = "tf.Const"() {value = dense<3> : tensor<1xi32>} : () -> tensor<1xi32>
+  %1 = "tf.Const"() {value = dense<[[3, 4]]> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
+
+  // CHECK: "tf.PadV2"
+  // CHECK-SAME: tensor<1x5x2xf32>
+  %2 = "tf.SpaceToBatchND"(%arg0, %0, %1) : (tensor<1x8x2xf32>, tensor<1xi32>, tensor<1x2xi32>) -> tensor<3x5x2xf32>
+
+  return %2 : tensor<3x5x2xf32>
+}
+
 // %input has 1 batch dimension then 3 block dimensions then 2 remainder
 // dimensions. This checks only ops that are specific to the case with 3 block
 // dimension and 2 remainder dimensions.
-// CHECK-LABEL: sixdim_SpaceToBatchND
-func @sixdim_SpaceToBatchND(%input: tensor<3x5x7x9x10x11xf32>, %block_shape: tensor<3xi64>, %paddings: tensor<3x2xi64>) -> tensor<?x?x?x?x10x11xf32> {
+// CHECK-LABEL: sixdim_space_to_batch_nd
+func @sixdim_space_to_batch_nd(%input: tensor<3x5x7x9x10x11xf32>, %block_shape: tensor<3xi64>, %paddings: tensor<3x2xi64>) -> tensor<?x?x?x?x10x11xf32> {
   // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"()
   // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[PAD00]], {{.+}})
   // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 9, 10, 11]> : tensor<6xi64>}
@@ -468,6 +480,16 @@ func @ZerosLike_variant(%arg0: tensor<!tf.variant<tensor<2xi32>>>) -> tensor<!tf
   return %0 : tensor<!tf.variant<tensor<2xi32>>>
 }
 
+// CHECK-LABEL: func @OnesLike_unranked
+func @OnesLike_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SHAPE:.*]] = "tf.Shape"(%arg0) : (tensor<*xi32>) -> tensor<?xi64>
+  // CHECK: "tf.BroadcastTo"(%[[ONE]], %[[SHAPE]]) : (tensor<i32>, tensor<?xi64>) -> tensor<*xi32>
+
+  %0 = "tf.OnesLike"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
 // CHECK-LABEL: func @addN_2
 func @addN_2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
@@ -515,9 +537,11 @@ func @addN_variant(%arg0: tensor<!tf.variant<tensor<2xf32>>>, %arg1: tensor<!tf.
 
 // CHECK-LABEL: func @DynamicStitch_simple
 func @DynamicStitch_simple(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[ITEMS]]#0, %[[AXIS]]) : (tensor<2xf32>, tensor<2xf32>, tensor<i64>) -> tensor<2x2xf32>
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS_0:.*]] = "tf.ExpandDims"(%[[ITEMS]]#0, %[[AXIS]])
+  // CHECK: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[ITEMS_0]], %[[AXIS]]) : (tensor<1x2xf32>, tensor<1x2xf32>, tensor<i64>) -> tensor<2x2xf32>
   // CHECK: return %[[RESULT]]
 
   %indices = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -533,7 +557,12 @@ func @DynamicStitch_scalar_matrix_indices(%arg0: tensor<2xf32>, %arg1: tensor<2x
   // CHECK-DAG: %[[INP1:.*]] = "tf.Reshape"(%arg1, %[[SHAPE]]) : (tensor<2x2x2xf32>, tensor<2xi64>) -> tensor<4x2xf32>
   // CHECK-DAG: %[[ITEMS1:.*]]:4 = "tf.Unpack"(%[[INP1]]) {axis = 0 : i64} : (tensor<4x2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %6 = "tf.ConcatV2"(%[[ITEMS1]]#3, %[[ITEMS1]]#2, %[[ITEMS1]]#1, %[[ITEMS1]]#0, %[[ITEMS0]], %[[AXIS]]) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<i64>) -> tensor<5x2xf32>
+  // CHECK-DAG: %[[ITEMS1_3:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#3, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS1_2:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#2, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS1_1:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#1, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS1_0:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#0, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS0_0:.*]] = "tf.ExpandDims"(%[[ITEMS0]], %[[AXIS]])
+  // CHECK-DAG: "tf.ConcatV2"(%[[ITEMS1_3]], %[[ITEMS1_2]], %[[ITEMS1_1]], %[[ITEMS1_0]], %[[ITEMS0_0]], %[[AXIS]]) : (tensor<1x2xf32>, tensor<1x2xf32>, tensor<1x2xf32>, tensor<1x2xf32>, tensor<1x2xf32>, tensor<i64>) -> tensor<5x2xf32>
 
   %indices0 = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
   %indices1 = "tf.Const"() {value = dense<[[3, 2], [1, 0]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
@@ -555,7 +584,9 @@ func @DynamicStitch_uint8(%arg0: tensor<2x2xui8>) -> tensor<2x2xui8> {
 func @DynamicStitch_scalar_item(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK-DAG: %[[ITEMS]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[RESULT]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[ITEMS]]#0, %[[AXIS]]) : (tensor<f32>, tensor<f32>, tensor<i64>) -> tensor<2xf32>
+  // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS_0:.*]] = "tf.ExpandDims"(%[[ITEMS]]#0, %[[AXIS]])
+  // CHECK-DAG: %[[RESULT]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[ITEMS_0]], %[[AXIS]]) : (tensor<1xf32>, tensor<1xf32>, tensor<i64>) -> tensor<2xf32>
   // CHECK: return %[[RESULT]]
 
   %indices = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -567,7 +598,9 @@ func @DynamicStitch_scalar_item(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 func @DynamicStitch_matrix_item(%arg0: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
   // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[ITEMS]]#0, %[[AXIS]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i64>) -> tensor<2x2x2xf32>
+  // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
+  // CHECK-DAG: %[[ITEMS_0:.*]] = "tf.ExpandDims"(%[[ITEMS]]#0, %[[AXIS]])
+  // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[ITEMS_0]], %[[AXIS]]) : (tensor<1x2x2xf32>, tensor<1x2x2xf32>, tensor<i64>) -> tensor<2x2x2xf32>
   // CHECK: return %[[RESULT]]
 
   %indices = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -586,7 +619,8 @@ func @DynamicStitch_dynamic(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tenso
 func @DynamicStitch_duplicates(%arg0: tensor<2x2xf32>) -> tensor<1x2xf32> {
   // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[AXIS]]) : (tensor<2xf32>, tensor<i64>) -> tensor<1x2xf32>
+  // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
+  // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[AXIS]]) : (tensor<1x2xf32>, tensor<i64>) -> tensor<1x2xf32>
   // CHECK: return %[[RESULT]]
 
   %indices = "tf.Const"() {value = dense<[0, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -629,7 +663,7 @@ func @Reciprocal_complexf64(%arg0: tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f
 // CHECK-LABEL: @ScatterNd
 func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
   // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>} : () -> tensor<8xf32>
-  // CHECK: "tf.TensorScatterUpdate"(%[[ZERO]], %arg0, %arg1) : (tensor<8xf32>, tensor<4x1xi32>, tensor<4xf32>) -> tensor<8xf32>
+  // CHECK: "tf.TensorScatterAdd"(%[[ZERO]], %arg0, %arg1) : (tensor<8xf32>, tensor<4x1xi32>, tensor<4xf32>) -> tensor<8xf32>
 
   %shape = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> tensor<1xi32>
   %0 = "tf.ScatterNd"(%arg0, %arg1, %shape) : (tensor<4x1xi32>, tensor<4xf32>, tensor<1xi32>) -> tensor<8xf32>
@@ -693,3 +727,14 @@ func @round_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Round"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// CHECK-LABEL: func @lgamma
+func @lgamma(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // The lowering for lgamma is complicated, which makes it awkward to write a
+  // complete test for it here. Instead we test that Lgamma is at least being
+  // lowered here and rely on UnaryOpsTest.testFloatOps and other TensorFlow
+  // tests to check it is lowered correctly and with sufficient precision.
+  // CHECK-NOT: tf.Lgamma
+  %0 = "tf.Lgamma"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
index 52dc06cd393..03cac7dbd33 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
@@ -1,13 +1,12 @@
 // RUN: tf-opt %s -tf-parallel-execute-to-islands | FILECHECK_OPTS="" FileCheck %s
 
-// CHECK-LABEL: func @check_regions_to_islands
-func @check_regions_to_islands() {
+// CHECK-LABEL: func @testEmptyRegions
+func @testEmptyRegions() {
   tf_executor.graph {
     tf_executor.island() {
       "tf_device.parallel_execute"() ({
         tf_device.return
-      },
-      {
+      }, {
         tf_device.return
       }) {} : () -> ()
       tf_executor.yield
@@ -17,210 +16,133 @@ func @check_regions_to_islands() {
   return
 }
 
-// CHECK:      %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK:      [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
 // CHECK:        tf_executor.yield
-// CHECK:      %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK:      [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
 // CHECK:        tf_executor.yield
-// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
-// CHECK-NEXT:   tf_executor.yield
+// CHECK:      tf_executor.fetch [[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]] :
 
 
-// CHECK-LABEL: func @check_regions_to_islands_with_inputs
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_regions_to_islands_with_inputs(%arg0 : tensor<i1>) {
-  tf_executor.graph {
+// CHECK-LABEL: func @testDataOperandsAndResults
+// CHECK-SAME: ([[ARG_0:%.+]]: tensor<i1>)
+func @testDataOperandsAndResults(%arg0 : tensor<i1>) {
+  %0:2 = tf_executor.graph {
     %1:2 = tf_executor.island {
       %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
       tf_executor.yield %2 : tensor<i1>
     }
-    tf_executor.island() {
-      "tf_device.parallel_execute"() ({
-        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
+    %3:3 = tf_executor.island() {
+      %4:2 = "tf_device.parallel_execute"() ({
+        %5 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %5 : tensor<i1>
+      }, {
         %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
         tf_device.return %5 : tensor<i32>
       }) {} : () -> (tensor<i1>, tensor<i32>)
-      tf_executor.yield
+      tf_executor.yield %4#0, %4#1 : tensor<i1>, tensor<i32>
     }
-    tf_executor.fetch
+    tf_executor.fetch %3#0, %3#1 : tensor<i1>, tensor<i32>
   }
   return
 }
 
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
-// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
-// CHECK-NEXT:   tf_executor.yield
+// CHECK:      [[INPUT_A:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"([[ARG_0]])
+// CHECK-NEXT:   tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_C_OUTPUT:%.+]] = "tf.opC"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_C_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
 
 
-// CHECK-LABEL: func @check_input_sink_island_forwards_control_inputs
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_input_sink_island_forwards_control_inputs(%arg0 : tensor<i1>) {
-  tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %7 = tf_executor.ControlTrigger {}
-    %8 = tf_executor.ControlTrigger {}
-    tf_executor.island(%7, %8) {
-      "tf_device.parallel_execute"() ({
-        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
-        %5 = "tf.opC"() : () -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
-      }) {} : () -> (tensor<i1>, tensor<i32>)
+// CHECK-LABEL: func @testControlOperands
+func @testControlOperands() {
+  %0:2 = tf_executor.graph {
+    %1 = tf_executor.island {
       tf_executor.yield
     }
-    tf_executor.fetch
-  }
-  return
-}
-
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK: %[[CT_0:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK: %[[CT_1:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island(%[[CT_0]], %[[CT_1]]) {
-// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[INPUT_CONTROL]]) {
-// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"() : () -> tensor<i32>
-// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
-// CHECK-NEXT:   tf_executor.yield
-
-
-// CHECK-LABEL: func @check_control_dep_added_when_region_does_not_have_inputs
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_control_dep_added_when_region_does_not_have_inputs(%arg0 : tensor<i1>) {
-  tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %7:3 = tf_executor.island() {
-      %8:2 = "tf_device.parallel_execute"() (
-      {
-        %3 = "tf.opB"() : () -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
-        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
-       }
-       ) {} : () -> (tensor<i1>, tensor<i32>)
-
-      tf_executor.yield %8#0, %8#1 : tensor<i1>, tensor<i32>
-    }
-
-    tf_executor.island {
-      "tf.opD"(%7#0, %7#1) : (tensor<i1>, tensor<i32>) -> ()
-      tf_executor.yield
-    }
-    tf_executor.fetch
-  }
-  return
-}
-
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK:      %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:   tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:      %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
-// CHECK-NEXT:   %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
-// CHECK:        tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
-// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:      %{{.*}} = tf_executor.island {
-// CHECK-NEXT:   tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]]
-
-
-// CHECK-LABEL: func @check_output_barrier_correctly_forwards_outputs
-func @check_output_barrier_correctly_forwards_outputs(%arg0 : tensor<i1>) -> tensor<i1> {
-  %0 = tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %8:3 = tf_executor.island() {
-      %7:2 = "tf_device.parallel_execute"() ({
-        %3 = "tf.opB"() : () -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
-        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
-      }) {} : () -> (tensor<i1>, tensor<i32>)
-      tf_executor.yield %7#0, %7#1 : tensor<i1>, tensor<i32>
-    }
-    tf_executor.fetch %8#0 : tensor<i1>
-  }
-  return %0 : tensor<i1>
-}
-
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
-// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
-// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:       %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i32>
-// CHECK:         tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:       %[[OUTPUT_SINK_OUTPUT:[a-z_0-9]*]]:2, %[[OUTPUT_SINK_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]] : tensor<i1>, tensor<i32>
-
-// CHECK-LABEL: func @check_parallel_execute_using_args
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_parallel_execute_using_args(%arg0 : tensor<i1>) {
-  tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %2:2 = tf_executor.island {
-      %3 = "tf.opB"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %3 : tensor<i1>
-    }
-    tf_executor.island() {
-      "tf_device.parallel_execute"() ({
-        %4 = "tf.opC"(%arg0, %1#0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %2:3 = tf_executor.island(%1) {
+      %3:2 = "tf_device.parallel_execute"() ({
+        %4 = "tf.opA"() : () -> tensor<i1>
         tf_device.return %4 : tensor<i1>
-      },
-      {
-        %5 = "tf.opD"(%arg0, %2#0) : (tensor<i1>, tensor<i1>) -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
+      }, {
+        %4 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %4 : tensor<i32>
       }) {} : () -> (tensor<i1>, tensor<i32>)
-      tf_executor.yield
+      tf_executor.yield %3#0, %3#1 : tensor<i1>, tensor<i32>
     }
-    tf_executor.fetch
+    tf_executor.fetch %2#0, %2#1 : tensor<i1>, tensor<i32>
   }
   return
 }
 
-// Verify that args are directly accessed in newly created island without alias
-// through entry barrier.
+// CHECK:      [[INPUT_CTRL:%.+]] = tf_executor.island {
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
 
-// CHECK:  "tf.opC"(%[[ARG_0]]
-// CHECK:  "tf.opD"(%[[ARG_0]]
+
+// CHECK-LABEL: func @testControlResults
+func @testControlResults() {
+  tf_executor.graph {
+    %0:3 = tf_executor.island {
+      %1:2 = "tf_device.parallel_execute"() ({
+        %2 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %2 : tensor<i1>
+      }, {
+        %2 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %2 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %1#0, %1#1 : tensor<i1>, tensor<i32>
+    }
+    %3 = tf_executor.island(%0#2) {
+      tf_executor.yield
+    }
+    tf_executor.fetch %3 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      {{%.+}}, [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      [[OUTPUT_CTRL:%.+]] = tf_executor.island([[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]]) {
+// CHECK:      [[FETCH_ISLAND:%.+]] = tf_executor.island([[OUTPUT_CTRL]]) {
+// CHECK:      tf_executor.fetch [[FETCH_ISLAND]] : !tf_executor.control
+
+
+// CHECK-LABEL: func @testSomeRegionNoUsers
+func @testSomeRegionNoUsers() {
+  %0 = tf_executor.graph {
+    %1:3 = tf_executor.island {
+      %2:2 = "tf_device.parallel_execute"() ({
+        %3 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      }, {
+        %3 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %3 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %2#0, %2#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  return
+}
+
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_CTRL]] :
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 487234ce958..eab6e8be986 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -361,8 +361,7 @@ func @send_recv(%arg0: tensor<2x!tf.string>) {
 // -----
 
 // Tests functional control flow functions with replica variant ops reachable
-// from a replicate region is cloned and remapped. Only the branches reachable
-// with replica variant ops are cloned.
+// from a replicate region is cloned and remapped.
 
 // CHECK-LABEL: func @control_flow_with_replicate_variant_ops
 func @control_flow_with_replicate_variant_ops(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<2x!tf.string>) {
@@ -380,30 +379,32 @@ func @control_flow_with_replicate_variant_ops(%arg0: tensor<i1>, %arg1: tensor<f
 }
 
 // CHECK: "tf.If"
-// CHECK-SAME: else_branch = @cond_false
+// CHECK-SAME: else_branch = [[COND_FALSE_REPLICA_0:@[a-z0-9_]+]]
 // CHECK-SAME: then_branch = [[COND_TRUE_REPLICA_0:@[a-z0-9_]+]]
 // CHECK: "tf.If"
-// CHECK-SAME: else_branch = @cond_false
+// CHECK-SAME: else_branch = [[COND_FALSE_REPLICA_1:@[a-z0-9_]+]]
 // CHECK-SAME: then_branch = [[COND_TRUE_REPLICA_1:@[a-z0-9_]+]]
 
 func @cond_false(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.string>) -> tensor<f32> {
   return %arg0 : tensor<f32>
 }
 
-// CHECK-NOT: func @cond_false.+(
-
 func @cond_true(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.string>) -> tensor<f32> {
   "tf._XlaSendFromHost"(%arg1, %arg2) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
   %0 = "tf._XlaRecvAtHost"(%arg2) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
   return %0 : tensor<f32>
 }
 
+// CHECK: func [[COND_FALSE_REPLICA_0]]
+
 // CHECK: func [[COND_TRUE_REPLICA_0]]
 // CHECK: "tf._XlaSendFromHost"
 // CHECK-SAME: device_ordinal = 1
 // CHECK: "tf._XlaRecvAtHost"
 // CHECK-SAME: device_ordinal = 1
 
+// CHECK: func [[COND_FALSE_REPLICA_1]]
+
 // CHECK: func [[COND_TRUE_REPLICA_1]]
 // CHECK: "tf._XlaSendFromHost"
 // CHECK-SAME: device_ordinal = 2
@@ -413,7 +414,7 @@ func @cond_true(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.stri
 // -----
 
 // Tests function with no replica variant ops reachable from a replicate region
-// is not cloned.
+// is cloned.
 
 // CHECK-LABEL: func @no_replicate_variant_ops
 func @no_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
@@ -431,11 +432,17 @@ func @no_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>)
 }
 
 // CHECK: "tf.StatefulPartitionedCall"
-// CHECK-SAME: f = @send_recv
+// CHECK-SAME: f = [[CALLEE_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALLEE_REPLICA_1:@[a-z0-9_]+]]
 
 func @send_recv(%arg0: tensor<2x!tf.string>) {
   "tf.NoOp"() : () -> ()
   return
 }
 
-// CHECK-NOT: @send_recv.+(
+// CHECK: func [[CALLEE_REPLICA_0]]
+// CHECK: "tf.NoOp"
+
+// CHECK: func [[CALLEE_REPLICA_1]]
+// CHECK: "tf.NoOp"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 8457d9c62cd..6cda668ab0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -82,6 +82,38 @@ func @same_resource_load_and_store() -> tensor<*xi32> {
 
 // -----
 
+// Tests that a resource ops with both load and store are hoisted
+// but input to load and output from store have mixed defined/undefined shapes.
+
+// CHECK-LABEL: func @same_resource_load_and_store_cast
+func @same_resource_load_and_store_cast() -> tensor<1xi32> {
+
+  // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
+  // CHECK: %[[CAST_RES:[0-9]*]] = "tf.Cast"(%[[COMPUTE_RES]])
+  // CHECK: tf_device.return %[[CAST_RES]], %[[COMPUTE_RES]]
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> (tensor<1xi32>, tensor<*xi32>)
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
+
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<1xi32>
+    %3 = "tf.SomeComputation"(%2) : (tensor<1xi32>) -> (tensor<*xi32>)
+    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> ()
+    %4 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<1xi32>
+    tf_device.return %4 : tensor<1xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<1xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]#0
+  return %1 : tensor<1xi32>
+}
+
+// -----
+
 // Tests that internal resource operations are not hoisted.
 
 // CHECK-LABEL: func @internal_resource
@@ -1091,3 +1123,129 @@ func @type_refinement_use_refined_type() -> tensor<4xi32> {
   return %1 : tensor<4xi32>
 }
 
+// -----
+
+!tf_res = type tensor<*x!tf.resource<tensor<f32>>>
+
+// Test all tf.VarIsInitializedOp's are set to true.
+// CHECK-LABEL: func @tpu_computation
+func @tpu_computation(%arg0: !tf_res, %arg1: tensor<i1>, %arg2: tensor<i32>) {
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.Case"(%arg2, %arg0) {branches = [@case_branch], is_stateless = false} : (tensor<i32>, !tf_res) -> tensor<i1>
+
+    // CHECK: "tf.CaseRegion"
+    %2 = "tf.CaseRegion"(%arg2) ( {
+      // CHECK-NEXT: [[CASE_REGION_BRANCH:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %3 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[CASE_REGION_BRANCH]])
+      "tf.Yield"(%3) : (tensor<i1>) -> ()
+    }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
+
+    %4 = "tf.If"(%arg1, %arg0) {then_branch = @if_then, else_branch = @if_else, is_stateless = false} : (tensor<i1>, !tf_res) -> tensor<i1>
+
+    // CHECK: "tf.IfRegion"
+    %5 = "tf.IfRegion"(%arg1) ( {
+      // CHECK-NEXT: [[IF_REGION_THEN:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %6 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[IF_REGION_THEN]])
+      "tf.Yield"(%6) : (tensor<i1>) -> ()
+    // CHECK-NEXT: }, {
+    }, {
+      // CHECK-NEXT: [[IF_REGION_ELSE:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %7 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[IF_REGION_ELSE]])
+      "tf.Yield"(%7) : (tensor<i1>) -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+
+    %8:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond, is_stateless = false} : (!tf_res, tensor<i1>) -> (!tf_res, tensor<i1>)
+
+    // CHECK: "tf.WhileRegion"
+    %9 = "tf.WhileRegion"(%arg1) ( {
+    // CHECK-NEXT: ^{{.+}}({{.+}}: tensor<i1>):
+    ^cond(%carg0: tensor<i1>):
+      // CHECK-NEXT: [[WHILE_REGION_COND:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %10 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[WHILE_REGION_COND]])
+      "tf.Yield"(%10) : (tensor<i1>) -> ()
+    // CHECK-NEXT: }, {
+    }, {
+    // CHECK-NEXT: ^{{.+}}({{.+}}: tensor<i1>):
+    ^body(%barg0: tensor<i1>):
+      // CHECK-NEXT: [[WHILE_REGION_BODY:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %11 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[WHILE_REGION_BODY]])
+      "tf.Yield"(%11) : (tensor<i1>) -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+
+    %12 = "tf.StatefulPartitionedCall"(%arg0) {f = @callee, config = "", config_proto = "", executor_type = ""} : (!tf_res) -> tensor<i1>
+
+    // CHECK: [[TRUE:%.+]] = "tf.Const"
+    // CHECK-SAME: value = dense<true> : tensor<i1>
+    %13 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+
+    // CHECK: tf_device.return [[TRUE]] :
+    tf_device.return %13 : tensor<i1>
+  }) : () -> tensor<i1>
+  return
+}
+
+// CHECK-LABEL: func @case_branch
+func @case_branch(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @if_then
+func @if_then(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @if_else
+func @if_else(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_cond
+// CHECK-SAME: ({{.+}}: tensor<i1>)
+func @while_cond(%arg0: !tf_res, %arg1: tensor<i1>) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_body
+// CHECK-SAME: ({{.+}}: tensor<i1>)
+func @while_body(%arg0: !tf_res, %arg1: tensor<i1>) -> (!tf_res, tensor<i1>) {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %arg0, %0 : !tf_res, tensor<i1>
+}
+
+// CHECK-LABEL: func @callee
+func @callee(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 428af91f155..a00a05552f3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -1,5 +1,4 @@
-// RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants=false -verify-diagnostics | FileCheck %s
-// RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants -verify-diagnostics | FileCheck %s
+// RUN: tf-opt %s -tf-shape-inference -verify-diagnostics | FileCheck %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
   // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
@@ -84,10 +83,10 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // Tests the case where an op's shape function returns non-fully-defined shapes.
 
   // CHECK-LABEL: func @op_non_fully_defined_shape_fn
-  func @op_non_fully_defined_shape_fn(%arg0: tensor<0xi32>, %arg1: tensor<0xi32>) -> tensor<?xi32> {
+  func @op_non_fully_defined_shape_fn(%arg0: tensor<*xi32>, %arg1: tensor<0xi32>) -> tensor<?xi32> {
     // CHECK: tf.BroadcastGradientArgs
-    // CHECK-SAME: (tensor<0xi32>, tensor<0xi32>) -> (tensor<?xi32>, tensor<?xi32>)
-    %2:2 = "tf.BroadcastGradientArgs"(%arg0, %arg1) {T = "tfdtype$DT_INT32", name = "BroadcastGradientArgs"} : (tensor<0xi32>, tensor<0xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+    // CHECK-SAME: (tensor<*xi32>, tensor<0xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+    %2:2 = "tf.BroadcastGradientArgs"(%arg0, %arg1) {T = "tfdtype$DT_INT32", name = "BroadcastGradientArgs"} : (tensor<*xi32>, tensor<0xi32>) -> (tensor<?xi32>, tensor<?xi32>)
     return %2#0 : tensor<?xi32>
   }
 
@@ -440,6 +439,22 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     return %arg0 : tensor<2xi32>
   }
 
+  // Test not updating call site if a std.call is used.
+  // CHECK-LABEL: func @call_partitioned_call2(
+  // CHECK-SAME: -> tensor<*xi32>
+  func @call_partitioned_call2() -> tensor<*xi32> {
+    // CHECK: () -> tensor<*xi32>
+    %0 = call @partitioned_called_func2() : () -> tensor<*xi32>
+    return %0 : tensor<*xi32>
+  }
+  // CHECK-LABEL: func @partitioned_called_func2(
+  // CHECK-SAME: -> tensor<*xi32>
+  func @partitioned_called_func2() -> (tensor<*xi32>) {
+    %0 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = tensor_cast %0 : tensor<1xi32> to tensor<*xi32>
+    return %1 : tensor<*xi32>
+  }
+
   // CHECK-LABEL: func @tensor_list_refine
   func @tensor_list_refine() {
     tf_executor.graph {
@@ -501,16 +516,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-LABEL: cast_at_end(%arg0:
   // CHECK-SAME: tensor<16x194x199x4xui8>, tensor<16x194x199x4xi8>, tensor<*xi8>
   func @cast_at_end(%arg0: tensor<16x194x199x4xf32>, %arg1: tensor<16x194x199x4xi8>) -> (tensor<*xui8>, tensor<*xi8>, tensor<*xi8>) {
-    // CHECK: %[[CAST_RESULT_0:.*]] = "tf.Cast"(%arg0)
-    // CHECK-SAME: (tensor<16x194x199x4xf32>) -> tensor<16x194x199x4xui8>
     %27 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<16x194x199x4xf32>) -> tensor<*xui8>
-    // CHECK: %[[CAST_RESULT_1:.*]] = "tf.Cast"(%arg0)
-    // CHECK-SAME: (tensor<16x194x199x4xf32>) -> tensor<16x194x199x4xi8>
-    // CHECK: %[[CAST_RESULT_2:.*]] = "tf.Cast"(%[[CAST_RESULT_1]])
-    // CHECK-SAME: (tensor<16x194x199x4xi8>) -> tensor<*xi8>
     %28 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<16x194x199x4xf32>) -> tensor<*xi8>
+    // CHECK: %[[CAST_RESULT_2:.*]] = "tf.Cast"(%arg0)
+    // CHECK-SAME: (tensor<16x194x199x4xf32>) -> tensor<*xi8>
     // CHECK: %[[ADDI:.*]] = addi %[[CAST_RESULT_2]], %[[CAST_RESULT_2]]
     %2 = addi %28, %28 : tensor<*xi8>
+    // CHECK: %[[CAST_RESULT_0:.*]] = "tf.Cast"(%arg0)
+    // CHECK-SAME: (tensor<16x194x199x4xf32>) -> tensor<16x194x199x4xui8>
+    // CHECK: %[[CAST_RESULT_1:.*]] = "tf.Cast"(%arg0)
+    // CHECK-SAME: (tensor<16x194x199x4xf32>) -> tensor<16x194x199x4xi8>
     // CHECK: return %[[CAST_RESULT_0]], %[[CAST_RESULT_1]], %[[ADDI]]
     return %27, %28, %2 : tensor<*xui8>, tensor<*xi8>, tensor<*xi8>
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
index e4fdad2eddb..17329050f3e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
@@ -122,6 +122,108 @@ func @while_cond(%arg0: tensor<!tf.resource>, %arg1: tensor<i32>) -> tensor<i32>
 
 // -----
 
+// Tests WhileRegion Op.
+
+// CHECK-LABEL: func @main()
+func @main() -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.Stack
+  // CHECK: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10xf32>>>
+  // CHECK: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
+  // CHECK: tf.AssignVariableOp
+  // CHECK: tf.AssignVariableOp
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // CHECK: tf.WhileRegion
+  %while = "tf.WhileRegion"(%max_size) ({
+    // CHECK: ^bb0(%[[BARG0:.*]]: tensor<i32>
+    ^bb0(%barg0: tensor<i32>):
+     // CHECK: "tf._SomeOp"(%[[BARG0]])
+     %pred = "tf._SomeOp"(%barg0) : (tensor<i32>) -> tensor<i1>
+    "tf.Yield"(%pred) : (tensor<i1>) -> ()
+  }, {
+    // CHECK: ^bb0(%[[BARG0:.*]]: tensor<i32>
+    ^bb0(%barg0: tensor<i32>):
+    // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG0]], %[[CONST1]])
+    %sub = "tf.Sub"(%barg0, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    // CHECK-NOT: "tf.StackPushV2"
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]])
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+    // CHECK-NOT: "tf.StackPushV2"
+    %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    // CHECK: "tf.Yield"(%[[SUB]])
+    "tf.Yield"(%sub) : (tensor<i32>) -> ()
+  }) {is_stateless = false}
+       : (tensor<i32>) -> tensor<i32>
+  // CHECK-NOT: tf.StackPopV2
+  // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+  // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+  // CHECK: %[[POP_VAL:.*]] = "tf.Slice"(%[[BUFFER_VAL]]
+  // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+  %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+  // CHECK-NOT: tf.StackCloseV2
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
+
+// -----
+
+// Test CaseRegionOp
+
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>
+func @main(%arg0: tensor<i32>) -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.StackV2
+  // CHECK: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10xf32>>>
+  // CHECK: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
+  // CHECK: tf.AssignVariableOp
+  // CHECK: tf.AssignVariableOp
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[BRANCH_INDEX]]) ( {
+  %case_op = "tf.CaseRegion"(%arg0) ({
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    // CHECK-NOT: tf.StackPushV2
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]])
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+     %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%elem) : (tensor<f32>) -> ()
+  }, {
+    %elem = "tf._SomeOtherOp"() : () -> tensor<f32>
+    // CHECK-NOT: tf.StackPushV2
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]])
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+    %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%elem) : (tensor<f32>) -> ()
+  }, {
+    // CHECK-NOT: tf.StackPopV2
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[POP_VAL:.*]] = "tf.Slice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+    %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+    "tf.Yield"(%pop) : (tensor<f32>) -> ()
+  }) {is_stateless = false}
+    : (tensor<i32>) -> tensor<f32>
+  // CHECK-NOT: tf.StackPopV2
+  %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+  // CHECK-NOT: tf.StackCloseV2
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
+
+// -----
 // Tests IfOp.
 
 // CHECK-LABEL: func @main
@@ -308,3 +410,53 @@ func @if_else(%arg0: tensor<!tf.resource>, %arg1: tensor<!tf.resource>) -> tenso
   %push = "tf.StackPushV2"(%arg1, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
   return %arg1 : tensor<!tf.resource>
 }
+
+// -----
+
+// Tests that the pass returns meaningful error message when WhileRegion op has
+// resource arguments.
+func @main() -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  %push_0 = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+  // expected-error @+1 {{found unexpected type 'tensor<!tf.resource<tensor<10xf32>>>' of operand #0, resource type operands are expected to have been canonicalized away for region based control flow ops}}
+  %1:2 = "tf.WhileRegion"(%stack, %max_size) ({
+    ^bb0 (%carg0: tensor<!tf.resource>, %carg1: tensor<i32>):
+    %pred = "tf._SomeOp"(%carg1) : (tensor<i32>) -> tensor<i1>
+    "tf.Yield"(%pred) : (tensor<i1>) -> ()
+  }, {
+    ^bb0 (%carg0: tensor<!tf.resource>, %carg1: tensor<i32>):
+    %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %sub = "tf.Sub"(%carg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %push_1 = "tf.StackPushV2"(%carg0, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%carg0, %sub) : (tensor<!tf.resource>, tensor<i32>) -> ()
+  }) {is_stateless = false}
+       : (tensor<!tf.resource>, tensor<i32>) -> (tensor<!tf.resource>, tensor<i32>)
+  %pop = "tf.StackPopV2"(%1#0) : (tensor<!tf.resource>) -> tensor<f32>
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
+
+// -----
+
+// Tests that the pass returns meaningful error message when IfRegion op has
+// resource returns.
+
+func @main(%arg0: tensor<i1>) -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // expected-error @+1 {{found unexpected type 'tensor<!tf.resource>' of result #0, resource type results are expected to have been canonicalized away for region based control flow ops}}
+  %if_op = "tf.IfRegion"(%arg0) ({
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%stack) : (tensor<!tf.resource>) -> ()
+  }, {
+    %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+    "tf.Yield"(%stack) : (tensor<!tf.resource>) -> ()
+  }) {is_stateless = false}
+    : (tensor<i1>) -> tensor<!tf.resource>
+  %pop = "tf.StackPopV2"(%if_op) : (tensor<!tf.resource>) -> tensor<f32>
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
index 0c4dc77cf69..8200cedaea9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
@@ -162,7 +162,7 @@ func @main() -> () {
 
 // -----
 
-// Test tensor list grads.
+// Test tensor array grads.
 
 // CHECK-LABEL: func @main
 func @main() {
@@ -314,6 +314,93 @@ func @else_branch(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
 
 // -----
 
+// Tests WhileRegion loop with access to the tensor array defined outside and
+// its gradient defined inside.
+
+// CHECK-LABEL: func @main
+func @main() -> () {
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>}
+  %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  // CHECK-NOT: tf.TensorArrayV3
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // CHECK: %[[FLOW_INIT:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK: %[[WHILE:.*]]:2 = "tf.WhileRegion"(%[[FLOW_INIT]], %[[SIZE]]) ( {
+  %while:2 = "tf.WhileRegion"(%ta#1, %size) ({
+  // CHECK: ^bb0(%[[BARG0:.*]]: tensor<f32>, %[[BARG1:.*]]: tensor<i32>):
+  ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
+    // CHECK: %[[PRED:.*]] = "tf._SomeOp"(%[[BARG1]])
+    // CHECK: "tf.Yield"(%[[PRED]])
+    %pred = "tf._SomeOp"(%barg1) : (tensor<i32>) -> tensor<i1>
+    "tf.Yield" (%pred) : (tensor<i1>) -> ()
+  }, {
+  // CHECK: ^bb0(%[[BARG0:.*]]: tensor<f32>, %[[BARG1:.*]]: tensor<i32>):
+  ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
+    %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %sub = "tf.Sub"(%barg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %elem = "tf._SomeOp"() : () -> tensor<3xf32>
+    %flow = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    // CHECK: %[[READ_VAR:.*]] = "tf.ReadVariableOp"(%[[VAR]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_VAR]],
+    // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[UPDATE]])
+    %write = "tf.TensorArrayWriteV3"(%ta#0, %sub, %elem, %flow) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+    // CHECK: %[[GVAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+    %grad:2 = "tf.TensorArrayGradV3"(%ta#0, %write) {source = "a"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<!tf.resource>, tensor<f32>)
+    // CHECK: %[[READ_GVAR:.*]] = "tf.ReadVariableOp"(%[[GVAR]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_GVAR]],
+    // CHECK: "tf.AssignVariableOp"(%[[GVAR]], %[[UPDATE]])
+    %gwrite = "tf.TensorArrayWriteV3"(%grad#0, %sub, %elem, %grad#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%gwrite, %sub) : (tensor<f32>, tensor<i32>) -> ()
+  }) {is_stateless = false}
+       : (tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+  // CHECK: %[[READ_VAR:.*]] = "tf.ReadVariableOp"(%[[VAR]])
+  // CHECK: "tf.Slice"(%[[READ_VAR]]
+  %read = "tf.TensorArrayReadV3"(%ta#0, %index, %while#0) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+  return
+}
+
+// -----
+
+// Test IfRegion op.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[PRED:.*]]: tensor<i1>
+func @main(%arg0: tensor<i1>) -> () {
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.TensorArrayV3
+  // CHECK: %[[TA_BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10x3xf32>>>
+  // CHECK: "tf.AssignVariableOp"(%[[TA_BUFFER]]
+  // CHECK-NOT: tf.TensorArrayV3
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // CHECK: "tf.IfRegion"(%[[PRED]]) ( {
+  %case_op = "tf.IfRegion"(%arg0) ({
+      // CHECK: %[[TA_VAL:.*]] = "tf.ReadVariableOp"(%[[TA_BUFFER]])
+      // CHECK: "tf.Slice"(%[[TA_VAL]]
+      // CHECK-NOT: tf.TensorArrayReadV3
+      %idx = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+      %read = "tf.TensorArrayReadV3"(%ta#0, %idx, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+      "tf.Yield"(%ta#1) : (tensor<f32>) -> ()
+    // CHECK: },  {
+    }, {
+      // CHECK: %[[TA_VAL:.*]] = "tf.ReadVariableOp"(%[[TA_BUFFER]])
+      // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[TA_VAL]]
+      // CHECK: "tf.AssignVariableOp"(%[[TA_BUFFER]], %[[UPDATE]])
+      // CHECK-NOT: tf.TensorArrayWriteV3
+      %idx = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
+      %elem = "tf._SomeOp"() : () -> tensor<3xf32>
+      %write = "tf.TensorArrayWriteV3"(%ta#0, %idx, %elem, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%write) : (tensor<f32>) -> ()
+    // CHECK: }) {is_stateless = false} : (tensor<i1>) -> tensor<f32>
+    }) {is_stateless = false} : (tensor<i1>) -> tensor<f32>
+  %idx = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.TensorArrayReadV3
+  %read_val = "tf.TensorArrayReadV3"(%ta#0, %idx, %case_op) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+  return
+}
+
+// -----
+
 // Tests (Stateful)PartitionedCall op with access to the tensor array defined
 // outside and its gradient defined inside. The gradient creation should be
 // moved outside.
@@ -458,6 +545,59 @@ func @callee() -> (tensor<*xf32>) attributes {sym_visibility = "private"} {
   // CHECK: return %[[CAST]] : tensor<*xf32>
   return %val : tensor<*xf32>
 }
+// -----
+
+// Test CaseRegion with gradient inside PartitionedCall Op. The gradient local
+// variable should be inserted before the PartitionedCall op.
+
+// CHECK-LABEL:   func @main()
+func @main() -> () {
+  %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  // CHECK-NOT: tf.TensorArrayV3
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %cond = "tf._SomeOp"() : () -> tensor<i1>
+  // CHECK: %[[GVAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  // CHECK: "tf.StatefulPartitionedCall"(%[[VAR]], %[[GVAR]])
+  %call = "tf.StatefulPartitionedCall"(%ta#0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.resource>) -> tensor<!tf.resource>
+  // CHECK-NOT: tf.TensorArrayReadV3
+  %read = "tf.TensorArrayReadV3"(%call, %index, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+  return
+}
+
+// CHECK-LABEL: func @callee
+// CHECK-SAME:  %[[VAR:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>, %[[GVAR:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>
+func @callee(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> attributes {sym_visibility = "private"} {
+  %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem = "tf._SomeOp"() : () -> tensor<3xf32>
+  %flow = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[BR_INDEX:.*]] = "tf.SomeOp"() : () -> tensor<i32>
+  %branch_index = "tf.SomeOp"() : () -> tensor<i32>
+  // CHECK: "tf.CaseRegion"(%[[BR_INDEX]]) ( {
+  "tf.CaseRegion"(%branch_index) ({
+    // CHECK: %[[READ_GVAR:.*]] = "tf.ReadVariableOp"(%[[GVAR]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_GVAR]],
+    // CHECK: "tf.AssignVariableOp"(%[[GVAR]], %[[UPDATE]])
+    %grad:2 = "tf.TensorArrayGradV3"(%arg0, %flow) {source = "a"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<!tf.resource>, tensor<f32>)
+    %gwrite = "tf.TensorArrayWriteV3"(%grad#0, %index, %elem, %grad#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"() : () -> ()
+  }, {
+    // CHECK: %[[READ_VAR:.*]] = "tf.ReadVariableOp"(%[[VAR]])
+    // CHECK: "tf.Slice"(%[[READ_VAR]]
+    %read = "tf.TensorArrayReadV3"(%arg0, %index, %flow) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+    "tf.Yield"() : () -> ()
+  }, {
+    // CHECK: %[[READ_VAR:.*]] = "tf.ReadVariableOp"(%[[VAR]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_VAR]],
+    // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[UPDATE]])
+    %write = "tf.TensorArrayWriteV3"(%arg0, %index, %elem, %flow) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false} : (tensor<i32>) -> ()
+  // CHECK: return %[[VAR]]
+ return %arg0 : tensor<!tf.resource>
+}
 
 // -----
 
@@ -501,3 +641,46 @@ func @if_then(%arg0: tensor<!tf.resource>, %arg1: tensor<!tf.resource>) -> tenso
 func @if_else(%arg0: tensor<!tf.resource>, %arg1: tensor<!tf.resource>) -> tensor<!tf.resource> {
   return %arg1 : tensor<!tf.resource>
 }
+
+// -----
+
+// Tests that the pass returns meaningful error message when region based
+// control flow op has resource arguments.
+func @main() -> () {
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // expected-error @+1 {{found unexpected type 'tensor<!tf.resource<tensor<10x3xf32>>>' of operand #0, resource type operands are expected to have been canonicalized away for region based control flow ops}}
+  %1:2 = "tf.WhileRegion"(%ta#0, %size) ({
+    ^bb0 (%carg0: tensor<!tf.resource>, %carg1: tensor<i32>):
+      %pred = "tf._SomeOp"(%carg1) : (tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%pred) : (tensor<i1>) -> ()
+    }, {
+    ^bb0 (%carg0: tensor<!tf.resource>, %carg1: tensor<i32>):
+      %idx = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+      %read_true = "tf.TensorArrayReadV3"(%carg0, %idx, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+      "tf.Yield"(%carg0, %idx) : (tensor<!tf.resource>, tensor<i32>) -> ()
+    }) {is_stateless = false}
+       : (tensor<!tf.resource>, tensor<i32>) -> (tensor<!tf.resource>, tensor<i32>)
+  return
+}
+
+// -----
+
+// Tests that the pass returns meaningful error message when region based
+// control flow op has resource returns.
+
+func @main(%arg0: tensor<i1>) -> (tensor<3xf32>) {
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  // expected-error @+1 {{found unexpected type 'tensor<!tf.resource>' of result #1, resource type results are expected to have been canonicalized away for region based control flow ops}}
+  %if_op:2 = "tf.IfRegion"(%arg0) ({
+      %idx = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+      %read_true = "tf.TensorArrayReadV3"(%ta#0, %idx, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+      "tf.Yield"(%read_true, %ta#0) : (tensor<3xf32>, tensor<!tf.resource>) -> ()
+    }, {
+      %idx = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+      %read_false = "tf.TensorArrayReadV3"(%ta#0, %idx, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+      "tf.Yield"(%read_false, %ta#0) : (tensor<3xf32>, tensor<!tf.resource>) -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> (tensor<3xf32>, tensor<!tf.resource>)
+  return %if_op : tensor<3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 0f8137af672..02b9d63cebb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -155,6 +155,91 @@ func @testBiasAddGrad(%arg0: tensor<2x3xf32>) -> tensor<3xf32> {
 
 // -----
 
+// Test valid tf.BroadcastGradientArgs
+// CHECK-LABEL: func @testBroadcastGradientArgs
+func @testBroadcastGradientArgs(%s0: tensor<4xi32>, %s1: tensor<4xi32>) -> (tensor<1xi32>, tensor<0xi32>) {
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<4xi32>, tensor<4xi32>) -> (tensor<1xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsIncompatibleInputType(%s0: tensor<4xi32>, %s1: tensor<4xi64>) -> (tensor<1xi32>, tensor<0xi32>) {
+  // expected-error @+1 {{requires the same element type for all operands and results}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<4xi32>, tensor<4xi64>) -> (tensor<1xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsIncompatibleBroadcastShape() -> (tensor<1xi32>, tensor<0xi32>) {
+  %s0 = "tf.Const"() {value = dense<[4, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error @+1 {{requires broadcast compatible shape tensors for 's0' and 's1', but got dense<[4, 1]> : tensor<2xi32> and dense<[2, 4]> : tensor<2xi32>}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<2xi32>, tensor<2xi32>) -> (tensor<1xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsInvalidS0Rank() -> (tensor<2x2xi32>, tensor<0xi32>) {
+  %s0 = "tf.Const"() {value = dense<[[4, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error @+1 {{failed to verify that operand 0 is 1-D}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<2x2xi32>, tensor<2xi32>) -> (tensor<1xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsInvalidS1Rank() -> (tensor<2xi32>, tensor<i32>) {
+  %s0 = "tf.Const"() {value = dense<[4, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %s1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // expected-error @+1 {{failed to verify that operand 1 is 1-D}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<2xi32>, tensor<i32>) -> (tensor<1xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsInvalidR0Rank() -> (tensor<2x2xi32>, tensor<0xi32>) {
+  %s0 = "tf.Const"() {value = dense<[4, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %s1 = "tf.Const"() {value = dense<[4, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error @+1 {{failed to verify that result 0 is 1-D}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<2xi32>, tensor<2xi32>) -> (tensor<2x2xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<2x2xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsInvalidR1Rank(%s0: tensor<4xi32>, %s1: tensor<4xi32>) -> (tensor<1xi32>, tensor<i32>) {
+  // expected-error @+1 {{failed to verify that result 1 is 1-D}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<4xi32>, tensor<4xi32>) -> (tensor<1xi32>, tensor<i32>)
+  return %r0, %r1 : tensor<1xi32>, tensor<i32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsInvalidR0Size() -> (tensor<0xi32>, tensor<0xi32>) {
+  %s0 = "tf.Const"() {value = dense<[4, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %s1 = "tf.Const"() {value = dense<[4, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error @+1 {{requires dimension 0 size of 'r0' to be 1 but got 0}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<2xi32>, tensor<2xi32>) -> (tensor<0xi32>, tensor<0xi32>)
+  return %r0, %r1 : tensor<0xi32>, tensor<0xi32>
+}
+
+// -----
+
+func @testBroadcastGradientArgsInvalidR1Size() -> (tensor<0xi32>, tensor<3xi32>) {
+  %s0 = "tf.Const"() {value = dense<[4, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %s1 = "tf.Const"() {value = dense<[1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error @+1 {{requires dimension 0 size of 'r1' to be 2 but got 3}}
+  %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<2xi32>, tensor<2xi32>) -> (tensor<0xi32>, tensor<3xi32>)
+  return %r0, %r1 : tensor<0xi32>, tensor<3xi32>
+}
+
+// -----
+
 // Test valid tf.BroadcastTo
 // CHECK-LABEL: func @testBroadcastTo(%arg0: tensor<16xf32>)
 func @testBroadcastTo(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
@@ -224,17 +309,17 @@ func @testIncompatibleElementTypes(%arg0: tensor<3x2xf32>, %arg1: tensor<3x2xf32
 // -----
 
 // CHECK-LABEL: func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>)
-func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>) {
+func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<100x100xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>) {
   %shape1 = constant dense<100> : tensor<2xi32>
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
-  %shape2 = "tf.Shape"(%arg0) {device = "", name = "Shape", T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> (tensor<?xi32>)
-  %r2 = "tf.Reshape"(%arg1, %shape2) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<*xf32>, tensor<?xi32>) -> (tensor<*xf32>)
-  %r3 = "tf.Reshape"(%arg2, %shape1) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<2xi32>) -> (tensor<10000xf32>)
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  %shape2 = "tf.Shape"(%arg0) : (tensor<*xf32>) -> tensor<?xi32>
+  %r2 = "tf.Reshape"(%arg1, %shape2) : (tensor<*xf32>, tensor<?xi32>) -> tensor<*xf32>
+  %r3 = "tf.Reshape"(%arg2, %shape1) : (tensor<10000xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   %shape3 = constant dense<[-1, 100]> : tensor<2xi32>
-  %r4 = "tf.Reshape"(%arg2, %shape3) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
-  %r5 = "tf.Reshape"(%arg0, %arg3) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<*xf32>, tensor<*xi32>) -> (tensor<*xf32>)
-  %r6 = "tf.Reshape"(%arg2, %arg3) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<*xi32>) -> (tensor<*xf32>)
-  return %r1, %r2, %r3, %r4, %r5, %r6: tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>
+  %r4 = "tf.Reshape"(%arg2, %shape3) : (tensor<10000xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  %r5 = "tf.Reshape"(%arg0, %arg3) : (tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+  %r6 = "tf.Reshape"(%arg2, %arg3) : (tensor<10000xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return %r1, %r2, %r3, %r4, %r5, %r6: tensor<100x100xf32>, tensor<*xf32>, tensor<100x100xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>
 }
 
 // -----
@@ -243,25 +328,41 @@ func @testReshape(tensor<*xf32>, tensor<*xf32>) -> (tensor<100x100xf32>) {
 ^bb0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>):
   %shape1 = constant dense<100.> : tensor<2xf32>
   // expected-error @+1 {{must be tensor of 32/64-bit signed integer values}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xf32>) -> (tensor<100x100xf32>)
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xf32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
 // -----
 // tf.Reshape with incorrect element number.
-func @testReshape(%arg0: tensor<10x10x10xf32>) -> tensor<100x100xf32> {
-  %shape1 = constant dense<100> : tensor<2xi32>
-  // expected-error @+1 {{number of output elements (10000) does not match expected number of elements (1000)}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
+func @testReshape(%arg0: tensor<10x10x10xf32>, %shape1: tensor<2xi32>) -> tensor<100x100xf32> {
+  // expected-error @+1 {{requires 'output' number of elements to match 'tensor' number of elements, but got 10000 and 1000}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
+// -----
+// tf.Reshape with incorrect shape operand rank.
+func @testReshape(%arg0: tensor<10x10x10xf32>, %shape1: tensor<2x2xi32>) -> tensor<*xf32> {
+  // expected-error @+1 {{requires 'shape' to be rank 1, but got 2}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2x2xi32>) -> tensor<*xf32>
+  return %r1 : tensor<*xf32>
+}
+
 // -----
 // tf.Reshape with more than one -1 in the shape.
 func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
   %shape1 = constant dense<-1> : tensor<2xi32>
-  // expected-error @+1 {{more than one component of shape are -1}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
+  // expected-error @+1 {{requires 'shape' to have at most one dynamic dimension, but got multiple dynamic dimensions at indices 0 and 1}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  return %r1 : tensor<100x100xf32>
+}
+
+// -----
+// tf.Reshape with shape operand element < -1.
+func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
+  %shape1 = constant dense<[100, -2]> : tensor<2xi32>
+  // expected-error @+1 {{requires 'shape' to have dimensions greater than -1, but got -2 at index 1}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
@@ -269,19 +370,68 @@ func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
 // tf.Reshape with -1 in the shape can't infer the dimension.
 func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
   %shape1 = constant dense<[101, -1]> : tensor<2xi32>
-  // expected-error @+1 {{one component of shape is -1 but couldn't infer the dimension}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
+  // expected-error @+1 {{requires 'tensor' number of elements be a multiple of 101, but got 10000}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
 // -----
-// tf.Reshape with a first operand that has non-static shape.
+// tf.Reshape with incorrect output rank.
+func @testReshape(%arg0: tensor<10x10xf32>) -> tensor<?x?xf32> {
+  %shape1 = constant dense<[100]> : tensor<1xi32>
+  // expected-error @+1 {{requires 'output' type 'tensor<?x?xf32>' to be cast compatible with expected type 'tensor<100xf32>'}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<?x?xf32>
+  return %r1 : tensor<?x?xf32>
+}
+
+// -----
+// tf.Reshape with incorrect output dimension.
+func @testReshape(%arg0: tensor<1000xf32>) -> tensor<?x8x?xf32> {
+  %shape1 = constant dense<[10, 10, 10]> : tensor<3xi32>
+  // expected-error @+1 {{requires 'output' type 'tensor<?x8x?xf32>' to be cast compatible with expected type 'tensor<10x10x10xf32>'}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<1000xf32>, tensor<3xi32>) -> tensor<?x8x?xf32>
+  return %r1 : tensor<?x8x?xf32>
+}
+
+// -----
+// tf.Reshape with a shape operand that has 0 for one of its elements.
+func @testReshape(%arg0: tensor<10x10x10xf32>) -> tensor<?x0xf32> {
+  %shape1 = constant dense<[-1, 0]> : tensor<2xi32>
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2xi32>) -> tensor<?x0xf32>
+  return %r1 : tensor<?x0xf32>
+}
+
+// -----
+// tf.Reshape with a tensor operand that has 0 for one of its elements.
+func @testReshape(%arg0: tensor<10x10x0xf32>) -> tensor<?x0xf32> {
+  %shape1 = constant dense<[-1, 0]> : tensor<2xi32>
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x0xf32>, tensor<2xi32>) -> tensor<?x0xf32>
+  return %r1 : tensor<?x0xf32>
+}
+
+// -----
+// tf.Reshape with a tensor operand that has non-static shape.
 func @testReshape(%arg0: tensor<10x10x?xf32>) -> tensor<10x10xf32> {
   %shape1 = constant dense<[10, 10]> : tensor<2xi32>
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> (tensor<10x10xf32>)
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> tensor<10x10xf32>
   return %r1 : tensor<10x10xf32>
 }
 
+// -----
+// tf.Reshape with tensor operand that has non-static shape and shape operand
+// with static shape.
+func @testReshape(%arg0: tensor<10x10x?xf32>, %shape1: tensor<2xi32>) -> tensor<100x100xf32> {
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  return %r1 : tensor<100x100xf32>
+}
+
+// -----
+// tf.Reshape with tensor and shape operands with static shape.
+func @testReshape(%arg0: tensor<10x10x10x10xf32>, %shape1: tensor<2xi32>) -> tensor<100x100xf32> {
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  return %r1 : tensor<100x100xf32>
+}
+
 // -----
 
 // CHECK-LABEL: func @testValidAvgPool
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir
index 2636e1ec080..991d8008534 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir
@@ -30,3 +30,83 @@ func @notfuseMulIntoConv2d(%arg0: tensor<1x112x112x3xf32>) -> tensor<1x112x112x2
   // CHECK: %1 = "tf.Mul"(%0, %cst_0) : (tensor<1x112x112x2xf32>, tensor<112x2xf32>) -> tensor<1x112x112x2xf32>
   // CHECK: return %1 : tensor<1x112x112x2xf32>
 }
+
+
+// CHECK-LABEL: simplifyBroadcastReshape
+func @simplifyBroadcastReshape(%arg0: tensor<1x8x1x1x1x1x1x18xbf16>) -> tensor<8x6x6x18xbf16> {
+  %cst_1 = constant dense<[1, 8, 6, 1, 6, 1, 1, 18]> : tensor<8xi64>
+  %97 = "tf.BroadcastTo"(%arg0, %cst_1) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<8xi64>) -> tensor<1x8x6x1x6x1x1x18xbf16>
+  %cst_2 = constant dense<[8, 6, 6, 18]> : tensor<4xi64>
+  %98 = "tf.Reshape"(%97, %cst_2) : (tensor<1x8x6x1x6x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x6x6x18xbf16>
+  return %98 : tensor<8x6x6x18xbf16>
+
+  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[8, 1, 1, 18]> : tensor<4xi64>} : () -> tensor<4xi64>
+  // CHECK: %[[CST1:.*]] =  "tf.Const"() {value = dense<[8, 6, 6, 18]> : tensor<4xi64>} : () -> tensor<4xi64>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[CST]]) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x1x1x18xbf16>
+  // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[RESHAPE]], %[[CST1]]) : (tensor<8x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x6x6x18xbf16>
+  // CHECK: return %[[BROADCAST]] : tensor<8x6x6x18xbf16>
+}
+
+// CHECK-LABEL: simplifyBroadcastReshapeExtraDims
+func @simplifyBroadcastReshapeExtraDims(%arg0: tensor<1x8x1x1x1x1x1x18xbf16>) -> tensor<7x8x6x6x18xbf16> {
+  %cst_1 = constant dense<[7, 1, 8, 6, 1, 6, 1, 1, 18]> : tensor<9xi64>
+  %97 = "tf.BroadcastTo"(%arg0, %cst_1) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<9xi64>) -> tensor<7x1x8x6x1x6x1x1x18xbf16>
+  %cst_2 = constant dense<[7, 8, 6, 6, 18]> : tensor<5xi64>
+  %98 = "tf.Reshape"(%97, %cst_2) : (tensor<7x1x8x6x1x6x1x1x18xbf16>, tensor<5xi64>) -> tensor<7x8x6x6x18xbf16>
+  return %98 : tensor<7x8x6x6x18xbf16>
+
+  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 8, 1, 1, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK: %[[CST1:.*]] =  "tf.Const"() {value = dense<[7, 8, 6, 6, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[CST]]) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x8x1x1x18xbf16>
+  // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[RESHAPE]], %[[CST1]]) : (tensor<1x8x1x1x18xbf16>, tensor<5xi64>) -> tensor<7x8x6x6x18xbf16>
+  // CHECK: return %[[BROADCAST]] : tensor<7x8x6x6x18xbf16>
+}
+
+// CHECK-LABEL: simplifyBroadcastReshapeOnes
+func @simplifyBroadcastReshapeOnes(%arg0: tensor<1x1x1x1x1x1x1x18xbf16>) -> tensor<1x6x1x6x18xbf16> {
+  %cst_1 = constant dense<[1, 1, 6, 1, 6, 1, 1, 18]> : tensor<8xi64>
+  %97 = "tf.BroadcastTo"(%arg0, %cst_1) : (tensor<1x1x1x1x1x1x1x18xbf16>, tensor<8xi64>) -> tensor<1x1x6x1x6x1x1x18xbf16>
+  %cst_2 = constant dense<[1, 6, 1, 6, 18]> : tensor<5xi64>
+  %98 = "tf.Reshape"(%97, %cst_2) : (tensor<1x1x6x1x6x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x6x1x6x18xbf16>
+  return %98 : tensor<1x6x1x6x18xbf16>
+
+  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 1, 1, 1, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK: %[[CST1:.*]] = "tf.Const"() {value = dense<[1, 6, 1, 6, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[CST]]) : (tensor<1x1x1x1x1x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x1x1x1x18xbf16>
+  // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[RESHAPE]], %[[CST1]]) : (tensor<1x1x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x6x1x6x18xbf16>
+  // CHECK: return %[[BROADCAST]] : tensor<1x6x1x6x18xbf16>
+}
+
+// CHECK-LABEL: avoidSimplifyBroadcastReshape
+func @avoidSimplifyBroadcastReshape(%arg0: tensor<1x8x1x1x1x1x1x18xbf16>) -> (tensor<1x8x6x1x6x1x1x18xbf16>, tensor<8x6x6x18xbf16>) {
+  %cst_1 = constant dense<[1, 8, 6, 1, 6, 1, 1, 18]> : tensor<8xi64>
+  %97 = "tf.BroadcastTo"(%arg0, %cst_1) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<8xi64>) -> tensor<1x8x6x1x6x1x1x18xbf16>
+  %cst_2 = constant dense<[8, 6, 6, 18]> : tensor<4xi64>
+  %98 = "tf.Reshape"(%97, %cst_2) : (tensor<1x8x6x1x6x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x6x6x18xbf16>
+  return %97, %98 : tensor<1x8x6x1x6x1x1x18xbf16>, tensor<8x6x6x18xbf16>
+
+  // CHECK: %[[CST:.*]] = constant dense<[1, 8, 6, 1, 6, 1, 1, 18]> : tensor<8xi64>
+  // CHECK: %[[CST1:.*]] = constant dense<[8, 6, 6, 18]> : tensor<4xi64>
+  // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%arg0, %[[CST]]) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<8xi64>) -> tensor<1x8x6x1x6x1x1x18xbf16>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[BROADCAST]], %[[CST1]]) : (tensor<1x8x6x1x6x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x6x6x18xbf16>
+  // CHECK: return %[[BROADCAST]], %[[RESHAPE]] : tensor<1x8x6x1x6x1x1x18xbf16>, tensor<8x6x6x18xbf16>
+}
+
+// CHECK-LABEL: avoidSimplifyBroadcastReshapeUnmatchedDims
+// The reshape splits broadcasted dimensions, instead of eliminating size-1 dimensions.
+// This results in a mismatch between the non-unit dimensions in the input and output.
+func @avoidSimplifyBroadcastReshapeUnmatchedDims(%arg0: tensor<1x1x1x1x1x1x1x18xbf16>) -> tensor<1x3x2x1x3x2x18xbf16> {
+  %cst_1 = constant dense<[1, 1, 6, 1, 6, 1, 1, 18]> : tensor<8xi64>
+  %97 = "tf.BroadcastTo"(%arg0, %cst_1) : (tensor<1x1x1x1x1x1x1x18xbf16>, tensor<8xi64>) -> tensor<1x1x6x1x6x1x1x18xbf16>
+  %cst_2 = constant dense<[1, 3, 2, 1, 3, 2, 18]> : tensor<7xi64>
+  %98 = "tf.Reshape"(%97, %cst_2) : (tensor<1x1x6x1x6x1x1x18xbf16>, tensor<7xi64>) -> tensor<1x3x2x1x3x2x18xbf16>
+  return %98 : tensor<1x3x2x1x3x2x18xbf16>
+
+  // CHECK: %[[CST:.*]] = constant dense<[1, 1, 6, 1, 6, 1, 1, 18]> : tensor<8xi64>
+  // CHECK: %[[CST1:.*]] = constant dense<[1, 3, 2, 1, 3, 2, 18]> : tensor<7xi64>
+  // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%arg0, %[[CST]]) : (tensor<1x1x1x1x1x1x1x18xbf16>, tensor<8xi64>) -> tensor<1x1x6x1x6x1x1x18xbf16>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[BROADCAST]], %[[CST1]]) : (tensor<1x1x6x1x6x1x1x18xbf16>, tensor<7xi64>) -> tensor<1x3x2x1x3x2x18xbf16>
+  // CHECK: return %[[RESHAPE]] : tensor<1x3x2x1x3x2x18xbf16>
+}
+
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_trait_folds.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_trait_folds.mlir
new file mode 100644
index 00000000000..7fed93332c7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_trait_folds.mlir
@@ -0,0 +1,146 @@
+// RUN: tf-opt %s -tf-standard-pipeline | FileCheck %s
+
+// CHECK-LABEL: func @testSingleConj
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<complex<f32>>)
+func @testSingleConj(%arg0: tensor<complex<f32>>) -> tensor<complex<f32>> {
+  // CHECK: [[CONJ:%.+]] = "tf.Conj"([[ARG0]])
+  %0 = "tf.Conj"(%arg0) : (tensor<complex<f32>>) -> tensor<complex<f32>>
+  // CHECK: return [[CONJ]]
+  return %0: tensor<complex<f32>>
+}
+
+// CHECK-LABEL: func @testDoubleConj
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<complex<f32>>)
+func @testDoubleConj(%arg0: tensor<complex<f32>>) -> tensor<complex<f32>> {
+  %0 = "tf.Conj"(%arg0) : (tensor<complex<f32>>) -> tensor<complex<f32>>
+  %1 = "tf.Conj"(%0) : (tensor<complex<f32>>) -> tensor<complex<f32>>
+  // CHECK: return [[ARG0]]
+  return %1: tensor<complex<f32>>
+}
+
+// CHECK-LABEL: func @testTripleConj
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<complex<f32>>)
+func @testTripleConj(%arg0: tensor<complex<f32>>) -> tensor<complex<f32>> {
+  // CHECK: [[CONJ:%.+]] = "tf.Conj"([[ARG0]])
+  %0 = "tf.Conj"(%arg0) : (tensor<complex<f32>>) -> tensor<complex<f32>>
+  %1 = "tf.Conj"(%0) : (tensor<complex<f32>>) -> tensor<complex<f32>>
+  %2 = "tf.Conj"(%1) : (tensor<complex<f32>>) -> tensor<complex<f32>>
+  // CHECK: return [[CONJ]]
+  return %2: tensor<complex<f32>>
+}
+
+// CHECK-LABEL: func @testSingleReciprocal
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testSingleReciprocal(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[RECIPROCAL:%.+]] = "tf.Reciprocal"([[ARG0]])
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[RECIPROCAL]]
+  return %0: tensor<i32>
+}
+
+// CHECK-LABEL: func @testDoubleReciprocal
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testDoubleReciprocal(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.Reciprocal"(%0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[ARG0]]
+  return %1: tensor<i32>
+}
+
+// CHECK-LABEL: func @testTripleReciprocal
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testTripleReciprocal(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[RECIPROCAL:%.+]] = "tf.Reciprocal"([[ARG0]])
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.Reciprocal"(%0) : (tensor<i32>) -> tensor<i32>
+  %2 = "tf.Reciprocal"(%1) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[RECIPROCAL]]
+  return %2: tensor<i32>
+}
+
+// CHECK-LABEL: func @testSingleInvert
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testSingleInvert(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[INVERT:%.+]] = "tf.Invert"([[ARG0]])
+  %0 = "tf.Invert"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[INVERT]]
+  return %0: tensor<i32>
+}
+
+// CHECK-LABEL: func @testDoubleInvert
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testDoubleInvert(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.Invert"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.Invert"(%0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[ARG0]]
+  return %1: tensor<i32>
+}
+
+// CHECK-LABEL: func @testTripleInvert
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testTripleInvert(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[INVERT:%.+]] = "tf.Invert"([[ARG0]])
+  %0 = "tf.Invert"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.Invert"(%0) : (tensor<i32>) -> tensor<i32>
+  %2 = "tf.Invert"(%1) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[INVERT]]
+  return %2: tensor<i32>
+}
+
+// CHECK-LABEL: func @testSingleNeg
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testSingleNeg(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[NEG:%.+]] = "tf.Neg"([[ARG0]])
+  %0 = "tf.Neg"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[NEG]]
+  return %0: tensor<i32>
+}
+
+// CHECK-LABEL: func @testDoubleNeg
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testDoubleNeg(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.Neg"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.Neg"(%0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[ARG0]]
+  return %1: tensor<i32>
+}
+
+// CHECK-LABEL: func @testTripleNeg
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @testTripleNeg(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[NEG:%.+]] = "tf.Neg"([[ARG0]])
+  %0 = "tf.Neg"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.Neg"(%0) : (tensor<i32>) -> tensor<i32>
+  %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[NEG]]
+  return %2: tensor<i32>
+}
+
+// CHECK-LABEL: func @testSingleLogicalNot
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>)
+func @testSingleLogicalNot(%arg0: tensor<i1>) -> tensor<i1> {
+  // CHECK: [[LNOT:%.+]] = "tf.LogicalNot"([[ARG0]])
+  %0 = "tf.LogicalNot"(%arg0) : (tensor<i1>) -> tensor<i1>
+  // CHECK: return [[LNOT]]
+  return %0: tensor<i1>
+}
+
+// CHECK-LABEL: func @testDoubleLogicalNot
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>)
+func @testDoubleLogicalNot(%arg0: tensor<i1>) -> tensor<i1> {
+  %0 = "tf.LogicalNot"(%arg0) : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.LogicalNot"(%0) : (tensor<i1>) -> tensor<i1>
+  // CHECK: return [[ARG0]]
+  return %1: tensor<i1>
+}
+
+// CHECK-LABEL: func @testTripleLogicalNot
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>)
+func @testTripleLogicalNot(%arg0: tensor<i1>) -> tensor<i1> {
+  // CHECK: [[LNOT:%.+]] = "tf.LogicalNot"([[ARG0]])
+  %0 = "tf.LogicalNot"(%arg0) : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.LogicalNot"(%0) : (tensor<i1>) -> tensor<i1>
+  %2 = "tf.LogicalNot"(%1) : (tensor<i1>) -> tensor<i1>
+  // CHECK: return [[LNOT]]
+  return %2: tensor<i1>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 281e4baaa12..3c2344be1e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -480,6 +480,53 @@ func @cluster_nested_op_using_resource() {
 // CHECK:  "tf.opA"() ( {
 // CHECK:   "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
 
+
+// -----
+
+
+!tf_res = type tensor<*x!tf.resource<tensor<f32>>>
+
+// Test multiple replicated clusters interleaved and uses resource variables.
+// CHECK-LABEL: func @multiple_replicated_interleaved
+func @multiple_replicated_interleaved(%arg0: !tf_res) {
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "a", num_replicas = 2, topology = "topology"} : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "b", num_replicas = 2, topology = "topology"} : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "c", num_replicas = 2, topology = "topology"} : () -> ()
+  %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (!tf_res, !tf_res) -> !tf_res
+  %1 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (!tf_res, !tf_res) -> !tf_res
+  %2 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (!tf_res, !tf_res) -> !tf_res
+  %3 = "tf.ReadVariableOp"(%0) {_tpu_replicate = "a"} : (!tf_res) -> tensor<f32>
+  %4 = "tf.ReadVariableOp"(%1) {_tpu_replicate = "b"} : (!tf_res) -> tensor<f32>
+  %5 = "tf.ReadVariableOp"(%2) {_tpu_replicate = "c"} : (!tf_res) -> tensor<f32>
+  %6 = "tf.Identity"(%3) {_tpu_replicate = "a"} : (tensor<f32>) -> tensor<f32>
+  %7 = "tf.Identity"(%4) {_tpu_replicate = "b"} : (tensor<f32>) -> tensor<f32>
+  %8 = "tf.Identity"(%5) {_tpu_replicate = "c"} : (tensor<f32>) -> tensor<f32>
+  %9:2 = "tf.TPUReplicatedOutput"(%6) : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  %10:2 = "tf.TPUReplicatedOutput"(%7) : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  %11:2 = "tf.TPUReplicatedOutput"(%8) : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  return
+}
+
+// CHECK: tf_device.replicate
+// CHECK: tf_device.replicate
+// CHECK: tf_device.replicate
+
+
+// -----
+
+
+// Test cluster that is replicated but has a non TPUReplicatedOutput consumer.
+// CHECK-LABEL: func @replicated_non_replicated_output
+func @replicated_non_replicated_output() {
+  %0 = "tf.opA"() {_tpu_replicate = "replicate", device = "device", name = "name"} : () -> tensor<i1>
+  %1 = "tf.opB"(%0) : (tensor<i1>) -> tensor<i1>
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[REPLICATE:%.+]]:2 = tf_device.replicate
+// CHECK: "tf.opB"([[REPLICATE]]#0)
+
 // -----
 
 
@@ -535,20 +582,6 @@ func @mismatched_replicated_output() {
 // -----
 
 
-// Test cluster that should be replicated where its outputs do not lead to a
-// TPUReplicatedOutput.
-func @missing_replicated_output() {
-  // expected-error@+1 {{requires output of tf_device.cluster to lead to a 'tf.TPUReplicatedOutput' op}}
-  %0 = "tf.opA"() {_tpu_replicate = "replicate", device = "device", name = "name"} : () -> tensor<i1>
-  %1 = "tf.opB"(%0) : (tensor<i1>) -> tensor<i1>
-  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
-  return
-}
-
-
-// -----
-
-
 // Test unused TPUReplicatedInput that has more than one operand.
 func @leftover_replicated_input(%arg0: tensor<i1>) {
   %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_device_propagation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_device_propagation.mlir
new file mode 100644
index 00000000000..39d6df513fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_device_propagation.mlir
@@ -0,0 +1,383 @@
+// RUN: tf-opt %s -tf-tpu-device-propagation | FileCheck %s
+
+// Tests function passthrough values.
+
+// CHECK-LABEL: func @testArgToRet
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testArgToRet(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i64> {
+  %0 = tf_executor.graph {
+    tf_executor.fetch %arg0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// Tests supported ops.
+
+// CHECK-LABEL: func @testIdentityOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testIdentityOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i64> {
+  %0 = tf_executor.graph {
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Identity"(%arg0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// CHECK-LABEL: func @testIdentityNOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, {{%.+}}: tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testIdentityNOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg1: tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> (tensor<i64>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    // CHECK:      tf.IdentityN
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:3 = tf_executor.island wraps "tf.IdentityN"(%arg0, %arg1) : (tensor<i64>, tensor<i32>) -> (tensor<i64>, tensor<i32>)
+    tf_executor.fetch %1#0, %1#1 : tensor<i64>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i64>, tensor<i32>
+}
+
+// CHECK-LABEL: func @testShapeOp
+// CHECK-SAME: ({{%.+}}: tensor<*xi64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<?xi64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testShapeOp(%arg0: tensor<*xi64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<?xi64> {
+  %0 = tf_executor.graph {
+    // CHECK:      tf.Shape
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Shape"(%arg0) : (tensor<*xi64>) -> tensor<?xi64>
+    tf_executor.fetch %1#0 : tensor<?xi64>
+  }
+  return %0 : tensor<?xi64>
+}
+
+// CHECK-LABEL: func @testEnterOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testEnterOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i64> {
+  %0 = tf_executor.graph {
+    // CHECK:      tf_executor.Enter
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.Enter %arg0 frame "frame" : tensor<i64>
+    tf_executor.fetch %1#0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// CHECK-LABEL: func @testExitOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testExitOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i64> {
+  %0 = tf_executor.graph {
+    // CHECK:      tf_executor.Exit
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.Exit %arg0 : tensor<i64>
+    tf_executor.fetch %1#0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// CHECK-LABEL: func @testMergeOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, {{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testMergeOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg1: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> (tensor<i64>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    // CHECK:      tf_executor.Merge
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:3 = tf_executor.Merge %arg0, %arg1 : tensor<i64>
+    tf_executor.fetch %1#0, %1#1 : tensor<i64>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i64>, tensor<i32>
+}
+
+// CHECK-LABEL: func @testSwitchOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, {{%.+}}: tensor<i1> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testSwitchOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg1: tensor<i1> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) {
+  tf_executor.graph {
+    // CHECK:      tf_executor.Switch
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %0:3 = tf_executor.Switch %arg0, %arg1 : tensor<i64> {T = "tfdtype$DT_INT64"}
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %2:2 = tf_executor.island wraps "tf.Identity"(%0#1) : (tensor<i64>) -> tensor<i64>
+    %3 = tf_executor.ControlTrigger %1#1, %2#1
+    tf_executor.fetch %3 : !tf_executor.control
+  }
+  return
+}
+
+// Tests unsupported op does not have TPU device propagated.
+
+// CHECK-LABEL: func @testUnsupportedOp
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> tensor<i64>
+func @testUnsupportedOp(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i64> {
+  %0 = tf_executor.graph {
+    // CHECK:      tf.UnsupportedOp
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.UnsupportedOp"(%arg0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// Tests empty devices are overwritten.
+
+// CHECK-LABEL: func @testEmptyDeviceOverwritten
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testEmptyDeviceOverwritten(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> (tensor<i64> {tf.device = ""}) {
+  %0 = tf_executor.graph {
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Identity"(%arg0) {device = ""} : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// Tests only devices are propagated when all operands are on the same TPU
+// device.
+
+// CHECK-LABEL: func @testOperandsNoDevice
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, {{%.+}}: tensor<i32>)
+// CHECK-SAME: -> (tensor<i64>, tensor<i32>)
+func @testOperandsNoDevice(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg1: tensor<i32>) -> (tensor<i64>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    // CHECK:      tf.IdentityN
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:3 = tf_executor.island wraps "tf.IdentityN"(%arg0, %arg1) : (tensor<i64>, tensor<i32>) -> (tensor<i64>, tensor<i32>)
+    tf_executor.fetch %1#0, %1#1 : tensor<i64>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i64>, tensor<i32>
+}
+
+// CHECK-LABEL: func @testOperandsDifferentDevice
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, {{%.+}}: tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"})
+// CHECK-SAME: -> (tensor<i64>, tensor<i32>)
+func @testOperandsDifferentDevice(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg1: tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<i64>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    // CHECK:      tf.IdentityN
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    %1:3 = tf_executor.island wraps "tf.IdentityN"(%arg0, %arg1) : (tensor<i64>, tensor<i32>) -> (tensor<i64>, tensor<i32>)
+    tf_executor.fetch %1#0, %1#1 : tensor<i64>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i64>, tensor<i32>
+}
+
+// Tests op with operand on different device does not have its device
+// overwritten.
+
+// CHECK-LABEL: func @testDifferentOperandAndOpDevice
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+func @testDifferentOperandAndOpDevice(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) {
+  tf_executor.graph {
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    %0:2 = tf_executor.island wraps "tf.Identity"(%arg0) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %0#1 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @testDifferentOperandAndResultDevice
+// CHECK-SAME: ({{%.+}}: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"})
+// CHECK-SAME: -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"})
+func @testDifferentOperandAndResultDevice(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> (tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}) {
+  %0 = tf_executor.graph {
+    tf_executor.fetch %arg0 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// Tests non TPU devices are not propagated.
+
+// CHECK-LABEL: func @testNonTPUDevice
+func @testNonTPUDevice(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) {
+  tf_executor.graph {
+    // CHECK:      tf.Identity
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    %0:2 = tf_executor.island wraps "tf.Identity"(%arg0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %0#1 : !tf_executor.control
+  }
+  return
+}
+
+// Tests control dependencies are ignored for propagating devices.
+
+// CHECK-LABEL: func @testControlDependenciesIgnored
+func @testControlDependenciesIgnored(%arg0: tensor<i64>) {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island(%0#1) wraps "tf.Identity"(%arg0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#1 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @testControlDependenciesMismatchedDevices
+func @testControlDependenciesMismatchedDevices(%arg0: tensor<i64> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:1", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island(%0#1) wraps "tf.Identity"(%arg0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#1 : !tf_executor.control
+  }
+  return
+}
+
+// Tests LoopCond -> Switch where LoopCond has a different device is ignored.
+
+// CHECK-LABEL: func @testLoopCondSwitchLinkDifferentDevice
+func @testLoopCondSwitchLinkDifferentDevice() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<false> : tensor<i1>} : () -> tensor<i1>
+    %1:2 = tf_executor.LoopCond %0#0 : (tensor<i1>) -> (tensor<i1>, !tf_executor.control) {}
+    %2:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    // CHECK:      tf_executor.Switch
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %3:3 = tf_executor.Switch %2#0, %1#0 : tensor<i64> {T = "tfdtype$DT_INT64"}
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %4:2 = tf_executor.island wraps "tf.Identity"(%3#0) : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %5:2 = tf_executor.island wraps "tf.Identity"(%3#1) : (tensor<i64>) -> tensor<i64>
+    %6 = tf_executor.ControlTrigger %4#1, %5#1
+    tf_executor.fetch %6 : !tf_executor.control
+  }
+  return
+}
+
+// Tests tf_executor.NextIteration.Source/tf_executor.NextIteration.Sink has a
+// device when an intermediate op in its loop has a device.
+
+// CHECK-LABEL: func @testNextIterationNoDevice
+func @testNextIterationNoDevice() {
+  tf_executor.graph {
+    // CHECK:      tf_executor.NextIteration.Source
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %0:3 = tf_executor.NextIteration.Source : tensor<i64> {T = "tfdtype$DT_INT64"}
+    // CHECK:      tf.Identity
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf.IdentityN
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %2:2 = tf_executor.island wraps "tf.IdentityN"(%1#0) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf_executor.NextIteration.Sink
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    tf_executor.NextIteration.Sink [%0#1] %2#0 : tensor<i64> {T = "tfdtype$DT_INT64"}
+    tf_executor.fetch %0#2 : !tf_executor.control
+  }
+  return
+}
+
+// Tests tf_executor.NextIteration with mismatched devices does not propagate
+// either device.
+
+// CHECK-LABEL: func @testNextIterationMismatchedDevices
+func @testNextIterationMismatchedDevices() {
+  tf_executor.graph {
+    // CHECK:      tf_executor.NextIteration.Source
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    %0:3 = tf_executor.NextIteration.Source : tensor<i64> {device = "/job:localhost/replica:0/task:0/device:TPU:1", T = "tfdtype$DT_INT64"}
+    // CHECK:      "tf.Identity"({{.+}}) :
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf_executor.NextIteration.Sink
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    tf_executor.NextIteration.Sink [%0#1] %1#0 : tensor<i64> {device = "/job:localhost/replica:0/task:0/device:TPU:0", T = "tfdtype$DT_INT64"}
+    tf_executor.fetch %0#2 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @testNextIterationMissingSourceDevice
+func @testNextIterationMissingSourceDevice() {
+  tf_executor.graph {
+    // CHECK:      tf_executor.NextIteration.Source
+    %0:3 = tf_executor.NextIteration.Source : tensor<i64> {T = "tfdtype$DT_INT64"}
+    // CHECK:      "tf.Identity"({{.+}}) :
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf_executor.NextIteration.Sink
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    tf_executor.NextIteration.Sink [%0#1] %1#0 : tensor<i64> {device = "/job:localhost/replica:0/task:0/device:TPU:0", T = "tfdtype$DT_INT64"}
+    tf_executor.fetch %0#2 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @testNextIterationMissingSinkDevice
+func @testNextIterationMissingSinkDevice() {
+  tf_executor.graph {
+    // CHECK:      tf_executor.NextIteration.Source
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    %0:3 = tf_executor.NextIteration.Source : tensor<i64> {device = "/job:localhost/replica:0/task:0/device:TPU:1", T = "tfdtype$DT_INT64"}
+    // CHECK:      "tf.Identity"({{.+}}) :
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    // CHECK:      tf_executor.NextIteration.Sink
+    tf_executor.NextIteration.Sink [%0#1] %1#0 : tensor<i64> {T = "tfdtype$DT_INT64"}
+    tf_executor.fetch %0#2 : !tf_executor.control
+  }
+  return
+}
+
+// Tests unsupported functions are not modified.
+
+// CHECK-LABEL: func @testMultipleBlockFunc
+func @testMultipleBlockFunc() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#1 : !tf_executor.control
+  }
+  br ^bb1
+^bb1:
+  return
+}
+
+// CHECK-LABEL: func @testMultipleGraphs
+func @testMultipleGraphs() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %1:2 = tf_executor.island wraps "tf.Identity"(%0#0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %1#1 : !tf_executor.control
+  }
+  tf_executor.graph {
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-LABEL: func @testNoGraph
+func @testNoGraph() -> tensor<i64> {
+  %0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK:      tf.Identity
+  // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+  %1 = "tf.Identity"(%0) : (tensor<i64>) -> tensor<i64>
+  return %1 : tensor<i64>
+}
+
+// CHECK-LABEL: func @testMismatchedGraphResults
+func @testMismatchedGraphResults() {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island wraps "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    // CHECK:      tf.Identity
+    // CHECK-NOT:  device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    %2:2 = tf_executor.island wraps "tf.Identity"(%1#0) : (tensor<i64>) -> tensor<i64>
+    tf_executor.fetch %2#0 : tensor<i64>
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index 2e3e38c7004..a3d5a43a214 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -227,3 +227,28 @@ func @pcall_func_body(%arg0: tensor<*xi1>) -> tensor<i32> {
   %2 = "tf.D"(%1) : (tensor<*xi1>) -> (tensor<i32>)
   return %2 : tensor<i32>
 }
+
+// -----
+
+// Tests that output sharding inside a functional op is parsed correctly.
+
+// CHECK-LABEL: func @check_sharding_inside_functional_op
+func @check_sharding_inside_functional_op(%arg0: tensor<*xi32>) {
+  "tf_device.cluster_func"(%arg0) {func = @cluster_func, step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  // CHECK: input_sharding_configuration
+  // CHECK-SAME: ["\01\02\03"]
+  // CHECK: output_sharding_configuration
+  // CHECK-SAME: ["\01\02\03"]
+  return
+}
+
+func @cluster_func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.PartitionedCall"(%arg0) {f= @func_body, config="", config_proto="", executor_type=""} : (tensor<*xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
+func @func_body(%arg0: tensor<*xi32>)-> tensor<*xi32> {
+  %0 = "tf.XlaSharding"(%arg0) { _XlaSharding = "\01\02\03" } : (tensor<*xi32>) -> tensor<*xi32>
+  %1 = "tf.Identity"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
+  return %1 : tensor<*xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index fe0c5bea44e..2a607065cdb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/util/matmul_bcast.h"
 
@@ -93,7 +94,7 @@ void BatchMatMulToEinsumPass::runOnFunction() {
   patterns.insert<ConvertTFBatchMatMulToEinsumOp<TF::BatchMatMulOp>,
                   ConvertTFBatchMatMulToEinsumOp<TF::BatchMatMulV2Op>>(
       &getContext());
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 PassRegistration<BatchMatMulToEinsumPass> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index eccbe5feaec..51465a4b3b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -51,7 +51,9 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
   pm.addPass(TFDevice::CreateReplicateToIslandPass());
   pm.addPass(CreateBreakUpIslandsPass());
   add_pass(TFDevice::CreateParallelExecuteToIslandsPass());
-  add_pass(TFDevice::CreateLaunchToDeviceAttributePass());
+  pm.addPass(TFDevice::CreateLaunchToDeviceAttributePass());
+  pm.addPass(CreateBreakUpIslandsPass());
+  pm.addPass(createSymbolDCEPass());
 }
 
 tensorflow::Status RunTPUBridge(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index d5b7eb7a739..8cac6f299c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -107,12 +107,6 @@ def BitcastNested : Pat<(TF_BitcastOp (TF_BitcastOp $arg)),
 def ConvertToConcatV2 : Pat<(TF_ConcatOp $axis, $inputs),
                             (TF_ConcatV2Op $inputs, $axis)>;
 
-//===----------------------------------------------------------------------===//
-// Conj op patterns.
-//===----------------------------------------------------------------------===//
-
-def ConjNested : Pat<(TF_ConjOp (TF_ConjOp $arg)), (replaceWithValue $arg)>;
-
 //===----------------------------------------------------------------------===//
 // Div op patterns.
 //===----------------------------------------------------------------------===//
@@ -121,13 +115,6 @@ def ConjNested : Pat<(TF_ConjOp (TF_ConjOp $arg)), (replaceWithValue $arg)>;
 def DivWithSqrtDivisor : Pat<(TF_DivOp $arg0, (TF_SqrtOp $arg1)),
                              (TF_MulOp $arg0, (TF_RsqrtOp $arg1))>;
 
-//===----------------------------------------------------------------------===//
-// Invert op patterns.
-//===----------------------------------------------------------------------===//
-
-def InvertNested : Pat<(TF_InvertOp (TF_InvertOp $arg)),
-                       (replaceWithValue $arg)>;
-
 //===----------------------------------------------------------------------===//
 // Log op patterns.
 //===----------------------------------------------------------------------===//
@@ -150,10 +137,6 @@ def LogToLog1p : Pat<
 // LogicalNot op patterns.
 //===----------------------------------------------------------------------===//
 
-// TODO(ezhulenev): Generalize this pattern for all involutions.
-def LogicalNotNested : Pat<(TF_LogicalNotOp (TF_LogicalNotOp $arg)),
-                           (replaceWithValue $arg)>;
-
 def LogicalNotOfEqual : Pat<
     (TF_LogicalNotOp (TF_EqualOp $arg0, $arg1, $shape_error)),
     (TF_NotEqualOp $arg0, $arg1, $shape_error)>;
@@ -175,12 +158,6 @@ def LogicalNotOfLess : Pat<(TF_LogicalNotOp (TF_LessOp $arg0, $arg1)),
 def LogicalNotOfLessEqual : Pat<(TF_LogicalNotOp (TF_LessEqualOp $arg0, $arg1)),
                                 (TF_GreaterOp $arg0, $arg1)>;
 
-//===----------------------------------------------------------------------===//
-// Neg op patterns.
-//===----------------------------------------------------------------------===//
-
-def NegNested : Pat<(TF_NegOp (TF_NegOp $arg)), (replaceWithValue $arg)>;
-
 //===----------------------------------------------------------------------===//
 // RealDiv op patterns.
 //===----------------------------------------------------------------------===//
@@ -195,13 +172,6 @@ def RealDivWithConstDivisor : Pat<
   (TF_RealDivOp $arg0, (TF_ConstOp FloatElementsAttr<32>:$value)),
   (TF_MulOp $arg0, (TF_ReciprocalOp (TF_ConstOp $value)))>;
 
-//===----------------------------------------------------------------------===//
-// Reciprocal op patterns.
-//===----------------------------------------------------------------------===//
-
-def ReciprocalNested : Pat<(TF_ReciprocalOp (TF_ReciprocalOp $arg)),
-                           (replaceWithValue $arg)>;
-
 //===----------------------------------------------------------------------===//
 // Reshape op patterns.
 //===----------------------------------------------------------------------===//
@@ -209,10 +179,8 @@ def ReciprocalNested : Pat<(TF_ReciprocalOp (TF_ReciprocalOp $arg)),
 def RedundantReshape : Pat<(TF_ReshapeOp (TF_ReshapeOp $arg, $unused), $shape),
                            (TF_ReshapeOp $arg, $shape)>;
 
-def IsSame : Constraint<CPred<"$0 == $1">>;
-def ReshapeToSelfShape : Pat<(TF_ReshapeOp $arg0, (TF_ShapeOp $arg1)),
-                         (replaceWithValue $arg0),
-                         [(IsSame $arg0, $arg1)]>;
+def ReshapeToSelfShape : Pat<(TF_ReshapeOp $x, (TF_ShapeOp $x)),
+                         (replaceWithValue $x)>;
 
 //===----------------------------------------------------------------------===//
 // Select op patterns.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
index b5d09f7a794..09712f9fca2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -144,7 +144,7 @@ void ContractionFusionPass::runOnFunction() {
 
   OwningRewritePatternList patterns;
   patterns.insert<FuseIntoMatMulOp>();
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
index 4737f44ae1e..28a5c583919 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
@@ -73,7 +73,7 @@ static Type GetResourceSubtype(Value resource) {
 
 void PopulateDecomposeResourceOpsPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
index dcd0b9af5e1..2856cf65dbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
@@ -45,7 +46,7 @@ struct DecomposeResourceOps
     OwningRewritePatternList patterns;
     mlir::TF::PopulateDecomposeResourceOpsPatterns(&getContext(), &patterns);
 
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index c3d43c27ac5..1fea0fd3c7d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/util/matmul_bcast.h"
 
@@ -364,7 +365,7 @@ void TransformEinsumPass::runOnFunction() {
   auto func = getFunction();
 
   patterns.insert<ConvertTFEinsumOp>(&getContext());
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 static PassRegistration<TransformEinsumPass> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
new file mode 100644
index 00000000000..c2760000e82
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
@@ -0,0 +1,119 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace {
+
+class ConvertResultsBroadcastableShapeOp : public RewritePattern {
+ public:
+  ConvertResultsBroadcastableShapeOp()
+      : RewritePattern(1, MatchAnyOpTypeTag()) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+class BroadcastFoldPass : public PassWrapper<BroadcastFoldPass, FunctionPass> {
+ public:
+  void runOnFunction() override;
+};
+
+LogicalResult ConvertResultsBroadcastableShapeOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  if (!op->hasTrait<OpTrait::ResultsBroadcastableShape>()) return failure();
+  if (op->getNumOperands() != 2 || op->getResultTypes().size() != 1)
+    return failure();
+
+  // Check that the result shape is fully defined.
+  auto result_type =
+      op->getResultTypes().front().dyn_cast_or_null<RankedTensorType>();
+  if (!result_type || !result_type.hasStaticShape()) return failure();
+
+  for (uint64_t i = 0, e = op->getNumOperands(); i < e; ++i) {
+    // Check that the i'th operand is a broadcast.
+    auto broadcast = llvm::dyn_cast_or_null<TF::BroadcastToOp>(
+        op->getOpOperand(i).get().getDefiningOp());
+    if (!broadcast) continue;
+
+    // Check that the operand of the broadcast has fully defined shape.
+    auto broadcast_arg_type =
+        broadcast.input().getType().dyn_cast_or_null<RankedTensorType>();
+    if (!broadcast_arg_type || !broadcast_arg_type.hasStaticShape()) continue;
+
+    // Check that the other argument has fully defined shape.
+    auto argument_type = op->getOpOperand(1 - i)
+                             .get()
+                             .getType()
+                             .dyn_cast_or_null<RankedTensorType>();
+    if (!argument_type || !argument_type.hasStaticShape()) continue;
+
+    // Check that the input of the broadcast and the other operand is broadcast
+    // compatible.
+    llvm::SmallVector<int64_t, 4> broadcasted_shape;
+    if (!OpTrait::util::getBroadcastedShape(broadcast_arg_type.getShape(),
+                                            argument_type.getShape(),
+                                            broadcasted_shape))
+      continue;
+
+    // Check that an implicit broadcast between the operand of the broadcast and
+    // the other argument would result in the same type as the result type.
+    if (broadcasted_shape != result_type.getShape()) continue;
+
+    // Update the operand of the op to be the operand of the broadcast.
+    rewriter.updateRootInPlace(
+        op, [&]() { op->getOpOperand(i).set(broadcast.input()); });
+    return success();
+  }
+
+  return failure();
+}
+
+void BroadcastFoldPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+
+  patterns.insert<ConvertResultsBroadcastableShapeOp>();
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
+}
+
+}  // namespace
+
+namespace TF {
+std::unique_ptr<OperationPass<FuncOp>> CreateBroadcastFoldPass() {
+  return absl::make_unique<BroadcastFoldPass>();
+}
+}  // namespace TF
+
+static PassRegistration<BroadcastFoldPass> pass(
+    "tf-broadcast-fold",
+    "Fold explicit broadcasts into the following operations if they support "
+    "implicit broadcasting on their operand.");
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 21f4581f76a..01ae4c3dd1e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -206,7 +207,7 @@ void FusedKernelMatcherPass::runOnFunction() {
   auto func = getFunction();
   patterns.insert<FuseConv2DBiasAdd, FuseMatMulBiasAdd>(&getContext());
 
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
index fbe0524ce8b..acc05f3a434 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -118,7 +119,7 @@ void GpuOpFusionPass::runOnFunction() {
   FuncOp func = getFunction();
   OwningRewritePatternList patterns;
   patterns.insert<ReluToFusedBatchNorm>(&getContext());
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
index 859d3ffb23c..26c0126932c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -30,6 +31,18 @@ limitations under the License.
 namespace mlir {
 namespace tf_executor {
 
+namespace {
+
+// Checks if a tf_executor.Graph can be pruned.
+// For TensorFlow V1.0 compatibility: when importing a graph without providing
+// feeds/fetches/targets we should not attempt to prune. The best approximation
+// here is to check if the graph is of the "main" function and does not have the
+// "tf.entry_function" attribute defined.
+bool CanPruneGraph(FuncOp func) {
+  return func.getName() != "main" ||
+         func.getAttrOfType<DictionaryAttr>("tf.entry_function") != nullptr;
+}
+
 // Visits an op's operand if it is an output of an Operation in the same
 // tf_executor.graph.
 void VisitOpOperand(GraphOp graph, Value operand,
@@ -75,6 +88,8 @@ void VisitOp(GraphOp graph, Operation* op,
   }
 }
 
+}  // namespace
+
 // Prunes unreachable operations of a tf_executor.graph operation.
 void PruneGraph(GraphOp graph) {
   // A graph has a single block which forms a DAG: operations that aren't
@@ -107,15 +122,8 @@ namespace {
 // This transformation pass prunes a TF graph eliminating dead-nodes.
 struct GraphPruning : public PassWrapper<GraphPruning, FunctionPass> {
   void runOnFunction() override {
-    getFunction().walk([](tf_executor::GraphOp graph) {
-      // For TensorFlow V1.0 compatibility: when importing a graph without
-      // providing feeds/fetches we should not attempt to prune. The best
-      // approximation here is to check if the graph does not have any fetched
-      // values.
-      if (!graph.GetFetch().getNumOperands()) return;
-
-      PruneGraph(graph);
-    });
+    if (!CanPruneGraph(getFunction())) return;
+    getFunction().walk([](tf_executor::GraphOp graph) { PruneGraph(graph); });
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
index 615ca26012e..49b88492be9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -115,7 +116,7 @@ void InitTextFileToImportPass::runOnFunction() {
   FuncOp func = getFunction();
 
   patterns.insert<ConvertInitializeTableFromTextFileV2>(context);
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
index 4e507c8e760..e07af27456b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
@@ -42,14 +42,20 @@ limitations under the License.
 //                       tensor<i1>, tensor<f32>, tensor<i32>, tensor<i1>
 //   }
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -57,42 +63,109 @@ namespace {
 constexpr char kDeviceAttr[] = "device";
 
 struct LaunchToDeviceAttributePass
-    : public PassWrapper<LaunchToDeviceAttributePass, FunctionPass> {
-  void runOnFunction() override;
+    : public PassWrapper<LaunchToDeviceAttributePass, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
 };
 
-LogicalResult HoistOpsAndAnnotateWithDevice(const Dialect* tf_dialect,
-                                            tf_device::LaunchOp launch) {
-  // Forward launch inner op results to launch op results.
-  launch.replaceAllUsesWith(launch.GetBody().getTerminator()->getOperands());
+bool IsSupportedExecutorDialectOp(Operation* op) {
+  return llvm::isa<tf_executor::ControlTriggerOp, tf_executor::EnterOp,
+                   tf_executor::ExitOp, tf_executor::LoopCondOp,
+                   tf_executor::MergeOp, tf_executor::NextIterationSinkOp,
+                   tf_executor::NextIterationSourceOp, tf_executor::SwitchOp,
+                   tf_executor::SwitchNOp>(op);
+}
 
-  // For all inner ops of the TensorFlow dialect, assign the launch device as a
-  // `device` attribute.
-  auto body = launch.GetBody().without_terminator();
-  for (Operation& op : body) {
-    if (op.getDialect() != tf_dialect)
-      return launch.emitOpError() << "must contain only 'tf' dialect ops";
+// Collects all functions reachable from a region, including transitive ones.
+llvm::SmallPtrSet<FuncOp, 4> GetReachableFunctionsFromRegion(ModuleOp module,
+                                                             Region& region) {
+  llvm::SmallPtrSet<FuncOp, 4> visited_functions;
 
-    auto device_attr = op.getAttr(kDeviceAttr);
+  SymbolTable symbol_table(module);
+  auto symbol_uses = symbol_table.getSymbolUses(&region);
+  if (!symbol_uses) return {};
+
+  for (auto& use : *symbol_uses)
+    if (auto func =
+            symbol_table.lookup<FuncOp>(use.getSymbolRef().getRootReference()))
+      visited_functions.insert(func);
+
+  llvm::SmallVector<FuncOp, 4> functions_to_visit(visited_functions.begin(),
+                                                  visited_functions.end());
+  while (!functions_to_visit.empty()) {
+    llvm::SmallVector<FuncOp, 4> new_functions_to_visit;
+
+    for (FuncOp function_to_visit : functions_to_visit) {
+      auto func_symbol_uses =
+          symbol_table.getSymbolUses(function_to_visit.getCallableRegion());
+      if (!func_symbol_uses) continue;
+
+      for (auto& use : *func_symbol_uses)
+        if (auto func = symbol_table.lookup<FuncOp>(
+                use.getSymbolRef().getRootReference()))
+          if (visited_functions.insert(func).second)
+            new_functions_to_visit.push_back(func);
+    }
+
+    functions_to_visit.swap(new_functions_to_visit);
+  }
+
+  return visited_functions;
+}
+
+// Assign all ops in region with specified device from launch.
+LogicalResult AssignDevicesInRegion(const Dialect* tf_dialect,
+                                    tf_device::LaunchOp launch,
+                                    Region& region) {
+  auto result = region.walk([&](Operation* op) -> WalkResult {
+    if (op->getDialect() != tf_dialect && !IsSupportedExecutorDialectOp(op))
+      return WalkResult::advance();
+
+    auto device_attr = op->getAttr(kDeviceAttr);
     if (!device_attr) {
-      op.setAttr(kDeviceAttr, launch.deviceAttr());
-      continue;
+      op->setAttr(kDeviceAttr, launch.deviceAttr());
+      return WalkResult::advance();
     }
 
     if (auto device_str_attr = device_attr.dyn_cast<StringAttr>()) {
-      if (launch.device() != device_str_attr.getValue())
+      if (device_str_attr.getValue().empty()) {
+        op->setAttr(kDeviceAttr, launch.deviceAttr());
+        return WalkResult::advance();
+      } else if (device_str_attr.getValue() != launch.device()) {
         return launch.emitOpError()
-               << "inner 'tf' dialect op has conflicting 'device' attribute, "
+               << "inner op has conflicting 'device' attribute, "
                   "got '"
                << device_str_attr.getValue() << "' but expected '"
                << launch.device() << "'";
+      }
     } else {
       return launch.emitOpError()
-             << "inner 'tf' dialect op has bad 'device' attribute";
+             << "inner op has bad 'device' attribute, got " << device_attr;
     }
-  }
+
+    return WalkResult::advance();
+  });
+
+  return failure(result.wasInterrupted());
+}
+
+LogicalResult HoistOpsAndAnnotateWithDevice(const Dialect* tf_dialect,
+                                            ModuleOp module,
+                                            tf_device::LaunchOp launch) {
+  llvm::SmallPtrSet<FuncOp, 4> reachable_functions =
+      GetReachableFunctionsFromRegion(module, launch.body());
+
+  // Forward launch inner op results to launch op results.
+  launch.replaceAllUsesWith(launch.GetBody().getTerminator()->getOperands());
+
+  // For all inner ops, assign the launch device as a `device` attribute.
+  if (failed(AssignDevicesInRegion(tf_dialect, launch, launch.body())))
+    return failure();
+  for (FuncOp func : reachable_functions)
+    if (failed(AssignDevicesInRegion(tf_dialect, launch, func.getBody())))
+      return failure();
 
   // Move all inner ops of the launch to the block containing the launch.
+  auto body = launch.GetBody().without_terminator();
   Operation* launch_op = launch.getOperation();
   launch_op->getBlock()->getOperations().splice(
       launch_op->getIterator(), launch.GetBody().getOperations(), body.begin(),
@@ -103,15 +176,16 @@ LogicalResult HoistOpsAndAnnotateWithDevice(const Dialect* tf_dialect,
   return success();
 }
 
-void LaunchToDeviceAttributePass::runOnFunction() {
+void LaunchToDeviceAttributePass::runOnOperation() {
   const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
   if (!tf_dialect) {
-    getFunction().emitError() << "'tf' dialect is not registered";
+    getOperation().emitError() << "'tf' dialect is not registered";
     return signalPassFailure();
   }
 
-  auto result = getFunction().walk([&](tf_device::LaunchOp launch) {
-    if (failed(HoistOpsAndAnnotateWithDevice(tf_dialect, launch)))
+  auto module = getOperation();
+  auto result = module.walk([&](tf_device::LaunchOp launch) {
+    if (failed(HoistOpsAndAnnotateWithDevice(tf_dialect, module, launch)))
       return WalkResult::interrupt();
 
     return WalkResult::advance();
@@ -122,7 +196,7 @@ void LaunchToDeviceAttributePass::runOnFunction() {
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>> CreateLaunchToDeviceAttributePass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchToDeviceAttributePass() {
   return std::make_unique<LaunchToDeviceAttributePass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 8e93a7e7470..94fb123026c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
@@ -681,20 +682,6 @@ class LegalizeHloToTf : public PassWrapper<LegalizeHloToTf, FunctionPass> {
   void runOnFunction() override;
 };
 
-// Returns whether the two values are guaranteed to be broadcastable to the
-// same shape, this broadcasts size 1 tensors up to any rank.
-// TODO(jpienaar): Move this to more general location.
-static bool AreBroadcastCompatible(Value x, Value y) {
-  auto x_ranked = x.getType().dyn_cast<RankedTensorType>();
-  auto y_ranked = y.getType().dyn_cast<RankedTensorType>();
-  if (!x_ranked || !y_ranked) {
-    return true;
-  }
-  SmallVector<int64_t, 4> resultShape;
-  return OpTrait::util::getBroadcastedShape(x_ranked.getShape(),
-                                            y_ranked.getShape(), resultShape);
-}
-
 // Returns the shape of the given value in a Constant Op.
 ConstantOp ShapeToConst(PatternRewriter &rewriter, Value value) {
   ArrayRef<int64_t> shape = value.getType().cast<ShapedType>().getShape();
@@ -808,7 +795,8 @@ void LegalizeHloToTf::runOnFunction() {
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();
   target.addLegalOp<CallOp, ConstantOp>();
-  if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+  if (failed(
+          applyPartialConversion(getFunction(), target, std::move(patterns)))) {
     getFunction().emitError("mhlo to TF legalization failed.");
     signalPassFailure();
   }
@@ -821,7 +809,7 @@ static PassRegistration<LegalizeHloToTf> pass(
 
 void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList *patterns,
                                      MLIRContext *context) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
   patterns->insert<ConvertConvOp, ConvertSliceOp, ConvertReduceOpToTfMax,
                    ConvertReduceOpToTfMin, ConvertReduceOpToTfSum,
                    ConvertIotaOpToTfRange>(context);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 17d7f00369d..d5c13242e0e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -21,10 +21,10 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td"
 include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
 
-// Check that two values can be broadcasted together
-// TODO(jpienaar): Move somewhere more general
-def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
-    "types must be broadcastable">;
+// Check if broadcasting is compatible with TF ops.
+def IsLegalNumpyRankedBroadcast :
+    Constraint<CPred<"hlo::IsLegalNumpyRankedBroadcast($0, $1, $2)">,
+    "broadcasting should be compatible with TF ops">;
 
 // Return a constant op that carries the shape of the given value.
 def ShapeToConst : NativeCodeCall<"ShapeToConst($_builder, $0)">;
@@ -61,35 +61,44 @@ foreach fromToBinPair = [[HLO_AddOp, HLOClient_BroadcastAddOp, TF_AddV2Op],
                          [HLO_Atan2Op, HLOClient_BroadcastAtan2Op, TF_Atan2Op],
                          [HLO_RemOp, HLOClient_BroadcastRemOp, TF_ModOp]] in {
   def : Pat<(fromToBinPair[0] $l, $r), (fromToBinPair[2] $l, $r)>;
-  def : Pat<(fromToBinPair[1] $l, $r, $_), (fromToBinPair[2] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(fromToBinPair[1] $l, $r, $broadcast_dimensions),
+            (fromToBinPair[2] $l, $r),
+            [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
 foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_BitwiseAndOp],
                  [HLO_OrOp, HLOClient_BroadcastOrOp, TF_BitwiseOrOp],
                  [HLO_XorOp, HLOClient_BroadcastXorOp, TF_BitwiseXorOp]] in {
   def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r), (pair[2] $l, $r)>;
-  def : Pat<(pair[1] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[2] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(pair[1] TF_IntTensor:$l, TF_IntTensor:$r, $broadcast_dimensions),
+            (pair[2] $l, $r),
+            [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
 foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_LogicalAndOp],
                  [HLO_OrOp, HLOClient_BroadcastOrOp, TF_LogicalOrOp]] in {
   def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r), (pair[2] $l, $r)>;
-  def : Pat<(pair[1] I1Tensor:$l, I1Tensor:$r, $_), (pair[2] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(pair[1] I1Tensor:$l, I1Tensor:$r, $broadcast_dimensions),
+            (pair[2] $l, $r),
+            [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
 def : Pat<(HLO_ShiftRightArithmeticOp $l, $r), (TF_RightShiftOp $l, $r)>;
-def : Pat<(HLOClient_BroadcastShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
-          [(AreBroadcastCompatible $l, $r)]>;
+def : Pat<(HLOClient_BroadcastShiftRightArithmeticOp $l, $r,
+                                                     $broadcast_dimensions),
+          (TF_RightShiftOp $l, $r),
+          [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 def : Pat<(HLO_ShiftRightLogicalOp $l, $r), (TF_RightShiftOp $l, $r)>;
-def : Pat<(HLOClient_BroadcastShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
-          [(AreBroadcastCompatible $l, $r)]>;
+def : Pat<(HLOClient_BroadcastShiftRightLogicalOp $l, $r,
+                                                  $broadcast_dimensions),
+          (TF_RightShiftOp $l, $r),
+          [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 
 def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r)), (TF_FloorDivOp $l, $r)>;
-def : Pat<(HLO_FloorOp (HLOClient_BroadcastDivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
-          [(AreBroadcastCompatible $l, $r)]>;
+def : Pat<(HLO_FloorOp (HLOClient_BroadcastDivOp $l, $r,
+                                                 $broadcast_dimensions)),
+          (TF_FloorDivOp $l, $r),
+          [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 
 def : Pat<(HLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
 
@@ -157,6 +166,16 @@ def : Pat<(HLO_SelectOp $cond, $t, $e), (TF_SelectOp $cond, $t, $e)>;
 def : Pat<(HLO_ConcatenateOp $inputs, $dim),
           (TF_ConcatV2Op $inputs, (TF_ConstOp $dim))>;
 
+class HasCompareType<string value> :
+    CPred<"$_self.cast<StringAttr>().getValue() == \"" # value # "\"">;
+
+// Attribute value should be such that it matches the comparison used by
+// TensorFlow, if the attribute is present.
+def IsTFCompareType : AttrConstraint<
+   Or<[CPred<"!$_self">, HasCompareType<"FLOAT">, HasCompareType<"SIGNED">,
+       HasCompareType<"UNSIGNED">]>,
+   "compare type supported by TensorFlow">;
+
 //===----------------------------------------------------------------------===//
 // Compare op patterns.
 // Note that these are legalized from chlo.broadcast_* ops, since those are
@@ -166,18 +185,24 @@ def : Pat<(HLO_ConcatenateOp $inputs, $dim),
 
 foreach p = [[TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ],
              [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in {
-  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
-            [(AreBroadcastCompatible $l, $r)]>;
-  def : Pat<(HLO_CompareOp $l, $r, p[1]), (p[0] $l, $r, ConstBoolAttrTrue)>;
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $broadcast_dimensions, p[1],
+                                          IsTFCompareType:$type),
+            (p[0] $l, $r, ConstBoolAttrTrue),
+            [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
+  def : Pat<(HLO_CompareOp $l, $r, p[1], IsTFCompareType:$type),
+            (p[0] $l, $r, ConstBoolAttrTrue)>;
 }
 
 foreach pair = [[TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE],
                 [TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT],
                 [TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE],
                 [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in {
-  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
-  def : Pat<(HLO_CompareOp $l, $r, pair[1]), (pair[0] $l, $r)>;
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $broadcast_dimensions,
+                                          pair[1], IsTFCompareType:$type),
+            (pair[0] $l, $r),
+            [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
+  def : Pat<(HLO_CompareOp $l, $r, pair[1], IsTFCompareType:$type),
+            (pair[0] $l, $r)>;
 }
 
 def ConvertDotOp : NativeCodeCall<"ConvertDotOp($_builder, "
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index 1ad6bde1013..ed0275fd225 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -135,9 +135,9 @@ Value ValuesToRank1(PatternRewriter &rewriter, Location loc, Type dtype,
                     ArrayRef<Value> vals) {
   int64_t length = vals.size();
   auto type = RankedTensorType::get({length}, dtype);
-  auto axis = rewriter.create<TF::ConstOp>(
+  auto axis = rewriter.create<ConstOp>(
       loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
-  return rewriter.create<TF::ConcatV2Op>(loc, type, ValueRange(vals), axis);
+  return rewriter.create<ConcatV2Op>(loc, type, ValueRange(vals), axis);
 }
 
 // Lowers AddN op to a sequence of AddV2 ops to accumulate operands.
@@ -182,12 +182,12 @@ Value ValuesToRank1(PatternRewriter &rewriter, Location loc, Type dtype,
 class LowerAddNOp : public RewritePattern {
  public:
   explicit LowerAddNOp(MLIRContext *context)
-      : RewritePattern(TF::AddNOp::getOperationName(),
-                       {TF::AddV2Op::getOperationName()}, 1, context) {}
+      : RewritePattern(AddNOp::getOperationName(),
+                       {AddV2Op::getOperationName()}, 1, context) {}
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    auto addn_op = cast<TF::AddNOp>(op);
+    auto addn_op = cast<AddNOp>(op);
 
     // TODO(hinsu): Support variant with TensorList type. tf.AddV2 doesn't
     // support variant type so variant types require special handling.
@@ -202,8 +202,8 @@ class LowerAddNOp : public RewritePattern {
       for (int64_t i = 0; i < n; i += 2) {
         // Add two adjacent operands if applicable.
         operands[i / 2] =
-            (i + 1 < n) ? rewriter.create<TF::AddV2Op>(
-                              addn_op.getLoc(), operands[i], operands[i + 1])
+            (i + 1 < n) ? rewriter.create<AddV2Op>(addn_op.getLoc(),
+                                                   operands[i], operands[i + 1])
                         : operands[i];
       }
       n = (n + 1) / 2;
@@ -215,7 +215,7 @@ class LowerAddNOp : public RewritePattern {
 };
 
 // Lowers DynamicStitch op with constant indices and with static input and
-// output shapes using Reshape, UnPack and ConcatV2 op.
+// output shapes using Reshape, UnPack and Pack op.
 //
 //   %indices0 = "tf.Const"() {value = dense<4> : tensor<i32>}
 //   %indices1 = "tf.Const"() {value = dense<[[3, 2], [1, 0]]> :
@@ -237,17 +237,23 @@ class LowerAddNOp : public RewritePattern {
 //     : (tensor<4x2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>,
 //     tensor<2xf32>)
 //   %axis = "tf.Const"() {value = dense<0> : tensor<i64>}
-//   %0 = "tf.ConcatV2"(items1#3, items1#2, items1#1, items1#0, %items0, %axis)
+//   %0 = "tf.Pack"(items1#3, items1#2, items1#1, items1#0, %items0, %axis)
 //     : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>,
 //        tensor<2xf32>, tensor<i64>) -> tensor<5x2xf32>
 //
-class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
+class LowerDynamicStitchOp : public RewritePattern {
  public:
   explicit LowerDynamicStitchOp(MLIRContext *context)
-      : OpRewritePattern<TF::DynamicStitchOp>(context) {}
+      : RewritePattern(
+            DynamicStitchOp::getOperationName(),
+            {ConstOp::getOperationName(), ReshapeOp::getOperationName(),
+             UnpackOp::getOperationName(), PackOp::getOperationName()},
+            1, context) {}
 
-  LogicalResult matchAndRewrite(DynamicStitchOp op,
+  LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
+    auto op = cast<DynamicStitchOp>(src_op);
+
     // Static output type is used to compute intermediate values. Note that the
     // output type doesn't have to be static but if input types and indices are
     // constant, then the output type can be statically determined.
@@ -303,8 +309,7 @@ class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
       }
     }
 
-    auto axis = rewriter.create<ConstOp>(loc, rewriter.getI64IntegerAttr(0));
-    rewriter.replaceOpWithNewOp<ConcatV2Op>(op, op.getType(), values, axis);
+    rewriter.replaceOpWithNewOp<PackOp>(op, op.getType(), values);
     return success();
   }
 };
@@ -316,12 +321,20 @@ class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
 // 1. Computing proper quantized bounds. This involves nudging the input bounds.
 // 2. Converting the input bounds to quantized space, rounding values.
 // 3. Convert back into floating point space.
-class ConvertFakeQuantWithMinMaxVarsOp
-    : public OpRewritePattern<TF::FakeQuantWithMinMaxVarsOp> {
-  using OpRewritePattern<TF::FakeQuantWithMinMaxVarsOp>::OpRewritePattern;
+class ConvertFakeQuantWithMinMaxVarsOp : public RewritePattern {
+ public:
+  explicit ConvertFakeQuantWithMinMaxVarsOp(MLIRContext *context)
+      : RewritePattern(FakeQuantWithMinMaxVarsOp::getOperationName(),
+                       {SubOp::getOperationName(), ConstOp::getOperationName(),
+                        MulOp::getOperationName(), FloorOp::getOperationName(),
+                        ClipByValueOp::getOperationName(),
+                        DivOp::getOperationName(), RoundOp::getOperationName()},
+                       1, context) {}
 
-  LogicalResult matchAndRewrite(TF::FakeQuantWithMinMaxVarsOp op,
+  LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
+    auto op = cast<FakeQuantWithMinMaxVarsOp>(src_op);
+
     auto input = op.inputs();
     auto input_ty = input.getType().cast<ShapedType>();
     auto element_ty = input_ty.getElementType();
@@ -335,82 +348,80 @@ class ConvertFakeQuantWithMinMaxVarsOp
     auto float_min = op.min();
     auto float_max = op.max();
 
-    auto float_diff =
-        rewriter.create<TF::SubOp>(op.getLoc(), float_max, float_min);
+    auto float_diff = rewriter.create<SubOp>(op.getLoc(), float_max, float_min);
 
     // Compute the range when quantized.
-    auto quant_min = rewriter.create<TF::ConstOp>(
+    auto quant_min = rewriter.create<ConstOp>(
         op.getLoc(), DenseElementsAttr::get(
                          scalar_ty, ConvertToAPFloat(bits_min, element_ty)));
 
-    auto quant_max = rewriter.create<TF::ConstOp>(
+    auto quant_max = rewriter.create<ConstOp>(
         op.getLoc(), DenseElementsAttr::get(
                          scalar_ty, ConvertToAPFloat(bits_max, element_ty)));
 
-    auto quant_diff = rewriter.create<TF::ConstOp>(
+    auto quant_diff = rewriter.create<ConstOp>(
         op.getLoc(),
         DenseElementsAttr::get(
             scalar_ty, ConvertToAPFloat(bits_max - bits_min, element_ty)));
 
     auto quant_to_float =
-        rewriter.create<TF::DivOp>(op.getLoc(), float_diff, quant_diff);
+        rewriter.create<DivOp>(op.getLoc(), float_diff, quant_diff);
 
     auto float_to_quant =
-        rewriter.create<TF::DivOp>(op.getLoc(), quant_diff, float_diff);
+        rewriter.create<DivOp>(op.getLoc(), quant_diff, float_diff);
 
     // During quantization, the quantized min/max values may not line up
     // perfectly with the specified min/max. Nudge them into the right range.
     auto min_scaled =
-        rewriter.create<TF::DivOp>(op.getLoc(), float_min, quant_to_float);
+        rewriter.create<DivOp>(op.getLoc(), float_min, quant_to_float);
     auto min_scaled_sub =
-        rewriter.create<TF::SubOp>(op.getLoc(), quant_min, min_scaled);
+        rewriter.create<SubOp>(op.getLoc(), quant_min, min_scaled);
 
     auto mid_rounded =
-        rewriter.create<TF::RoundOp>(op.getLoc(), scalar_ty, min_scaled_sub);
+        rewriter.create<RoundOp>(op.getLoc(), scalar_ty, min_scaled_sub);
 
-    auto nudged_zero_point_val = rewriter.create<TF::ClipByValueOp>(
+    auto nudged_zero_point_val = rewriter.create<ClipByValueOp>(
         op.getLoc(), scalar_ty, mid_rounded, quant_min, quant_max);
 
-    auto quant_min_sub = rewriter.create<TF::SubOp>(op.getLoc(), quant_min,
-                                                    nudged_zero_point_val);
-    auto quant_max_sub = rewriter.create<TF::SubOp>(op.getLoc(), quant_max,
-                                                    nudged_zero_point_val);
+    auto quant_min_sub =
+        rewriter.create<SubOp>(op.getLoc(), quant_min, nudged_zero_point_val);
+    auto quant_max_sub =
+        rewriter.create<SubOp>(op.getLoc(), quant_max, nudged_zero_point_val);
 
     auto nudged_float_min =
-        rewriter.create<TF::MulOp>(op.getLoc(), quant_min_sub, quant_to_float);
+        rewriter.create<MulOp>(op.getLoc(), quant_min_sub, quant_to_float);
 
     auto nudged_float_max =
-        rewriter.create<TF::MulOp>(op.getLoc(), quant_max_sub, quant_to_float);
+        rewriter.create<MulOp>(op.getLoc(), quant_max_sub, quant_to_float);
 
     // Now quantize the input value with the approximated min/max values.
 
     // Move the input value into quantized space
-    Value quantized_input = rewriter.create<TF::ClipByValueOp>(
+    Value quantized_input = rewriter.create<ClipByValueOp>(
         op.getLoc(), input_ty, input, nudged_float_min, nudged_float_max);
 
-    quantized_input = rewriter.create<TF::SubOp>(
-        op.getLoc(), input_ty, quantized_input, nudged_float_min);
+    quantized_input = rewriter.create<SubOp>(op.getLoc(), input_ty,
+                                             quantized_input, nudged_float_min);
 
-    quantized_input = rewriter.create<TF::MulOp>(
-        op.getLoc(), input_ty, quantized_input, float_to_quant);
+    quantized_input = rewriter.create<MulOp>(op.getLoc(), input_ty,
+                                             quantized_input, float_to_quant);
 
     // Round the quantized input always to the positive direction.
-    auto half_val = rewriter.create<TF::ConstOp>(
+    auto half_val = rewriter.create<ConstOp>(
         op.getLoc(),
         DenseElementsAttr::get(scalar_ty, ConvertToAPFloat(0.5, element_ty)));
 
-    quantized_input = rewriter.create<TF::AddOp>(op.getLoc(), input_ty,
-                                                 quantized_input, half_val);
+    quantized_input = rewriter.create<AddOp>(op.getLoc(), input_ty,
+                                             quantized_input, half_val);
 
-    quantized_input =
-        rewriter.create<TF::FloorOp>(op.getLoc(), quantized_input);
+    quantized_input = rewriter.create<FloorOp>(op.getLoc(), quantized_input);
 
     // Convert back into floating point spae.
-    Value output = rewriter.create<TF::MulOp>(op.getLoc(), input_ty,
-                                              quantized_input, quant_to_float);
+    Value output = rewriter.create<MulOp>(op.getLoc(), input_ty,
+                                          quantized_input, quant_to_float);
 
-    output = rewriter.create<TF::AddOp>(op.getLoc(), input_ty, output,
-                                        nudged_float_min);
+    output =
+        rewriter.create<AddOp>(op.getLoc(), input_ty, output, nudged_float_min);
 
     rewriter.replaceOp(op, {output});
     return success();
@@ -438,14 +449,20 @@ class ConvertFakeQuantWithMinMaxVarsOp
 //   "tf.TensorScatterUpdate"(%x, %indices, %updates) :
 //     (tensor<5xi32>, tensor<5x1xi32>, tensor<5xi32>) -> tensor<5xi32>
 //
-class LowerInvertPermutationOp
-    : public OpRewritePattern<TF::InvertPermutationOp> {
+class LowerInvertPermutationOp : public RewritePattern {
  public:
   explicit LowerInvertPermutationOp(MLIRContext *context)
-      : OpRewritePattern<TF::InvertPermutationOp>(context) {}
+      : RewritePattern(
+            InvertPermutationOp::getOperationName(),
+            {ConstOp::getOperationName(), RangeOp::getOperationName(),
+             ReshapeOp::getOperationName(),
+             TensorScatterUpdateOp::getOperationName()},
+            1, context) {}
 
-  LogicalResult matchAndRewrite(TF::InvertPermutationOp op,
+  LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
+    auto op = cast<InvertPermutationOp>(src_op);
+
     Location loc = op.getLoc();
     auto x_type = op.x().getType().dyn_cast<RankedTensorType>();
     // x input must have static shape.
@@ -455,24 +472,248 @@ class LowerInvertPermutationOp
     Type int_type = x_type.getElementType();  // Could be i32 or i64.
 
     auto result_type = x_type;
-    auto start =
-        rewriter.create<TF::ConstOp>(loc, GetScalarOfType(int_type, 0));
-    Value limit = rewriter.create<TF::ConstOp>(
+    auto start = rewriter.create<ConstOp>(loc, GetScalarOfType(int_type, 0));
+    Value limit = rewriter.create<ConstOp>(
         loc, GetScalarOfType(int_type, x_type.getShape()[0]));
-    auto delta =
-        rewriter.create<TF::ConstOp>(loc, GetScalarOfType(int_type, 1));
+    auto delta = rewriter.create<ConstOp>(loc, GetScalarOfType(int_type, 1));
     // Construct a sequence of numbers [0, 1, ... len(x)-1].
     auto updates =
-        rewriter.create<TF::RangeOp>(loc, result_type, start, limit, delta);
+        rewriter.create<RangeOp>(loc, result_type, start, limit, delta);
 
     auto shape_type = RankedTensorType::get({2}, rewriter.getIntegerType(32));
-    auto shape = rewriter.create<TF::ConstOp>(
+    auto shape = rewriter.create<ConstOp>(
         loc, DenseElementsAttr::get(
                  shape_type, {static_cast<int>(x_type.getDimSize(0)), 1}));
-    auto indices = rewriter.create<TF::ReshapeOp>(loc, op.x(), shape);
+    auto indices = rewriter.create<ReshapeOp>(loc, op.x(), shape);
 
-    rewriter.replaceOpWithNewOp<TF::TensorScatterUpdateOp>(
-        op, result_type, op.x(), indices, updates);
+    rewriter.replaceOpWithNewOp<TensorScatterUpdateOp>(op, result_type, op.x(),
+                                                       indices, updates);
+    return success();
+  }
+};
+
+// Approximates lgamma using Lanczos' approximation from
+// "A Precision Approximation of the Gamma Function". SIAM Journal on Numerical
+// Analysis series B. Vol. 1:
+// lgamma(z + 1) = (log(2) + log(pi)) / 2 + (z + 1/2) * log(t(z)) - t(z) + A(z)
+// t(z) = z + kLanczosGamma + 1/2
+// A(z) = kBaseLanczosCoeff
+//       + sigma(k = 1, n, kLanczosCoefficients[i] / (z +  k))
+//
+// Coefficients for the Lanczos approximation of the gamma function. The
+// coefficients are uniquely determined by the choice of g and n
+// (kLanczosGamma and kLanczosCoefficients.size() + 1). The coefficients below
+// correspond to [7, 9]. [5, 7], [7, 9], [9, 10], and [607/128.0, 15] were
+// evaluated and [7, 9] seemed to be the least sensitive to the quality of the
+// log function. In particular, [5, 7] is the only choice where -1.5e-5 <=
+// lgamma(2) <= 1.5e-5 for a particularly inaccurate log function.
+static constexpr double kLanczosGamma = 7;  // aka g
+static constexpr double kBaseLanczosCoeff = 0.99999999999980993227684700473478;
+static constexpr std::array<double, 8> kLanczosCoefficients = {
+    676.520368121885098567009190444019, -1259.13921672240287047156078755283,
+    771.3234287776530788486528258894,   -176.61502916214059906584551354,
+    12.507343278686904814458936853,     -0.13857109526572011689554707,
+    9.984369578019570859563e-6,         1.50563273514931155834e-7};
+
+class LowerLgammaOp : public RewritePattern {
+ public:
+  explicit LowerLgammaOp(MLIRContext *context)
+      : RewritePattern(LgammaOp::getOperationName(),
+                       {
+                           CastOp::getOperationName(),
+                           ConstOp::getOperationName(),
+                           NegOp::getOperationName(),
+                           SubOp::getOperationName(),
+                           SelectV2Op::getOperationName(),
+                           LessOp::getOperationName(),
+                           AddV2Op::getOperationName(),
+                           DivOp::getOperationName(),
+                           SubOp::getOperationName(),
+                           LogOp::getOperationName(),
+                           Log1pOp::getOperationName(),
+                           IsInfOp::getOperationName(),
+                           MulOp::getOperationName(),
+                           FloorOp::getOperationName(),
+                           AbsOp::getOperationName(),
+                           GreaterOp::getOperationName(),
+                           SinOp::getOperationName(),
+                           IsFiniteOp::getOperationName(),
+                       },
+                       1, context) {}
+
+  LogicalResult matchAndRewrite(Operation *src_op,
+                                PatternRewriter &rewriter) const override {
+    auto op = cast<LgammaOp>(src_op);
+
+    Location loc = op.getLoc();
+    Value input = op.x();
+    TensorType original_tensor_type = op.x().getType().cast<TensorType>();
+
+    // The approximation is not precise enough for float16. Do the computation
+    // in float32 for that case.
+    TensorType tensor_type = original_tensor_type;
+    FloatType float_type = tensor_type.getElementType().cast<FloatType>();
+    bool needs_cast = float_type.getWidth() < 32;
+    if (needs_cast) {
+      MLIRContext *context = rewriter.getContext();
+      float_type = FloatType::getF32(context);
+      if (original_tensor_type.hasRank()) {
+        tensor_type =
+            RankedTensorType::get(original_tensor_type.getShape(), float_type);
+      } else {
+        tensor_type = UnrankedTensorType::get(float_type);
+      }
+      input = rewriter.create<CastOp>(loc, tensor_type, input);
+    }
+
+    // Helper lambda function for creating a ConstOp for a tensor filled with
+    // the given constant float value.
+    auto create_const_op = [&rewriter, loc, tensor_type,
+                            float_type](double value) {
+      return rewriter.create<ConstOp>(
+          loc, DenseElementsAttr::get(tensor_type,
+                                      FloatAttr::get(float_type, value)));
+    };
+
+    Value one_half = create_const_op(0.5);
+    Value one = create_const_op(1.0);
+    Value infinity = create_const_op(std::numeric_limits<double>::infinity());
+    Value pi = create_const_op(M_PI);
+    Value log_pi = create_const_op(std::log(M_PI));
+    Value log_sqrt_two_pi = create_const_op((std::log(2) + std::log(M_PI)) / 2);
+    Value lanczos_gamma_plus_one_half = create_const_op(kLanczosGamma + 0.5);
+    Value log_lanczos_gamma_plus_one_half =
+        create_const_op(std::log(kLanczosGamma + 0.5));
+    Value base_lanczos_coeff = create_const_op(kBaseLanczosCoeff);
+
+    Value minus_input = rewriter.create<NegOp>(loc, input);
+    Value input_minus_one = rewriter.create<SubOp>(loc, input, one);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
+    Value need_to_reflect = rewriter.create<LessOp>(loc, input, one_half);
+    Type tensor_bool_type = need_to_reflect.getType();
+    Value z = rewriter.create<SelectV2Op>(loc, need_to_reflect, minus_input,
+                                          input_minus_one);
+
+    Value x = base_lanczos_coeff;
+    for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
+      Value lanczos_coefficient = create_const_op(kLanczosCoefficients[i]);
+      Value index = create_const_op(static_cast<double>(i));
+      Value z_plus_index = rewriter.create<AddV2Op>(loc, z, index);
+      Value z_plus_index_plus_one =
+          rewriter.create<AddV2Op>(loc, z_plus_index, one);
+      Value incr = rewriter.create<DivOp>(loc, lanczos_coefficient,
+                                          z_plus_index_plus_one);
+      x = rewriter.create<AddV2Op>(loc, x, incr);
+    }
+
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    Value t = rewriter.create<AddV2Op>(loc, lanczos_gamma_plus_one_half, z);
+    Value z_div_lanczos_gamma_plus_one_half =
+        rewriter.create<DivOp>(loc, z, lanczos_gamma_plus_one_half);
+    Value log1p_z_div_lanczos_gamma_plus_one_half =
+        rewriter.create<Log1pOp>(loc, z_div_lanczos_gamma_plus_one_half);
+    Value log_t =
+        rewriter.create<AddV2Op>(loc, log_lanczos_gamma_plus_one_half,
+                                 log1p_z_div_lanczos_gamma_plus_one_half);
+
+    // Compute the final result (modulo reflection).  t(z) may be large, and we
+    // need to be careful not to overflow to infinity in the first term of
+    //
+    //   (z + 1/2) * log(t(z)) - t(z).
+    //
+    // Therefore we compute this as
+    //
+    //   (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
+    //
+    // log_y = log_sqrt_two_pi + (z + one_half - t / log_t) * log_t + Log(x);
+    Value t_div_log_t = rewriter.create<DivOp>(loc, t, log_t);
+    Value one_half_minus_t_div_log_t =
+        rewriter.create<SubOp>(loc, one_half, t_div_log_t);
+    Value z_plus_one_half_minus_t_div_log_t =
+        rewriter.create<AddV2Op>(loc, z, one_half_minus_t_div_log_t);
+    Value z_plus_one_half_minus_t_div_log_t_mul_log_t =
+        rewriter.create<MulOp>(loc, z_plus_one_half_minus_t_div_log_t, log_t);
+    Value log_x = rewriter.create<LogOp>(loc, x);
+    Value log_y_rhs = rewriter.create<AddV2Op>(
+        loc, z_plus_one_half_minus_t_div_log_t_mul_log_t, log_x);
+    Value log_y = rewriter.create<AddV2Op>(loc, log_sqrt_two_pi, log_y_rhs);
+
+    // Compute the reflected value, used when x < 0.5:
+    //
+    //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
+    //
+    // (The abs is because lgamma is the log of the absolute value of the gamma
+    // function.)
+    //
+    // We have to be careful when computing the final term above. gamma(x) goes
+    // to +/-inf at every integer x < 0, and this is controlled by the
+    // sin(pi * x) term.  The slope is large, so precision is particularly
+    // important.
+    //
+    // Because abs(sin(pi * x)) has period 1, we can equivalently use
+    // abs(sin(pi * frac(x))), where frac(x) is the fractional part of x.  This
+    // is more numerically accurate: It doesn't overflow to inf like pi * x can,
+    // and if x is an integer, it evaluates to 0 exactly, which is significant
+    // because we then take the log of this value, and log(0) is inf.
+    //
+    // We don't have a frac(x) primitive in XLA and computing it is tricky, but
+    // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for
+    // our purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
+    //
+    // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
+    // to 1.  To remedy this, we can use the fact that sin(pi * x) in the domain
+    // [0, 1] is symmetric across the line Y=0.5.
+    Value abs_input = rewriter.create<AbsOp>(loc, input);
+    Value abs_input_floor = rewriter.create<FloorOp>(loc, abs_input);
+    Value abs_frac_input =
+        rewriter.create<SubOp>(loc, abs_input, abs_input_floor);
+
+    // Convert values of abs_frac_input > 0.5 to (1 - frac_input) to improve
+    // precision of pi * abs_frac_input for values of abs_frac_input close to 1.
+    Value one_minus_abs_frac_input =
+        rewriter.create<SubOp>(loc, one, abs_frac_input);
+    Value abs_frac_input_gt_one_half =
+        rewriter.create<GreaterOp>(loc, abs_frac_input, one_half);
+    Value reduced_frac_input =
+        rewriter.create<SelectV2Op>(loc, abs_frac_input_gt_one_half,
+                                    one_minus_abs_frac_input, abs_frac_input);
+    Value pi_mul_reduced_frac_input =
+        rewriter.create<MulOp>(loc, pi, reduced_frac_input);
+    Value sin_pi_mul_reduced_frac_input =
+        rewriter.create<SinOp>(loc, pi_mul_reduced_frac_input);
+    Value reflection_denom =
+        rewriter.create<LogOp>(loc, sin_pi_mul_reduced_frac_input);
+
+    // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
+    // then it "wins" and the result is +/-inf.
+    Value is_finite =
+        rewriter.create<IsFiniteOp>(loc, tensor_bool_type, reflection_denom);
+    Value neg_reflection_denom = rewriter.create<NegOp>(loc, reflection_denom);
+    Value log_pi_minus_reflection_denom =
+        rewriter.create<SubOp>(loc, log_pi, reflection_denom);
+    Value reflection_if_finite =
+        rewriter.create<SubOp>(loc, log_pi_minus_reflection_denom, log_y);
+    Value reflection = rewriter.create<SelectV2Op>(
+        loc, is_finite, reflection_if_finite, neg_reflection_denom);
+
+    Value result =
+        rewriter.create<SelectV2Op>(loc, need_to_reflect, reflection, log_y);
+
+    // lgamma(+/-inf) = +inf.
+    Value is_inf = rewriter.create<IsInfOp>(loc, tensor_bool_type, input);
+    result = rewriter.create<SelectV2Op>(loc, is_inf, infinity, result);
+
+    if (needs_cast) {
+      result = rewriter.create<CastOp>(loc, original_tensor_type, result);
+    }
+
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
@@ -487,15 +728,21 @@ class LowerInvertPermutationOp
 //   %inp1 = "tf.ExpandDims"(%operand1, %axis): tensor<2xf32> -> tensor<2x1xf32>
 //   %result = "tf.ConcatV2"(%operand0, %operand1, %axis) { N = 2 : i64 }:
 //
-class LowerPackOp : public OpRewritePattern<TF::PackOp> {
+class LowerPackOp : public RewritePattern {
  public:
   explicit LowerPackOp(MLIRContext *context)
-      : OpRewritePattern<TF::PackOp>(context) {}
+      : RewritePattern(
+            PackOp::getOperationName(),
+            {ConstOp::getOperationName(), ConcatV2Op::getOperationName(),
+             ExpandDimsOp::getOperationName()},
+            1, context) {}
 
-  LogicalResult matchAndRewrite(TF::PackOp op,
+  LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
+    auto op = cast<PackOp>(src_op);
+
     Location loc = op.getLoc();
-    auto axis_value = rewriter.create<TF::ConstOp>(
+    auto axis_value = rewriter.create<ConstOp>(
         loc,
         DenseElementsAttr::get(
             RankedTensorType::get({}, rewriter.getIntegerType(64)), op.axis()));
@@ -513,12 +760,12 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
         inferred_ty = InferExpandDimsType(input_ty, axis, &rewriter);
         prev_input_ty = input_ty;
       }
-      expanded_inputs.push_back(rewriter.create<TF::ExpandDimsOp>(
-          loc, inferred_ty, input, axis_value));
+      expanded_inputs.push_back(
+          rewriter.create<ExpandDimsOp>(loc, inferred_ty, input, axis_value));
     }
 
-    rewriter.replaceOpWithNewOp<TF::ConcatV2Op>(op, op.getType(),
-                                                expanded_inputs, axis_value);
+    rewriter.replaceOpWithNewOp<ConcatV2Op>(op, op.getType(), expanded_inputs,
+                                            axis_value);
     return success();
   }
 };
@@ -548,12 +795,29 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
 //     [batch * product(block_shape)]
 //     + [padded.shape[1]/block_shape[0], ..., padded.shape[M]/block_shape[M-1]]
 //     + remaining_shape
-class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
+class LowerSpaceToBatchNDOp : public RewritePattern {
  public:
-  using OpRewritePattern<TF::SpaceToBatchNDOp>::OpRewritePattern;
+  explicit LowerSpaceToBatchNDOp(MLIRContext *context)
+      : RewritePattern(SpaceToBatchNDOp::getOperationName(),
+                       {
+                           CastOp::getOperationName(),
+                           ConstOp::getOperationName(),
+                           ConcatV2Op::getOperationName(),
+                           AddOp::getOperationName(),
+                           PadOp::getOperationName(),
+                           SumOp::getOperationName(),
+                           SplitOp::getOperationName(),
+                           DivOp::getOperationName(),
+                           MulOp::getOperationName(),
+                           ReshapeOp::getOperationName(),
+                           TransposeOp::getOperationName(),
+                       },
+                       1, context) {}
 
-  LogicalResult matchAndRewrite(TF::SpaceToBatchNDOp op,
+  LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
+    auto op = cast<SpaceToBatchNDOp>(src_op);
+
     Location loc = op.getLoc();
     auto input_type = op.input().getType().cast<TensorType>();
     if (!input_type.hasStaticShape()) {
@@ -576,15 +840,15 @@ class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
 
     auto block_shape_i64_type = RankedTensorType::get(
         block_shape_type.getShape(), rewriter.getIntegerType(64));
-    auto block_shape_i64 = rewriter.create<TF::CastOp>(
-        loc, block_shape_i64_type, op.block_shape());
+    auto block_shape_i64 =
+        rewriter.create<CastOp>(loc, block_shape_i64_type, op.block_shape());
 
     auto paddings_i64_type = RankedTensorType::get(paddings_type.getShape(),
                                                    rewriter.getIntegerType(64));
     auto paddings_i64 =
-        rewriter.create<TF::CastOp>(loc, paddings_i64_type, op.paddings());
+        rewriter.create<CastOp>(loc, paddings_i64_type, op.paddings());
 
-    auto pad00 = rewriter.create<TF::ConstOp>(
+    auto pad00 = rewriter.create<ConstOp>(
         loc, DenseElementsAttr::get<int64_t>(
                  RankedTensorType::get({1, 2}, rewriter.getIntegerType(64)),
                  {0, 0}));
@@ -592,30 +856,51 @@ class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
     full_paddings_list.append(remaining_rank, pad00);
     auto full_paddings_type =
         RankedTensorType::get({input_rank, 2}, rewriter.getIntegerType(64));
-    auto zero_i64 = rewriter.create<TF::ConstOp>(
+    auto zero_i64 = rewriter.create<ConstOp>(
         loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
     // Extends paddings to all dimensions of input by adding 0s to non-block
     // dimensions.
-    auto full_paddings = rewriter.create<TF::ConcatV2Op>(
+    auto full_paddings = rewriter.create<ConcatV2Op>(
         loc, full_paddings_type, full_paddings_list, zero_i64);
 
-    SmallVector<int64_t, 4> padded_shape(input_rank, ShapedType::kDynamicSize);
+    // Compute the result type here instead of using shape inference because the
+    // full_paddings won't be available as a constant for shape inference.
+    ElementsAttr block_shape;
+    ElementsAttr paddings;
+    auto padded_shape = llvm::to_vector<4>(input_shape);
+    if (matchPattern(op.block_shape(), m_Constant(&block_shape)) &&
+        matchPattern(op.paddings(), m_Constant(&paddings))) {
+      for (uint64_t i = 0; i < block_rank; i++) {
+        int64_t paddings_sum =
+            paddings.getValue({i, 0}).cast<IntegerAttr>().getInt() +
+            paddings.getValue({i, 1}).cast<IntegerAttr>().getInt();
+        int64_t block_shape_i =
+            block_shape.getValue({i}).cast<IntegerAttr>().getInt();
+        padded_shape[i + 1] =
+            (paddings_sum + padded_shape[i + 1]) / block_shape_i;
+      }
+    } else {
+      for (int i = 0; i < block_rank; i++) {
+        padded_shape[i + 1] = ShapedType::kDynamicSize;
+      }
+    }
+
     auto padded_type =
         RankedTensorType::get(padded_shape, rewriter.getF32Type());
     // padded = pad(input, full_paddings)
     auto padded =
-        rewriter.create<TF::PadOp>(loc, padded_type, op.input(), full_paddings);
+        rewriter.create<PadOp>(loc, padded_type, op.input(), full_paddings);
 
     auto paddings_sum_type =
         RankedTensorType::get({input_rank}, rewriter.getIntegerType(64));
-    auto one_i64 = rewriter.create<TF::ConstOp>(
+    auto one_i64 = rewriter.create<ConstOp>(
         loc, GetScalarOfType(rewriter.getIntegerType(64), 1));
     // paddings_sum = paddings[*,0] + paddings[*,1]
-    auto paddings_sum = rewriter.create<TF::SumOp>(loc, paddings_sum_type,
-                                                   full_paddings, one_i64);
+    auto paddings_sum =
+        rewriter.create<SumOp>(loc, paddings_sum_type, full_paddings, one_i64);
 
     // input_shape_tensor = input.shape
-    auto input_shape_tensor = rewriter.create<TF::ConstOp>(
+    auto input_shape_tensor = rewriter.create<ConstOp>(
         loc,
         DenseElementsAttr::get(
             RankedTensorType::get({input_rank}, rewriter.getIntegerType(64)),
@@ -623,31 +908,31 @@ class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
 
     // padded_shape_tensor is the shape of padded.
     auto padded_shape_tensor =
-        rewriter.create<TF::AddOp>(loc, paddings_sum, input_shape_tensor);
+        rewriter.create<AddOp>(loc, paddings_sum, input_shape_tensor);
 
-    auto zero_i32 = rewriter.create<TF::ConstOp>(
+    auto zero_i32 = rewriter.create<ConstOp>(
         loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
     SmallVector<Type, 4> padded_shape_splits_types(
         input_rank, RankedTensorType::get({1}, rewriter.getIntegerType(64)));
     SmallVector<Value, 4> padded_shape_splits(
         rewriter
-            .create<TF::SplitOp>(loc, padded_shape_splits_types, zero_i32,
-                                 padded_shape_tensor)
+            .create<SplitOp>(loc, padded_shape_splits_types, zero_i32,
+                             padded_shape_tensor)
             .output());
 
     SmallVector<Type, 4> block_shape_splits_types(
         block_rank, RankedTensorType::get({1}, rewriter.getIntegerType(64)));
     SmallVector<Value, 4> block_shape_splits(
         rewriter
-            .create<TF::SplitOp>(loc, block_shape_splits_types, zero_i32,
-                                 block_shape_i64)
+            .create<SplitOp>(loc, block_shape_splits_types, zero_i32,
+                             block_shape_i64)
             .output());
 
     SmallVector<Value, 4> outer_shape_vals;
     for (int64_t i = 0; i < block_rank; ++i) {
       // TODO(b/157475606): Insert tf.Assert that the following division has
       // remainder 0.
-      outer_shape_vals.push_back(rewriter.create<TF::DivOp>(
+      outer_shape_vals.push_back(rewriter.create<DivOp>(
           loc, padded_shape_splits[1 + i], block_shape_splits[i]));
     }
 
@@ -673,13 +958,13 @@ class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
     for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
       permutation_vals.push_back(block_rank + i);
     }
-    auto permutation = rewriter.create<TF::ConstOp>(
+    auto permutation = rewriter.create<ConstOp>(
         loc, GetI64ElementsAttr(permutation_vals, &rewriter));
 
     auto output_batch = padded_shape_splits[0];
     for (int64_t i = 0; i < block_rank; ++i) {
       output_batch =
-          rewriter.create<TF::MulOp>(loc, output_batch, block_shape_splits[i]);
+          rewriter.create<MulOp>(loc, output_batch, block_shape_splits[i]);
     }
     SmallVector<Value, 4> output_shape_vals{output_batch};
     for (int64_t i = 0; i < block_rank; ++i) {
@@ -690,30 +975,35 @@ class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
     }
     auto output_shape = ValuesToRank1(
         rewriter, loc, rewriter.getIntegerType(64), output_shape_vals);
-    auto reshaped = rewriter.create<TF::ReshapeOp>(loc, padded, reshaped_shape);
-    auto permuted =
-        rewriter.create<TF::TransposeOp>(loc, reshaped, permutation);
+    auto reshaped = rewriter.create<ReshapeOp>(loc, padded, reshaped_shape);
+    auto permuted = rewriter.create<TransposeOp>(loc, reshaped, permutation);
 
     // Sometimes the result type is more specific than what the reshape builder
     // can infer.
     auto result_type = op.getResult().getType();
-    rewriter.replaceOpWithNewOp<TF::ReshapeOp>(op, result_type, permuted,
-                                               output_shape);
+    rewriter.replaceOpWithNewOp<ReshapeOp>(op, result_type, permuted,
+                                           output_shape);
 
     return success();
   }
 };
 
-// Lowers `TF::SparseMatMulOp` to `TF::MatMulOp`, ignoring the sparseness hints,
+// Lowers `SparseMatMulOp` to `MatMulOp`, ignoring the sparseness hints,
 // since we currently don't have an implementation that can use this
 // information. Adds appropriate casts where necessary to align element types
-// of operands and result for `TF::MatMulOp`.
-class LowerSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
+// of operands and result for `MatMulOp`.
+class LowerSparseMatMulOp : public RewritePattern {
  public:
-  using OpRewritePattern<TF::SparseMatMulOp>::OpRewritePattern;
+  explicit LowerSparseMatMulOp(MLIRContext *context)
+      : RewritePattern(
+            SparseMatMulOp::getOperationName(),
+            {CastOp::getOperationName(), MatMulOp::getOperationName()}, 1,
+            context) {}
 
-  LogicalResult matchAndRewrite(TF::SparseMatMulOp op,
+  LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
+    auto op = cast<SparseMatMulOp>(src_op);
+
     // Result type must be f32 for applying the pattern (currently this is
     // required by the op anyway but this might change).
     if (!op.product().getType().cast<TensorType>().getElementType().isF32()) {
@@ -736,10 +1026,9 @@ class LowerSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
         tensor_type_f32 = UnrankedTensorType::get(FloatType::getF32(context));
       }
       // Add cast to f32 to conform with element type of result.
-      operand =
-          rewriter.create<TF::CastOp>(op.getLoc(), tensor_type_f32, operand);
+      operand = rewriter.create<CastOp>(op.getLoc(), tensor_type_f32, operand);
     }
-    Value result = rewriter.create<TF::MatMulOp>(
+    Value result = rewriter.create<MatMulOp>(
         op.getLoc(), op.product().getType(), operands[0], operands[1],
         op.transpose_a(), op.transpose_b());
 
@@ -751,11 +1040,11 @@ class LowerSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
 // Lowers _UnaryOpsComposition op as a series of original TensorFlow ops that
 // were fused together.
 class Lower_UnaryOpsComposition
-    : public OpRewritePattern<TF::_UnaryOpsCompositionOp> {
+    : public OpRewritePattern<_UnaryOpsCompositionOp> {
  public:
-  using OpRewritePattern<TF::_UnaryOpsCompositionOp>::OpRewritePattern;
+  using OpRewritePattern<_UnaryOpsCompositionOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::_UnaryOpsCompositionOp op,
+  LogicalResult matchAndRewrite(_UnaryOpsCompositionOp op,
                                 PatternRewriter &rewriter) const override {
     Value result = op.x();
     for (StringRef op_name : op.op_names().getAsValueRange<StringAttr>()) {
@@ -777,10 +1066,10 @@ class Lower_UnaryOpsComposition
 void PopulateLoweringTFPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
   patterns->insert<LowerAddNOp, ConvertFakeQuantWithMinMaxVarsOp,
-                   LowerDynamicStitchOp, LowerInvertPermutationOp, LowerPackOp,
-                   LowerSpaceToBatchNDOp, LowerSparseMatMulOp,
-                   Lower_UnaryOpsComposition>(context);
-  populateWithGenerated(context, patterns);
+                   LowerDynamicStitchOp, LowerInvertPermutationOp,
+                   LowerLgammaOp, LowerPackOp, LowerSpaceToBatchNDOp,
+                   LowerSparseMatMulOp, Lower_UnaryOpsComposition>(context);
+  populateWithGenerated(context, *patterns);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index bddc863ee60..97c51659100 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -271,15 +271,19 @@ def LowerFakeQuantWithMinMaxArgs :
 def CreateTFShapeOp : NativeCodeCall<
     "$_builder.create<TF::ShapeOp>($0.getLoc(), $1, $2)">;
 
-// TODO(hinsu): Support inputs of TensorList types.
-def LowerZerosLikeOp :
-  Pat<(TF_ZerosLikeOp:$src_op TensorOf<[AnySignlessInteger, AnyFloat]>:$input),
-      (TF_BroadcastToOp (TF_ConstOp (GetScalarOfType<0> $input)),
-                        (CreateTFShapeOp $src_op, $input, /*use 32bit*/ConstBoolAttrFalse))>;
+class LowerInitializationOp<Op FromOp, int initial_val>
+  : Pat<(FromOp:$src_op
+         TensorOf<[AnyInteger, AnyFloat, AnyComplex]>:$input),
+        (TF_BroadcastToOp (TF_ConstOp (GetScalarOfType<initial_val> $input)),
+                          (CreateTFShapeOp $src_op, $input,
+                                           /*use 32bit*/ConstBoolAttrFalse))>;
+
+def LowerZerosLikeOp : LowerInitializationOp<TF_ZerosLikeOp, 0>;
+def LowerOnesLikeOp : LowerInitializationOp<TF_OnesLikeOp, 1>;
 
 def LowerScatterNdOp :
   Pat<(TF_ScatterNdOp $indices,
-       TensorOf<[AnySignlessInteger, AnyFloat]>:$updates, $shape),
-      (TF_TensorScatterUpdateOp
+       TensorOf<[AnyInteger, AnyFloat, AnyComplex]>:$updates, $shape),
+      (TF_TensorScatterAddOp
        (TF_FillOp $shape, (TF_ConstOp (GetScalarOfType<0> $updates))),
        $indices, $updates)>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc
index 340b965cdd7..78be2c9a419 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 
 namespace mlir {
@@ -29,7 +30,7 @@ struct LowerTF : public PassWrapper<LowerTF, FunctionPass> {
     OwningRewritePatternList patterns;
     mlir::TF::PopulateLoweringTFPatterns(&getContext(), &patterns);
 
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
index 4abebc4475d..ac844b925ce 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -37,6 +38,11 @@ namespace {
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kAllowSoftPlacementAttr[] = "allow_soft_placement";
 
+auto* auto_outside_compilation_gauge =
+    tensorflow::monitoring::Gauge<bool, 0>::New(
+        "/tensorflow/core/use_auto_outside_compilation",
+        "Tracks if auto outside compilation is enabled");
+
 // This pass marks unsupported ops in a device cluster with
 // `_xla_outside_compilation` attribute so the operations will run on the host
 // instead of the device.  Unsupported ops are ops that can not be code
@@ -200,6 +206,9 @@ LogicalResult MarkUncompilableOps(
       outside_compiled_cluster_counter++;
     }
   });
+  if (outside_compiled_cluster_counter > 0) {
+    auto_outside_compilation_gauge->GetCell()->Set(true);
+  }
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index 24e77d31e7c..805a345ebe1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -32,13 +33,104 @@ namespace {
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_optimize.inc"
 
+// Returns a TF Constant tensor with the passed in values.
+TF::ConstOp GetI64ConstantTensor(PatternRewriter &rewriter,
+                                 ArrayRef<int64_t> values, Location location) {
+  auto cst_attr = rewriter.getI64TensorAttr(values);
+  return rewriter.create<TF::ConstOp>(location, cst_attr.getType(), cst_attr);
+}
+
+// Rewrites broadcast->reshape to a reshape->broadcast that reduces
+// the rank of the input and output of the broadcast.
+class SimplifyBroadcastReshape : public OpRewritePattern<BroadcastToOp> {
+  using OpRewritePattern<BroadcastToOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BroadcastToOp op,
+                                PatternRewriter &rewriter) const override {
+    // Only rewrite if the Broadcast has only one consumer.
+    if (!op.output().hasOneUse()) return failure();
+
+    Operation *user = *op.output().getUsers().begin();
+
+    auto reshape_op = llvm::dyn_cast_or_null<ReshapeOp>(user);
+    if (!reshape_op) return failure();
+
+    auto reshape_type = reshape_op.output().getType().cast<ShapedType>();
+
+    if (!reshape_type.hasStaticShape()) return failure();
+    ArrayRef<int64_t> reshape_shape = reshape_type.getShape();
+
+    // The pattern attempts to reduce the rank of the input to BroadcastTo.
+    // Thus, we fail to match if the consuming reshape rank is larger.
+    ArrayRef<int64_t> input_shape =
+        op.input().getType().cast<ShapedType>().getShape();
+    if (reshape_shape.size() > input_shape.size()) return failure();
+
+    // Extend the input shape with leading 1s to match the broadcast shape.
+    ArrayRef<int64_t> broadcast_shape =
+        op.output().getType().cast<ShapedType>().getShape();
+    SmallVector<int64_t, 4> input_shape_extended;
+    input_shape_extended.append(broadcast_shape.size() - input_shape.size(), 1);
+    input_shape_extended.append(input_shape.begin(), input_shape.end());
+
+    // Collect non-unit dims and corresponding dim in the input shape.
+    SmallVector<int64_t, 4> input_carryover_dims;
+    SmallVector<int64_t, 4> non_unit_dims;
+
+    for (int i = 0; i < input_shape_extended.size(); i++) {
+      int64_t dim = broadcast_shape[i];
+      if (dim != 1) {
+        non_unit_dims.push_back(dim);
+        input_carryover_dims.push_back(input_shape_extended[i]);
+      }
+    }
+
+    // If the reshape rank is less than the number of non-unit dimensions
+    // of the broadcast, then the reshape collapses non-unit dimensions.
+    // TODO(rahulsp) : Handle this case with more careful checks.
+    if (reshape_shape.size() < non_unit_dims.size()) return failure();
+
+    SmallVector<int64_t, 4> old_reshape_non_unit_dims;
+    SmallVector<int64_t, 4> new_reshape_dims;
+    int new_reshape_dim_idx = 0;
+    for (int64_t dim : reshape_shape) {
+      int new_reshape_dim = 1;
+      if (dim != 1) {
+        old_reshape_non_unit_dims.push_back(dim);
+        if (new_reshape_dim_idx < input_carryover_dims.size()) {
+          new_reshape_dim = input_carryover_dims[new_reshape_dim_idx];
+          new_reshape_dim_idx++;
+        }
+      }
+      new_reshape_dims.push_back(new_reshape_dim);
+    }
+
+    if (non_unit_dims != old_reshape_non_unit_dims) return failure();
+
+    Type el_ty = getElementTypeOrSelf(op.getType());
+    TF::ConstOp new_reshape_shape = GetI64ConstantTensor(
+        rewriter, ArrayRef<int64_t>(new_reshape_dims), op.getLoc());
+    auto new_reshape_type = RankedTensorType::get(new_reshape_dims, el_ty);
+    ReshapeOp new_reshape =
+        rewriter.create<ReshapeOp>(new_reshape_shape.getLoc(), new_reshape_type,
+                                   op.input(), new_reshape_shape);
+    TF::ConstOp new_broadcast_shape =
+        GetI64ConstantTensor(rewriter, reshape_shape, op.getLoc());
+    rewriter.replaceOpWithNewOp<BroadcastToOp>(
+        reshape_op, reshape_op.output().getType(), new_reshape,
+        new_broadcast_shape);
+    return success();
+  }
+};
+
 // Canonicalize operations in functions.
 struct TFOptimizePass : public PassWrapper<TFOptimizePass, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     auto func = getFunction();
-    populateWithGenerated(&getContext(), &patterns);
-    applyPatternsAndFoldGreedily(func, patterns);
+    populateWithGenerated(&getContext(), patterns);
+    patterns.insert<SimplifyBroadcastReshape>(&getContext());
+    applyPatternsAndFoldGreedily(func, std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
index f477f38afc6..86eea50d744 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -16,66 +16,62 @@ limitations under the License.
 // This pass forms `tf_executor.island` per region of
 // `tf_device.parallel_execute`.
 //
-// For example:
+// For example, the following:
+//
+//  %0 = tf_executor.island {
+//    tf_executor.yield
+//  }
 //  %1:2 = tf_executor.island {
 //    %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
 //      tf_executor.yield %2 : tensor<i1>
 //  }
-//  tf_executor.island() {
-//    "tf_device.parallel_execute"() ({
-//      %3 = "tf.opB"() : () -> tensor<i1>
-//      tf_device.return %3 : tensor<i1>
-//    },
-//    {
+//  %3:2 = tf_executor.island(%0) {
+//    %4 = "tf_device.parallel_execute"() ( {
+//      %5 = "tf.opB"() : () -> tensor<i1>
+//      tf_device.return %5 : tensor<i1>
+//    }, {
 //      %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
 //      tf_device.return
 //    }) {} : () -> (tensor<i1>)
+//    tf_executor.yield %4 : tensor<i1>
+//  }
+//  tf_executor.fetch %3#0 : tensor<i1>
+//
+// gets lowered to:
+//
+//  %0 = tf_executor.island {
 //    tf_executor.yield
 //  }
-//  tf_executor.fetch
+//  %1:2 = tf_executor.island {
+//    %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+//    tf_executor.yield %2 : tensor<i1>
+//  }
 //
-//  Would become:
-//    %1:2 = tf_executor.island {
-//      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-//      tf_executor.yield %2 : tensor<i1>
-//    }
+//  // Island for the first region of above parallel_execute.
+//  %3:2 = tf_executor.island(%0) {
+//    %4 = "tf.opB"() : () -> tensor<i1>
+//    tf_executor.yield %4 : tensor<i1>
+//  }
 //
-//    // Input barrier sink island that forwards all inputs.
-//    %output_0, %control_1 = tf_executor.island {
-//      tf_executor.yield %1#0: tensor<i1>
-//    }
+//  // Island for the second region of above parallel_execute.
+//  %5 = tf_executor.island(%0) {
+//    %6 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+//    tf_executor.yield
+//  }
 //
-//    // Island for the first region of above parallel_execute.
-//    %output_2, %control_3 = tf_executor.island(%control_1) {
-//      %3 = "tf.opB"() : () -> tensor<i1>
-//      tf_executor.yield %3 : tensor<i1>
-//    }
-//
-//    // Island for the second region of above parallel_execute.
-//    %control_5 = tf_executor.island {
-//        %5 = "tf.opC"(%output_0) : (tensor<i1>) -> tensor<i32>
-//      tf_executor.yield
-//    }
-//
-//    // Output barrier sink island that forwards all outputs.
-//    %output_5, %control_6 = tf_executor.island(%control_5) {
-//      tf_executor.yield %output_2
-//    }
+//  tf_executor.fetch %3#0, %5 : tensor<i1>, !tf_executor.control
 //
 //  When tf_device.parallel_execute op is enclosed after tf_device.replicate,
 //  then this pass will run following `replicate-to-island` pass and
 //  `tf-executor-break-up-islands` pass.
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 
@@ -89,174 +85,117 @@ struct ParallelExecuteToIslandsPass
 };
 
 // Convert parallel_execute op to a set of islands where each region of
-// parallel_execute op becomes a separate island. This ensures that
-// regions of parallel_execute op gets executed concurrently.
-LogicalResult ExpandParallelExecuteToIslands(
-    tf_executor::IslandOp island_op, tf_executor::IslandOp input_sink_island,
+// parallel_execute op becomes a separate island. This ensures that the regions
+// of the parallel_execute op gets executed concurrently.
+void ExpandParallelExecuteToIslands(
+    tf_executor::IslandOp island_op,
     tf_device::ParallelExecuteOp parallel_execute_op, OpBuilder* builder,
-    llvm::SmallVector<tf_executor::IslandOp, 4>* islands) {
-  const int num_executions =
-      parallel_execute_op.getOperation()->getNumRegions();
-  llvm::SmallVector<tf_executor::IslandOp, 4> executions;
-  executions.reserve(num_executions);
-  builder->setInsertionPoint(island_op);
+    llvm::SmallVectorImpl<tf_executor::IslandOp>& executes) {
+  const int num_regions = parallel_execute_op.getOperation()->getNumRegions();
+  executes.reserve(num_regions);
 
-  auto control_type = tf_executor::ControlType::get(island_op.getContext());
-  for (int i : llvm::seq<int>(0, num_executions)) {
-    auto execute_region =
-        parallel_execute_op.GetRegionBlockWithIndex(i).getParent();
+  for (int i : llvm::seq<int>(0, num_regions)) {
+    Block& execute_block = parallel_execute_op.GetRegionBlockWithIndex(i);
 
-    // If region does not have any inputs, then add explicit control dependency
-    // from the input sink island. This guarantees that all inputs of
-    // parallel_execute op must be materialized before any of the islands are
-    // executed.
-    llvm::SetVector<Value> region_inputs;
-    getUsedValuesDefinedAbove(*execute_region, region_inputs);
-    llvm::SmallVector<Value, 1> execution_control_inputs;
-    if (region_inputs.empty() && input_sink_island)
-      execution_control_inputs.emplace_back(input_sink_island.control());
-
-    // Collect result types and operands.
-    Operation* terminator = execute_region->front().getTerminator();
-    llvm::SmallVector<Type, 8> output_types(terminator->getOperandTypes());
-
-    // Replace terminator with YieldOp as island op always ends with yield op.
+    // Replace terminator with tf_executor.YieldOp.
+    Operation* terminator = execute_block.getTerminator();
     builder->setInsertionPoint(terminator);
-    builder->create<tf_executor::YieldOp>(terminator->getLoc(),
-                                          terminator->getOperands());
+    auto yield = builder->create<tf_executor::YieldOp>(
+        terminator->getLoc(), terminator->getOperands());
     terminator->erase();
 
     // Create new island for each region.
     builder->setInsertionPoint(island_op);
-    auto execution_island = builder->create<tf_executor::IslandOp>(
-        island_op.getLoc(), output_types, control_type,
-        execution_control_inputs);
+    auto execute_island = builder->create<tf_executor::IslandOp>(
+        island_op.getLoc(), yield.getOperandTypes(),
+        island_op.control().getType(), island_op.controlInputs());
 
-    // Move over tf_device.parallel_execute body region into newly a
-    // created island.
-    execution_island.body().takeBody(*execute_region);
-    islands->push_back(execution_island);
+    // Move over tf_device.parallel_execute body region into newly the created
+    // island.
+    execute_island.body().takeBody(*execute_block.getParent());
+    executes.push_back(execute_island);
   }
-
-  return success();
 }
 
-// Creates an island that works as input sync point for islands. This guarantees
-// that all (implicitly captured) inputs of parallel_execute are materialized
-// before any of the islands are executed.
-tf_executor::IslandOp CreateInputBarrierIsland(
-    OpBuilder* builder, tf_executor::IslandOp island_op) {
-  builder->setInsertionPoint(island_op);
-
-  llvm::SetVector<Value> all_inputs;
-  getUsedValuesDefinedAbove(island_op.body(), all_inputs);
-
-  // Filter out values that are arguments and doesn't need to be part of the
-  // entry barrier.
-  llvm::SmallVector<Value, 8> island_inputs;
-  llvm::SmallVector<Type, 8> input_types;
-  island_inputs.reserve(all_inputs.size());
-  input_types.reserve(all_inputs.size());
-  for (Value val : all_inputs) {
-    if (!val.isa<BlockArgument>()) {
-      island_inputs.push_back(val);
-      input_types.push_back(val.getType());
-    }
-  }
-  if (island_inputs.empty() && island_op.controlInputs().empty()) return {};
-
-  // Create new island for that forwards all inputs.
-  auto control_type = tf_executor::ControlType::get(island_op.getContext());
-  auto input_sink_island = builder->create<tf_executor::IslandOp>(
-      island_op.getLoc(), input_types, control_type, island_op.controlInputs());
-  input_sink_island.body().push_back(new Block);
-
-  for (auto input_index_and_value : llvm::enumerate(island_inputs)) {
-    int index = input_index_and_value.index();
-    Value input_value = input_index_and_value.value();
-    replaceAllUsesInRegionWith(input_value, input_sink_island.getResult(index),
-                               island_op.body());
-  }
-
-  // Create YieldOp for the new input sink island.
-  builder->setInsertionPointToEnd(&input_sink_island.GetBody());
-  builder->create<tf_executor::YieldOp>(island_op.getLoc(), island_inputs);
-  return input_sink_island;
-}
-
-// Creates an islands that works as output sync point. This guarantees that
-// execution of all islands must be completed before op following
-// parallel_execute runs.
-tf_executor::IslandOp CreateOutputBarrierIsland(
-    OpBuilder* builder, tf_executor::IslandOp island_op,
-    llvm::SmallVectorImpl<tf_executor::IslandOp>* islands) {
-  // Add control dependency to island operand if island output has no uses.
-  llvm::SmallVector<Value, 8> island_operands;
-  for (auto& island : *islands)
-    if (island.use_empty()) island_operands.push_back(island.control());
-
-  // Create single island forwarding all island results.
-  builder->setInsertionPoint(island_op);
-  auto island_output_sink = builder->create<tf_executor::IslandOp>(
-      island_op.getLoc(), llvm::to_vector<8>(island_op.getResultTypes()),
-      island_operands);
-  island_output_sink.body().push_back(new Block);
-  return island_output_sink;
-}
-
-LogicalResult CreateIslandsFromParallelExecute(
+void CreateIslandsFromParallelExecute(
     tf_executor::IslandOp island_op,
     tf_device::ParallelExecuteOp parallel_execute_op) {
   OpBuilder builder(island_op);
-  auto input_sink_island = CreateInputBarrierIsland(&builder, island_op);
 
-  // Create N islands where N is the number of regions inside parallel_execute
-  // op.
-  llvm::SmallVector<tf_executor::IslandOp, 4> islands;
-  auto result = ExpandParallelExecuteToIslands(
-      island_op, input_sink_island, parallel_execute_op, &builder, &islands);
-  if (failed(result)) return result;
+  // Create islands for each region of the parallel_execute op.
+  llvm::SmallVector<tf_executor::IslandOp, 4> executes;
+  ExpandParallelExecuteToIslands(island_op, parallel_execute_op, &builder,
+                                 executes);
 
-  // Remap all results of parallel_execute op with outputs from newly
-  // created islands.
+  // Remap all results of parallel_execute op with outputs from newly created
+  // islands.
   llvm::SmallVector<Value, 8> parallel_execute_outputs;
   parallel_execute_outputs.reserve(
       parallel_execute_op.getOperation()->getNumResults());
 
-  for (auto island : islands)
-    for (auto output_value : island.outputs())
-      parallel_execute_outputs.emplace_back(output_value);
+  for (auto& execute : executes)
+    parallel_execute_outputs.append(execute.outputs().begin(),
+                                    execute.outputs().end());
 
-  parallel_execute_op.getOperation()->replaceAllUsesWith(
-      parallel_execute_outputs);
+  for (auto result : llvm::zip(island_op.outputs(), parallel_execute_outputs))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
 
-  auto island_output_sink =
-      CreateOutputBarrierIsland(&builder, island_op, &islands);
+  // Add sink island to pin all islands as a control dependency if there is a
+  // control dependency leading from the parallel_execute originally.
+  if (!island_op.control().use_empty()) {
+    llvm::SmallVector<Value, 8> island_operands;
+    for (auto& execute : executes) island_operands.push_back(execute.control());
+
+    builder.setInsertionPoint(island_op);
+    auto island_sink = builder.create<tf_executor::IslandOp>(
+        island_op.getLoc(), llvm::ArrayRef<Type>{},
+        island_op.control().getType(), island_operands);
+    island_sink.body().push_back(new Block);
+    builder.setInsertionPointToEnd(&island_sink.GetBody());
+    builder.create<tf_executor::YieldOp>(island_op.getLoc(),
+                                         llvm::ArrayRef<Value>{});
+    island_op.control().replaceAllUsesWith(island_sink.control());
+  }
+
+  // Islands with no uses should be pinned to a graph fetch so they still
+  // execute.
+  llvm::SmallVector<Value, 8> unused_execute_controls;
+  for (auto& execute : executes)
+    if (execute.use_empty())
+      unused_execute_controls.push_back(execute.control());
+
+  if (!unused_execute_controls.empty()) {
+    auto graph_op = island_op.getParentOfType<tf_executor::GraphOp>();
+    tf_executor::FetchOp fetch = graph_op.GetFetch();
+    auto fetches = llvm::to_vector<8>(fetch.getOperands());
+    fetches.append(unused_execute_controls.begin(),
+                   unused_execute_controls.end());
+    builder.setInsertionPoint(fetch);
+    builder.create<tf_executor::FetchOp>(fetch.getLoc(), fetches);
+    fetch.erase();
+  }
 
-  // Move island YieldOp over to new single island and remap island results.
-  island_op.GetYield().getOperation()->moveBefore(
-      &island_output_sink.GetBody(), island_output_sink.GetBody().begin());
-  island_op.replaceAllUsesWith(island_output_sink);
   island_op.erase();
-
-  return success();
-}
-
-// Finds islands with a single `tf_device.parallel_execute` and create
-// individual islands per region of parallel_execute.
-void LowerSingleIslandParallelExecuteToIslands(
-    tf_executor::IslandOp island_op) {
-  if (!hasSingleElement(island_op.GetBody().without_terminator())) return;
-
-  if (auto parallel_execute_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
-          &island_op.GetBody().front()))
-    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
 }
 
 void ParallelExecuteToIslandsPass::runOnFunction() {
-  getFunction().walk([&](tf_executor::IslandOp island_op) {
-    LowerSingleIslandParallelExecuteToIslands(island_op);
+  // Find islands with a single `tf_device.parallel_execute` and create
+  // individual islands per execute region of the parallel_execute.
+  llvm::SmallVector<tf_executor::IslandOp, 4> parallel_execute_op_islands;
+  getFunction().walk([&](tf_executor::GraphOp graph_op) {
+    for (auto island_op : graph_op.getOps<tf_executor::IslandOp>()) {
+      if (!island_op.WrapsSingleOp()) continue;
+
+      if (isa<tf_device::ParallelExecuteOp>(&island_op.GetBody().front()))
+        parallel_execute_op_islands.push_back(island_op);
+    }
   });
+
+  for (tf_executor::IslandOp island_op : parallel_execute_op_islands) {
+    auto parallel_execute_op =
+        cast<tf_device::ParallelExecuteOp>(island_op.GetBody().front());
+    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
+  }
 }
 }  // anonymous namespace
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index a4ddb713ec0..1d7816c8981 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -84,6 +84,10 @@ std::unique_ptr<OperationPass<FuncOp>> CreateGpuOpFusionPass();
 std::unique_ptr<OperationPass<mlir::FuncOp>>
 CreateTensorDeviceCopyConversionPass();
 
+// Returns a pass that folds tf.BroadcastTo nodes with subsequent nodes if they
+// have built in broadcasting support.
+std::unique_ptr<OperationPass<FuncOp>> CreateBroadcastFoldPass();
+
 struct LayoutOptimizationPipelineOptions
     : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
   Option<std::string> force_data_format{
@@ -274,7 +278,7 @@ CreateMarkOpsForOutsideCompilationPass();
 // Creates a pass that hoists a `tf_device.launch` body and assigns a `device`
 // attribute to each TensorFlow dialect op in the body based on the `device`
 // attribute on the `tf_device.launch`.
-std::unique_ptr<OperationPass<FuncOp>> CreateLaunchToDeviceAttributePass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchToDeviceAttributePass();
 }  // namespace TFDevice
 
 namespace TFTPU {
@@ -354,6 +358,9 @@ CreateTPUUpdateEmbeddingEnqueueOpInputsPass();
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractOutsideCompilationPass();
 
+// Creates a pass that propagates TPU devices to users.
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUDevicePropagationPass();
+
 // Populates the supplied passmanager with the passes required to run the
 // bridge.
 void CreateTPUBridgePipeline(OpPassManager& pm);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index 5b70729ee80..88ba1cee1b8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -120,46 +120,6 @@ llvm::SmallPtrSet<FuncOp, 4> GetReachableFunctionsFromRegion(ModuleOp module,
   return visited_functions;
 }
 
-// Collects all functions and transitive functions reachable from region that
-// contain replicate variant ops.
-llvm::SmallDenseMap<llvm::StringRef, FuncOp> GetReachableFunctionsToClone(
-    ModuleOp module, Region& region,
-    const llvm::Optional<DictionaryAttr>& devices) {
-  llvm::SmallPtrSet<FuncOp, 4> reachable_functions =
-      GetReachableFunctionsFromRegion(module, region);
-
-  llvm::SmallDenseMap<llvm::StringRef, FuncOp> functions_to_clone;
-  llvm::SmallVector<FuncOp, 4> functions_to_visit;
-  for (FuncOp func : reachable_functions) {
-    if (!func.getCallableRegion()) continue;
-    if (HasReplicaVariantOps(*func.getCallableRegion(), devices)) {
-      functions_to_clone.insert({func.getName(), func});
-      functions_to_visit.push_back(func);
-    }
-  }
-
-  while (!functions_to_visit.empty()) {
-    llvm::SmallVector<FuncOp, 4> new_functions_to_visit;
-
-    for (FuncOp func_to_visit : functions_to_visit) {
-      auto func_uses = func_to_visit.getSymbolUses(module);
-      if (!func_uses) continue;
-      for (auto use : *func_uses) {
-        auto parent_func = use.getUser()->getParentOfType<FuncOp>();
-        if (!parent_func || !reachable_functions.contains(parent_func) ||
-            !functions_to_clone.insert({parent_func.getName(), parent_func})
-                 .second)
-          continue;
-        new_functions_to_visit.push_back(parent_func);
-      }
-    }
-
-    functions_to_visit.swap(new_functions_to_visit);
-  }
-
-  return functions_to_clone;
-}
-
 struct FuncOldNameAndClone {
   StringRef old_name;
   FuncOp clone;
@@ -276,20 +236,19 @@ LogicalResult ExpandReplicateIntoReplicas(
   terminator.erase();
 
   auto funcs_to_clone =
-      GetReachableFunctionsToClone(module, replicate_op.body(), devices);
+      GetReachableFunctionsFromRegion(module, replicate_op.body());
   SymbolTable symbol_table(module);
 
   builder.setInsertionPoint(island_op);
   BlockAndValueMapping mapping;
   for (int i : llvm::seq<int>(0, num_replicas)) {
-    // Clone reachable functions with replica variant ops.
+    // Clone reachable functions from region.
     llvm::SmallVector<FuncOldNameAndClone, 4> cloned_functions;
     cloned_functions.reserve(funcs_to_clone.size());
-    for (auto& func_to_clone : funcs_to_clone) {
-      auto cloned_function = func_to_clone.getSecond().clone();
+    for (FuncOp func_to_clone : funcs_to_clone) {
+      auto cloned_function = func_to_clone.clone();
       symbol_table.insert(cloned_function, module.end());
-      cloned_functions.push_back(
-          {func_to_clone.getSecond().getName(), cloned_function});
+      cloned_functions.push_back({func_to_clone.getName(), cloned_function});
     }
 
     // Create new island for replica.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 3cd316cf92d..e635f13f018 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -159,6 +159,26 @@ Type GetResourceSubtype(Value value) {
   return nullptr;
 }
 
+// Replaces all `tf.VarIsInitializedOp` in a block with a constant true.
+// TODO(b/171039585): Replace this with proper analysis of
+// `tf.VarIsInitializedOp` in regards to resource writes and control flow.
+void SetAllVarIsInitializedToTrue(Block* block) {
+  auto builder = OpBuilder::atBlockBegin(block);
+  TF::ConstOp const_true = nullptr;
+  for (auto op :
+       llvm::make_early_inc_range(block->getOps<TF::VarIsInitializedOp>())) {
+    builder.setInsertionPoint(op);
+    if (!const_true)
+      const_true = builder.create<TF::ConstOp>(
+          op.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get(/*shape=*/{}, builder.getI1Type()), true));
+
+    op.is_initialized().replaceAllUsesWith(const_true);
+    op.erase();
+  }
+}
+
 // Performs store-load forwarding. This effectively removes
 // 1) Any resource loads after a store to that same resource is done
 // 2) Any resource stores except the last one.
@@ -181,8 +201,20 @@ void ForwardStoreToLoad(Block* block) {
       if (!last_store) continue;
 
       // Use stored value in last_store to replace all uses of current resource
-      // load's result, then erase this resource load.
-      read_variable_op.value().replaceAllUsesWith(last_store.value());
+      // load's result, then erase this resource load. Add an intermediate
+      // CastOp if the shape of types doesn't exactly match.
+      Type read_type = read_variable_op.value().getType();
+      if (read_type != last_store.value().getType()) {
+        OpBuilder builder(last_store);
+        builder.setInsertionPointAfter(last_store);
+        auto cast = builder.create<TF::CastOp>(
+            last_store.getLoc(), read_type, last_store.value(),
+            /*Truncate=*/builder.getBoolAttr(false));
+        read_variable_op.value().replaceAllUsesWith(cast);
+      } else {
+        read_variable_op.value().replaceAllUsesWith(last_store.value());
+      }
+
       read_variable_op.erase();
       continue;
     }
@@ -463,7 +495,7 @@ void RegionResourceHoister::AppendResourceStoreValueToReturn(
     auto new_return_operands = llvm::to_vector<4>(old_return->getOperands());
     new_return_operands.resize(num_new_results_);
 
-    // initialize return values for written resources to be the hosited reads.
+    // initialize return values for written resources to be the hoisted reads.
     for (Value resource : written_resources_) {
       const ResourceInfo& info = resources_[resource];
       new_return_operands[info.result_index] = info.hoisted_read;
@@ -767,8 +799,6 @@ LogicalResult LiftArgRetResourcesForFunction(
     FuncOp func_op,
     const llvm::SmallDenseMap<int64_t, Type>& resource_data_types,
     llvm::function_ref<void(int64_t, Value)> handle_updated_arg_value) {
-  ForwardStoreToLoad(&func_op.front());
-
   RegionResourceHoister hoister(func_op);
   if (failed(hoister.Analyze())) return failure();
 
@@ -1167,7 +1197,7 @@ void UpdatePartitionedCallOpWithNewCallee(
 }
 
 LogicalResult HoistForControlFlow(
-    Block*, ModuleOp,
+    Block*, ModuleOp, bool,
     llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*);
 
 // A templated routine for handling both PartitionedCallOp and
@@ -1176,14 +1206,15 @@ LogicalResult HoistForControlFlow(
 // flow, then performs lifting on the callee.
 template <typename CallOpType>
 LogicalResult HandlePartitionedCallOp(
-    CallOpType call_op, FuncOp callee, ModuleOp module,
+    CallOpType call_op, FuncOp callee, ModuleOp module, bool vars_initialized,
     llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
         lifted_callees) {
   auto emplace_res = lifted_callees->try_emplace(callee.getName(),
                                                  PartitionedCallLiftingInfo());
   if (emplace_res.second) {
     // Unseen callee. Perform resource lifting on it.
-    if (failed(HoistForControlFlow(&callee.front(), module, lifted_callees)))
+    if (failed(HoistForControlFlow(&callee.front(), module, vars_initialized,
+                                   lifted_callees)))
       return failure();
 
     if (failed(HandlePartitionedCallOpCallee(
@@ -1198,26 +1229,28 @@ LogicalResult HandlePartitionedCallOp(
 // Hoists resource loads/stores from control flow ops in `block` outside the
 // body/cond/branch/callee functions.
 LogicalResult HoistForControlFlow(
-    Block* block, ModuleOp module,
+    Block* block, ModuleOp module, bool vars_initialized,
     llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
         lifted_partitioned_call_callees) {
+  if (vars_initialized) SetAllVarIsInitializedToTrue(block);
+
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
       auto body = while_op.body_function();
       auto cond = while_op.cond_function();
       // Recursively handle the nested control flow.
-      HoistForControlFlow(&body.front(), module,
+      HoistForControlFlow(&body.front(), module, vars_initialized,
                           lifted_partitioned_call_callees);
-      HoistForControlFlow(&cond.front(), module,
+      HoistForControlFlow(&cond.front(), module, vars_initialized,
                           lifted_partitioned_call_callees);
       if (failed(HandleWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
       auto then_branch = if_op.then_function();
       auto else_branch = if_op.else_function();
       // Recursively handle the nested control flow.
-      HoistForControlFlow(&then_branch.front(), module,
+      HoistForControlFlow(&then_branch.front(), module, vars_initialized,
                           lifted_partitioned_call_callees);
-      HoistForControlFlow(&else_branch.front(), module,
+      HoistForControlFlow(&else_branch.front(), module, vars_initialized,
                           lifted_partitioned_call_callees);
       if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch})))
         return failure();
@@ -1226,7 +1259,7 @@ LogicalResult HoistForControlFlow(
       case_op.get_branch_functions(branch_functions);
       for (FuncOp func : branch_functions) {
         // Recursively handle the nested control flow.
-        HoistForControlFlow(&func.front(), module,
+        HoistForControlFlow(&func.front(), module, vars_initialized,
                             lifted_partitioned_call_callees);
       }
       if (failed(HandleCaseOrIfOp(case_op, branch_functions))) return failure();
@@ -1237,6 +1270,7 @@ LogicalResult HoistForControlFlow(
             "resource lifting does not support call with nested references.");
       }
       if (failed(HandlePartitionedCallOp(call_op, callee, module,
+                                         vars_initialized,
                                          lifted_partitioned_call_callees))) {
         // Nested control flow handling is done in HandlePartitionedCallOp().
         return failure();
@@ -1244,12 +1278,13 @@ LogicalResult HoistForControlFlow(
     } else if (auto call_op =
                    llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
       if (failed(HandlePartitionedCallOp(call_op, call_op.func(), module,
+                                         vars_initialized,
                                          lifted_partitioned_call_callees))) {
         return failure();
       }
     } else if (isa<TF::IfRegionOp, TF::CaseRegionOp, TF::WhileRegionOp>(op)) {
       for (Region& region : op.getRegions())
-        HoistForControlFlow(&region.front(), module,
+        HoistForControlFlow(&region.front(), module, vars_initialized,
                             lifted_partitioned_call_callees);
       LogicalResult result = RegionResourceHoister::ReplaceOpWithNewOp(&op);
       if (failed(result)) return failure();
@@ -1277,7 +1312,8 @@ void ResourceOpLiftingPass::runOnOperation() {
   auto walk_result = module.walk([&](FuncOp func_op) {
     return func_op.walk([&](tf_device::ClusterOp cluster) {
       LogicalResult result = HoistForControlFlow(
-          &cluster.GetBody(), module, &lifted_partitioned_call_callees);
+          &cluster.GetBody(), module, /*vars_initialized=*/true,
+          &lifted_partitioned_call_callees);
       if (failed(result)) return WalkResult::interrupt();
       result = RegionResourceHoister::ReplaceOpWithNewOp(cluster);
       if (failed(result)) return WalkResult::interrupt();
@@ -1340,9 +1376,9 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
 
   llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
-  if (failed(HoistForControlFlow(&function.front(),
-                                 cast<ModuleOp>(function.getParentOp()),
-                                 &lifted_partitioned_call_callees)))
+  if (failed(HoistForControlFlow(
+          &function.front(), cast<ModuleOp>(function.getParentOp()),
+          /*vars_initialized=*/false, &lifted_partitioned_call_callees)))
     return failure();
 
   // Clean up and canonicalize to remove dead local variables as some local
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index e802353b84c..153a46ad66b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -67,100 +68,13 @@ using tensorflow::shape_inference::ShapeHandle;
 namespace mlir {
 namespace TF {
 namespace {
-Optional<TypeRange> InferShapeForFunctionReturnType(FuncOp func) {
-  // Find any return ops.
-  SmallVector<ReturnOp, 4> return_ops;
-  for (Block& block : func) {
-    if (auto return_op = dyn_cast<ReturnOp>(block.getTerminator())) {
-      return_ops.push_back(return_op);
-    }
-  }
-
-  // Right now we only handle the case of a single return op.
-  // To handle multiple return ops, we would need to look at all their shapes
-  // and come up with a common shape and insert appropriate casts.
-  if (return_ops.size() != 1) {
-    return None;
-  }
-
-  // Find the return type.
-  auto return_op = return_ops.front();
-
-  // Manually fold tf.Cast that precedes the return instruction and only differs
-  // in shape refinement level.
-  for (OpOperand& arg_op : return_op.getOperation()->getOpOperands()) {
-    Operation* arg_defining_op = arg_op.get().getDefiningOp();
-    if (auto cast_op = dyn_cast_or_null<CastOp>(arg_defining_op)) {
-      // Shape inference should not change the element type.
-      if (cast_op.SrcT() != cast_op.DstT()) continue;
-      // We only refine the result shape if the result a dynamic shape, the
-      // input has static shape, and the two shapes are compatible.
-      auto has_static_shape = [](const Value value) {
-        auto shaped_type = value.getType().dyn_cast<ShapedType>();
-        return shaped_type && shaped_type.hasStaticShape();
-      };
-      Value input = cast_op.x();
-      Value result = cast_op.y();
-      if (!has_static_shape(input) || has_static_shape(result) ||
-          failed(verifyCompatibleShape(input.getType(), result.getType())))
-        continue;
-
-      arg_op.set(cast_op.x());
-      if (cast_op.y().use_empty()) cast_op.erase();
-    }
-  }
-
-  return TypeRange(return_op.getOperandTypes());
-}
-
-// Returns if the shape inference pass supports an op outside the TF dialect.
-bool IsSupportedNonTFOp(Operation* op) {
-  return isa<ReturnOp, tf_device::ReturnOp, tf_device::ClusterOp,
-             tf_device::LaunchOp, tf_executor::EnterOp, tf_executor::ExitOp,
-             tf_executor::FetchOp, tf_executor::GraphOp, tf_executor::IslandOp,
-             tf_executor::LoopCondOp, tf_executor::MergeOp,
-             tf_executor::NextIterationSinkOp, tf_executor::SwitchNOp,
-             tf_executor::SwitchOp, tf_executor::YieldOp>(op);
-}
-
-// Returns whether a cast back would need to be inserted, e.g., whether the
-// operation of which use is an operand allows for shape refinement without
-// a cast.
-bool NeedsCastBack(OpOperand& use, Dialect* tf_dialect) {
-  return use.getOwner()->getDialect() != tf_dialect &&
-         !IsSupportedNonTFOp(use.getOwner());
-}
-
-// Updates the result of an operation to a new inferred type. Also inserts
-// tf.Cast operation for uses that are incompatible with the new type.
-void UpdateTypeAndInsertIncompatibleUseCasts(Dialect* tf_dialect, Type new_type,
-                                             Operation* op, Value result) {
-  // A tf.Cast operation is lazily created on the first use requires a cast.
-  TF::CastOp cast_op;
-  auto get_cast_op = [&]() {
-    if (!cast_op) {
-      OpBuilder b(op);
-      b.setInsertionPointAfter(op);
-      cast_op = b.create<TF::CastOp>(op->getLoc(), result.getType(), result,
-                                     /*truncate=*/b.getBoolAttr(false));
-    }
-    return Value(cast_op);
-  };
-  // First insert cast back for uses that need a cast and then
-  // update the type.
-  for (OpOperand& use : make_early_inc_range(result.getUses())) {
-    if (NeedsCastBack(use, tf_dialect)) use.set(get_cast_op());
-  }
-
-  result.setType(new_type);
-}
 
 // Returns whether type can be further refined.
 bool CanBeRefined(Type type) {
   auto shape_type = type.dyn_cast<ShapedType>();
   return shape_type &&
          (!shape_type.hasStaticShape() ||
-          shape_type.getElementType().isa<TF::ResourceType, TF::VariantType>());
+          shape_type.getElementType().isa<TF::TensorFlowTypeWithSubtype>());
 }
 
 // Returns whether `original_type` type can be refined with
@@ -179,6 +93,49 @@ bool CanRefineTypeWith(Type original_type, Type potential_refined_type) {
          !element_type_with_subtype.GetSubtypes().empty();
 }
 
+// Returns if the shape inference pass supports an op outside the TF dialect.
+bool IsSupportedNonTFOp(Operation* op) {
+  return isa<tf_device::ReturnOp, tf_device::ClusterOp, tf_device::LaunchOp,
+             tf_executor::EnterOp, tf_executor::ExitOp, tf_executor::FetchOp,
+             tf_executor::GraphOp, tf_executor::IslandOp,
+             tf_executor::LoopCondOp, tf_executor::MergeOp,
+             tf_executor::NextIterationSinkOp, tf_executor::SwitchNOp,
+             tf_executor::SwitchOp, tf_executor::YieldOp>(op);
+}
+
+// Returns whether a cast back would need to be inserted, e.g., whether the
+// operation of which use is an operand allows for shape refinement without
+// a cast.
+bool NeedsCastBack(OpOperand& use, Dialect* tf_dialect) {
+  return use.getOwner()->getDialect() != tf_dialect &&
+         !IsSupportedNonTFOp(use.getOwner());
+}
+
+// Updates the result of an operation to a new inferred type. Also inserts
+// tf.Cast operation for uses that are incompatible with the new type.
+void UpdateTypeAndInsertIncompatibleUseCasts(Dialect* tf_dialect, Type new_type,
+                                             Value result) {
+  // A tf.Cast operation is lazily created on the first use requires a cast.
+  TF::CastOp cast_op;
+  auto get_cast_op = [&]() {
+    if (!cast_op) {
+      Operation* op = result.getDefiningOp();
+      OpBuilder b(op);
+      b.setInsertionPointAfter(op);
+      cast_op = b.create<TF::CastOp>(op->getLoc(), result.getType(), result,
+                                     /*truncate=*/b.getBoolAttr(false));
+    }
+    return Value(cast_op);
+  };
+  // First insert cast back for uses that need a cast and then
+  // update the type.
+  for (OpOperand& use : make_early_inc_range(result.getUses())) {
+    if (NeedsCastBack(use, tf_dialect)) use.set(get_cast_op());
+  }
+
+  result.setType(new_type);
+}
+
 // Refines the type of `result` of `op` using the type `potential_refined_type`.
 // Return true if the type was changed.
 bool RefineResultType(Operation* op, Value result,
@@ -187,7 +144,7 @@ bool RefineResultType(Operation* op, Value result,
     return false;
 
   UpdateTypeAndInsertIncompatibleUseCasts(op->getDialect(),
-                                          potential_refined_type, op, result);
+                                          potential_refined_type, result);
   return true;
 }
 
@@ -197,6 +154,7 @@ bool InferShapeForCall(CallOpInterface call_op) {
   FuncOp func = dyn_cast<FuncOp>(call_op.resolveCallable());
   if (!func) return false;
 
+  LLVM_DEBUG(llvm::dbgs() << "Infer shape for call " << func.getName());
   Operation* op = call_op.getOperation();
   bool changed = false;
   // Map each of the results of the call to the returned type of the
@@ -205,6 +163,7 @@ bool InferShapeForCall(CallOpInterface call_op) {
     changed = RefineResultType(op, std::get<0>(result), std::get<1>(result)) ||
               changed;
   }
+  LLVM_DEBUG(llvm::dbgs() << " changed ? " << changed << "\n");
 
   return changed;
 }
@@ -232,8 +191,7 @@ bool InferShapeForCast(CastOp op, Dialect* tf_dialect) {
       ranked_op_type.getShape(),
       result.getType().cast<ShapedType>().getElementType());
 
-  UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect, new_type, op,
-                                          op.getResult());
+  UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect, new_type, op.getResult());
   return true;
 }
 
@@ -289,7 +247,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
 
     UpdateTypeAndInsertIncompatibleUseCasts(
-        op->getDialect(), std::get<1>(result), op, std::get<0>(result));
+        op->getDialect(), std::get<1>(result), std::get<0>(result));
     changed = true;
   }
   return changed;
@@ -528,13 +486,17 @@ class ShapeInference {
   // whether any result type changed.
   bool InferShapeForNonTFDialectOperation(Operation* op);
 
+  // Infers shape for function return type and returns whether changed.
+  void InferShapeForFunctionReturnType(FuncOp func);
+
+  Dialect* const tf_dialect_;
+
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produced) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
-  Dialect* tf_dialect_;
 
   // TODO(b/154065712): Remove propagate_caller_callee_constants once using
   // SCCP pass instead.
@@ -543,10 +505,9 @@ class ShapeInference {
 
 ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context,
                                bool propagate_caller_callee_constants)
-    : graph_version_(graph_version),
-      propagate_caller_callee_constants_(propagate_caller_callee_constants) {
-  tf_dialect_ = context->getLoadedDialect<TensorFlowDialect>();
-}
+    : tf_dialect_(context->getLoadedDialect<TensorFlowDialect>()),
+      graph_version_(graph_version),
+      propagate_caller_callee_constants_(propagate_caller_callee_constants) {}
 
 ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
                                                  InferenceContext* ic) {
@@ -572,7 +533,7 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
         ValuePort{result.getOwner(), {result.getResultNumber(), i}});
     while (!worklist.empty()) {
       auto front = worklist.pop_back_val();
-      LLVM_DEBUG(front.print(llvm::errs() << "\nWorklist front "));
+      LLVM_DEBUG(front.print(llvm::dbgs() << "\nWorklist front "));
 
       SmallVector<ValuePort, 4> inputs;
       auto res = ComputeInputsRequiredForOutput(front, &inputs);
@@ -600,7 +561,7 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
         LLVM_DEBUG(llvm::dbgs() << "[root node]\n");
         if (auto dea = ret.dyn_cast<DenseIntElementsAttr>()) {
           if (dea.getNumElements() != 1) {
-            LLVM_DEBUG(llvm::errs() << "Unexpected number of elements\n");
+            LLVM_DEBUG(llvm::dbgs() << "Unexpected number of elements\n");
             return {};
           }
           int64_t val = (*dea.getIntValues().begin()).getSExtValue();
@@ -633,8 +594,7 @@ bool ShapeInference::RefineTypeForPassThroughOperands(Operation* op,
              .isa<TF::TensorFlowRefType>())
       continue;
 
-    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, operand_type, op,
-                                            result);
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, operand_type, result);
     changed = true;
   }
   return changed;
@@ -666,7 +626,7 @@ bool ShapeInference::RefineShapeForPassThroughOps(Operation* op) {
 
     auto new_type = RankedTensorType::get(operand_type.getShape(),
                                           result_type.getElementType());
-    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, new_type, op, result);
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, new_type, result);
     changed = true;
   }
   return changed;
@@ -790,7 +750,7 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
       inferred_type = UnrankedTensorType::get(inferred.getElementType());
 
     if (op_result.getType() == inferred_type) continue;
-    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, inferred_type, op,
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, inferred_type,
                                             op_result);
     changed = true;
   }
@@ -831,10 +791,7 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
       continue;
     }
 
-    auto new_return_types = InferShapeForFunctionReturnType(func);
-    if (new_return_types)
-      func.setType(FunctionType::get(input_types, new_return_types.getValue(),
-                                     func.getContext()));
+    InferShapeForFunctionReturnType(func);
   }
   return success(all_succeeded);
 }
@@ -1015,7 +972,7 @@ LogicalResult ShapeInference::TryToFold(Operation* op) {
     if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
       if (std::get<0>(result).getType() == eattr.getType()) continue;
 
-      UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, eattr.getType(), op,
+      UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, eattr.getType(),
                                               std::get<0>(result));
     }
   }
@@ -1023,6 +980,83 @@ LogicalResult ShapeInference::TryToFold(Operation* op) {
   return success();
 }
 
+void ShapeInference::InferShapeForFunctionReturnType(FuncOp func) {
+  LLVM_DEBUG(llvm::dbgs() << "Inferring return type for: " << func.getName()
+                          << "\n");
+
+  // Find any return ops.
+  SmallVector<ReturnOp, 4> return_ops;
+  for (Block& block : func) {
+    if (auto return_op = dyn_cast<ReturnOp>(block.getTerminator())) {
+      return_ops.push_back(return_op);
+    }
+  }
+
+  // Right now we only handle the case of a single return op.
+  // To handle multiple return ops, we would need to look at all their shapes
+  // and come up with a common shape and insert appropriate casts.
+  if (return_ops.size() != 1) return;
+
+  // Find the return type.
+  auto return_op = return_ops.front();
+
+  // Avoid refining result type if not used by TF dialect op. This can be
+  // relaxed once we move to a work queue, but at the moment this can result
+  // in invalid modules (in particular when a std.call is used but we've
+  // already processed the function where the call is made from before this).
+  auto uses = mlir::SymbolTable::getSymbolUses(
+      func.getOperation(), func.getParentOfType<ModuleOp>());
+  if (!uses) {
+    LLVM_DEBUG(llvm::dbgs() << "Skipping refing return type of function "
+                               "given unknown use\n");
+    return;
+  }
+  for (auto use : *uses) {
+    if (use.getUser()->getDialect() != tf_dialect_) {
+      LLVM_DEBUG(llvm::dbgs() << "Skipping refing return type of function "
+                                 "given non-TF dialect use\n");
+      return;
+    }
+  }
+
+  // Manually fold tf.Cast that precedes the return instruction and only differs
+  // in shape refinement level.
+  for (OpOperand& arg_op : return_op.getOperation()->getOpOperands()) {
+    Operation* arg_defining_op = arg_op.get().getDefiningOp();
+    if (auto cast_op = dyn_cast_or_null<CastOp>(arg_defining_op)) {
+      Value input = cast_op.x();
+      Value result = cast_op.y();
+      if (!CanRefineTypeWith(result.getType(), input.getType())) continue;
+
+      LLVM_DEBUG({
+        llvm::errs() << "\tfolding & updating return type ";
+        cast_op.getResult().getType().print(llvm::errs());
+        cast_op.getOperand().getType().print(llvm::errs() << " to ");
+        llvm::errs() << "\n";
+      });
+
+      // Shape inference should not change the element type.
+      if (HasCompatibleElementTypes(input.getType(), result.getType())) {
+        arg_op.set(cast_op.x());
+      } else {
+        OpBuilder b(return_op.getOperation());
+        auto type = RankedTensorType::get(
+            input.getType().cast<TensorType>().getShape(),
+            result.getType().cast<TensorType>().getElementType());
+        auto new_cast_op =
+            b.create<TF::CastOp>(return_op.getLoc(), type, input,
+                                 /*truncate=*/b.getBoolAttr(false));
+        arg_op.set(new_cast_op);
+      }
+      if (cast_op.y().use_empty()) cast_op.erase();
+    }
+  }
+
+  // Update function type.
+  func.setType(FunctionType::get(
+      func.getArgumentTypes(), return_op.getOperandTypes(), func.getContext()));
+}
+
 LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                                                       int64_t max_iteration) {
   bool changed = true;
@@ -1074,25 +1108,24 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
   return success();
 }
 
+static LogicalResult InferShapeForFunction(ShapeInference& context,
+                                           FuncOp func) {
+  if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
+    return failure();
+  // TODO(b/156276510): Verify that it is always fine to refine a function's
+  // return type, as long as we do not change the argument shapes.
+  context.InferShapeForFunctionReturnType(func);
+
+  return success();
+}
+
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
-                                    int64_t graph_version,
-                                    bool propagate_caller_callee_constants) {
+                                    int64_t graph_version) {
   ShapeInference context(graph_version, func.getContext(),
-                         propagate_caller_callee_constants);
-  if (arg_shapes.empty()) {
-    if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
-      return failure();
-    // TODO(b/156276510): Verify that it is always fine to refine a function's
-    // return type, as long as we do not change the argument shapes.
-    if (auto return_types = InferShapeForFunctionReturnType(func)) {
-      func.setType(FunctionType::get(func.getType().getInputs(),
-                                     return_types.getValue(),
-                                     func.getContext()));
-    }
+                         /*propagate_caller_callee_constants=*/true);
+  if (arg_shapes.empty()) return InferShapeForFunction(context, func);
 
-    return success();
-  }
   FunctionType func_type = func.getType();
   bool needs_refinement = false;
   SmallVector<Type, 4> new_arg_types;
@@ -1124,24 +1157,34 @@ LogicalResult InferShapeForFunction(FuncOp func,
     new_arg_types.push_back(new_arg_type);
   }
 
-  if (!needs_refinement) {
-    return success();
-  }
+  if (!needs_refinement) return success();
 
-  LogicalResult result = context.InferShapeUntilFixPoint(&func.getBody());
-  if (failed(result)) {
+  if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
     return failure();
-  }
 
-  auto return_types = InferShapeForFunctionReturnType(func);
-  func.setType(FunctionType::get(new_arg_types,
-                                 return_types.hasValue()
-                                     ? return_types.getValue()
-                                     : func.getType().getResults(),
+  context.InferShapeForFunctionReturnType(func);
+  func.setType(FunctionType::get(new_arg_types, func.getType().getResults(),
                                  func.getContext()));
 
   return success();
 }
 
+LogicalResult InferModuleShape(ModuleOp module) {
+  auto producer_or = tensorflow::GetTfGraphProducerVersion(module);
+  if (!producer_or.ok()) {
+    // TODO(jpienaar): Keeping the existing behavior for now but this could
+    // be relaxed.
+    LLVM_DEBUG(llvm::dbgs()
+               << "Skipping inference; " << producer_or.status().ToString());
+    return success();
+  }
+  int64_t producer = producer_or.ValueOrDie();
+  for (auto func : module.getOps<FuncOp>()) {
+    auto res = InferShapeForFunction(func, /*arg_shapes=*/{}, producer);
+    if (failed(res)) return res;
+  }
+  return success();
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index 7486fd77388..8a4fe205875 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -19,22 +19,27 @@ limitations under the License.
 #include <cstdint>
 
 #include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 namespace mlir {
-
 namespace TF {
 
+// Refines all the shapes in a module.
+LogicalResult InferModuleShape(ModuleOp module);
+
 // Given a list of refined shapes matching the function arguments of func, runs
 // shape inference over the function to propagate this updated information.
 // If arg_shapes are empty, then argument shapes will be left unchanged.
+// Note: This affects the entire module, and changes are not just scoped to the
+// function being inferred.
 // TODO(b/154065712): Remove propagate_caller_callee_constants once using
 // SCCP pass instead.
-LogicalResult InferShapeForFunction(
-    FuncOp func, ArrayRef<ArrayRef<int64_t>> arg_shapes, int64_t graph_version,
-    bool propagate_caller_callee_constants = true);
+LogicalResult InferShapeForFunction(FuncOp func,
+                                    ArrayRef<ArrayRef<int64_t>> arg_shapes,
+                                    int64_t graph_version);
 
 }  // namespace TF
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 1a846398412..dbb326a89b0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -13,29 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
-#include <initializer_list>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
 #define DEBUG_TYPE "tf-shape-inference"
@@ -51,31 +33,9 @@ class ShapeInference
     : public PassWrapper<ShapeInference, OperationPass<ModuleOp>> {
  public:
   ShapeInference() = default;
-  ShapeInference(const ShapeInference& that) {
-    propagate_caller_callee_constants_ =
-        that.propagate_caller_callee_constants_;
-  }
-
   void runOnOperation() override {
-    auto module = getOperation();
-    auto producer_or = tensorflow::GetTfGraphProducerVersion(module);
-    if (!producer_or.ok()) {
-      LLVM_DEBUG(llvm::dbgs() << producer_or.status().ToString(););
-      return;
-    }
-    int64_t producer = producer_or.ValueOrDie();
-    for (auto func : module.getOps<FuncOp>()) {
-      if (failed(InferShapeForFunction(func, /*arg_shapes=*/{}, producer,
-                                       propagate_caller_callee_constants_)))
-        return signalPassFailure();
-    }
+    if (failed(InferModuleShape(getOperation()))) return signalPassFailure();
   }
-
- private:
-  Option<bool> propagate_caller_callee_constants_{
-      *this, "propagate-caller-callee-constants",
-      llvm::cl::desc("Propagate constants between callers and callees"),
-      llvm::cl::init(true)};
 };
 
 PassRegistration<ShapeInference> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index 6bd8baf9c99..05eef4d5045 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -464,6 +464,38 @@ LogicalResult HandleStackPopV2Op(
   return success();
 }
 
+LogicalResult HandleRegionControlFlowOps(
+    Operation& op, ModuleOp module,
+    llvm::SmallDenseMap<Value, Value>* data_var_to_size_var,
+    llvm::StringMap<PartitionedCallStackOpsInfo>*
+        decomposed_partitioned_call_callees) {
+  for (OpOperand& operand : op.getOpOperands()) {
+    if (getElementTypeOrSelf(operand.get().getType()).isa<TF::ResourceType>()) {
+      return op.emitOpError()
+             << "found unexpected type " << operand.get().getType()
+             << " of operand #" << operand.getOperandNumber()
+             << ", resource type operands are expected to have been "
+                "canonicalized away for region based control flow ops";
+    }
+  }
+  for (OpResult result : op.getResults()) {
+    if (getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+      return op.emitOpError()
+             << "found unexpected type " << result.getType() << " of result #"
+             << result.getResultNumber()
+             << ", resource type results are expected to have been "
+                "canonicalized away for region based control flow ops";
+    }
+  }
+  for (Region& region : op.getRegions()) {
+    if (failed(DecomposeStackOpsInternal(&region.front(), module,
+                                         data_var_to_size_var,
+                                         decomposed_partitioned_call_callees)))
+      return failure();
+  }
+  return success();
+}
+
 // Decomposes stack ops on a region and recursively decomposes called functions.
 // data_var_to_size_var: a mapping from stacks' buffer local variables to size
 // local variables.
@@ -505,6 +537,13 @@ LogicalResult DecomposeStackOpsInternal(
                             decomposed_partitioned_call_callees))) {
         return failure();
       }
+    } else if (llvm::isa<TF::WhileRegionOp>(op) ||
+               llvm::isa<TF::IfRegionOp>(op) ||
+               llvm::isa<TF::CaseRegionOp>(op)) {
+      if (failed(
+              HandleRegionControlFlowOps(op, module, data_var_to_size_var,
+                                         decomposed_partitioned_call_callees)))
+        return failure();
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       if (!pcall.func()) {
         return pcall.emitOpError(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index 680d5334ceb..8ad4687d537 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -458,38 +458,41 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
     ArrayRef<FuncOp> funcs, ModuleOp module) {
   llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> result;
   llvm::SmallDenseMap<int64_t, llvm::StringSet<>> result_sets;
-  auto insert = [&](Value v, const string& source) {
-    auto arg = v.cast<BlockArgument>();
-    if (!arg) return;
+  auto insert = [&](Value v, const string& source, const Block& func_block) {
+    auto arg = v.dyn_cast<BlockArgument>();
+    if (!arg || arg.getOwner() != &func_block) return;
     auto insert_res = result_sets[arg.getArgNumber()].insert(source);
     if (!insert_res.second) return;
     result[arg.getArgNumber()].push_back(source);
   };
   for (FuncOp func : funcs) {
-    for (auto& op : func.front().getOperations()) {
-      if (llvm::isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
-        op.replaceAllUsesWith(op.getOperands());
-        continue;
+    const Block& func_block = func.front();
+    // Walk all operations and nested regions to find accessed gradient sources
+    // for function arguments.
+    func.walk([&](Operation* op) {
+      if (llvm::isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
+        op->replaceAllUsesWith(op->getOperands());
+        return;
       }
-      if (auto grad = llvm::dyn_cast<TF::TensorArrayGradV3Op>(&op)) {
-        insert(grad.handle(), grad.source().str());
-      } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
+      if (auto grad = llvm::dyn_cast<TF::TensorArrayGradV3Op>(op)) {
+        insert(grad.handle(), grad.source().str(), func_block);
+      } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
         for (const auto& entry : AccessedGradients(
                  {while_op.body_function(), while_op.cond_function()}, module))
           for (const string& source : entry.getSecond())
-            insert(while_op.getOperand(entry.getFirst()), source);
-      } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
+            insert(while_op.getOperand(entry.getFirst()), source, func_block);
+      } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
         for (const auto& entry : AccessedGradients(
                  {if_op.then_function(), if_op.else_function()}, module))
           for (const string& source : entry.getSecond())
-            insert(if_op.getOperand(entry.getFirst() + 1), source);
-      } else if (auto call = llvm::dyn_cast<CallOpInterface>(&op)) {
+            insert(if_op.getOperand(entry.getFirst() + 1), source, func_block);
+      } else if (auto call = llvm::dyn_cast<CallOpInterface>(op)) {
         auto callee = dyn_cast<FuncOp>(call.resolveCallable());
         for (const auto& entry : AccessedGradients({callee}, module))
           for (const string& source : entry.getSecond())
-            insert(call.getArgOperands()[entry.getFirst()], source);
+            insert(call.getArgOperands()[entry.getFirst()], source, func_block);
       }
-    }
+    });
   }
   return result;
 }
@@ -810,6 +813,38 @@ LogicalResult HandlePartitionedCallOp(
   return success();
 }
 
+LogicalResult HandleRegionControlFlowOps(
+    Operation& op, ModuleOp module,
+    llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
+    llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
+        decomposed_partitioned_call_callees) {
+  for (OpOperand& operand : op.getOpOperands()) {
+    if (getElementTypeOrSelf(operand.get().getType()).isa<TF::ResourceType>()) {
+      return op.emitOpError()
+             << "found unexpected type " << operand.get().getType()
+             << " of operand #" << operand.getOperandNumber()
+             << ", resource type operands are expected to have been "
+                "canonicalized away for region based control flow ops";
+    }
+  }
+  for (OpResult result : op.getResults()) {
+    if (getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+      return op.emitOpError()
+             << "found unexpected type " << result.getType() << " of result #"
+             << result.getResultNumber()
+             << ", resource type results are expected to have been "
+                "canonicalized away for region based control flow ops";
+    }
+  }
+
+  for (Region& region : op.getRegions()) {
+    if (failed(DecomposeTensorArrayOps(&region.front(), module, stats,
+                                       decomposed_partitioned_call_callees)))
+      return failure();
+  }
+  return success();
+}
+
 LogicalResult DecomposeTensorArrayOps(
     Block* block, ModuleOp module,
     llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
@@ -853,6 +888,12 @@ LogicalResult DecomposeTensorArrayOps(
                             decomposed_partitioned_call_callees))) {
         return failure();
       }
+    } else if (llvm::isa<TF::CaseRegionOp>(op) ||
+               llvm::isa<TF::IfRegionOp>(op) ||
+               llvm::isa<TF::WhileRegionOp>(op)) {
+      if (failed(HandleRegionControlFlowOps(
+              op, module, stats, decomposed_partitioned_call_callees)))
+        return failure();
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       auto callee = pcall.func();
       if (!callee)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
index f14efeb91ce..397a6f56fa0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
@@ -59,7 +59,8 @@ class TensorDeviceCopyConversionPass
     patterns.insert<PassThroughConversion<TF::IdentityOp>,
                     PassThroughConversion<TF::IdentityNOp>>(&getContext());
 
-    if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+    if (failed(applyPartialConversion(getFunction(), target,
+                                      std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
index 786c4b74b34..f2321df9823 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
@@ -58,7 +58,7 @@ struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
 void PopulateTFDataOptimizationPatterns(MLIRContext *context,
                                         OwningRewritePatternList *patterns) {
   patterns->insert<FuseParallelMapAndBatch>(context);
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc
index 5be69bddb11..ae74c73d22f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization_pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h"
 
 namespace mlir {
@@ -28,7 +29,7 @@ struct TFDataOptimization
     OwningRewritePatternList patterns;
     mlir::TF::PopulateTFDataOptimizationPatterns(&getContext(), &patterns);
 
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index fff09ccf363..46bc094e5ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -172,6 +172,7 @@ bool ShouldMoveOpAfterCluster(
     const llvm::SmallSetVector<Operation*, 8>& preceding_users,
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis,
     const llvm::SmallDenseSet<int64_t>& observed_resource_ids) {
+  const bool is_replicate = llvm::isa<tf_device::ReplicateOp>(op);
   auto result = op->walk([&](Operation* inner_op) {
     for (Value operand : inner_op->getOperands()) {
       Operation* def = operand.getDefiningOp();
@@ -186,6 +187,11 @@ bool ShouldMoveOpAfterCluster(
       }
     }
 
+    // Don't visit replicate op inner op operands as new resource
+    // values/arguments may have been created but are not known in
+    // `resource_alias_analysis`.
+    if (is_replicate && inner_op != op) return WalkResult::advance();
+
     // Check for uses of any resource in or after cluster.
     for (Value operand : TF::filter_resources(inner_op->getOperands())) {
       if (resource_alias_analysis.IsUnknownResource(operand)) continue;
@@ -424,20 +430,24 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
   for (auto result_and_idx : llvm::enumerate(cluster.getResults())) {
     Value result = result_and_idx.value();
     int idx = result_and_idx.index();
-    for (auto& use : result.getUses()) {
-      Operation* def = use.getOwner();
-      if (!def || !llvm::isa<TF::TPUReplicatedOutputOp>(def))
-        return cluster.emitError()
-               << "requires output of " << cluster.getOperationName()
-               << " to lead to a 'tf.TPUReplicatedOutput' op";
+    auto replicate_outputs = llvm::make_range(
+        std::next(replicate_op.result_begin(), idx * num_replicas),
+        std::next(replicate_op.result_begin(), (idx + 1) * num_replicas));
 
-      const int def_NumResults = def->getNumResults();
-      if (def_NumResults != num_replicas)
+    for (auto& use : llvm::make_early_inc_range(result.getUses())) {
+      Operation* def = use.getOwner();
+      if (!llvm::isa<TF::TPUReplicatedOutputOp>(def)) {
+        // If user is not a `tf.TPUReplicatedOutput`, simply forward the first
+        // replica output. Certain Graphs under V1 create `tf.Identity` users of
+        // replicated ops to pin the TPU computation for execution.
+        use.set(*replicate_outputs.begin());
+        continue;
+      }
+
+      const int def_num_results = def->getNumResults();
+      if (def_num_results != num_replicas)
         return def->emitOpError() << "requires " << num_replicas << " results";
 
-      auto replicate_outputs = llvm::make_range(
-          std::next(replicate_op.result_begin(), idx * num_replicas),
-          std::next(replicate_op.result_begin(), (idx + 1) * num_replicas));
       def->replaceAllUsesWith(replicate_outputs);
     }
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc
new file mode 100644
index 00000000000..6771ad1b923
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc
@@ -0,0 +1,253 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <tuple>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+constexpr char kDeviceAttr[] = "device";
+constexpr char kFuncDeviceAttr[] = "tf.device";
+
+// Checks if a function only contains a tf_executor.graph.
+bool IsSupportedGraph(FuncOp func) {
+  if (!llvm::hasSingleElement(func)) return false;
+
+  Block& block = func.front();
+  if (!llvm::hasSingleElement(block.without_terminator())) return false;
+
+  auto graph = llvm::dyn_cast<tf_executor::GraphOp>(block.front());
+  if (!graph) return false;
+
+  Operation* terminator = block.getTerminator();
+  if (graph.getNumResults() != terminator->getNumOperands()) return false;
+  for (auto result : llvm::zip(graph.results(), terminator->getOperands()))
+    if (std::get<0>(result) != std::get<1>(result)) return false;
+
+  return true;
+}
+
+// Checks if an operation of the tf_executor dialect can have TPU devices
+// propagated through.
+bool IsSupportedExecutorOp(Operation& op) {
+  auto ops_have_same_device = [](Operation* lhs, Operation* rhs) {
+    auto lhs_device_attr = lhs->getAttrOfType<StringAttr>(kDeviceAttr);
+    auto rhs_device_attr = rhs->getAttrOfType<StringAttr>(kDeviceAttr);
+    return (!lhs_device_attr && !rhs_device_attr) ||
+           (lhs_device_attr && rhs_device_attr &&
+            lhs_device_attr.getValue() == rhs_device_attr.getValue());
+  };
+
+  // Check if tf_executor.NextIteration.Source/tf_executor.NextIteration.Sink
+  // pair has matching devices or no devices.
+  if (auto source = llvm::dyn_cast<tf_executor::NextIterationSourceOp>(op)) {
+    return ops_have_same_device(source, source.GetSink());
+  } else if (auto sink = llvm::dyn_cast<tf_executor::NextIterationSinkOp>(op)) {
+    return ops_have_same_device(sink.GetSource(), sink);
+  }
+
+  return llvm::isa<tf_executor::EnterOp, tf_executor::ExitOp,
+                   tf_executor::IslandOp, tf_executor::MergeOp,
+                   tf_executor::SwitchOp>(op);
+}
+
+// Assigns all data results to a specified device.
+void PopulateDeviceForOpResults(
+    Operation& op, llvm::StringRef device,
+    llvm::DenseMap<Value, llvm::StringRef>& value_to_device) {
+  Operation* op_to_update = &op;
+  // Use tf_executor.island op if present as non v1 control flow op results are
+  // forwarded by a parent tf_executor.island op.
+  if (llvm::isa<tf_executor::IslandOp>(op_to_update->getParentOp()))
+    op_to_update = op_to_update->getParentOp();
+
+  for (Value result : op_to_update->getResults()) {
+    if (result.getType().isa<tf_executor::TokenType>()) continue;
+    if (result.getType().isa<tf_executor::ControlType>()) break;
+
+    value_to_device.insert({result, device});
+  }
+}
+
+// Checks if an operation can have TPU devices propagated through.
+bool IsSupportedOpToSetDevice(Operation& op) {
+  return IsSupportedExecutorOp(op) ||
+         isa<TF::IdentityOp, TF::IdentityNOp, TF::ShapeOp>(op);
+}
+
+// Finds nonconflicting TPU device for an operation from its operands. If an
+// operand has no device or a non TPU device, or if there are conflicting
+// devices, and empty StringRef will be returned. Control dependencies,
+// NextIteration.Source -> NextIteration.Sink token dependencies, and
+// LoopCond -> Switch data dependencies are ignored.
+llvm::StringRef FindDeviceFromOperands(
+    Operation& op,
+    const llvm::DenseMap<Value, llvm::StringRef>& value_to_device) {
+  llvm::StringRef new_device;
+  const bool is_switch = llvm::isa<tf_executor::SwitchOp>(op);
+  for (Value operand : op.getOperands()) {
+    if (operand.getType().isa<tf_executor::TokenType>()) continue;
+    if (operand.getType().isa<tf_executor::ControlType>()) break;
+
+    if (is_switch &&
+        llvm::isa_and_nonnull<tf_executor::LoopCondOp>(operand.getDefiningOp()))
+      continue;
+
+    auto it = value_to_device.find(operand);
+    if (it == value_to_device.end()) return llvm::StringRef();
+
+    if (new_device.empty()) {
+      new_device = it->getSecond();
+      continue;
+    }
+
+    if (new_device != it->getSecond()) return llvm::StringRef();
+  }
+
+  return new_device;
+}
+
+// Propagates devices from function arguments.
+void PropagateDevicesFromArguments(
+    FuncOp func, llvm::DenseMap<Value, llvm::StringRef>& value_to_device) {
+  for (BlockArgument& arg : func.getArguments()) {
+    auto arg_device_attr =
+        func.getArgAttrOfType<StringAttr>(arg.getArgNumber(), kFuncDeviceAttr);
+    if (!arg_device_attr || arg_device_attr.getValue().empty() ||
+        !tensorflow::IsTPUDevice(arg_device_attr.getValue()))
+      continue;
+    value_to_device.insert({arg, arg_device_attr.getValue()});
+  }
+}
+
+// Propagates devices from operation operands to results. Updating the device of
+// a tf_executor.NextIteration.Source/tf_executor.NextIteration.Sink will result
+// in multiple passes over the tf_executor.graph to propagate devices in loops.
+void PropagateDevicesInGraph(
+    tf_executor::GraphOp graph,
+    llvm::DenseMap<Value, llvm::StringRef>& value_to_device) {
+  auto ops = graph.GetBody().without_terminator();
+
+  bool updated_next_iteration = false;
+  do {
+    updated_next_iteration = false;
+    for (Operation& op : ops) {
+      if (!IsSupportedExecutorOp(op)) continue;
+
+      Operation* op_to_update = &op;
+      // Unpack inner op of tf_executor.island.
+      if (auto island_op =
+              llvm::dyn_cast<tf_executor::IslandOp>(op_to_update)) {
+        if (!island_op.WrapsSingleOp()) continue;
+        op_to_update = &island_op.GetBody().front();
+      }
+
+      // If op already has a TPU device set, simply propagate its device.
+      auto device_attr = op_to_update->getAttrOfType<StringAttr>(kDeviceAttr);
+      const bool has_device = device_attr && !device_attr.getValue().empty();
+      if (has_device && tensorflow::IsTPUDevice(device_attr.getValue())) {
+        PopulateDeviceForOpResults(*op_to_update, device_attr.getValue(),
+                                   value_to_device);
+        continue;
+      }
+
+      // Op has an unsupported device.
+      if (has_device) continue;
+
+      if (!IsSupportedOpToSetDevice(*op_to_update)) continue;
+
+      llvm::StringRef new_device =
+          FindDeviceFromOperands(*op_to_update, value_to_device);
+      if (new_device.empty()) continue;
+
+      auto new_device_attr =
+          mlir::StringAttr::get(new_device, op_to_update->getContext());
+      op_to_update->setAttr(kDeviceAttr, new_device_attr);
+      PopulateDeviceForOpResults(*op_to_update, new_device_attr.getValue(),
+                                 value_to_device);
+
+      if (auto sink =
+              llvm::dyn_cast<tf_executor::NextIterationSinkOp>(op_to_update)) {
+        auto source = sink.GetSource();
+        source.setAttr(kDeviceAttr, new_device_attr);
+        PopulateDeviceForOpResults(*source, new_device_attr.getValue(),
+                                   value_to_device);
+        updated_next_iteration = true;
+      }
+    }
+  } while (updated_next_iteration);
+}
+
+// Propagates devices to function results.
+void PropagateDevicesToResults(
+    FuncOp func, tf_executor::FetchOp fetch,
+    const llvm::DenseMap<Value, llvm::StringRef>& value_to_device) {
+  for (OpOperand& operand : fetch.getOperation()->getOpOperands()) {
+    if (operand.get().getType().isa<tf_executor::ControlType>()) break;
+    auto it = value_to_device.find(operand.get());
+    if (it != value_to_device.end()) {
+      auto device_attr = func.getResultAttrOfType<StringAttr>(
+          operand.getOperandNumber(), kFuncDeviceAttr);
+      if (device_attr && !device_attr.getValue().empty()) continue;
+      func.setResultAttr(operand.getOperandNumber(), kFuncDeviceAttr,
+                         StringAttr::get(it->getSecond(), func.getContext()));
+    }
+  }
+}
+
+struct TPUDevicePropagation
+    : public PassWrapper<TPUDevicePropagation, FunctionPass> {
+  void runOnFunction() override;
+};
+
+void TPUDevicePropagation::runOnFunction() {
+  FuncOp func = getFunction();
+  if (!IsSupportedGraph(func)) return;
+
+  llvm::DenseMap<Value, llvm::StringRef> value_to_device;
+  PropagateDevicesFromArguments(func, value_to_device);
+  auto graph = llvm::cast<tf_executor::GraphOp>(func.front().front());
+  PropagateDevicesInGraph(graph, value_to_device);
+  PropagateDevicesToResults(func, graph.GetFetch(), value_to_device);
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUDevicePropagationPass() {
+  return std::make_unique<TPUDevicePropagation>();
+}
+
+static PassRegistration<TPUDevicePropagation> pass(
+    "tf-tpu-device-propagation", "Propagates TPU devices from ops to users");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 65490716cf0..d232de440f5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -597,6 +597,7 @@ void MoveOutsideCompiledOpsInsideControlFlow(
     tf_device::LaunchOp host_launch_op, Value compilation_key,
     llvm::ArrayRef<Operation*> cluster_section_ops,
     const llvm::SmallVectorImpl<ControlFlowStackInfo>& controlflow_stack,
+    llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& section_external_inputs,
     llvm::ArrayRef<Value> section_external_outputs,
     llvm::SmallDenseMap<Operation*, Operation*>* replicated_controlflow_map) {
@@ -655,14 +656,19 @@ void MoveOutsideCompiledOpsInsideControlFlow(
                                      *insertion_op->getParentRegion());
   }
 
+  auto operand_inside_device_cluster = [&](OpOperand& operand) {
+    return tpu_cluster.body().isAncestor(
+               operand.getOwner()->getParentRegion()) &&
+           llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
+             return operand.getOwner() == cluster_op;
+           });
+  };
+
   for (auto result :
        llvm::zip(section_external_outputs, host_compute.getResults())) {
-    for (auto& result_use : std::get<0>(result).getUses()) {
-      Operation* result_using_op = result_use.getOwner();
-      const bool inside_device_cluster =
-          tpu_cluster.body().isAncestor(result_using_op->getParentRegion());
-      if (inside_device_cluster) result_use.set(std::get<1>(result));
-    }
+    Value external_output = std::get<0>(result);
+    external_output.replaceUsesWithIf(std::get<1>(result),
+                                      operand_inside_device_cluster);
   }
 }
 
@@ -731,7 +737,7 @@ void MoveOutsideCompiledOps(
 
     MoveOutsideCompiledOpsInsideControlFlow(
         module, tpu_cluster, host_cluster_section_name, host_launch_op,
-        compilation_key, cluster_section_ops, controlflow_stack,
+        compilation_key, cluster_section_ops, controlflow_stack, cluster_ops,
         section_external_inputs, section_external_outputs.takeVector(),
         &replicated_controlflows);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index 0b9eaba8c97..35ad3d21b30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -47,133 +48,47 @@ struct TPUShardingIdentificationPass
   void runOnOperation() override;
 };
 
-// Sets `sharding_op` if `op` is XlaShardingOp or if XlaSharding op is adjacent
-// to `op`. XlaSharding op may be direct user of inputs but it may also be
-// followed by an Identity op and, in the case where bfloat16 type is used, Cast
-// op may be added right after the input. As so, parse the users of the
-// operation to access connected XlaSharding op.
+// Finds XlaSharding op connected to an argument value. If value is a resource
+// type then XlaSharding op will be connected to a ReadVariable op. XlaSharding
+// op may be direct user of inputs but it may also be followed by an Identity op
+// and, in the case where bfloat16 type is used, Cast op may be added right
+// after the input.
 //
+// TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
+// Case, While) ops and Caller return values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
-void GetAdjacentXlaShardingOp(Operation* op,
-                              llvm::Optional<TF::XlaShardingOp>* sharding_op) {
-  // TODO(hongjunchoi): Detect the case when sharding configuration is ambiguous
-  // for a single input (i.e. multiple different XlaSharding ops with different
-  // configuration policies are connected).
-  if (sharding_op->hasValue()) return;
+llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(const Value& value) {
+  llvm::SmallPtrSet<Value, 4> visited_values;
+  llvm::SmallVector<Value, 4> values_to_visit{value};
+  while (!values_to_visit.empty()) {
+    llvm::SmallVector<Value, 4> next_values_to_visit;
+    for (Value value_to_visit : values_to_visit) {
+      if (!visited_values.insert(value_to_visit).second) continue;
 
-  if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(op)) {
-    sharding_op->emplace(sharding);
-    return;
-  }
+      for (auto& use : value_to_visit.getUses()) {
+        Operation* owner = use.getOwner();
+        if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(owner))
+          return sharding._XlaSharding();
 
-  if (llvm::isa<TF::IdentityOp, TF::CastOp>(op)) {
-    for (auto user : op->getUsers())
-      GetAdjacentXlaShardingOp(user, sharding_op);
-  }
-}
+        if (llvm::isa<TF::IdentityOp, TF::CastOp, TF::ReadVariableOp>(owner)) {
+          next_values_to_visit.push_back(use.getOwner()->getResult(0));
+          continue;
+        }
 
-// Parses XlaSharding op connected to input args. If Input to
-// tf_device.ClusterFunc op is of resource type, then XlaSharding op will be
-// connected to following ReadVariable op.
-//
-// TODO(hongjunchoi): Add logic to parse XlaSharding op inside a Call op or
-// If/While op.
-llvm::Optional<llvm::StringRef> ParseInputSharding(const Value& arg) {
-  llvm::Optional<TF::XlaShardingOp> parsed_sharding_op;
-  for (auto user : arg.getUsers()) {
-    if (parsed_sharding_op) continue;
-
-    GetAdjacentXlaShardingOp(user, &parsed_sharding_op);
-    if (parsed_sharding_op) continue;
-
-    if (llvm::isa<TF::ReadVariableOp>(user))
-      for (auto read_variable_user : user->getUsers())
-        GetAdjacentXlaShardingOp(read_variable_user, &parsed_sharding_op);
-  }
-
-  if (!parsed_sharding_op) return llvm::Optional<llvm::StringRef>();
-  return parsed_sharding_op.getValue()._XlaSharding();
-}
-
-// Returns the provided sharding configuration if operand of return value of
-// tf_device.ClusterFunc op is directly from XlaSharding op,
-llvm::Optional<StringRef> ParseReturnValueSharding(FuncOp func,
-                                                   const int output_index,
-                                                   const OpOperand& operand) {
-  if (auto sharding_op = llvm::dyn_cast_or_null<TF::XlaShardingOp>(
-          operand.get().getDefiningOp()))
-    return sharding_op._XlaSharding();
-
-  return llvm::Optional<StringRef>();
-}
-
-// Includes information on Func op and argument index of the input value. This
-// is used to trace Value that is fed into function call ops.
-struct FunctionAndArgumentInfo {
-  FuncOp func;
-  int argument_index;
-};
-
-// Adds tf.PartitionedCall op or tf.StatefulPartitionedCall op to `list`. If
-// `op` is a function call op, then find the func op from provided `module` and
-// add the func op with `arg_index` to `list`. `list` will later be used to
-// trace mlir::Value that is fed into (potentially nested) function call ops.
-void AddFunctionalOpsToList(
-    const int arg_index, ModuleOp module, Operation* op,
-    llvm::SmallVectorImpl<FunctionAndArgumentInfo>* list) {
-  if (auto pcall_op = llvm::dyn_cast<TF::PartitionedCallOp>(op)) {
-    if (!pcall_op.f().isa<FlatSymbolRefAttr>()) return;
-
-    auto pcall_func = llvm::cast<FuncOp>(
-        module.lookupSymbol(pcall_op.f().getRootReference()));
-    assert(pcall_func);
-    list->emplace_back(FunctionAndArgumentInfo{pcall_func, arg_index});
-
-  } else if (auto spcall_op =
-                 llvm::dyn_cast<TF::StatefulPartitionedCallOp>(op)) {
-    auto sp_call_func = llvm::cast<FuncOp>(module.lookupSymbol(spcall_op.f()));
-    assert(sp_call_func);
-    list->emplace_back(FunctionAndArgumentInfo{sp_call_func, arg_index});
-  }
-}
-
-// Walks the MLIR graph from `arg` and return a list of all function call ops to
-// which the `arg` op is directly connected.
-//
-// For example:
-//   argument0 -> PartitionedCallOp -> StatefulPartitionedCallOp -> AddOp
-//
-// For above case, PartitionedCall op and StatefulPartitionedCallOp will be
-// returned.
-llvm::SmallVector<FunctionAndArgumentInfo, 4> ExtractFunctionsConnectedToArg(
-    BlockArgument arg, ModuleOp module) {
-  llvm::SmallVector<FunctionAndArgumentInfo, 4> functions_connected_to_arg;
-  for (auto& arg_use : arg.getUses())
-    AddFunctionalOpsToList(arg_use.getOperandNumber(), module,
-                           arg_use.getOwner(), &functions_connected_to_arg);
-
-  llvm::SmallVector<FunctionAndArgumentInfo, 4> functions_to_parse{
-      functions_connected_to_arg.begin(), functions_connected_to_arg.end()};
-
-  while (!functions_to_parse.empty()) {
-    llvm::SmallVector<FunctionAndArgumentInfo, 4> newly_discovered_functions;
-    for (auto function_info : functions_to_parse) {
-      Block& func_entry_block = function_info.func.front();
-      auto argument =
-          func_entry_block.getArgument(function_info.argument_index);
-
-      for (auto& arg_use : argument.getUses())
-        AddFunctionalOpsToList(arg_use.getOperandNumber(), module,
-                               arg_use.getOwner(), &newly_discovered_functions);
+        if (auto call_op = llvm::dyn_cast<CallOpInterface>(owner)) {
+          FuncOp func = llvm::dyn_cast<FuncOp>(call_op.resolveCallable());
+          if (!func) continue;
+          next_values_to_visit.push_back(
+              func.getArgument(use.getOperandNumber()));
+        }
+      }
     }
 
-    functions_connected_to_arg.append(newly_discovered_functions.begin(),
-                                      newly_discovered_functions.end());
-    std::swap(functions_to_parse, newly_discovered_functions);
+    values_to_visit.swap(next_values_to_visit);
   }
 
-  return functions_connected_to_arg;
+  return llvm::None;
 }
 
 // Walks the graph from the arguments of the `cluster_func_op` and extracts
@@ -186,7 +101,6 @@ void IdentifyXlaShardingForComputationInputs(
     FuncOp cluster_function, Builder* builder) {
   // Look up function definition from module.
   Block& cluster_function_block = cluster_function.front();
-  ModuleOp module = cluster_func_op.getParentOfType<ModuleOp>();
 
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
       cluster_function_block.getNumArguments(), logical_core_0_sharding);
@@ -202,31 +116,17 @@ void IdentifyXlaShardingForComputationInputs(
   // Sharding configurations are added to the tf_device.ClusterFunc as an
   // attribute and the function as an argument attribute.
   for (auto& arg : cluster_function_block.getArguments()) {
-    auto arg_sharding = ParseInputSharding(arg);
-    const int arg_index_to_tpu_computation = arg.getArgNumber();
-
-    if (!arg_sharding.hasValue()) {
-      auto connected_functions_to_arg =
-          ExtractFunctionsConnectedToArg(arg, module);
-      for (auto& function_arg_info : connected_functions_to_arg) {
-        if (arg_sharding.hasValue()) break;
-
-        const int function_argument_index = function_arg_info.argument_index;
-        auto& parsed_function = function_arg_info.func;
-        Block& parsed_function_block = parsed_function.front();
-        arg_sharding = ParseInputSharding(
-            parsed_function_block.getArgument(function_argument_index));
-      }
-    }
+    auto arg_sharding = GetXlaShardingFromArg(arg);
+    const int index = arg.getArgNumber();
 
     if (arg_sharding) {
-      sharding_for_args[arg_index_to_tpu_computation] = arg_sharding.getValue();
+      sharding_for_args[index] = arg_sharding.getValue();
       cluster_function.setArgAttr(
-          arg_index_to_tpu_computation, kShardingAttr,
+          index, kShardingAttr,
           builder->getStringAttr(arg_sharding.getValue()));
     } else {
       cluster_function.setArgAttr(
-          arg_index_to_tpu_computation, kShardingAttr,
+          index, kShardingAttr,
           builder->getStringAttr(logical_core_0_sharding));
     }
   }
@@ -235,6 +135,44 @@ void IdentifyXlaShardingForComputationInputs(
                           builder->getStrArrayAttr(sharding_for_args));
 }
 
+// Finds XlaSharding op connected to a result value. XlaSharding op may be
+// direct user of inputs but it may also be followed by an Identity op and, in
+// the case where bfloat16 type is used, Cast op may be added right after the
+// input.
+//
+// TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
+// Case, While) ops and Caller argument values.
+// TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
+// inputs.
+llvm::Optional<StringRef> GetXlaShardingFromRetval(const Value& value) {
+  llvm::SmallPtrSet<Value, 4> visited_values;
+  Value value_to_visit = value;
+  while (value_to_visit) {
+    if (!visited_values.insert(value_to_visit).second) return llvm::None;
+
+    Operation* def = value_to_visit.getDefiningOp();
+    if (auto sharding = llvm::dyn_cast_or_null<TF::XlaShardingOp>(def))
+      return sharding._XlaSharding();
+
+    if (llvm::isa_and_nonnull<TF::IdentityOp, TF::CastOp>(def)) {
+      value_to_visit = def->getOperand(0);
+      continue;
+    }
+
+    if (auto call_op = llvm::dyn_cast_or_null<CallOpInterface>(def)) {
+      FuncOp func = llvm::dyn_cast<FuncOp>(call_op.resolveCallable());
+      if (!func) continue;
+      value_to_visit = func.front().getTerminator()->getOperand(
+          value_to_visit.cast<OpResult>().getResultNumber());
+      continue;
+    }
+
+    break;
+  }
+
+  return llvm::None;
+}
+
 // Parses XlaSharding op directly connected from the outputs of the
 // `cluster_func` and extract sharding configurations for outputs.
 void IdentifyXlaShardingForComputationOutputs(
@@ -252,8 +190,8 @@ void IdentifyXlaShardingForComputationOutputs(
   // tf_device.ClusterFunc as an attribute and the function as a result
   // attribute.
   for (auto& ret : terminator->getOpOperands()) {
+    auto ret_sharding = GetXlaShardingFromRetval(ret.get());
     const int index = ret.getOperandNumber();
-    auto ret_sharding = ParseReturnValueSharding(func, index, ret);
 
     if (ret_sharding) {
       sharding_for_rets[index] = ret_sharding.getValue();
@@ -264,6 +202,7 @@ void IdentifyXlaShardingForComputationOutputs(
                          builder->getStringAttr(logical_core_0_sharding));
     }
   }
+
   cluster_func.setAttr(tensorflow::kOutputShardingAttr,
                        builder->getStrArrayAttr(sharding_for_rets));
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index ceb2d86899b..330e76884d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/util/matmul_bcast.h"
 
@@ -54,7 +55,7 @@ void UnrollBatchMatMulPass::runOnFunction() {
 
   patterns.insert<ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
                   ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(&getContext());
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
index a5aebd16146..1a6fc0766dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
@@ -30,33 +30,32 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
-using stream_executor::port::StatusOr;
-
 // Given an MLIR module, returns a GraphDef.
-StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
-    mlir::ModuleOp module, const GraphExportConfig& configs);
+stream_executor::port::StatusOr<std::unique_ptr<GraphDef>>
+ConvertMlirToGraphdef(mlir::ModuleOp module, const GraphExportConfig& configs);
 
 // Converts an MLIR module to TensorFlow graph and FunctionLibraryDefinition.
 // The "main" function of the module is stored in the graph and the rest of
 // functions are stored in the library. Control ret nodes are stored separately
 // in `control_ret_nodes`.
-stream_executor::port::Status ConvertMlirToGraph(
-    mlir::ModuleOp module, const GraphExportConfig& configs,
-    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
-    absl::flat_hash_set<Node*>* control_ret_nodes);
+Status ConvertMlirToGraph(mlir::ModuleOp module,
+                          const GraphExportConfig& configs,
+                          std::unique_ptr<Graph>* graph,
+                          FunctionLibraryDefinition* flib_def,
+                          absl::flat_hash_set<Node*>* control_ret_nodes);
 
 // Converts an MLIR module to TensorFlow graph and FunctionLibraryDefinition.
 // The "main" function of the module is stored in the graph and the rest of
 // functions are stored in the library.
-stream_executor::port::Status ConvertMlirToGraph(
-    mlir::ModuleOp module, const GraphExportConfig& configs,
-    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def);
+Status ConvertMlirToGraph(mlir::ModuleOp module,
+                          const GraphExportConfig& configs,
+                          std::unique_ptr<Graph>* graph,
+                          FunctionLibraryDefinition* flib_def);
 
 // Converts an MLIR function and adds it to a FunctionLibraryDefinition.
-stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
-    mlir::FuncOp func, const GraphExportConfig& configs,
-    FunctionDef* function_def);
-
+Status ConvertMlirFunctionToFunctionLibraryDef(mlir::FuncOp func,
+                                               const GraphExportConfig& configs,
+                                               FunctionDef* function_def);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_EXPORT_GRAPHDEF_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 42ce5c533a2..efbbc43967c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -104,6 +104,7 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/utils/transitive_fanin.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
@@ -130,6 +131,10 @@ using stream_executor::port::StatusOr;
 
 namespace {
 
+auto* reference_variable_gauge = tensorflow::monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/core/uses_reference_variables",
+    "Tracks if reference variables are used anywhere in the graph");
+
 constexpr char kTpuReplicateAttr[] = "_tpu_replicate";
 
 bool IsOutputShapesAttribute(const AttrValue& attr_value,
@@ -2057,6 +2062,11 @@ class GraphDefImporter : public ImporterBase {
       llvm::StringRef func_name);
 
  private:
+  // Checks if a Module contains any ref variables in any operation operands
+  // or results, including checking Block arguments and operations within
+  // regions.
+  static bool ModuleContainsRefType(mlir::ModuleOp module);
+
   explicit GraphDefImporter(
       const FunctionLibraryDefinition& flib, const GraphDebugInfo& debug_info,
       const GraphImportConfig& specs, mlir::ModuleOp module,
@@ -2092,6 +2102,38 @@ class GraphDefImporter : public ImporterBase {
       absl::InlinedVector<Node*, 4>* control_ret_nodes);
 };
 
+bool IsTensorFlowRefType(mlir::Type ty) {
+  return mlir::getElementTypeOrSelf(ty).isa<mlir::TF::TensorFlowRefType>();
+}
+
+bool OpHasRefTypeOperandOrResult(mlir::Operation* op) {
+  // Check op operands.
+  for (mlir::Type ty : op->getOperandTypes())
+    if (IsTensorFlowRefType(ty)) return true;
+  // Check op results.
+  for (mlir::Type ty : op->getResultTypes())
+    if (IsTensorFlowRefType(ty)) return true;
+  // Check all block arguments within any regions the op has.
+  for (mlir::Region& region : op->getRegions())
+    for (mlir::Block& block : region)
+      for (auto& arg : block.getArguments())
+        if (IsTensorFlowRefType(arg.getType())) return true;
+  return false;
+}
+
+bool GraphDefImporter::ModuleContainsRefType(mlir::ModuleOp module) {
+  // If walk is interrupted at any point, that means a ref variable was found.
+  // At this point, we've confirmed existence of a ref variable and don't need
+  // to continue looking.
+  return module
+      .walk([&](mlir::Operation* op) {
+        if (OpHasRefTypeOperandOrResult(op))
+          return mlir::WalkResult::interrupt();
+        return mlir::WalkResult::advance();
+      })
+      .wasInterrupted();
+}
+
 StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
     mlir::MLIRContext* context, const Graph& graph,
     const GraphDebugInfo& debug_info, const FunctionLibraryDefinition& flib_def,
@@ -2126,28 +2168,27 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
     TF_RETURN_IF_ERROR(importer.GetControlRetsFromGraph(specs.control_outputs,
                                                         &control_ret_nodes));
 
-    if (!arg_nodes.empty() || !ret_nodes.empty() ||
-        !control_ret_nodes.empty()) {
-      mlir::Builder b(context);
-      std::string s;
-      llvm::raw_string_ostream ss(s);
-      auto node_name = [&](const OutputTensor& tensor) {
-        ss << tensor.node->name();
-      };
-      llvm::interleave(arg_nodes, ss, node_name, ",");
-      auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
-      s.clear();
-      llvm::interleave(ret_nodes, ss, node_name, ",");
-      auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
-      s.clear();
-      llvm::interleave(specs.control_outputs, ss, ",");
-      auto control_outputs =
-          b.getNamedAttr("control_outputs", b.getStringAttr(ss.str()));
+    mlir::Builder b(context);
+    std::string s;
+    llvm::raw_string_ostream ss(s);
+    auto node_name = [&](const OutputTensor& tensor) {
+      ss << tensor.node->name();
+    };
+    llvm::interleave(arg_nodes, ss, node_name, ",");
+    auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
+    s.clear();
+    llvm::interleave(ret_nodes, ss, node_name, ",");
+    auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
+    s.clear();
+    llvm::interleave(specs.control_outputs, ss, ",");
+    auto control_outputs =
+        b.getNamedAttr("control_outputs", b.getStringAttr(ss.str()));
 
-      attrs.push_back(b.getNamedAttr(
-          "tf.entry_function",
-          b.getDictionaryAttr({inputs, outputs, control_outputs})));
-    }
+    // Under `graph_as_function` mode, `tf.entry_function` is always set as it
+    // is assumed feed, fetch, and target nodes are set correctly.
+    attrs.push_back(b.getNamedAttr(
+        "tf.entry_function",
+        b.getDictionaryAttr({inputs, outputs, control_outputs})));
   } else {
     // Collects the argument and return nodes by looking up the node names
     // specified by the user.
@@ -2190,6 +2231,13 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
   TF_RETURN_IF_ERROR(importer.ImporterBase::Convert(
       func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes, attrs));
 
+  // Check if there are any reference variables in the module.
+  bool contains_ref_var = ModuleContainsRefType(*module);
+  reference_variable_gauge->GetCell()->Set(contains_ref_var);
+  if (contains_ref_var) {
+    VLOG(1) << "Graph contains one or more reference variables";
+  }
+
   // Mark main function public, others private.
   for (auto function : module.get().getOps<mlir::FuncOp>()) {
     auto visibility = function.getName() == func_name
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
index cb3a3be22d8..dce7b451e48 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
@@ -26,12 +26,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
 using mlir::MLIRContext;
 
-static StatusOr<mlir::OwningModuleRef> Import(
+static stream_executor::port::StatusOr<mlir::OwningModuleRef> Import(
     const GraphOptimizationPassOptions& options, const Graph& graph,
     MLIRContext* context) {
   // TODO(fengliuai): get debug info at runtime.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc
index 4792e220b17..a51a3697b1c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc
@@ -21,11 +21,6 @@ Status GenerateResourceSharedNameIfEmpty(Graph& graph,
                                          FunctionLibraryDefinition& flib_def) {
   auto is_resource_op_with_empty_shared_name = [](const NodeDef& node_def,
                                                   const OpDef& op_def) {
-    // Only upgrade when it is a resource handle op.
-    if (op_def.output_arg().size() != 1 ||
-        op_def.output_arg(0).type() != tensorflow::DT_RESOURCE)
-      return false;
-
     // If the OpDef has "use_node_name_sharing" field, then it is valid to use
     // node names as shared names.
     if (!std::any_of(op_def.attr().begin(), op_def.attr().end(),
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index b55a5aa5243..13804e324ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -277,10 +277,6 @@ void CreateConvertMlirToXlaHloPipeline(
   pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
   pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   pm.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
-
-  // TODO(b/159127949): Stack and TensorArray decomposition passes do not handle
-  // region based control flow yet. So convert back to functional control flow.
-  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
   pm.addPass(mlir::TF::CreateStackOpsDecompositionPass());
   pm.addPass(mlir::TF::CreateTensorArrayOpsDecompositionPass());
   pm.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
@@ -289,6 +285,10 @@ void CreateConvertMlirToXlaHloPipeline(
   // Guarantee all functions have one use, which enables shape inference.
   pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // TODO(b/171426148): We cannot completely remove region to functional control
+  // flow conversion from this pipeline yet as it causes some unit tests to
+  // fail.
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
   // LegalizeTFControlFlow encapsulates arguments for control flow operations
   // with a tuple argument which break the assumption of resource lifting
   // inside PromoteResourcesToArgs.
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 3516e3a65d9..bf7c82b72e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -540,4 +541,12 @@ mlir::LogicalResult GetHostDeviceOutsideComputation(
   return mlir::success();
 }
 
+bool IsTPUDevice(llvm::StringRef device) {
+  Device parsed_device;
+  if (!DeviceNameUtils::ParseFullName(mlir::StringRefToView(device),
+                                      &parsed_device))
+    return false;
+  return parsed_device.has_type && parsed_device.type == kDeviceTPU;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 96cc8d7877b..49a0d4c0f61 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -247,6 +247,9 @@ mlir::LogicalResult GetHostDeviceOutsideComputation(
     mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
     std::string* host_device);
 
+// Checks if a device string is a TPU device.
+bool IsTPUDevice(llvm::StringRef device);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 8cf06259142..c6753f7b96d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -804,5 +804,11 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
   EXPECT_EQ(host_device, "/job:localhost/replica:0/task:0/device:CPU:0");
 }
 
+TEST(TPURewriteDeviceUtilTest, TestIsTPUDevice) {
+  EXPECT_TRUE(IsTPUDevice("/job:localhost/replica:0/task:0/device:TPU:0"));
+  EXPECT_FALSE(IsTPUDevice("/job:localhost/replica:0/task:0/device:CPU:0"));
+  EXPECT_FALSE(IsTPUDevice("INVALID_DEVICE"));
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index e5408cef828..ee0241945f6 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -31,7 +31,6 @@ int main(int argc, char **argv) {
   mlir::registerAllPasses();
   mlir::mhlo::registerAllMhloPasses();
   mlir::lmhlo::registerAllLmhloPasses();
-  mlir::mhlo::registerAllMhloPasses();
 
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 34686cc0f68..68fc5c3be6a 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -111,6 +111,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
index a3678f7d154..04811ff8ede 100644
--- a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h"
 
@@ -50,8 +51,8 @@ void Optimize::runOnFunction() {
   auto *ctx = &getContext();
   auto func = getFunction();
 
-  populateWithGenerated(ctx, &patterns);
-  applyPatternsAndFoldGreedily(func, patterns);
+  populateWithGenerated(ctx, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td b/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td
index c5a059e5b6b..c17939fd962 100644
--- a/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td
+++ b/tensorflow/compiler/mlir/tfjs/transforms/optimize_pattern.td
@@ -23,10 +23,6 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 
-// Constraint that makes sure both operands are the same operands.
-// TODO(b/154826385): Reconsider once equal source pattern symbols are allowed.
-def EqualOperands : Constraint<CPred<"$0 == $1">>;
-
 // Checks if the operand0's rank is one less than operand1's rank.
 def PReluAlphaRankCheck : Constraint<
   CPred<"$0.getType().cast<ShapedType>().getRank() == "
@@ -36,13 +32,12 @@ def PReluAlphaRankCheck : Constraint<
 // PReLU pattern from Keras:
 // f(x) = Relu(x) + (-alpha * Relu(-x))
 def : Pat<(TF_AddV2Op
-           (TF_ReluOp:$relu_out $input1),
+           (TF_ReluOp:$relu_out $x),
            (TF_MulOp:$mul_out
-            (TF_ReluOp (TF_NegOp:$input_neg_out $input2)),
+            (TF_ReluOp (TF_NegOp:$input_neg_out $x)),
             $neg_alpha)),
-          (TFJS_PReluOp $input1, (TF_NegOp $neg_alpha)),
-          [(EqualOperands $input1, $input2),
-           (PReluAlphaRankCheck $neg_alpha, $input1),
+          (TFJS_PReluOp $x, (TF_NegOp $neg_alpha)),
+          [(PReluAlphaRankCheck $neg_alpha, $x),
            (HasOneUse $relu_out),
            (HasOneUse $mul_out),
            (HasOneUse $input_neg_out)
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 420648dff54..ae02faf8be2 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -1,10 +1,17 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
 load(
     "//third_party/mlir:tblgen.bzl",
     "gentbl",
 )
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = [
@@ -18,7 +25,8 @@ package_group(
     includes = ["//third_party/mlir:subpackages"],
     packages = [
         "//learning/brain/experimental/mlir/tfr/...",
-        "//tensorflow/compiler/mlir/...",
+        "//tensorflow/c/...",
+        "//tensorflow/compiler/...",
     ],
 )
 
@@ -36,10 +44,12 @@ filegroup(
         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
+    compatible_with = get_compatible_with_cloud(),
 )
 
 gentbl(
     name = "tfr_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -113,7 +123,9 @@ cc_library(
         ":tfr",
         ":utils",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -170,26 +182,26 @@ cc_library(
     srcs = ["integration/tfr_decompose_ctx.cc"],
     hdrs = ["integration/tfr_decompose_ctx.h"],
     deps = [
+        ":passes",
+        ":tfr",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_attr",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/mlir/tfr",
-        "//tensorflow/compiler/mlir/tfr:passes",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/common_runtime:optimization_registry",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:TransformUtils",
     ],
 )
@@ -201,7 +213,6 @@ tf_cc_test(
         ":tfr_decompose_ctx",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -216,18 +227,64 @@ cc_library(
     name = "graph_decompose_pass",
     srcs = ["integration/graph_decompose_pass.cc"],
     hdrs = ["integration/graph_decompose_pass.h"],
-    data = ["//tensorflow/compiler/mlir/tfr/resources:decomposition_lib"],
     deps = [
         ":tfr_decompose_ctx",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//mlir:IR",
     ],
     alwayslink = 1,
 )
 
+tf_py_test(
+    name = "graph_decompose_test",
+    size = "small",
+    srcs = ["integration/graph_decompose_test.py"],
+    data = ["//tensorflow/compiler/mlir/tfr/resources:decomposition_lib"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tfr/resources:composite_ops",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+cc_library(
+    name = "node_expansion_pass",
+    srcs = ["integration/node_expansion_pass.cc"],
+    hdrs = ["integration/node_expansion_pass.h"],
+    deps = [
+        ":tfr_decompose_ctx",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:core",
+        "//tensorflow/core/common_runtime/eager:eager_op_rewrite_registry",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
+tf_py_test(
+    name = "node_expansion_test",
+    size = "small",
+    srcs = ["integration/node_expansion_test.py"],
+    data = ["//tensorflow/compiler/mlir/tfr/resources:decomposition_lib"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tfr/resources:composite_ops",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "tfr_wrapper",
     srcs = ["python/tfr_wrapper.cc"],
@@ -261,10 +318,6 @@ py_library(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/tfr:tfr_wrapper",
-        "//tensorflow/python:op_def_registry",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
-        "@gast_archive//:gast",
     ],
 )
 
@@ -280,7 +333,6 @@ tf_py_test(
         "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
         "//tensorflow/compiler/mlir/tfr/resources:test_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
     ],
 )
@@ -291,9 +343,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:op_def_registry",
-        "//tensorflow/python/autograph/pyct",
-        "@gast_archive//:gast",
     ],
 )
 
@@ -302,11 +351,24 @@ py_test(
     size = "small",
     srcs = ["python/op_reg_gen_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":composite",
         ":op_reg_gen",
         "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
-        "//tensorflow/python:client_testlib",
     ],
 )
+
+py_library(
+    name = "test_utils",
+    srcs = ["python/test_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+gen_op_libraries(
+    name = "one_op",
+    src = "define_op_template.py",
+)
diff --git a/tensorflow/compiler/mlir/tfr/README.md b/tensorflow/compiler/mlir/tfr/README.md
new file mode 100644
index 00000000000..aa45ba02e94
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/README.md
@@ -0,0 +1,158 @@
+# Composable Tensorflow
+
+## Composable Tensorflow
+
+Composable TensorFlow (TF) is the framework for defining portable TF ops with
+composition in the authoring language.
+
+The set of standard TF ops is currently open. New ops are defined for special
+purposes but it is hard to make them work end-to-end: The op
+needs to be handled separately by a several backends (tf2xla bridge, tflite
+converter, CPU kernels, etc.). Writing shape functions and gradients for these
+ops is extremely difficult. `tf.function` makes some parts of the implementation
+simpler, but it introduces runtime overhead and it cannot easily be used to
+apply dedicated optimizations to op kernels.
+
+The composable TF framework allows the user to define portable TF ops as
+ompositions of other TF ops. It translates a Python function used to define the
+composition directly into a portable IR at build time, and uses it to expand the
+composite op in the TF program during compilation / execution. By using this
+expansion mechanism, new op are readily available on different platforms without
+extra work. Moreover, since the expansion is optional, the backend can easily
+treat it as a monolithic op when needed, for instance to apply optimizations or
+associate it with a custom kernel.
+
+### Benefits
+
+Using the Composable TF API to define a new op and its composition can bring the
+following benefits:
+
+* *Automatic backend support*: As long as it is composed of ops supported by the
+backend, the new op is automatcally supported (as a `tf.function` alternative);
+* *Reduced tracing overhead*: Unlike `tf.function`, the composition function is
+compiled at build time, hence TF only needs to trace a single op to build the
+`graph`;
+* *Easy fused op/kernel optimization*: Even if it has complex
+semantics, the new op is presented as a single node in the graph, thus
+optimization passes and kernels can easily be specialized to this op for better
+performance.
+* *Automatic shape/type inference support*: No shape functions are required for
+the new op;
+* *Automatic gradient support (WIP)*: The user doesn't need to author
+gradient a function of the op for training.
+
+### Use Cases
+
+* (Portablity) User wants to add a new op and run this op on different
+platforms (CPU, TPU, TFLite, etc.) to be portable.
+ * *Solution*: The user should define the new op as a composition. The ops used
+ inside the composition should have support for these platforms. These ops can
+ also be composite ops.
+
+* (Performance) User defines a custom kernel for a regular structure
+(i.e. LSTM), but it is hard to add the logic to fuse the individual ops to
+target this kernel in the inference graph.
+ * *Solution*: The user should define a new TF op, which corresponds to the
+ fused kernel, with composition, and use this op to build the model for both
+ training and inference. For the platforms where a fused kernel is not
+ available, the execution will use the composition instead.
+
+## Gradient
+(TODO)
+
+## Authoring Op Composition in Python
+
+The composable TF provides a single API to define a new op with its composition
+at the same time. For example, the following code defines a new
+`FusedFullyConnected` op, which have `MatMul`, `Add` and some
+`activation function` (specified by an op attribute) fused.
+
+
+```python
+import tensorflow as tf
+
+@Composite(
+    'FusedFullyConnected',
+    inputs=['input_: T', 'filter_: T', 'bias: T'],
+    attrs=['act: {"", "RELU", "RELU6", "TANH"} = ""'],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_fully_connected(input_, filter_, bias, act):
+  res = tf.raw_ops.MatMul(
+      a=input_, b=filter_, transpose_a=False, transpose_b=True)
+  res = tf.raw_ops.Add(x=res, y=bias)
+  if act == 'RELU':
+    return tf.raw_ops.Relu(features=res)
+  elif act == 'RELU6':
+    return tf.raw_ops.Relu6(features=res)
+  elif act == 'TANH':
+    return tf.raw_ops.Tanh(x=res)
+  else:
+    return res
+
+```
+
+Besides defining new ops, composition can be specified for an existing op
+for portability. The following code defines the semantics of `AddNOp`:
+
+```python
+@Composite('AddNOp')
+def _my_op_c(ins):
+  N = len(ins)
+  if N == 1:
+    return ins[0]
+  sum = ins[0]
+  for i in range(1, N):
+    sum += ins[i]
+  return sum
+```
+
+Utilities have been built to compile the Python composition functions down to
+the backend IR. The project also provides a set of graph optimization passes to
+expand the composite ops in the graph by using the input backend IR. These
+passes have been added to the TF [common runtime]
+(https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/common_runtime)
+for graph execution and [eager_runtime]
+(https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/common_runtime/eager)
+for eager execution.
+
+## Compiling Op Composition
+
+### Ahead-Of-Time (AOT) mode
+
+Like the op kernels, the op composition can be pre-compiled to the backend IR
+so the decomposition can be invoked at runtime. A Python [define_op_template.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tfr/define_op_template.py)
+file is provided as an example to build composite ops in the users project
+directory. All the targets required to build the new ops are created by the
+following target:
+
+
+```BUILD
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
+
+gen_op_libraries(
+    name = "test_ops",
+    src = "define_op_template.py",
+    deps = [
+        "//third_party/py/tensorflow",
+    ],
+)
+```
+
+More composite op definitions and usages are here included in the
+[examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir/tfr/examples)
+directory.
+
+### Just-In-Time (JIT) mode
+(TODO)
+
+## Known Limitations
+
+* `while` statement
+* condition of `if` statement couldn't be a tensor
+
+## Team
+
+* Feng Liu
+* Dan Moldovan
+
diff --git a/tensorflow/compiler/mlir/tfr/build_defs.bzl b/tensorflow/compiler/mlir/tfr/build_defs.bzl
new file mode 100644
index 00000000000..c00b5c88eee
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/build_defs.bzl
@@ -0,0 +1,151 @@
+"""BUILD extension for TF composition project."""
+
+load("//tensorflow:tensorflow.bzl", "py_binary", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+def gen_op_libraries(
+        name,
+        src,
+        deps = [],
+        tags = [],
+        test = False):
+    """gen_op_libraries() generates all cc and py libraries for composite op source.
+
+    Args:
+        name: used as the name component of all the generated libraries.
+        src: File contains the composite ops.
+        deps: Libraries the 'src' depends on.
+        tags:
+        test:
+    """
+    if not src.endswith(".py") or name == src[:-3]:
+        fail("'src' %s conflicts with op Python wrapper. Rename it to be different from 'name'." % src)
+
+    gen_op_lib_exec = src[:-3]  # Strip off the .py
+    py_binary(
+        name = gen_op_lib_exec,
+        srcs = [src],
+        srcs_version = "PY2AND3",
+        python_version = "PY3",
+        deps = [
+            "//tensorflow/compiler/mlir/tfr:op_reg_gen",
+            "//tensorflow/compiler/mlir/tfr:tfr_gen",
+            "//tensorflow/compiler/mlir/tfr:composite",
+        ] + deps,
+    )
+
+    registed_op = "registed_" + name
+    native.genrule(
+        name = registed_op,
+        srcs = [],
+        outs = [name + ".inc.cc"],
+        cmd = "$(location %s) --output=$@ --gen_register_op=true" % gen_op_lib_exec,
+        exec_tools = [":" + gen_op_lib_exec],
+        tags = tags,
+    )
+
+    native.cc_library(
+        name = name + "_cc",
+        testonly = test,
+        srcs = [":" + registed_op],
+        deps = [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+        alwayslink = 1,
+    )
+
+    tf_custom_op_library(
+        name = name + ".so",
+        srcs = [":" + registed_op],
+    )
+
+    tf_gen_op_wrapper_py(
+        name = "gen_" + name,
+        out = "gen_" + name + ".py",
+        deps = [
+            ":%s_cc" % name,
+        ],
+    )
+
+    tf_custom_op_py_library(
+        name = name,
+        dso = [":%s.so" % name],
+        kernels = [":%s_cc" % name],
+        srcs_version = "PY2AND3",
+        deps = [
+            ":gen_%s" % name,
+        ],
+    )
+
+    # Link the register op and rebuild the binary
+    gen_tfr_lib_exec = gen_op_lib_exec + "_with_op_library"
+    py_binary(
+        name = gen_tfr_lib_exec,
+        main = src,
+        srcs = [src],
+        srcs_version = "PY2AND3",
+        python_version = "PY3",
+        deps = [
+            "//tensorflow/compiler/mlir/tfr:op_reg_gen",
+            "//tensorflow/compiler/mlir/tfr:tfr_gen",
+            "//tensorflow/compiler/mlir/tfr:composite",
+            ":%s" % name,
+        ] + deps,
+    )
+
+    native.genrule(
+        name = name + "_mlir",
+        srcs = [],
+        outs = [name + ".mlir"],
+        cmd = "$(location %s) --output=$@ --gen_register_op=false" % gen_tfr_lib_exec,
+        exec_tools = [":" + gen_tfr_lib_exec],
+        tags = tags,
+    )
+
+    native.py_library(
+        name = name + "_py",
+        srcs = [src],
+        srcs_version = "PY2AND3",
+        deps = [
+            "//tensorflow/compiler/mlir/tfr:op_reg_gen",
+            "//tensorflow/compiler/mlir/tfr:tfr_gen",
+            "//tensorflow/compiler/mlir/tfr:composite",
+        ] + deps,
+    )
+
+def gen_op_bindings(name):
+    native.cc_library(
+        name = name + "_ops_cc",
+        srcs = [name + "_ops.cc"],
+        deps = [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+        alwayslink = 1,
+    )
+
+    tf_custom_op_library(
+        name = name + "_ops.so",
+        srcs = [name + "_ops.cc"],
+    )
+
+    tf_gen_op_wrapper_py(
+        name = "gen_" + name + "_ops",
+        out = "gen_" + name + "_ops.py",
+        deps = [
+            ":" + name + "_ops_cc",
+        ],
+    )
+
+    tf_custom_op_py_library(
+        name = name + "_ops",
+        dso = [":" + name + "_ops.so"],
+        kernels = [":" + name + "_ops_cc"],
+        visibility = ["//visibility:public"],
+        deps = [
+            ":gen_" + name + "_ops",
+        ],
+    )
diff --git a/tensorflow/compiler/mlir/tfr/define_op_template.py b/tensorflow/compiler/mlir/tfr/define_op_template.py
new file mode 100644
index 00000000000..c0db2981d2d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/define_op_template.py
@@ -0,0 +1,64 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A template to define composite ops."""
+
+# pylint: disable=g-direct-tensorflow-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+from tensorflow.compiler.mlir.tfr.python.composite import Composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'output', None,
+    'Path to write the genereated register op file and MLIR file.')
+
+flags.DEFINE_bool('gen_register_op', True,
+                  'Generate register op cc file or tfr mlir file.')
+
+flags.mark_flag_as_required('output')
+
+
+@Composite('TestRandom', derived_attrs=['T: numbertype'], outputs=['o: T'])
+def _composite_random_op():
+  pass
+
+
+def main(_):
+  if FLAGS.gen_register_op:
+    assert FLAGS.output.endswith('.cc')
+    generated_code = gen_register_op(sys.modules[__name__], '_composite_')
+  else:
+    assert FLAGS.output.endswith('.mlir')
+    generated_code = tfr_gen_from_module(sys.modules[__name__], '_composite_')
+
+  dirname = os.path.dirname(FLAGS.output)
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+  with open(FLAGS.output, 'w') as f:
+    f.write(generated_code)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
new file mode 100644
index 00000000000..eeaee926c87
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
@@ -0,0 +1,60 @@
+load("//tensorflow:tensorflow.bzl", "py_binary")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = [
+        "//tensorflow/compiler/mlir/tfr/...",
+    ],
+)
+
+gen_op_libraries(
+    name = "mnist_ops",
+    src = "ops_defs.py",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_test(
+    name = "mnist_ops_test",
+    size = "small",
+    srcs = ["mnist_ops_test.py"],
+    data = [":mnist_ops_mlir"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        ":mnist_ops",
+        ":mnist_ops_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/mlir/tfr:test_utils",
+    ],
+)
+
+py_binary(
+    name = "mnist_train",
+    srcs = ["mnist_train.py"],
+    data = [":mnist_ops_mlir"],
+    python_version = "PY3",
+    deps = [
+        ":mnist_ops",
+        ":mnist_ops_py",
+        "//tensorflow:tensorflow_py",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_ops_test.py b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_ops_test.py
new file mode 100644
index 00000000000..d25b424279f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_ops_test.py
@@ -0,0 +1,126 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.examples.mnist.ops_defs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.examples.mnist import gen_mnist_ops
+from tensorflow.compiler.mlir.tfr.examples.mnist import ops_defs
+from tensorflow.compiler.mlir.tfr.python import test_utils
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_mnist_ops.__file__)
+_lib_name = os.path.basename(gen_mnist_ops.__file__)[4:].replace('.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class MnistOpsDefsTest(test_utils.OpsDefsTest):
+
+  def test_new_conv2d_relu(self):
+    input_ = tf.random.uniform([1, 4, 4, 1])
+    filter_ = tf.random.uniform([2, 2, 1, 8])
+    bias = tf.zeros([8])
+    kwargs = {
+        'input_': input_,
+        'filter_': filter_,
+        'bias': bias,
+        'stride_w': 2,
+        'stride_h': 2,
+        'dilation_w': 1,
+        'dilation_h': 1,
+        'padding': 'SAME',
+        'act': 'RELU'
+    }
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_conv2d),
+                               ops_defs._composite_conv_add_relu, kwargs)
+
+  def test_new_conv2d_relu6(self):
+    input_ = tf.random.uniform([1, 4, 4, 1])
+    filter_ = tf.random.uniform([2, 2, 1, 8])
+    bias = tf.zeros([8])
+    kwargs = {
+        'input_': input_,
+        'filter_': filter_,
+        'bias': bias,
+        'stride_w': 2,
+        'stride_h': 2,
+        'dilation_w': 1,
+        'dilation_h': 1,
+        'padding': 'SAME',
+        'act': 'RELU6'
+    }
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_conv2d),
+                               ops_defs._composite_conv_add_relu, kwargs)
+
+  def test_new_conv2d_tanh(self):
+    self.skipTest('Fix tanh gradients')
+    input_ = tf.random.uniform([1, 4, 4, 1])
+    filter_ = tf.random.uniform([2, 2, 1, 8])
+    bias = tf.zeros([8])
+    kwargs = {
+        'input_': input_,
+        'filter_': filter_,
+        'bias': bias,
+        'stride_w': 2,
+        'stride_h': 2,
+        'dilation_w': 1,
+        'dilation_h': 1,
+        'padding': 'SAME',
+        'act': 'TANH'
+    }
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_conv2d),
+                               ops_defs._composite_conv_add_relu, kwargs)
+
+  def test_new_fully_connected(self):
+    input_ = tf.random.uniform([2, 4])
+    filter_ = tf.random.uniform([3, 4])
+    bias = tf.zeros([3])
+    kwargs = {'input_': input_, 'filter_': filter_, 'bias': bias, 'act': 'RELU'}
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_fully_connected),
+                               ops_defs._composite_fully_connected, kwargs)
+
+  def test_new_max_pool(self):
+    input_ = tf.random.uniform([8, 4, 4, 1])
+    kwargs = {
+        'input_': input_,
+        'stride_w': 2,
+        'stride_h': 2,
+        'filter_width': 1,
+        'filter_height': 1,
+        'padding': 'SAME',
+    }
+
+    self._assertOpAndComposite([input_],
+                               tf.function(gen_mnist_ops.new_max_pool),
+                               ops_defs._composite_max_pool, kwargs)
+
+
+if __name__ == '__main__':
+  os.environ[
+      'TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/examples/mnist'
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_train.py b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_train.py
new file mode 100644
index 00000000000..a4adcf86d5b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_train.py
@@ -0,0 +1,179 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MNIST model float training script with TensorFlow graph execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.compiler.mlir.tfr.examples.mnist import gen_mnist_ops
+from tensorflow.compiler.mlir.tfr.examples.mnist import ops_defs  # pylint: disable=unused-import
+from tensorflow.python.framework import load_library
+
+flags.DEFINE_integer('train_steps', 200, 'Number of steps in training.')
+
+_lib_dir = os.path.dirname(gen_mnist_ops.__file__)
+_lib_name = os.path.basename(gen_mnist_ops.__file__)[4:].replace('.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+# MNIST dataset parameters.
+num_classes = 10  # total classes (0-9 digits).
+num_features = 784  # data features (img shape: 28*28).
+num_channels = 1
+
+# Training parameters.
+learning_rate = 0.01
+display_step = 10
+batch_size = 128
+
+# Network parameters.
+n_hidden_1 = 32  # 1st conv layer number of neurons.
+n_hidden_2 = 64  # 2nd conv layer number of neurons.
+n_hidden_3 = 1024  # 1st fully connected layer of neurons.
+flatten_size = num_features // 16 * n_hidden_2
+
+seed = 66478
+
+weights = {
+    'f1':
+        tf.Variable(
+            tf.random.truncated_normal([5, 5, num_channels, n_hidden_1],
+                                       stddev=0.1,
+                                       seed=seed)),
+    'f2':
+        tf.Variable(
+            tf.random.truncated_normal([5, 5, n_hidden_1, n_hidden_2],
+                                       stddev=0.1,
+                                       seed=seed)),
+    'f3':
+        tf.Variable(
+            tf.random.truncated_normal([n_hidden_3, flatten_size],
+                                       stddev=0.1,
+                                       seed=seed)),
+    'f4':
+        tf.Variable(
+            tf.random.truncated_normal([num_classes, n_hidden_3],
+                                       stddev=0.1,
+                                       seed=seed)),
+}
+
+biases = {
+    'b1': tf.Variable(tf.zeros([n_hidden_1])),
+    'b2': tf.Variable(tf.zeros([n_hidden_2])),
+    'b3': tf.Variable(tf.zeros([n_hidden_3])),
+    'b4': tf.Variable(tf.zeros([num_classes])),
+}
+
+
+class FloatModel(tf.Module):
+  """Float inference for mnist model."""
+
+  @tf.function
+  def __call__(self, data):
+    """The Model definition."""
+    x = tf.reshape(data, [-1, 28, 28, 1])
+
+    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
+    # the same size as the input).
+
+    # NOTE: The data/x/input is always specified in floating point precision.
+    # output shape: [-1, 28, 28, 32]
+    conv1 = gen_mnist_ops.new_conv2d(x, weights['f1'], biases['b1'], 1, 1, 1, 1,
+                                     'SAME', 'RELU')
+
+    # Max pooling. The kernel size spec {ksize} also follows the layout of
+    # the data. Here we have a pooling window of 2, and a stride of 2.
+    # output shape: [-1, 14, 14, 32]
+    max_pool1 = gen_mnist_ops.new_max_pool(conv1, 2, 2, 2, 2, 'SAME')
+
+    # output shape: [-1, 14, 14, 64]
+    conv2 = gen_mnist_ops.new_conv2d(max_pool1, weights['f2'], biases['b2'], 1,
+                                     1, 1, 1, 'SAME', 'RELU')
+
+    # output shape: [-1, 7, 7, 64]
+    max_pool2 = gen_mnist_ops.new_max_pool(conv2, 2, 2, 2, 2, 'SAME')
+
+    # Reshape the feature map cuboid into a 2D matrix to feed it to the
+    # fully connected layers.
+    # output shape: [-1, 7*7*64]
+    reshape = tf.reshape(max_pool2, [-1, flatten_size])
+
+    # output shape: [-1, 1024]
+    fc1 = gen_mnist_ops.new_fully_connected(reshape, weights['f3'],
+                                            biases['b3'], 'RELU')
+    # output shape: [-1, 10]
+    return gen_mnist_ops.new_fully_connected(fc1, weights['f4'], biases['b4'])
+
+
+def grad(model, inputs, labels, trainable_variables):
+  with tf.GradientTape() as tape:
+    logits = model(inputs)
+    loss_value = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(labels, logits))
+    grads = tape.gradient(loss_value, trainable_variables)
+  correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  return accuracy, loss_value, grads
+
+
+def training_step(model, inputs, labels, optimizer, step):
+  trainable_variables = list(weights.values()) + list(biases.values())
+  accuracy, loss_value, grads = grad(model, inputs, labels, trainable_variables)
+  if step % display_step == 0:
+    print('Step %d:' % step)
+    print('    Loss = %f' % loss_value)
+    print('    Batch accuracy: %f' % accuracy)
+  optimizer.apply_gradients(zip(grads, trainable_variables))
+
+
+def get_next_batch(iter_):
+  features = next(iter_)
+  images, labels = features['image'], features['label']
+  return (mnist_preprocess(images), tf.one_hot(labels, num_classes))
+
+
+def mnist_preprocess(x):
+  x_float = tf.cast(x, tf.float32)
+  return x_float / 255.0
+
+
+def train(model, dataset, optimizer):
+  iter_ = iter(dataset)
+  for step in range(flags.FLAGS.train_steps):
+    inputs, labels = get_next_batch(iter_)
+    training_step(model, inputs, labels, optimizer, step)
+
+
+def main(_):
+  # TODO(fengliuai): put this in some automatically generated code.
+  os.environ[
+      'TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/examples/mnist'
+  # Create an mnist float model with the specified float state.
+  model = FloatModel()
+  optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
+
+  ds_train = tfds.load('mnist', split='train', shuffle_files=True)
+  ds_train = ds_train.shuffle(1024).batch(batch_size).prefetch(64)
+
+  train(model, ds_train, optimizer)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/ops_defs.py b/tensorflow/compiler/mlir/tfr/examples/mnist/ops_defs.py
new file mode 100644
index 00000000000..0cf4678892e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/ops_defs.py
@@ -0,0 +1,217 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines all the new composite ops used in the mnist example."""
+
+# pylint: disable=g-direct-tensorflow-import
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.python import composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+Composite = composite.Composite
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'output', None,
+    'Path to write the genereated register op file and MLIR file.')
+
+flags.DEFINE_bool('gen_register_op', True,
+                  'Generate register op cc file or tfr mlir file.')
+
+
+@Composite(
+    'NewConv2D',
+    inputs=['input_: T', 'filter_: T', 'bias: T'],
+    attrs=[
+        'stride_w: int', 'stride_h: int', 'dilation_w: int', 'dilation_h: int',
+        'padding: {"SAME", "VALID"}', 'act: {"", "RELU", "RELU6", "TANH"} = ""'
+    ],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_conv_add_relu(input_, filter_, bias, stride_w, stride_h,
+                             dilation_w, dilation_h, padding, act):
+  res = tf.raw_ops.Conv2D(
+      input=input_,
+      filter=filter_,
+      strides=[1, stride_w, stride_h, 1],
+      dilations=[1, dilation_w, dilation_h, 1],
+      padding=padding)
+  res = tf.raw_ops.Add(x=res, y=bias)
+  if act == 'RELU':
+    return tf.raw_ops.Relu(features=res)
+  elif act == 'RELU6':
+    return tf.raw_ops.Relu6(features=res)
+  elif act == 'TANH':
+    return tf.raw_ops.Tanh(x=res)
+  else:
+    return res
+
+
+@tf.RegisterGradient('NewConv2D')
+def _conv_add_relu_grad(op, grad):
+  act = op.get_attr('act')
+  y = op.outputs[0]
+  if act == 'RELU':
+    grad = gen_nn_ops.relu_grad(grad, y)
+  elif act == 'RELU6':
+    grad = gen_nn_ops.relu6_grad(grad, y)
+  elif act == 'TANH':
+    y = math_ops.conj(y)
+    grad = gen_math_ops.tanh_grad(y, grad)
+
+  broadcast_shape = tf.shape(y)
+  input_value_shape = tf.shape(op.inputs[2])
+  _, reduction_axes = tf.raw_ops.BroadcastGradientArgs(
+      s0=broadcast_shape, s1=input_value_shape)
+  updates_grad_reshaped = tf.reduce_sum(
+      grad, axis=reduction_axes, keepdims=True)
+  bias_grad = tf.reshape(updates_grad_reshaped, input_value_shape)
+
+  dilations = [1, op.get_attr('dilation_w'), op.get_attr('dilation_h'), 1]
+  strides = [1, op.get_attr('stride_w'), op.get_attr('stride_h'), 1]
+  padding = op.get_attr('padding')
+  shape_0, shape_1 = tf.shape_n([op.inputs[0], op.inputs[1]])
+  return [
+      tf.compat.v1.nn.conv2d_backprop_input(
+          shape_0,
+          op.inputs[1],
+          grad,
+          strides=strides,
+          padding=padding,
+          dilations=dilations,
+          data_format='NHWC'),
+      tf.compat.v1.nn.conv2d_backprop_filter(
+          op.inputs[0],
+          shape_1,
+          grad,
+          strides=strides,
+          padding=padding,
+          dilations=dilations,
+          data_format='NHWC'), bias_grad
+  ]
+
+
+@Composite(
+    'NewFullyConnected',
+    inputs=['input_: T', 'filter_: T', 'bias: T'],
+    attrs=['act: {"", "RELU", "RELU6", "TANH"} = ""'],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_fully_connected(input_, filter_, bias, act):
+  res = tf.raw_ops.MatMul(
+      a=input_, b=filter_, transpose_a=False, transpose_b=True)
+  res = tf.raw_ops.Add(x=res, y=bias)
+  if act == 'RELU':
+    return tf.raw_ops.Relu(features=res)
+  elif act == 'RELU6':
+    return tf.raw_ops.Relu6(features=res)
+  elif act == 'TANH':
+    return tf.raw_ops.Tanh(x=res)
+  else:
+    return res
+
+
+@tf.RegisterGradient('NewFullyConnected')
+def _fully_connected_grad(op, grad):
+  act = op.get_attr('act')
+  y = op.outputs[0]
+  if act == 'RELU':
+    grad = gen_nn_ops.relu_grad(grad, y)
+  elif act == 'RELU6':
+    grad = gen_nn_ops.relu6_grad(grad, y)
+  elif act == 'TANH':
+    y = math_ops.conj(y)
+    grad = gen_math_ops.tanh_grad(y, grad)
+
+  broadcast_shape = tf.shape(y)
+  input_value_shape = tf.shape(op.inputs[2])
+  _, reduction_axes = tf.raw_ops.BroadcastGradientArgs(
+      s0=broadcast_shape, s1=input_value_shape)
+  updates_grad_reshaped = tf.reduce_sum(
+      grad, axis=reduction_axes, keepdims=True)
+  bias_grad = tf.reshape(updates_grad_reshaped, input_value_shape)
+
+  a = math_ops.conj(op.inputs[0])
+  b = math_ops.conj(op.inputs[1])
+  grad_a = gen_math_ops.mat_mul(grad, b)
+  grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True)
+  return [grad_a, grad_b, bias_grad]
+
+
+@Composite(
+    'NewMaxPool',
+    inputs=['input_: T'],
+    attrs=[
+        'stride_w: int', 'stride_h: int', 'filter_width: int',
+        'filter_height: int', 'padding: {"SAME", "VALID"}'
+    ],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_max_pool(input_, stride_w, stride_h, filter_width, filter_height,
+                        padding):
+  ksize = [1, filter_width, filter_height, 1]
+  strides = [1, stride_w, stride_h, 1]
+  return tf.raw_ops.MaxPool(
+      input=input_, ksize=ksize, strides=strides, padding=padding)
+
+
+@tf.RegisterGradient('NewMaxPool')
+def _max_pool_grad(op, grad):
+  filter_width = op.get_attr('filter_width')
+  filter_height = op.get_attr('filter_height')
+  stride_w = op.get_attr('stride_w')
+  stride_h = op.get_attr('stride_h')
+  padding = op.get_attr('padding')
+  return tf.raw_ops.MaxPoolGrad(
+      orig_input=op.inputs[0],
+      orig_output=op.outputs[0],
+      grad=grad,
+      ksize=[1, filter_width, filter_height, 1],
+      strides=[1, stride_w, stride_h, 1],
+      padding=padding,
+      data_format='NHWC')
+
+
+def main(_):
+  if FLAGS.gen_register_op:
+    assert FLAGS.output.endswith('.cc')
+    generated_code = gen_register_op(sys.modules[__name__], '_composite_')
+  else:
+    assert FLAGS.output.endswith('.mlir')
+    generated_code = tfr_gen_from_module(sys.modules[__name__], '_composite_',)
+
+  dirname = os.path.dirname(FLAGS.output)
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+  with open(FLAGS.output, 'w') as f:
+    f.write(generated_code)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/BUILD b/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
new file mode 100644
index 00000000000..ef08caff939
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
@@ -0,0 +1,45 @@
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = [
+        "//tensorflow/compiler/mlir/tfr/...",
+    ],
+)
+
+gen_op_libraries(
+    name = "pad_ops",
+    src = "ops_defs.py",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_test(
+    name = "pad_ops_test",
+    size = "small",
+    srcs = ["pad_ops_test.py"],
+    data = [":pad_ops_mlir"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        ":pad_ops",
+        ":pad_ops_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/mlir/tfr:test_utils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/ops_defs.py b/tensorflow/compiler/mlir/tfr/examples/pad/ops_defs.py
new file mode 100644
index 00000000000..4b072a58f08
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/ops_defs.py
@@ -0,0 +1,168 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines the mirror pad and mirror pad grad."""
+
+# pylint: disable=g-direct-tensorflow-import
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.python import composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+Composite = composite.Composite
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'output', None,
+    'Path to write the genereated register op file and MLIR file.')
+
+flags.DEFINE_bool('gen_register_op', True,
+                  'Generate register op cc file or tfr mlir file.')
+
+
+@Composite(
+    'NewMirrorPad',
+    inputs=['input_: T', 'paddings: Tpaddings'],
+    attrs=['mode: {"REFLECT", "SYMMETRIC"}'],
+    derived_attrs=['T: type', 'Tpaddings: {int32, int64} = DT_INT32'],
+    outputs=['output: T'])
+def _composite_mirror_pad(input_, paddings, mode):
+  shape = input_.shape.as_list()
+  for i in range(len(shape)):
+    rdims = tf.raw_ops.OneHot(
+        indices=i, depth=len(shape), on_value=True, off_value=False, axis=-1)
+    rarray = tf.raw_ops.Reverse(tensor=input_, dims=rdims)
+
+    left_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 0])
+    right_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 1])
+
+    if mode == 'REFLECT':
+      left_padding, _ = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[left_padding_size, -1],
+          axis=i,
+          num_split=2)
+      _, right_padding = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[-1, right_padding_size],
+          axis=i,
+          num_split=2)
+    else:
+      _, left_padding = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[-1, left_padding_size],
+          axis=i,
+          num_split=2)
+      right_padding, _ = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[right_padding_size, -1],
+          axis=i,
+          num_split=2)
+
+    input_ = tf.raw_ops.Concat(
+        concat_dim=i, values=[left_padding, input_, right_padding])
+  return input_
+
+
+@tf.RegisterGradient('NewMirrorPad')
+def _mirror_pad_grad(op, grad):
+  mode = op.get_attr('mode')
+  return [gen_array_ops.mirror_pad_grad(grad, op.inputs[1], mode=mode), None]
+
+
+@Composite(
+    'NewMirrorPadGrad',
+    inputs=['input_: T', 'paddings: Tpaddings'],
+    attrs=['mode: {"REFLECT", "SYMMETRIC"}'],
+    derived_attrs=['T: type', 'Tpaddings: {int32, int64} = DT_INT32'],
+    outputs=['output: T'])
+def _composite_mirror_pad_grad(input_, paddings, mode):
+  shape = input_.shape.as_list()
+  for i in range(len(shape)):
+    rdims = tf.raw_ops.OneHot(
+        indices=i, depth=len(shape), on_value=True, off_value=False, axis=-1)
+    left_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 0])
+    right_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 1])
+
+    left_padding, core, right_padding = tf.raw_ops.SplitV(
+        value=input_,
+        size_splits=[left_padding_size, -1, right_padding_size],
+        axis=i,
+        num_split=3)
+    reversed_left_padding = tf.raw_ops.Reverse(tensor=left_padding, dims=rdims)
+    reversed_right_padding = tf.raw_ops.Reverse(
+        tensor=right_padding, dims=rdims)
+    zero_like = tf.raw_ops.ZerosLike(x=core)
+    left_offset, _ = tf.raw_ops.SplitV(
+        value=zero_like,
+        size_splits=[-1, left_padding_size],
+        axis=i,
+        num_split=2)
+    right_offset, _ = tf.raw_ops.SplitV(
+        value=zero_like,
+        size_splits=[-1, right_padding_size],
+        axis=i,
+        num_split=2)
+
+    if mode == 'REFLECT':
+      from_left_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[left_offset, reversed_left_padding])
+      from_right_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[reversed_right_padding, right_offset])
+    else:
+      from_left_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[reversed_left_padding, left_offset])
+      from_right_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[right_offset, reversed_right_padding])
+    input_ = tf.raw_ops.AddN(
+        inputs=[from_left_padding, core, from_right_padding])
+
+  return input_
+
+
+@tf.RegisterGradient('NewMirrorPadGrad')
+def _mirror_pad_grad_grad(op, grad):
+  mode = op.get_attr('mode')
+  return [gen_array_ops.mirror_pad(grad, op.inputs[1], mode=mode), None]
+
+
+def main(_):
+  if FLAGS.gen_register_op:
+    assert FLAGS.output.endswith('.cc')
+    generated_code = gen_register_op(sys.modules[__name__], '_composite_')
+  else:
+    assert FLAGS.output.endswith('.mlir')
+    generated_code = tfr_gen_from_module(sys.modules[__name__], '_composite_')
+
+  dirname = os.path.dirname(FLAGS.output)
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+  with open(FLAGS.output, 'w') as f:
+    f.write(generated_code)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/pad_ops_test.py b/tensorflow/compiler/mlir/tfr/examples/pad/pad_ops_test.py
new file mode 100644
index 00000000000..11f6e0acbf2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/pad_ops_test.py
@@ -0,0 +1,96 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.examples.pad.ops_defs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.examples.pad import gen_pad_ops
+from tensorflow.compiler.mlir.tfr.examples.pad import ops_defs
+from tensorflow.compiler.mlir.tfr.python import test_utils
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_pad_ops.__file__)
+_lib_name = os.path.basename(gen_pad_ops.__file__)[4:].replace('.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class PadOpsDefsTest(test_utils.OpsDefsTest, parameterized.TestCase):
+
+  @parameterized.named_parameters(('ReflectMode', 'REFLECT'),
+                                  ('SymmetricMode', 'SYMMETRIC'))
+  def test_mirror_pad(self, mode):
+    input_ = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
+    paddings = tf.constant([[
+        1,
+        1,
+    ], [2, 2]])
+    kwargs = {
+        'input': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    kwargs_ = {
+        'input_': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    # Make sure the composition python function is correct
+    self._assertOpAndComposite([input_], tf.raw_ops.MirrorPad,
+                               ops_defs._composite_mirror_pad, kwargs_, kwargs)
+    # Make sure the translation and decomposition is correct
+    self._assertOpAndComposite([input_],
+                               tf.function(gen_pad_ops.new_mirror_pad),
+                               ops_defs._composite_mirror_pad, kwargs_)
+
+  @parameterized.named_parameters(('ReflectMode', 'REFLECT'),
+                                  ('SymmetricMode', 'SYMMETRIC'))
+  def test_mirror_pad_grad(self, mode):
+    input_ = tf.constant([[2, 1, 1, 2, 3, 3, 2], [2, 1, 1, 2, 3, 3, 2],
+                          [5, 4, 4, 5, 6, 6, 5], [5, 4, 4, 5, 6, 6, 5]],
+                         dtype=tf.float32)
+    paddings = tf.constant([[
+        1,
+        1,
+    ], [2, 2]])
+    kwargs = {
+        'input': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    kwargs_ = {
+        'input_': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    # Make sure the composition python function is correct
+    self._assertOpAndComposite([input_], tf.raw_ops.MirrorPadGrad,
+                               ops_defs._composite_mirror_pad_grad, kwargs_,
+                               kwargs)
+    # Make sure the translation and decomposition is correct
+    self._assertOpAndComposite([input_],
+                               tf.function(gen_pad_ops.new_mirror_pad_grad),
+                               ops_defs._composite_mirror_pad_grad, kwargs_)
+
+
+if __name__ == '__main__':
+  os.environ[
+      'TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/examples/pad'
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
index 9fd7ee03cb9..7041545637a 100644
--- a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
@@ -14,56 +14,52 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h"
 
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+namespace {
+
+auto* tf_core_op_expansion_graph_counter =
+    monitoring::Counter<0>::New("/tensorflow/core/op_expansion/graph_counter",
+                                "The number of graphs being op expanded.");
+}  // namespace
+
+namespace tfr {
+
+bool GraphDecomposePass::IsEnabled(const ConfigProto& config_proto) const {
+  const char* tfr_lib_env_val = getenv(std::string(kTFRLibEnv).c_str());
+  return tfr_lib_env_val != nullptr;
+}
 
 Status GraphDecomposePass::Run(const ConfigProto& config_proto,
                                mlir::ModuleOp module) {
-  TF_ASSIGN_OR_RETURN(ctx_, LoadDecompositionLib(module.getContext()));
-  TF_RETURN_IF_ERROR(ctx_->Decompose(module));
-  return ctx_->Destroy();
-}
-
-StatusOr<std::unique_ptr<TFRDecomposeContext>>
-GraphDecomposePass::LoadDecompositionLib(mlir::MLIRContext* mlir_ctx) {
-  Env* env = Env::Default();
-  std::string tfr_lib_dir;
-  TF_RETURN_IF_ERROR(ReadStringFromEnvVar(
-      "TF_MLIR_TFR_LIB_DIR", "tensorflow/compiler/mlir/tfr/resources",
-      &tfr_lib_dir));
-  string composite_mlir_dir = io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
-  std::vector<string> files;
-  TF_RETURN_IF_ERROR(env->GetChildren(composite_mlir_dir, &files));
-  std::string tfr_raw_text;
-  for (const auto& file : files) {
-    string fullpath = io::JoinPath(composite_mlir_dir, file);
-    if (env->MatchPath(fullpath, io::JoinPath(composite_mlir_dir, "*.mlir"))) {
-      std::string text;
-      TF_RETURN_IF_ERROR(ReadFileToString(env, fullpath, &text));
-      tfr_raw_text.append(text);
-    }
+  if (!IsEnabled(config_proto)) {
+    LOG_FIRST_N(INFO, 1) << "Skipping Graph Decomposition Pass, decompositin "
+                            "library was not found";
+    return Status::OK();
   }
 
-  auto ctx = TFRDecomposeContext::Get(tfr_raw_text, mlir_ctx);
-  if (!ctx) {
-    return errors::Internal(absl::StrCat(
-        "Failed to load the imported decomposition lib: ", tfr_raw_text));
-  }
-  return ctx;
+  tf_core_op_expansion_graph_counter->GetCell()->IncrementBy(1);
+
+  LOG_FIRST_N(INFO, 1) << "Run Graph Decomposition Passes";
+
+  TF_RETURN_IF_ERROR(DecomposeGraph(module));
+
+  LOG_FIRST_N(INFO, 1) << "Finish Graph Decomposition Passes";
+
+  return Status::OK();
 }
 
 namespace {
-constexpr int kMlirGraphDecomposePassPriority = 1;
+constexpr int kMlirGraphDecomposePassPriority = -1;
 
 static mlir_pass_registration::MlirOptimizationPassRegistration
     register_mlir_graph_decompose_pass(kMlirGraphDecomposePassPriority,
                                        std::make_unique<GraphDecomposePass>());
 }  // namespace
 
+}  // namespace tfr
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
index f0963379928..dd93e99f04b 100644
--- a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
@@ -16,11 +16,12 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
 
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+namespace tfr {
 
 // An optimization pass that decompose the composite ops in a module according
 // to the decomposition library. Currently the decomposition library is loaded
@@ -30,23 +31,16 @@ class GraphDecomposePass : public MlirOptimizationPass {
  public:
   llvm::StringRef name() const override { return "tfr"; }
 
-  bool IsEnabled(const ConfigProto& config_proto) const override {
-    // TODO(fengliuai): make a new flag in config_proto.experimental()
-    return true;
-  }
+  // Whether to run this pass. If this is enabled, the GraphDef will be imported
+  // to MLIR even no tf composition file is found.
+  bool IsEnabled(const ConfigProto& config_proto) const override;
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
   // API integrated with the Tensorflow runtime.
   Status Run(const ConfigProto& config_proto, mlir::ModuleOp module) override;
-
- private:
-  // Load a predefined decomposition library.
-  StatusOr<std::unique_ptr<TFRDecomposeContext>> LoadDecompositionLib(
-      mlir::MLIRContext* mlir_ctx);
-
-  std::unique_ptr<TFRDecomposeContext> ctx_;
 };
 
+}  // namespace tfr
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_test.py b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_test.py
new file mode 100644
index 00000000000..d573b8e7195
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_test.py
@@ -0,0 +1,83 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.integrattion.graph_decompose."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.compiler.mlir.tfr.resources import gen_composite_ops
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_composite_ops.__file__)
+_lib_name = os.path.basename(gen_composite_ops.__file__)[4:].replace(
+    '.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class GraphDecomposeTest(test.TestCase):
+
+  def testAddN(self):
+    add = def_function.function(gen_composite_ops.my_add_n)
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    sq1 = add([t1])
+    sq2 = add([t1, t2])
+    sq3 = add([t1, t2, t3])
+    self.assertAllEqual(sq1.numpy().reshape(-1), [1, 2, 3, 4])
+    self.assertAllEqual(sq2.numpy().reshape(-1), [2, 4, 6, 8])
+    self.assertAllEqual(sq3.numpy().reshape(-1), [3, 6, 9, 12])
+
+  def testBiasedDense(self):
+    biased_dense = def_function.function(gen_composite_ops.my_biased_dense)
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biased_dense(t1, t2, t3)
+    self.assertAllEqual(sq.numpy().reshape(-1), [-3, 0, 5, 12])
+
+  def testBiasedDenseRelu(self):
+    biased_dense = def_function.function(gen_composite_ops.my_biased_dense)
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biased_dense(t1, t2, t3, act='relu')
+    self.assertAllEqual(sq.numpy().reshape(-1), [0, 0, 5, 12])
+
+  def testWithKnownKernel(self):
+
+    @def_function.function
+    def biasd_dense_elu(x, y, z):
+      dot = gen_composite_ops.my_biased_dense(x, y, z)
+      return nn_ops.elu(dot)  # with known kernel, should not expand.
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biasd_dense_elu(t1, t2, t3)
+    self.assertAllClose(sq.numpy().reshape(-1), [-0.950213, 0, 5, 12])
+
+
+if __name__ == '__main__':
+  os.environ['TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/resources'
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
new file mode 100644
index 00000000000..5bb7d235fa7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace {
+
+auto* tf_core_op_expansion_node_counter =
+    monitoring::Counter<0>::New("/tensorflow/core/op_expansion/node_counter",
+                                "The number of nodes being op expanded.");
+}  // namespace
+
+namespace tfr {
+
+Status CompositeOpExpansion::Run(EagerOperation* orig_op,
+                                 std::unique_ptr<EagerOperation>* out_op) {
+  if (!IsEnabled()) return Status::OK();
+  if (orig_op->Device() != kVariantDeviceNull) return Status::OK();
+
+  tf_core_op_expansion_node_counter->GetCell()->IncrementBy(1);
+
+  LOG_FIRST_N(INFO, 1) << "Run Node Expansion Passes";
+
+  // Get the FunctionDef and insert that into the context
+  const NodeDef& ndef = orig_op->MutableAttrs()->BuildNodeDef();
+  auto& ctx = orig_op->EagerContext();
+  Fprint128 cache_key =
+      orig_op->MutableAttrs()->CacheKey(orig_op->DeviceName());
+  // Include soft placement policy in cache key since the placement strategy
+  // can change and thus affect which kernel is picked.
+  auto x = FingerprintCat64(cache_key.high64, cache_key.low64);
+  std::string fname =
+      absl::StrCat("_expanded_", ndef.name(), "_", std::to_string(x));
+  if (!ctx.FindFunctionByName(fname)) {
+    TF_ASSIGN_OR_RETURN(auto func, ExpandNode(ndef, fname));
+    TF_RETURN_IF_ERROR(ctx.AddFunctionDef(func));
+  }
+
+  // Rewrite the out_op to be the call op. This essentially a deep copy of the
+  // orig_op, except the op name.
+  auto* new_op = new EagerOperation(&ctx);
+  TF_RETURN_IF_ERROR(
+      new_op->Reset(fname.c_str(), orig_op->DeviceName().c_str()));
+  for (auto input : orig_op->GetInputs()) {
+    TF_RETURN_IF_ERROR(new_op->AddInput(input));
+  }
+  new_op->MutableAttrs()->CopyAttributes(orig_op->Attrs());
+  out_op->reset(new_op);
+
+  LOG_FIRST_N(INFO, 1)
+      << "Finish Node Expansion Passes. Rewrite the op to call function: "
+      << fname;
+
+  return Status::OK();
+}
+
+REGISTER_REWRITE(EagerOpRewriteRegistry::POST_PLACEMENT, CompositeOpExpansion);
+
+}  // namespace tfr
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
new file mode 100644
index 00000000000..b1e4911b541
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
+
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+// An optimization pass that decompose the composite ops in a module according
+// to the decomposition library. Currently the decomposition library is loaded
+// each time the pass runs. A special environment variable is set to locate the
+// decomposition library.
+class CompositeOpExpansion : public EagerOpRewrite {
+ public:
+  CompositeOpExpansion(string name, string file, string line)
+      : EagerOpRewrite(name, file, line) {}
+
+  Status Run(EagerOperation* orig_op,
+             std::unique_ptr<tensorflow::EagerOperation>* out_op) override;
+
+ private:
+  // Whether to run this pass. If this is enabled, the NodeDef will be imported
+  // to MLIR even no tf composition file is found.
+  bool IsEnabled() {
+    const char* tfr_lib_env_val = getenv(string(kTFRLibEnv).c_str());
+    return tfr_lib_env_val != nullptr;
+  }
+};
+
+}  // namespace tfr
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_test.py b/tensorflow/compiler/mlir/tfr/integration/node_expansion_test.py
new file mode 100644
index 00000000000..f99b52fe65a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_test.py
@@ -0,0 +1,78 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.integrattion.node_expansion."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.compiler.mlir.tfr.resources import gen_composite_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_composite_ops.__file__)
+_lib_name = os.path.basename(gen_composite_ops.__file__)[4:].replace(
+    '.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class NodeExpansionTest(test.TestCase):
+
+  def testAddN(self):
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    sq1 = gen_composite_ops.my_add_n([t1])
+    sq2 = gen_composite_ops.my_add_n([t1, t2])
+    sq3 = gen_composite_ops.my_add_n([t1, t2, t3])
+    self.assertAllEqual(sq1.numpy().reshape(-1), [1, 2, 3, 4])
+    self.assertAllEqual(sq2.numpy().reshape(-1), [2, 4, 6, 8])
+    self.assertAllEqual(sq3.numpy().reshape(-1), [3, 6, 9, 12])
+
+  def testBiasedDense(self):
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = gen_composite_ops.my_biased_dense(t1, t2, t3)
+    self.assertAllEqual(sq.numpy().reshape(-1), [-3, 0, 5, 12])
+
+  def testBiasedDenseRelu(self):
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = gen_composite_ops.my_biased_dense(t1, t2, t3, act='relu')
+    self.assertAllEqual(sq.numpy().reshape(-1), [0, 0, 5, 12])
+
+  def testWithKnownKernel(self):
+
+    def biasd_dense_elu(x, y, z):
+      dot = gen_composite_ops.my_biased_dense(x, y, z)
+      return nn_ops.elu(dot)  # with known kernel, should not expand.
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biasd_dense_elu(t1, t2, t3)
+    self.assertAllClose(sq.numpy().reshape(-1), [-0.950213, 0, 5, 12])
+
+
+if __name__ == '__main__':
+  os.environ['TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/resources'
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
index 7a2962c7b67..61e96548579 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -14,48 +14,84 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
 #include "tensorflow/compiler/mlir/tfr/passes/passes.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
-#include "tensorflow/core/protobuf/struct.pb.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+namespace tfr {
 
-std::unique_ptr<TFRDecomposeContext> TFRDecomposeContext::Get(
+const char* const kTFRLibEnv = "TF_MLIR_TFR_LIB_DIR";
+
+StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
+    mlir::MLIRContext* mlir_ctx) {
+  Env* env = Env::Default();
+  std::string tfr_lib_dir;
+  TF_RETURN_IF_ERROR(ReadStringFromEnvVar(
+      kTFRLibEnv, "tensorflow/compiler/mlir/tfr/resources", &tfr_lib_dir));
+  string composite_mlir_dir = io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
+  std::vector<string> files;
+  TF_RETURN_IF_ERROR(env->GetChildren(composite_mlir_dir, &files));
+  if (files.empty()) {
+    return errors::Internal(absl::StrCat(
+        "Failed to find the decomposition lib from path ", composite_mlir_dir));
+  }
+  std::string tfr_raw_text;
+  for (const auto& file : files) {
+    string fullpath = io::JoinPath(composite_mlir_dir, file);
+    if (env->MatchPath(fullpath, io::JoinPath(composite_mlir_dir, "*.mlir"))) {
+      std::string text;
+      TF_RETURN_IF_ERROR(ReadFileToString(env, fullpath, &text));
+      tfr_raw_text.append(text);
+    }
+  }
+
+  auto ctx = TFRDecomposeContext::GetFromText(tfr_raw_text, mlir_ctx);
+  if (!ctx) {
+    return errors::Internal(absl::StrCat(
+        "Failed to load the imported decomposition lib: ", tfr_raw_text));
+  }
+  return ctx;
+}
+
+std::unique_ptr<TFRDecomposeContext> TFRDecomposeContext::GetFromText(
     StringPiece tfr_raw_text, mlir::MLIRContext* mlir_ctx) {
+  mlir_ctx->allowUnregisteredDialects(/*allow=*/true);
   // Load dialects involved in the conversion
   mlir::DialectRegistry& registry = mlir_ctx->getDialectRegistry();
   // clang-format off
@@ -74,55 +110,74 @@ std::unique_ptr<TFRDecomposeContext> TFRDecomposeContext::Get(
   llvm::SourceMgr source_mgr;
   source_mgr.AddNewSourceBuffer(std::move(memory_buffer), llvm::SMLoc());
   mlir::OwningModuleRef module = mlir::parseSourceFile(source_mgr, mlir_ctx);
+  // The MLIRContext owns the module
+  auto module_op = module.release();
 
   // Create the context
-  return absl::make_unique<TFRDecomposeContext>(std::move(module));
+  return absl::make_unique<TFRDecomposeContext>(module_op);
 }
 
-StatusOr<std::unique_ptr<GraphDef>> TFRDecomposeContext::Decompose(
-    const NodeDef& node_def, absl::Span<NodeAndType> inputs) {
-  // TODO(fengliuai): implement a cache to return early.
+StatusOr<FunctionDef> TFRDecomposeContext::ExpandNode(const NodeDef& node_def,
+                                                      StringPiece func_name) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
+  DataTypeVector input_dtys, output_dtys;
+  TF_RETURN_IF_ERROR(InputTypesForNode(node_def, *op_def, &input_dtys));
+  TF_RETURN_IF_ERROR(OutputTypesForNode(node_def, *op_def, &output_dtys));
 
-  // Creates a graph from the node def, so it can be imported to MLIR as module.
-  GraphImportConfig import_confs;
-  Status status;
-  Graph graph(OpRegistry::Global());
-
-  // Creates the placeholer nodes, which will be promoted as function arguments.
-  // Adds the argument nodes to the importer configs.
-  for (const auto& input : inputs) {
-    // TODO(fengliuai): how to get shape?
-    TensorShape unknown_shape;
-    Node* placeholder_node;
-    NodeBuilder builder(input.first, "Placeholder");
-    builder.Attr("shape", unknown_shape);
-    builder.Attr("dtype", input.second);
-    TF_RETURN_IF_ERROR(builder.Finalize(&graph, &placeholder_node));
-    import_confs.inputs.insert({std::string(input.first), {}});
+  mlir::MLIRContext* context = tfr_module_.getContext();
+  llvm::SmallVector<mlir::Type, 4> input_tys, output_tys;
+  mlir::Builder builder(context);
+  for (auto ty : input_dtys) {
+    mlir::Type elt_ty;
+    TF_RETURN_IF_ERROR(ConvertDataType(ty, builder, &elt_ty));
+    mlir::TensorType mlir_ty = mlir::UnrankedTensorType::get(elt_ty);
+    input_tys.push_back(mlir_ty);
   }
-  // Add the current node and also specify the outputs.
-  graph.AddNode(node_def, &status);
-  import_confs.outputs.emplace_back(node_def.name());
-
-  TF_ASSIGN_OR_RETURN(
-      auto node_module,
-      ConvertGraphToMlir(graph, debug_info_, flib_def_, import_confs,
-                         tfr_module_->getContext()));
-  if (failed(mlir::verify(*node_module))) {
-    return errors::Internal(absl::StrCat(
-        "Failed to verify the imported NodeDef: ", node_def.DebugString()));
+  for (auto ty : output_dtys) {
+    mlir::Type elt_ty;
+    TF_RETURN_IF_ERROR(ConvertDataType(ty, builder, &elt_ty));
+    mlir::TensorType mlir_ty = mlir::UnrankedTensorType::get(elt_ty);
+    output_tys.push_back(mlir_ty);
+  }
+  llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+  for (const auto& attr : node_def.attr()) {
+    TF_ASSIGN_OR_RETURN(auto mlir_attr,
+                        ConvertAttributeValue(attr.second, &builder));
+    attrs.push_back({mlir::Identifier::get(attr.first, context), mlir_attr});
   }
 
-  // Call the decompose passes by using the external symbol table.
-  if (failed(pm_.run(*node_module))) {
-    return errors::Internal("Failed to run the decompose passes.");
-  }
+  mlir::Location loc = mlir::UnknownLoc::get(context);
+  mlir::ModuleOp module = mlir::ModuleOp::create(loc);
+  mlir::FunctionType func_type =
+      mlir::FunctionType::get(input_tys, output_tys, context);
+  llvm::StringRef func_name_str(func_name.data(), func_name.size());
+  auto func = mlir::FuncOp::create(loc, func_name_str, func_type, {});
+  module.push_back(func);
+  func.addEntryBlock();
+  mlir::OpBuilder op_builder(func.getBody());
 
-  // Export the result as a GraphDef.
-  return ConvertMlirToGraphdef(*node_module, export_confs_);
+  // Create the TF op
+  const std::string tf_op_full_name = absl::StrCat("tf.", node_def.op());
+  mlir::OperationState op_state(loc, tf_op_full_name);
+  op_state.addOperands(func.getArguments());
+  op_state.addTypes(output_tys);
+  op_state.addAttributes(attrs);
+  mlir::Operation* tf_op = op_builder.createOperation(op_state);
+  op_builder.create<mlir::ReturnOp>(loc, tf_op->getResults());
+
+  // Run the decompose passes on the module
+  TF_RETURN_IF_ERROR(DecomposeGraph(module));
+
+  // Export the result as a FunctionDef.
+  FunctionDef func_def;
+  TF_RETURN_IF_ERROR(
+      ConvertMlirFunctionToFunctionLibraryDef(func, export_confs_, &func_def));
+  module.erase();
+  return func_def;
 }
 
-Status TFRDecomposeContext::Decompose(mlir::ModuleOp user_module) {
+Status TFRDecomposeContext::DecomposeGraph(mlir::ModuleOp user_module) {
   // Call the decompose passes by using the external symbol table.
   if (failed(pm_.run(user_module))) {
     return errors::Internal("Failed to run the decompose passes.");
@@ -130,29 +185,38 @@ Status TFRDecomposeContext::Decompose(mlir::ModuleOp user_module) {
   return Status::OK();
 }
 
-Status TFRDecomposeContext::Destroy() {
-  tfr_module_.release().erase();
-  return Status::OK();
-}
-
 // Constructor of the decompose context.
-TFRDecomposeContext::TFRDecomposeContext(mlir::OwningModuleRef tfr_module)
-    : tfr_module_(std::move(tfr_module)),
-      pm_(tfr_module_->getContext()),
-      flib_def_(OpRegistry::Global(), FunctionDefLibrary()) {
+TFRDecomposeContext::TFRDecomposeContext(mlir::ModuleOp tfr_module)
+    : tfr_module_(tfr_module), pm_(tfr_module_.getContext()) {
   mlir::OpPassManager& func_pm = pm_.nest<mlir::FuncOp>();
 
   // Prepare the imported graph.
   func_pm.addPass(mlir::CreateExecutorDialectToFunctionalConversionPass());
 
   // Run TFR lowering, inlining and raising to tf.
-  func_pm.addPass(mlir::TFR::CreateDecomposeTFOpsPass(tfr_module_.get()));
+  func_pm.addPass(mlir::TFR::CreateDecomposeTFOpsPass(tfr_module_));
   func_pm.addPass(mlir::TFR::CreateRaiseToTFOpsPass(
-      tfr_module_.get(), /*materialize_derived_attrs=*/true));
+      tfr_module_, /*materialize_derived_attrs=*/true));
 
   // Prepare to be exported.
   func_pm.addPass(mlir::CreateFunctionalToExecutorDialectConversionPass());
   pm_.addPass(mlir::CreateBreakUpIslandsPass());
 }
 
+void TFRDecomposeContext::Destroy() { tfr_module_.erase(); }
+
+StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                 StringPiece func_name) {
+  mlir::MLIRContext mlir_ctx;
+  TF_ASSIGN_OR_RETURN(auto ctx, TFRDecomposeContext::Get(&mlir_ctx));
+  return ctx->ExpandNode(node_def, func_name);
+}
+
+Status DecomposeGraph(mlir::ModuleOp user_module) {
+  mlir::MLIRContext* mlir_ctx = user_module.getContext();
+  TF_ASSIGN_OR_RETURN(auto ctx, TFRDecomposeContext::Get(mlir_ctx));
+  return ctx->DecomposeGraph(user_module);
+}
+
+}  // namespace tfr
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
index b51d6158eb2..6e33bbf0b0c 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
@@ -15,25 +15,19 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
 #define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
 
-#include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/stringpiece.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+namespace tfr {
+
+extern const char* const kTFRLibEnv;
 
 using stream_executor::port::StatusOr;
-using NodeAndType = std::pair<StringPiece, DataType>;
 
 // An wrapper for all the objects used to decompose a module (graph mode) and
 // node_def (eager mode). Note that this class owns the decomposition library.
@@ -41,34 +35,47 @@ class TFRDecomposeContext {
  public:
   // The entry function to get a decompose context. All the required passes have
   // been initialized.
-  static std::unique_ptr<TFRDecomposeContext> Get(StringPiece tfr_raw_text,
-                                                  mlir::MLIRContext* mlir_ctx);
+  static StatusOr<std::unique_ptr<TFRDecomposeContext>> Get(
+      mlir::MLIRContext* mlir_ctx);
 
   // Constructor of the decompose context. To share the decompose library, the
   // whole decompose TFR function library is loaded.
-  explicit TFRDecomposeContext(mlir::OwningModuleRef tfr_module);
+  explicit TFRDecomposeContext(mlir::ModuleOp tfr_module);
 
-  // Decompose the op in the NodeDef to a set of primitive ops according to the
-  // decompose library in the context. Wrap the decomposed result in a GraphDef.
-  StatusOr<std::unique_ptr<GraphDef>> Decompose(const NodeDef& node_def,
-                                                absl::Span<NodeAndType> inputs);
+  // Constructs the decompose context from the tfr text module and the mlir
+  // context. The tfr text module is added to the mlir context.
+  static std::unique_ptr<TFRDecomposeContext> GetFromText(
+      StringPiece tfr_raw_text, mlir::MLIRContext* mlir_ctx);
 
-  // Decompose the ops in the ModuleOp to a set of primitive ops according to
-  // decompose library in the context.
-  Status Decompose(mlir::ModuleOp user_module);
+  // Decomposes the op in the NodeDef to a set of primitive ops according to the
+  // decompose library in the context. Wrap the decomposed result in a
+  // FunctionDef.
+  StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                   StringPiece func_name);
 
-  // Release all the owned references.
-  Status Destroy();
+  // Runs the decompose passes on the user_module.
+  Status DecomposeGraph(mlir::ModuleOp user_module);
+
+  // Erases the tfr_module created.
+  void Destroy();
 
  private:
-  mlir::OwningModuleRef tfr_module_;
+  mlir::ModuleOp tfr_module_;
   mlir::PassManager pm_;
 
-  FunctionLibraryDefinition flib_def_;
-  GraphDebugInfo debug_info_;
   GraphExportConfig export_confs_;
 };
 
+// Decomposes the NodeDef to a set of primitive ops according to the decompose
+// library loaded. Wrap the decomposed result in a FunctionDef.
+StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                 StringPiece func_name);
+
+// Decomposes the ops in the ModuleOp to a set of primitive ops according to
+// decompose library in the context.
+Status DecomposeGraph(mlir::ModuleOp user_module);
+
+}  // namespace tfr
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
index 8b1b0453cff..8736e9f7134 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
@@ -25,19 +25,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 using testing::ElementsAreArray;
 using testing::Test;
+using NodeAndType = std::pair<std::string, tensorflow::DataType>;
 
 namespace tensorflow {
+namespace {
 
 REGISTER_OP("MyAddN")
     .Input("inputs: N * T")
@@ -48,7 +50,7 @@ REGISTER_OP("MyAddN")
     .SetIsAggregate()
     .SetShapeFn(shape_inference::UnchangedShape);
 
-REGISTER_OP("RiscAdd")
+REGISTER_OP("RiscAddDummy")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
@@ -57,8 +59,6 @@ REGISTER_OP("RiscAdd")
         "complex64, complex128, string}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
-namespace {
-
 constexpr char tfr_raw_text[] = R"(
 
 tfr.func @tf__my_add_n(%values: !tfr.tensor_list,
@@ -74,7 +74,7 @@ tfr.func @tf__my_add_n(%values: !tfr.tensor_list,
     %end = index_cast %n : i64 to index
     %reduce = scf.for %i = %step to %end step %step iter_args(%reduce_iter=%v1) -> !tfr.tensor {
       %v = tfr.get_element %values[%i] : (!tfr.tensor_list, index) -> !tfr.tensor
-      %reduce_next =  tfr.call @tf__risc_add(%reduce_iter, %v) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+      %reduce_next =  tfr.call @tf__risc_add_dummy(%reduce_iter, %v) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
       scf.yield %reduce_next : !tfr.tensor
     }
     scf.yield %reduce : !tfr.tensor
@@ -82,22 +82,24 @@ tfr.func @tf__my_add_n(%values: !tfr.tensor_list,
   tfr.return %res : !tfr.tensor
 }
 
-tfr.func @tf__risc_add_(!tfr.tensor<T>, !tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__risc_add_dummy_(!tfr.tensor<T>, !tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
 )";
 
 class TFRDecomposeContextTest : public Test {
  protected:
   void SetUp() override {
-    test_ctx_ = TFRDecomposeContext::Get(tfr_raw_text, &ctx_);
+    test_ctx_ = tfr::TFRDecomposeContext::GetFromText(tfr_raw_text, &ctx_);
   }
 
+  void TearDown() override { test_ctx_->Destroy(); }
+
   mlir::MLIRContext ctx_;
-  std::unique_ptr<TFRDecomposeContext> test_ctx_;
+  std::unique_ptr<tfr::TFRDecomposeContext> test_ctx_;
 };
 
-std::vector<NodeAndType> NodesSequenceOf(const GraphDef& graph) {
+std::vector<NodeAndType> NodesSequenceOf(const FunctionDef& graph) {
   std::vector<NodeAndType> nodes;
-  for (auto& node : graph.node()) {
+  for (auto& node : graph.node_def()) {
     nodes.push_back({node.op(), node.attr().at("T").type()});
   }
   return nodes;
@@ -111,13 +113,10 @@ TEST_F(TFRDecomposeContextTest, FLOAT_1_ins) {
                     .Input(src_list)
                     .Finalize(&test_node);
   EXPECT_TRUE(status.ok());
-  std::vector<NodeAndType> input_node_types{{"input", DT_FLOAT}};
-  auto decomposed =
-      test_ctx_->Decompose(test_node, absl::MakeSpan(input_node_types));
+  auto decomposed = test_ctx_->ExpandNode(test_node, "test");
   EXPECT_TRUE(decomposed.ok());
-  std::vector<NodeAndType> expected_results{
-      {"_Arg", DT_FLOAT}, {"Identity", DT_FLOAT}, {"_Retval", DT_FLOAT}};
-  EXPECT_THAT(NodesSequenceOf(*decomposed.ValueOrDie()),
+  std::vector<NodeAndType> expected_results{{"Identity", DT_FLOAT}};
+  EXPECT_THAT(NodesSequenceOf(decomposed.ValueOrDie()),
               ElementsAreArray(expected_results));
 }
 
@@ -131,17 +130,12 @@ TEST_F(TFRDecomposeContextTest, FLOAT_3_ins) {
                     .Input(src_list)
                     .Finalize(&test_node);
   EXPECT_TRUE(status.ok());
-  std::vector<NodeAndType> input_node_types{
-      {"in0", DT_FLOAT}, {"in1", DT_FLOAT}, {"in2", DT_FLOAT}};
-  auto decomposed =
-      test_ctx_->Decompose(test_node, absl::MakeSpan(input_node_types));
+  auto decomposed = test_ctx_->ExpandNode(test_node, "test");
   EXPECT_TRUE(decomposed.ok());
 
-  std::vector<NodeAndType> expected_results{
-      {"_Arg", DT_FLOAT},    {"_Arg", DT_FLOAT},    {"_Arg", DT_FLOAT},
-      {"RiscAdd", DT_FLOAT}, {"RiscAdd", DT_FLOAT}, {"EnsureShape", DT_FLOAT},
-      {"_Retval", DT_FLOAT}};
-  EXPECT_THAT(NodesSequenceOf(*decomposed.ValueOrDie()),
+  std::vector<NodeAndType> expected_results{{"RiscAddDummy", DT_FLOAT},
+                                            {"RiscAddDummy", DT_FLOAT}};
+  EXPECT_THAT(NodesSequenceOf(decomposed.ValueOrDie()),
               ElementsAreArray(expected_results));
 }
 
@@ -154,17 +148,12 @@ TEST_F(TFRDecomposeContextTest, INT32_3_ins) {
   auto status =
       NodeDefBuilder("int_add", "MyAddN").Input(src_list).Finalize(&test_node);
   EXPECT_TRUE(status.ok());
-  std::vector<NodeAndType> input_node_types{
-      {"in0", DT_INT32}, {"in1", DT_INT32}, {"in2", DT_INT32}};
-  auto decomposed =
-      test_ctx_->Decompose(test_node, absl::MakeSpan(input_node_types));
+  auto decomposed = test_ctx_->ExpandNode(test_node, "test");
   EXPECT_TRUE(decomposed.ok());
 
-  std::vector<NodeAndType> expected_results{
-      {"_Arg", DT_INT32},    {"_Arg", DT_INT32},    {"_Arg", DT_INT32},
-      {"RiscAdd", DT_INT32}, {"RiscAdd", DT_INT32}, {"EnsureShape", DT_INT32},
-      {"_Retval", DT_INT32}};
-  EXPECT_THAT(NodesSequenceOf(*decomposed.ValueOrDie()),
+  std::vector<NodeAndType> expected_results{{"RiscAddDummy", DT_INT32},
+                                            {"RiscAddDummy", DT_INT32}};
+  EXPECT_THAT(NodesSequenceOf(decomposed.ValueOrDie()),
               ElementsAreArray(expected_results));
 }
 
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index c0ef5c3b387..ae2bda5c11b 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -60,9 +60,14 @@ namespace {
 struct TFRInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
+  // Allow all call operations to be inlined.
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
   // Returns true if the given region 'src' can be inlined into the region
   // 'dest' that is attached to an operation registered to the current dialect.
-  bool isLegalToInline(Region *dest, Region *src,
+  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
                        BlockAndValueMapping &) const final {
     return true;
   }
@@ -70,7 +75,7 @@ struct TFRInlinerInterface : public DialectInlinerInterface {
   // Returns true if the given operation 'op', that is registered to this
   // dialect, can be inlined into the region 'dest' that is attached to an
   // operation registered to the current dialect.
-  bool isLegalToInline(Operation *op, Region *dest,
+  bool isLegalToInline(Operation *op, Region *dest, bool wouldBeCloned,
                        BlockAndValueMapping &) const final {
     return true;
   }
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
index 562b3f79955..500b87026ae 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
@@ -265,7 +265,8 @@ def TFR_ConstOp : TFR_Op<"constant", [ConstantLike, NoSideEffect]> {
 
   let hasFolder = 1;
 
-  let builders = [OpBuilder<"Attribute value",
+  let builders = [
+    OpBuilderDAG<(ins "Attribute":$value),
     [{
       auto* ctx = value.getContext();
       $_state.addAttribute("value", value);
@@ -287,8 +288,8 @@ def TFR_ConstantTensorOp : TFR_Op<"constant_tensor", [NoSideEffect]> {
     Example:
 
     ```mlir
-    %1 = tfr.contant_tensor(%0) : f32 -> tensor<f32>
-    %3 = tfr.contant_tensor(%2) : vector<1xf32> -> tensor<1xf32>
+    %1 = tfr.constant_tensor(%0) : f32 -> tensor<f32>
+    %3 = tfr.constant_tensor(%2) : vector<1xf32> -> tensor<1xf32>
     ```
   }];
 
@@ -399,8 +400,8 @@ def TFR_TFRFuncOp : TFR_Op<"func", [HasParent<"ModuleOp">,
   let skipDefaultBuilders = 1;
 
   let builders = [
-    OpBuilder<"StringRef name, FunctionType type, "
-              "ArrayRef<NamedAttribute> attrs = {}">
+    OpBuilderDAG<(ins "StringRef":$name, "FunctionType":$type,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
   ];
 
   let extraClassDeclaration = [{
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose.cc b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
index 9265437cca9..796ad688a5b 100644
--- a/tensorflow/compiler/mlir/tfr/passes/decompose.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -44,12 +44,28 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_types.h"
 #include "tensorflow/compiler/mlir/tfr/passes/passes.h"
 #include "tensorflow/compiler/mlir/tfr/utils/utils.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+
+namespace tensorflow {
+namespace {
+
+auto* tf_core_op_expansion_op_counter =
+    monitoring::Counter<1>::New("/tensorflow/core/op_expansion/op_counter",
+                                "The number of composite op expanded.", "name");
+}
+
+void IncreaseOpExpansionExecuteCounterByOne(const std::string& op_name) {
+  tf_core_op_expansion_op_counter->GetCell(op_name)->IncrementBy(1);
+}
+
+}  // namespace tensorflow
 
 //===----------------------------------------------------------------------===//
 // The pass to decompose unregistered TF ops with the TFR compose function.
@@ -62,7 +78,6 @@ namespace {
 // Decompose the TF ops with the registered composition library.
 struct DecomposeTFOpsPass
     : public PassWrapper<DecomposeTFOpsPass, FunctionPass> {
-
   explicit DecomposeTFOpsPass(llvm::Optional<ModuleOp> external_tfr_module)
       : external_tfr_module(external_tfr_module) {}
 
@@ -92,7 +107,7 @@ void DecomposeTFOpsPass::ApplyCanonicalization() {
   }
   populateSCFOpsCanonicalizationPatterns(patterns, context);
 
-  applyPatternsAndFoldGreedily(getFunction(), patterns);
+  applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
 }
 
 LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
@@ -118,6 +133,9 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
       return;
     }
 
+    tensorflow::IncreaseOpExpansionExecuteCounterByOne(
+        op->getName().getStringRef().str());
+
     auto compose_func_type = compose_func.getType();
     builder.setInsertionPoint(op);
     TFRTensorType unconstrainted_tensor_type = builder.getType<TFRTensorType>();
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
index f3fe9618c62..6448cafcb6a 100644
--- a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
@@ -50,6 +49,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
@@ -455,7 +455,7 @@ void RaiseToTFOpsPass::runOnFunction() {
     op->getCanonicalizationPatterns(patterns, ctx);
   }
 
-  applyPatternsAndFoldGreedily(func, patterns);
+  applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py b/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py
index 4876e17790b..99b2dfdedc4 100644
--- a/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py
+++ b/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py
@@ -139,8 +139,7 @@ def gen_register_op(source, method_prefix=None):
       if not method_prefix or name.startswith(method_prefix)
   ]
   headers = r"""
-#include "third_party/tensorflow/core/framework/node_def_builder.h"
-#include "third_party/tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
   """
diff --git a/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py b/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py
index 807e48a3dbf..6392015ba4d 100644
--- a/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py
+++ b/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py
@@ -55,7 +55,6 @@ class TFRGenTensorTest(test.TestCase):
   def test_op_reg_gen(self):
     cxx_code = gen_register_op(sys.modules[__name__])
     cxx_code_exp = r"""
-      CHECK: #include "third_party/tensorflow/core/framework/node_def_builder.h"
       CHECK-NEXT: #include "third_party/tensorflow/core/framework/op.h"
       CHECK-EMPTY
       CHECK-LABEL: namespace tensorflow {
diff --git a/tensorflow/compiler/mlir/tfr/python/test_utils.py b/tensorflow/compiler/mlir/tfr/python/test_utils.py
new file mode 100644
index 00000000000..62aa3e39105
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/test_utils.py
@@ -0,0 +1,48 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test utils for composite op definition."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.platform import test
+
+
+class OpsDefsTest(test.TestCase):
+  """Test utils."""
+
+  def _assertOpAndComposite(self, vars_, compute_op, compute_composite, kwargs,
+                            op_kwargs=None):
+    if op_kwargs is None:
+      op_kwargs = kwargs
+
+    # compute with op.
+    with backprop.GradientTape() as gt:
+      for var_ in vars_:
+        gt.watch(var_)
+      y = compute_op(**op_kwargs)  # uses op and decomposites by the graph pass.
+      grads = gt.gradient(y, vars_)  # uses registered gradient function.
+
+    # compute with composition
+    with backprop.GradientTape() as gt:
+      for var_ in vars_:
+        gt.watch(var_)
+      re_y = compute_composite(**kwargs)  # uses composite function.
+      re_grads = gt.gradient(re_y, vars_)  # uses gradients compposite function.
+
+    for v, re_v in zip(y, re_y):
+      self.assertAllClose(v, re_v)
+    for g, re_g in zip(grads, re_grads):
+      self.assertAllClose(g, re_g)
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_gen.py b/tensorflow/compiler/mlir/tfr/python/tfr_gen.py
index 0311c6a3136..3bf89c7a2d5 100644
--- a/tensorflow/compiler/mlir/tfr/python/tfr_gen.py
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_gen.py
@@ -45,6 +45,7 @@ from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
 from tensorflow.python.autograph.pyct.static_analysis import type_inference
 from tensorflow.python.framework import load_library
 from tensorflow.python.framework import op_def_registry
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -271,7 +272,7 @@ class OpDefCache(object):
 
   def mlir_external_funcs(self):
     tfr_funcs = []
-    for op_def, derived_attrs in self._op_defs.values():
+    for _, (op_def, derived_attrs) in sorted(self._op_defs.items()):
       tfr_func = '\ntfr.func @tf__{}_('.format(_camel_to_snake(op_def.name))
 
       # tensor inputs
@@ -1339,20 +1340,33 @@ def tfr_gen(func, op_defs):
 
 
 def tfr_gen_from_module(source, method_prefix=None, op_libraries=None):
-  """Parse a python code and emit the TFR functions from a target class."""
+  """Parse the input source module and emit the TFR functions."""
   op_defs = OpDefCache()
 
+  # Load the op library so the op is added to the op registry. This is
+  # required when the op cc_library couldn't be statically linked in open
+  # source.
+  # This is a no op if the op shared library couldn't be found in the same
+  # directory of the op Python API.
+  # TODO(fengliuai): make the .so file path configurable.
   if op_libraries:
+    prefix_len = len('gen_')
     for m in op_libraries:
       lib_dir = os.path.dirname(m.__file__)
-      prefix_len = len('gen_')
       lib_name = os.path.basename(m.__file__)[prefix_len:].replace('.py', '.so')
-      # Load the op library so the op is added to the op registry. This is
-      # required when the op cc_library couldn't be statically linked in open
-      # source.
-      # This is a no op if the op shared library couldn't be found in the same
-      # directory of the op Python API.
-      load_library.load_op_library(os.path.join(lib_dir, lib_name))
+      lib_path = os.path.join(lib_dir, lib_name)
+      if os.path.exists(lib_path):
+        logging.info('load file: ' + lib_path)
+        load_library.load_op_library(lib_path)
+  else:
+    # The op library is generated from the source module, then we load all the
+    # .so file in the directory
+    lib_dir = os.path.dirname(source.__file__)
+    for lib_name in os.listdir(lib_dir):
+      if lib_name.endswith('.so'):
+        lib_path = os.path.join(lib_dir, lib_name)
+        logging.info('load file: ' + lib_path)
+        load_library.load_op_library(lib_path)
 
   mlir_funcs = [
       tfr_gen(func, op_defs)
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py b/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py
index e7b8c491a52..88696490c4a 100644
--- a/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py
@@ -462,26 +462,25 @@ class TFRGenTensorTest(TFRGenTestBase):
       CHECK-NEXT:   tfr.return %[[call]] : !tfr.tensor
       CHECK-NEXT: }
 
-      CHECK-LABEL: tfr.func @tf__test_complex_tf_op_(!tfr.tensor<T>,!tfr.tensor<Tlen>,i64{tfr.name="N"}) -> (!tfr.tensor_list<N,T>) attributes {N,T,Tlen}
-
-      CHECK-LABEL: tfr.func @tf__split_v_(!tfr.tensor<T>,!tfr.tensor<Tlen>,!tfr.tensor<i32_>,i64{tfr.name="num_split"}) -> (!tfr.tensor_list<num_split,T>) attributes {T,Tlen,i32_,num_split}
-
-      CHECK-LABEL: tfr.func @tf__pack_(!tfr.tensor_list<N,T>,i64{tfr.name="axis"}) -> (!tfr.tensor<T>) attributes {N,T,axis}
-
-      CHECK-LABEL: tfr.func @tf__test_identity_op_(!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
-
-      CHECK-LABEL: tfr.func @tf__identity_(!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
-
-      CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor<T>,!tfr.tensor<T>,i1{tfr.name="pred"}) -> (!tfr.tensor<T>) attributes {T,pred}
-
       CHECK-LABEL: tfr.func @tf__add_(!tfr.tensor<T>,!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
 
       CHECK-LABEL: tfr.func @tf__concat_(!tfr.tensor<i32_>,!tfr.tensor_list<N,T>) -> (!tfr.tensor<T>) attributes {N,T,i32_}
 
-      CHECK-LABEL: tfr.func @tf__test_input_n_op_(!tfr.tensor_list<N,T>) -> (!tfr.tensor<T>) attributes {N,T}
+      CHECK-LABEL: tfr.func @tf__identity_(!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
+
+      CHECK-LABEL: tfr.func @tf__pack_(!tfr.tensor_list<N,T>,i64{tfr.name="axis"}) -> (!tfr.tensor<T>) attributes {N,T,axis}
+
+      CHECK-LABEL: tfr.func @tf__split_v_(!tfr.tensor<T>,!tfr.tensor<Tlen>,!tfr.tensor<i32_>,i64{tfr.name="num_split"}) -> (!tfr.tensor_list<num_split,T>) attributes {T,Tlen,i32_,num_split}
 
       CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor<T>,!tfr.tensor<T>,i1{tfr.name="pred"}) -> (!tfr.tensor<T>) attributes {T,pred}
 
+      CHECK-LABEL: tfr.func @tf__test_complex_tf_op_(!tfr.tensor<T>,!tfr.tensor<Tlen>,i64{tfr.name="N"}) -> (!tfr.tensor_list<N,T>) attributes {N,T,Tlen}
+
+      CHECK-LABEL: tfr.func @tf__test_identity_op_(!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
+
+      CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor<T>,!tfr.tensor<T>,i1{tfr.name="pred"}) -> (!tfr.tensor<T>) attributes {T,pred}
+
+      CHECK-LABEL: tfr.func @tf__test_input_n_op_(!tfr.tensor_list<N,T>) -> (!tfr.tensor<T>) attributes {N,T}
     """
     self._check_code(mlir_code, mlir_code_exp)
 
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
index b7372cffe2d..59ef8c15518 100644
--- a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 PYBIND11_MODULE(tfr_wrapper, m) {
   m.def("verify", [](std::string input) {
-    mlir::MLIRContext ctx(/*loadAllDialects=*/true);
+    mlir::MLIRContext ctx;
     auto& registry = ctx.getDialectRegistry();
     registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
                     mlir::StandardOpsDialect, mlir::shape::ShapeDialect,
diff --git a/tensorflow/compiler/mlir/tfr/resources/BUILD b/tensorflow/compiler/mlir/tfr/resources/BUILD
index 0d0705b1da0..bb3f07d3e7c 100644
--- a/tensorflow/compiler/mlir/tfr/resources/BUILD
+++ b/tensorflow/compiler/mlir/tfr/resources/BUILD
@@ -1,5 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_bindings")
 
 package(
     default_visibility = [
@@ -22,59 +21,6 @@ filegroup(
     srcs = ["decomposition_lib.mlir"],
 )
 
-cc_library(
-    name = "composite_ops_cc",
-    srcs = ["composite_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-    alwayslink = 1,
-)
+gen_op_bindings(name = "composite")
 
-tf_gen_op_wrapper_py(
-    name = "composite_ops",
-    out = "composite_ops.py",
-    deps = [
-        ":composite_ops_cc",
-    ],
-)
-
-cc_library(
-    name = "test_ops_cc",
-    srcs = ["test_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
-tf_custom_op_library(
-    name = "test_ops.so",
-    srcs = [
-        "test_ops.cc",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_test_ops",
-    out = "gen_test_ops.py",
-    deps = [
-        ":test_ops_cc",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "test_ops",
-    dso = ["test_ops.so"],
-    kernels = [
-        ":test_ops_cc",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":gen_test_ops",
-    ],
-)
+gen_op_bindings(name = "test")
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index b4cbf765c79..2e402f2be22 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -77,6 +77,8 @@ cc_library(
         "@llvm-project//mlir:SCFToGPUPass",
         "@llvm-project//mlir:SCFToStandard",
         "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:ShapeToStandard",
+        "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -85,7 +87,10 @@ cc_library(
 
 tf_cc_binary(
     name = "tf_to_gpu_binary",
-    srcs = ["tf_to_gpu_binary.cc"],
+    srcs = [
+        "crash_handler.h",
+        "tf_to_gpu_binary.cc",
+    ],
     visibility = [
         "//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary:__pkg__",
         "//tensorflow/core/kernels/mlir_generated:__pkg__",
@@ -95,6 +100,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/crash_handler.h b/tensorflow/compiler/mlir/tools/kernel_gen/crash_handler.h
new file mode 100644
index 00000000000..9ecaa40e567
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/crash_handler.h
@@ -0,0 +1,38 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CRASH_HANDLER_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CRASH_HANDLER_H_
+
+#include "llvm/Support/PrettyStackTrace.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+
+inline void SetCrashReportMessage() {
+#if defined(PLATFORM_GOOGLE)
+  llvm::setBugReportMsg(
+      "The TensorFlow Kernel Generator crashed, see the docs at "
+      "go/tf-kernel-gen for debug hints and contact information.\n");
+#else
+  llvm::setBugReportMsg(
+      "The TensorFlow Kernel Generator crashed, please report a bug with the "
+      "trace below on https://github.com/tensorflow/tensorflow/issues.\n");
+#endif
+}
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CRASH_HANDLER_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index b3d92773be4..676e1849318 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -61,10 +61,10 @@ LogicalResult Verify(OpTy op) {
 }
 
 //===----------------------------------------------------------------------===//
-// AllocRawOp
+// TFAllocOp
 //===----------------------------------------------------------------------===//
 template <>
-LogicalResult Verify<AllocRawOp>(AllocRawOp op) {
+LogicalResult Verify<TFAllocOp>(TFAllocOp op) {
   // Check that the total number of operands matches the number of dynamic
   // dimensions specified in the memref type.
   unsigned result_dyn_dims = op.getType().getNumDynamicDims();
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
index e6e29bcbdc2..221563fbd26 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -30,6 +30,11 @@ def TFFramework_Dialect : Dialect {
     TensorFlow C++ Framework.
   }];
   let cppNamespace = "::mlir::kernel_gen::tf_framework";
+
+  let extraClassDeclaration = [{
+    static constexpr StringRef kTFEntryAttrName = "tf_entry";
+    static constexpr size_t kAllocationAlignment = 16;
+  }];
 }
 
 def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
@@ -49,42 +54,48 @@ class TFFramework_Op<string mnemonic, list<OpTrait> traits = []> :
 }
 
 //===----------------------------------------------------------------------===//
-// AllocRawOp
+// TFAllocOp
 //===----------------------------------------------------------------------===//
-def TFFramework_AllocRawOp : TFFramework_Op<"alloc_raw",
+def TFFramework_TFAllocOp : TFFramework_Op<"alloc",
     [MemoryEffects<[MemAlloc<DefaultResource>]>]> {
   let summary = "allocation of tensors that uses TF Framework";
   let description = [{
     Allocation of tensors during kernel execution in the Compute method.
 
-    This should be used to allocate any temporary or output memref.
-    Corresponds to `Allocator::AllocateRaw` in
-    tensorflow/core/framework/allocator.h.
+    This should be used to allocate any temporary or output memref. If
+    `output_index` and `input_indices` are given, attempts to forward one of
+    the input tensors to the output by calling `OpKernelContext::forward_input`.
+
+    If the attributes are missing or the forwarding fails, calls
+    `Allocator::AllocateRaw` in tensorflow/core/framework/allocator.h.
   }];
 
-  let arguments = (ins TFFramework_OpKernelContextType:$ctx,
-                   Variadic<Index>:$dyn_sizes);
+  let arguments = (ins
+    TFFramework_OpKernelContextType:$ctx,
+    Variadic<Index>:$dyn_sizes,
+    OptionalAttr<I32ArrayAttr>:$input_indices,
+    OptionalAttr<I32Attr>:$output_index
+  );
   let results = (outs Res<AnyMemRef, "", [MemAlloc<DefaultResource>]>:$result);
 
   let builders = [
-    OpBuilder<[{
-      OpBuilder &builder, OperationState &result, MemRefType memref_type,
-      Value ctx
-    }], [{
-      result.addOperands(ctx);
-      result.types.push_back(memref_type);
+    OpBuilderDAG<(ins "MemRefType":$memref_type, "Value":$ctx),
+    [{
+      $_state.addOperands(ctx);
+      $_state.types.push_back(memref_type);
     }]>,
-
-    OpBuilder<[{
-      OpBuilder &builder, OperationState &result, MemRefType memref_type,
-      Value ctx, ValueRange dyn_sizes
-    }], [{
-      build(builder, result, memref_type, ctx);
-      result.addOperands(dyn_sizes);
+    OpBuilderDAG<(ins "MemRefType":$memref_type, "Value":$ctx,
+      "ValueRange":$dyn_sizes),
+    [{
+      build($_builder, $_state, memref_type, ctx);
+      $_state.addOperands(dyn_sizes);
     }]>];
 
   let extraClassDeclaration = [{
     MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+    static constexpr StringRef kReuseOutputAttrName = "reuse_output";
+    static constexpr StringRef kReuseInputCandidatesAttrName =
+        "reuse_input_candidates";
   }];
   let assemblyFormat = [{
     `(` $ctx (`,` $dyn_sizes^ )? `)` attr-dict `:` type($result)
@@ -92,16 +103,16 @@ def TFFramework_AllocRawOp : TFFramework_Op<"alloc_raw",
 }
 
 //===----------------------------------------------------------------------===//
-// DeallocRawOp
+// TFDeallocOp
 //===----------------------------------------------------------------------===//
-def TFFramework_DeallocRawOp : TFFramework_Op<"dealloc_raw",
+def TFFramework_TFDeallocOp : TFFramework_Op<"dealloc",
     [MemoryEffects<[MemFree]>]> {
   let summary = "deallocation of tensors that uses TF Framework";
   let description = [{
     Deallocation of tensors during kernel execution in the Compute method.
 
     This should be used to deallocate any temporary memref that was allocated
-    with `tf_framework.alloc_raw`.
+    with `tf_framework.alloc`.
     Corresponds to `Allocator::DeallocateRaw` in
     tensorflow/core/framework/allocator.h.
   }];
@@ -119,7 +130,7 @@ def TFFramework_NullContextOp : TFFramework_Op<"null_context",
   let summary = "Creates a fake TF context that will be lowered to nullptr";
   let description = [{Needed for testing}];
   let results = (outs TFFramework_OpKernelContextType:$result);
-  let assemblyFormat = "`(` `)` attr-dict `:` type($result)";
+  let assemblyFormat = "attr-dict `:` type($result)";
 }
 
 #endif // TF_FRAMEWORK_OPS
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 00096b6d7db..5c19644d94a 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
@@ -36,12 +37,14 @@ limitations under the License.
 #include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
@@ -81,15 +84,21 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
     pm.addPass(mlir::mhlo::createLegalizeToLhloPass(
         /*results_escape_functions=*/true));
     // Moving `AllocOp`s and inserting missing `DeallocOp`s
-    pm.addPass(::mlir::createBufferPlacementPass());
+    pm.addPass(::mlir::createBufferHoistingPass());
+    pm.addPass(::mlir::createBufferDeallocationPass());
     pm.addNestedPass<mlir::FuncOp>(mlir::createCopyRemovalPass());
+    pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
   } else {
     pm.addPass(mlir::mhlo::createLegalizeTFPass(
         /*allow_partial_conversion=*/false, /*legalize_chlo=*/false));
-    pm.addPass(mlir::mhlo::createChloLegalizeToHloPass());
     pm.addPass(mlir::createTransformUnrankedHloPass());
+    pm.addPass(mlir::mhlo::createChloLegalizeToHloPass());
+    pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
+    // Clean up the IR created above. In particular, operations on descriptors
+    // are simplified here.
+    pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(mlir::kernel_gen::transforms::CreateBufferizePass());
     pm.addPass(mlir::kernel_gen::transforms::CreateParallelLoopsToSequential());
   }
@@ -158,8 +167,6 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
   // Approximate Tanh using standard operations.
   pm.addNestedPass<::mlir::FuncOp>(
       ::mlir::mhlo::createLegalizeTrigonometricToApproximationPass());
-  // Move scalar operations into the launch to ensure smaller signatures.
-  pm.addPass(xla::mlir_gpu::createMoveScalarComputationsIntoGpuLaunchPass());
   // Take launches to launches with kernels.
   pm.addPass(::mlir::createGpuKernelOutliningPass());
 
@@ -168,6 +175,11 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
     pm.addPass(xla::mlir_gpu::createRewriteKernelSignaturePass());
   }
   pm.addPass(::mlir::createLowerAffinePass());
+
+  // Constraints are removed as late as possible and before lowering to CFG.
+  pm.addPass(::mlir::createConvertShapeConstraintsPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+
   pm.addPass(::mlir::createLowerToCFGPass());
   if (failed(pm.run(module))) {
     return InternalError("Lowering to GPU kernels failed.");
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/buffer_reuse.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/buffer_reuse.mlir
new file mode 100644
index 00000000000..cc6f3623466
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/buffer_reuse.mlir
@@ -0,0 +1,330 @@
+// RUN: kernel-gen-opt %s --buffer-reuse | FileCheck %s
+
+// CHECK-LABEL: @unique_reuse_output
+func @unique_reuse_output() -> (index, memref<2x3xi64>) attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_output = 1 : index
+  %result_0 = constant 1 : index
+  %result_1 = alloc() : memref<2x3xi64>
+  return %result_0, %result_1 : index, memref<2x3xi64>
+}
+
+// CHECK-LABEL: @ambiguous_reuse_output
+func @ambiguous_reuse_output(%pred : i1)
+    -> (memref<2x3xi64>, memref<2x3xi64>) attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK: reuse_output = -1
+  %mem = alloc() : memref<2x3xi64>
+  %other_mem = alloc() : memref<2x3xi64>
+  cond_br %pred, ^bb0, ^bb1
+^bb0:
+  return %mem, %other_mem : memref<2x3xi64>, memref<2x3xi64>
+^bb1:
+  return %other_mem, %mem : memref<2x3xi64>, memref<2x3xi64>
+}
+
+// CHECK-LABEL: @direct_reuse
+func @direct_reuse(%not_a_memref : index,
+                   %smaller : memref<5xi64>,
+                   %greater : memref<7xi64>,
+                   %different_element_type : memref<2x3xf32>,
+                   %reusable_0 : memref<2x3xi64>,
+                   %reusable_1 : memref<6xi64>) -> memref<2x3xi64>
+                   attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [4 : index, 5 : index]
+  %result = alloc() : memref<2x3xi64>
+  return %result : memref<2x3xi64>
+}
+
+// CHECK-LABEL: @local_reuse_with_memref_maps
+func @local_reuse_with_memref_maps(
+    %arg : memref<?xi64, offset: 2, strides: [3]>, %n : index)
+    -> memref<?xi64, offset: 2, strides: [3]> attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [0 : index]
+  %result = alloc(%n) : memref<?xi64, offset: 2, strides: [3]>
+  linalg.generic {
+    indexing_maps = [affine_map<(i) -> (i)>, affine_map<(i) -> (i)>],
+    iterator_types = ["parallel"]
+  } ins(%arg : memref<?xi64, offset: 2, strides: [3]>)
+    outs(%result : memref<?xi64, offset: 2, strides: [3]>) {
+  ^bb0(%a: i64, %b: i64):
+    linalg.yield %a : i64
+  }
+  return %result : memref<?xi64, offset: 2, strides: [3]>
+}
+
+// CHECK-LABEL: @indirect_size_equality
+func @indirect_size_equality(%arg0 : memref<?xi64>,
+                             %arg1 : memref<?xi64>,
+                             %n : index) -> memref<?xi64>
+                             attributes {tf_entry} {
+  // arg0 and arg1 are equal in size.
+  linalg.generic {
+    indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+    iterator_types = ["parallel"]
+  } ins(%arg0 : memref<?xi64>) outs(%arg1 : memref<?xi64>) {
+  ^bb0(%a: i64, %b: i64):
+    linalg.yield %a : i64
+  }
+
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [0 : index, 1 : index]
+  %result = alloc(%n) : memref<?xi64>
+
+  // arg0 and result are equal in size.
+  linalg.generic {
+    indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+    iterator_types = ["parallel"]
+  } ins(%arg0 : memref<?xi64>) outs(%result : memref<?xi64>) {
+  ^bb0(%a: i64, %b: i64):
+    linalg.yield %a : i64
+  }
+
+  return %result : memref<?xi64>
+}
+
+// CHECK-LABEL: @livetimes_incompatible
+func @livetimes_incompatible(%arg0 : memref<3xi64>)
+    -> memref<3xi64> attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = []
+  %result = alloc() : memref<3xi64>
+
+  // Use newly allocated buffer.
+  %c0 = constant 0 : index
+  %0 = load %result[%c0] : memref<3xi64>
+
+  // Use argument buffer again.
+  %1 = load %arg0[%c0] : memref<3xi64>
+
+  return %result : memref<3xi64>
+}
+
+// CHECK-LABEL: @never_used
+func @never_used(%arg0 : memref<3xi64>) -> memref<3xi64> attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [0 : index]
+  %result = alloc() : memref<3xi64>
+  %c0 = constant 0 : index
+  %0 = load %arg0[%c0] : memref<3xi64>
+  return %result : memref<3xi64>
+}
+
+// CHECK-LABEL: @branching_reuse
+func @branching_reuse(%pred : i1, %arg : memref<6xi64>) -> memref<6xi64>
+    attributes {tf_entry} {
+  cond_br %pred, ^bb0, ^bb1
+^bb0:
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [1 : index]
+  %mem0 = alloc() : memref<6xi64>
+
+  // Keep buffer argument live in this branch. Reuse is still possible because
+  // the newly allocated buffer was not used yet.
+  %c0 = constant 0 : index
+  load %arg[%c0] : memref<6xi64>
+
+  br ^bb2(%mem0 : memref<6xi64>)
+^bb1:
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [1 : index]
+  %mem1 = alloc() : memref<6xi64>
+  br ^bb2(%mem1 : memref<6xi64>)
+^bb2(%result : memref<6xi64>):
+  return %result : memref<6xi64>
+}
+
+// CHECK-LABEL: @branching_no_reuse
+func @branching_no_reuse(%pred : i1, %arg : memref<6xi64>) -> memref<6xi64>
+    attributes {tf_entry} {
+  cond_br %pred, ^bb0, ^bb1
+^bb0:
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = []
+  %mem0 = alloc() : memref<6xi64>
+
+  // Use newly allocated memory immediately.
+  %c0 = constant 0 : index
+  load %mem0[%c0] : memref<6xi64>
+
+  // Keep buffer argument live in this branch and prevent reuse.
+  load %arg[%c0] : memref<6xi64>
+
+  br ^bb2(%mem0 : memref<6xi64>)
+^bb1:
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [1 : index]
+  %mem1 = alloc() : memref<6xi64>
+  br ^bb2(%mem1 : memref<6xi64>)
+^bb2(%result : memref<6xi64>):
+  return %result : memref<6xi64>
+}
+
+// CHECK-LABEL: @branching_reuse_if
+func @branching_reuse_if(%pred : i1, %arg : memref<6xi64>)
+    -> memref<6xi64> attributes {tf_entry} {
+  %result = scf.if %pred -> (memref<6xi64>) {
+    // CHECK: alloc
+    // CHECK-SAME: reuse_input_candidates = [1 : index]
+    %mem0 = alloc() : memref<6xi64>
+
+    // Keep buffer argument live in this branch. Reuse is still possible because
+    // the newly allocated buffer was not used yet.
+    %c0 = constant 0 : index
+    load %arg[%c0] : memref<6xi64>
+
+    scf.yield %mem0 : memref<6xi64>
+  } else {
+    // CHECK: alloc
+    // CHECK-SAME: reuse_input_candidates = [1 : index]
+    %mem1 = alloc() : memref<6xi64>
+    scf.yield %mem1 : memref<6xi64>
+  }
+  return %result : memref<6xi64>
+}
+
+// CHECK-LABEL: @branching_no_reuse_if
+func @branching_no_reuse_if(%pred : i1, %arg : memref<6xi64>) -> memref<6xi64>
+    attributes {tf_entry} {
+  %result = scf.if %pred -> (memref<6xi64>) {
+    // CHECK: alloc
+    // CHECK-SAME: reuse_input_candidates = []
+    %mem0 = alloc() : memref<6xi64>
+
+    // Use newly allocated memory immediately.
+    %c0 = constant 0 : index
+    load %mem0[%c0] : memref<6xi64>
+
+    // Keep buffer argument live in this branch and prevent reuse.
+    load %arg[%c0] : memref<6xi64>
+
+    scf.yield %mem0 : memref<6xi64>
+  } else {
+    // CHECK: alloc
+    // CHECK-SAME: reuse_input_candidates = [1 : index]
+    %mem1 = alloc() : memref<6xi64>
+    scf.yield %mem1 : memref<6xi64>
+  }
+  return %result : memref<6xi64>
+}
+
+// CHECK-LABEL: @alloc_before_branching
+// New buffer is first used in the blocks succeeding its allocation block. In
+// both/all cases the newly allocated buffer is used after the buffer argument
+// is no longer live. Because these first uses are not block-local the analysis
+// does not detect this case (yet). It is correct but incomplete.
+func @alloc_before_branching(%pred : i1, %arg : memref<6xi64>) -> memref<6xi64>
+    attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = []
+  %mem = alloc() : memref<6xi64>
+  %c0 = constant 0 : index
+  cond_br %pred, ^bb0, ^bb1
+^bb0:
+  // Last use of `arg` before first use of `mem` (can reuse).
+  load %arg[%c0] : memref<6xi64>
+  load %mem[%c0] : memref<6xi64>
+  return %mem : memref<6xi64>
+^bb1:
+  // Last use of `arg` before first use of `mem` (can reuse).
+  load %arg[%c0] : memref<6xi64>
+  load %mem[%c0] : memref<6xi64>
+  return %mem : memref<6xi64>
+}
+
+// CHECK-LABEL: @alloc_before_branching_2
+func @alloc_before_branching_2(%pred : i1, %arg : memref<6xi64>)
+    -> memref<6xi64> attributes {tf_entry} {
+  // CHECK: alloc()
+  // CHECK-SAME: reuse_input_candidates = []
+  %mem = alloc() : memref<6xi64>
+  %c0 = constant 0 : index
+  cond_br %pred, ^bb0, ^bb1
+^bb0:
+  // Last use of `arg` after first use of `mem` (cannot reuse).
+  load %mem[%c0] : memref<6xi64>
+  load %arg[%c0] : memref<6xi64>
+  return %mem : memref<6xi64>
+^bb1:
+  // Last use of `arg` before first use of `mem` (can reuse).
+  load %arg[%c0] : memref<6xi64>
+  load %mem[%c0] : memref<6xi64>
+  return %mem : memref<6xi64>
+}
+
+// CHECK-LABEL: @alloc_before_branching_if
+// New buffer is first used in the blocks succeeding its allocation block. In
+// both/all cases the newly allocated buffer is used after the buffer argument
+// is no longer live. Because these first uses are not block-local the analysis
+// does not detect this case (yet). It is correct but incomplete.
+func @alloc_before_branching_if(%pred : i1, %arg : memref<6xi64>) -> memref<6xi64>
+    attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = []
+  %mem = alloc() : memref<6xi64>
+  %result = scf.if %pred -> (memref<6xi64>) {
+    // Last use of `arg` before first use of `mem` (can reuse).
+    %c0 = constant 0 : index
+    load %arg[%c0] : memref<6xi64>
+    load %mem[%c0] : memref<6xi64>
+    scf.yield %mem : memref<6xi64>
+  } else {
+    // Last use of `arg` before first use of `mem` (can reuse).
+    %c0 = constant 0 : index
+    load %arg[%c0] : memref<6xi64>
+    load %mem[%c0] : memref<6xi64>
+    scf.yield %mem : memref<6xi64>
+  }
+  return %result : memref<6xi64>
+}
+
+// CHECK-LABEL: @alloc_before_branching_2_if
+func @alloc_before_branching_2_if(%pred : i1, %arg : memref<6xi64>)
+    -> memref<6xi64> attributes {tf_entry} {
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = []
+  %mem = alloc() : memref<6xi64>
+  %result = scf.if %pred -> (memref<6xi64>) {
+    // Last use of `arg` after first use of `mem` (cannot reuse).
+    %c0 = constant 0 : index
+    load %mem[%c0] : memref<6xi64>
+    load %arg[%c0] : memref<6xi64>
+    scf.yield %mem : memref<6xi64>
+  } else {
+    // Last use of `arg` before first use of `mem` (can reuse).
+    %c0 = constant 0 : index
+    load %arg[%c0] : memref<6xi64>
+    load %mem[%c0] : memref<6xi64>
+    scf.yield %mem : memref<6xi64>
+  }
+  return %result : memref<6xi64>
+}
+
+// CHECK-LABEL: @abs_unranked_i64
+func @abs_unranked_i64(%arg : memref<*xi64>,
+                       %arg_shape : memref<?xindex>,
+                       %flat_shape : memref<1xindex>,
+                       %arg_size : index) -> memref<*xi64>
+                       attributes {tf_entry} {
+  %flat_arg = lmhlo.reshape_memref_cast %arg(%flat_shape)
+      : (memref<*xi64>, memref<1xindex>) -> memref<?xi64>
+  // CHECK: alloc
+  // CHECK-SAME: reuse_input_candidates = [0 : index], reuse_output = 0 : index
+  %flat_result = alloc(%arg_size) : memref<?xi64>
+  linalg.generic {
+    indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+    iterator_types = ["parallel"]
+  } ins(%flat_arg : memref<?xi64>) outs(%flat_result : memref<?xi64>) {
+  ^bb0(%a: i64, %b: i64):
+    %c0 = constant 0 : i64
+    %a_pos = cmpi "sge", %a, %c0 : i64
+    %a_neg = subi %c0, %a : i64
+    %a_abs = select %a_pos, %a, %a_neg : i64
+    linalg.yield %a_abs : i64
+  }
+  %result = lmhlo.reshape_memref_cast %flat_result(%arg_shape)
+      : (memref<?xi64>, memref<?xindex>) -> memref<*xi64>
+  return %result : memref<*xi64>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir
index 762962fc8e6..1a278365464 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir
@@ -60,3 +60,19 @@ func @dynamic_tensor_from_elements(%arg : tensor<*xf32>) -> tensor<?xindex> {
   return %result : tensor<?xindex>
 }
 
+// CHECK-LABEL: @assuming
+// CHECK-SAME: (%[[WITNESS:.*]]: !shape.witness, %[[ARG:.*]]: memref<?xf32>)
+// CHECK-SAME: -> memref<?xf32>
+func @assuming(%witness: !shape.witness, %arg : memref<?xf32>)
+              -> tensor<?xf32> {
+  // CHECK-NEXT: %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]]
+  // CHECK-SAME:     -> (memref<?xf32>) {
+  // CHECK-NEXT:   shape.assuming_yield %[[ARG]] : memref<?xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return %[[ASSUMING_RESULT]] : memref<?xf32>
+  %assuming_result = shape.assuming %witness -> (tensor<?xf32>) {
+    %result = tensor_load %arg : memref<?xf32>
+    shape.assuming_yield %result : tensor<?xf32>
+  }
+  return %assuming_result : tensor<?xf32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
index bb0f1926cda..5d0beb7c7fe 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
@@ -10,9 +10,9 @@ func @tf_entry(%size_0 : index , %size_2 : index) -> index
   dealloc %buf : memref<?x10x?xf32>
   std.return %size_0 : index
 }
-// CHECK-NEXT: [[VAL_3:%.*]] = tf_framework.alloc_raw
+// CHECK-NEXT: [[VAL_3:%.*]] = tf_framework.alloc
 // CHECK-SAME:   ([[CTX]], [[SIZE_0]], [[SIZE_2]]) : memref<?x10x?xf32>
-// CHECK-NEXT: tf_framework.dealloc_raw([[CTX]], [[VAL_3]]) : memref<?x10x?xf32>
+// CHECK-NEXT: tf_framework.dealloc([[CTX]], [[VAL_3]]) : memref<?x10x?xf32>
 // CHECK-NEXT: return [[SIZE_0]] : index
 
 // -----
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir
index 1d1b3319515..1d3d5e485fb 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir
@@ -2,6 +2,6 @@
 
 func @alloc_raw(%ctx: !tf_framework.op_kernel_context, %size : index) {
   // expected-error @+1 {{`dyn_sizes` count 1 does not match dynamic dimensions}}
-  %buf = tf_framework.alloc_raw(%ctx, %size) : memref<?x10x?xi8>
+  %buf = tf_framework.alloc(%ctx, %size) : memref<?x10x?xi8>
   return
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir
new file mode 100644
index 00000000000..b0917bf2eca
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir
@@ -0,0 +1,9 @@
+// RUN: tf-opt %s --test-tf-lower-tf --xla-legalize-tf | mlir-hlo-opt --transform-unranked-hlo | kernel-gen-opt -allow-unregistered-dialect --canonicalize --shape-to-descriptors --canonicalize --bufferize | FileCheck %s
+
+// Test whether all shape computations required for isinf can be lowered to
+// the standard dialect, scf and descriptors.
+// CHECK-LABEL: @isinf
+func @isinf(%arg0: tensor<?xf32>) -> tensor<?xi1> {
+  %0 = "tf.IsInf"(%arg0) : (tensor<?xf32>) -> tensor<?xi1>
+  return %0 : tensor<?xi1>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
index fc8e7c97ec8..b0d366c63d8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
@@ -4,22 +4,33 @@
 // Verify the generic form can be parsed.
 // RUN: kernel-gen-opt -mlir-print-op-generic %s | kernel-gen-opt | FileCheck %s
 
-// CHECK-LABEL: func @alloc_raw
-func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
+// CHECK-LABEL: func @alloc
+func @alloc(%ctx: !tf_framework.op_kernel_context,
                    %size_0 : index , %size_2 : index) {
-  %buf_0 = tf_framework.alloc_raw(%ctx) : memref<10xi8>
-  %buf_1 = tf_framework.alloc_raw(%ctx, %size_0, %size_2) : memref<?x10x?xi8>
+  %buf_0 = tf_framework.alloc(%ctx) : memref<10xi8>
+  %buf_1 = tf_framework.alloc(%ctx, %size_0, %size_2) : memref<?x10x?xi8>
   return
 }
 
-// CHECK-LABEL: func @dealloc_raw
-func @dealloc_raw(%ctx: !tf_framework.op_kernel_context, %memref : memref<?x10xf32>) {
-  tf_framework.dealloc_raw(%ctx, %memref) : memref<?x10xf32>
+// CHECK-LABEL: func @forwarding_alloc
+func @forwarding_alloc(%ctx: !tf_framework.op_kernel_context,
+                       %size_0 : index , %size_2 : index) {
+  %buf = tf_framework.alloc(%ctx, %size_0, %size_2) {
+    input_indices = [0 : i32, 1 : i32],
+    output_index = 0 : i32
+  } : memref<?x10x?xi8>
+  return
+}
+
+// CHECK-LABEL: func @dealloc
+func @dealloc(%ctx: !tf_framework.op_kernel_context,
+              %memref : memref<?x10xf32>) {
+  tf_framework.dealloc(%ctx, %memref) : memref<?x10xf32>
   return
 }
 
 // CHECK-LABEL: func @null_context
 func @null_context() {
-  tf_framework.null_context() : !tf_framework.op_kernel_context
+  tf_framework.null_context : !tf_framework.op_kernel_context
   return
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-mlhlo.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-lmhlo.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-mlhlo.mlir
rename to tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-lmhlo.mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir
new file mode 100644
index 00000000000..8fe9f05dd13
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir
@@ -0,0 +1,191 @@
+// RUN: kernel-gen-opt %s -allow-unregistered-dialect -propagate-tf-abi-knowledge-to-kernels | FileCheck %s --check-prefixes=CHECK,ABI
+// RUN: kernel-gen-opt %s -allow-unregistered-dialect -propagate-shape-knowledge-to-kernels | FileCheck %s --check-prefixes=CHECK,SHAPE
+
+// The input is taken from what is actually used in kernel generator lowering
+// for unary operations. This could be minimized but then we would not be
+// testing how this actually gets used.
+
+// CHECK-LABEL: module attributes {gpu.container_module}
+module attributes {gpu.container_module} {
+  // CHECK-LABEL: func @abs
+  func @abs(%ctx: !tf_framework.op_kernel_context, %arg0: memref<*xf32>)
+      -> memref<*xf32> attributes {tf_entry} {
+    %c256 = constant 256 : index
+    %c1024 = constant 1024 : index
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    %0 = rank %arg0 : memref<*xf32>
+    %1 = alloca(%0) : memref<?xindex>
+    br ^bb1(%c0 : index)
+  ^bb1(%2: index):  // 2 preds: ^bb0, ^bb2
+    %3 = cmpi "slt", %2, %0 : index
+    cond_br %3, ^bb2, ^bb3
+  ^bb2:  // pred: ^bb1
+    %4 = dim %arg0, %2 : memref<*xf32>
+    store %4, %1[%2] : memref<?xindex>
+    %5 = addi %2, %c1 : index
+    br ^bb1(%5 : index)
+  ^bb3:  // pred: ^bb1
+    %6 = dim %1, %c0 : memref<?xindex>
+    br ^bb4(%c0, %c1 : index, index)
+  ^bb4(%7: index, %8: index):  // 2 preds: ^bb3, ^bb5
+    %9 = cmpi "slt", %7, %6 : index
+    cond_br %9, ^bb5, ^bb6
+  ^bb5:  // pred: ^bb4
+    %10 = dim %arg0, %7 : memref<*xf32>
+    %11 = muli %10, %8 : index
+    %12 = addi %7, %c1 : index
+    br ^bb4(%12, %11 : index, index)
+  ^bb6:  // pred: ^bb4
+    %13 = alloca() : memref<1xindex>
+    store %8, %13[%c0] : memref<1xindex>
+    %14 = lmhlo.reshape_memref_cast %arg0(%13) : (memref<*xf32>, memref<1xindex>) -> memref<?xf32>
+    %15 = dim %14, %c0 : memref<?xf32>
+    %16 = tf_framework.alloc(%ctx, %15) : memref<?xf32>
+    %17 = cmpi "sle", %15, %c0 : index
+    %18 = subi %c0, %15 : index
+    %19 = subi %15, %c1 : index
+    %20 = select %17, %18, %19 : index
+    %21 = divi_signed %20, %c1024 : index
+    %22 = subi %c0, %21 : index
+    %23 = addi %21, %c1 : index
+    %24 = select %17, %22, %23 : index
+    gpu.launch_func @abs_kernel::@abs_kernel
+        blocks in (%24, %c1, %c1) threads in (%c256, %c1, %c1)
+        args(%14 : memref<?xf32>, %16 : memref<?xf32>)
+    %25 = lmhlo.reshape_memref_cast %16(%1) : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
+    return %25 : memref<*xf32>
+  }
+
+  // CHECK-LABEL: gpu.module @abs_kernel
+  gpu.module @abs_kernel {
+    // CHECK-LABEL: @__nv_fabsf
+    llvm.func @__nv_fabsf(!llvm.float) -> !llvm.float
+    // CHECK-LABEL: @abs_kernel
+    // ABI-SAME: %[[ARG0:.*]]: !llvm.ptr<float>, %[[ARG1:.*]]: !llvm.ptr<float> {llvm.align = 16 : index},
+    // ABI-SAME: %[[ARG2:.*]]: !llvm.i64, %[[ARG3:.*]]: !llvm.i64, %[[ARG4:.*]]: !llvm.i64, %[[ARG5:.*]]: !llvm.ptr<float>, %[[ARG6:.*]]: !llvm.ptr<float> {llvm.align = 16 : index, llvm.noalias = true},
+    // ABI-SAME: %[[ARG7:.*]]: !llvm.i64, %[[ARG8:.*]]: !llvm.i64, %[[ARG9:.*]]: !llvm.i64
+    // SHAPE-SAME: %[[ARG0:.*]]: !llvm.ptr<float>, %[[ARG1:.*]]: !llvm.ptr<float>, %[[ARG2:.*]]: !llvm.i64, %[[ARG3:.*]]: !llvm.i64, %[[ARG4:.*]]: !llvm.i64, %[[ARG5:.*]]: !llvm.ptr<float>, %[[ARG6:.*]]: !llvm.ptr<float>, %[[ARG7:.*]]: !llvm.i64, %[[ARG8:.*]]: !llvm.i64, %[[ARG9:.*]]: !llvm.i64
+    llvm.func @abs_kernel(%arg0: !llvm.ptr<float>, %arg1: !llvm.ptr<float>, %arg2: !llvm.i64, %arg3: !llvm.i64, %arg4: !llvm.i64, %arg5: !llvm.ptr<float>, %arg6: !llvm.ptr<float>, %arg7: !llvm.i64, %arg8: !llvm.i64, %arg9: !llvm.i64) attributes {gpu.kernel} {
+      // ABI: %[[ZERO:.*]] = llvm.mlir.constant(0 : index)
+      // CHECK: llvm.mlir.undef
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[ARG1]]
+      // SHAPE-NEXT: llvm.insertvalue %[[ARG0]]
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // CHECK-NEXT: llvm.insertvalue %[[ARG1]]
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[ZERO]]
+      // SHAPE-NEXT: llvm.insertvalue %[[ARG2]]
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // CHECK-NEXT: llvm.insertvalue %[[ARG3]]
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // CHECK-NEXT: llvm.insertvalue %[[ARG4]]
+      %5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // CHECK-NEXT: llvm.mlir.undef
+      %6 = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[ARG6]]
+      // SHAPE-NEXT: llvm.insertvalue %[[ARG5]]
+      %7 = llvm.insertvalue %arg5, %6[0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // CHECK-NEXT: llvm.insertvalue %[[ARG6]]
+      %8 = llvm.insertvalue %arg6, %7[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[ZERO]]
+      // SHAPE-NEXT: llvm.insertvalue %[[ARG7]]
+      %9 = llvm.insertvalue %arg7, %8[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[ARG8]]
+      // SHAPE-NEXT: llvm.insertvalue %[[ARG3]]
+      %10 = llvm.insertvalue %arg8, %9[3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[ARG9]]
+      // SHAPE-NEXT: llvm.insertvalue %[[ARG4]]
+      %11 = llvm.insertvalue %arg9, %10[4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %12 = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+      %13 = llvm.sext %12 : !llvm.i32 to !llvm.i64
+      %14 = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+      %15 = llvm.sext %14 : !llvm.i32 to !llvm.i64
+      %16 = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+      %17 = llvm.sext %16 : !llvm.i32 to !llvm.i64
+      %18 = nvvm.read.ptx.sreg.tid.x : !llvm.i32
+      %19 = llvm.sext %18 : !llvm.i32 to !llvm.i64
+      %20 = nvvm.read.ptx.sreg.tid.y : !llvm.i32
+      %21 = llvm.sext %20 : !llvm.i32 to !llvm.i64
+      %22 = nvvm.read.ptx.sreg.tid.z : !llvm.i32
+      %23 = llvm.sext %22 : !llvm.i32 to !llvm.i64
+      %24 = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+      %25 = llvm.sext %24 : !llvm.i32 to !llvm.i64
+      %26 = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+      %27 = llvm.sext %26 : !llvm.i32 to !llvm.i64
+      %28 = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+      %29 = llvm.sext %28 : !llvm.i32 to !llvm.i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+      %31 = llvm.sext %30 : !llvm.i32 to !llvm.i64
+      %32 = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+      %33 = llvm.sext %32 : !llvm.i32 to !llvm.i64
+      %34 = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+      %35 = llvm.sext %34 : !llvm.i32 to !llvm.i64
+      llvm.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %36 = llvm.mlir.constant(0 : index) : !llvm.i64
+      %37 = llvm.extractvalue %5[3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %38 = llvm.mlir.constant(256 : index) : !llvm.i64
+      %39 = llvm.mul %13, %38 : !llvm.i64
+      %40 = llvm.mlir.constant(256 : index) : !llvm.i64
+      %41 = llvm.mlir.constant(-256 : index) : !llvm.i64
+      %42 = llvm.mul %13, %41 : !llvm.i64
+      %43 = llvm.add %42, %37 : !llvm.i64
+      %44 = llvm.icmp "slt" %40, %43 : !llvm.i64
+      %45 = llvm.select %44, %40, %43 : !llvm.i1, !llvm.i64
+      %46 = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %47 = llvm.extractvalue %5[0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %48 = llvm.bitcast %47 : !llvm.ptr<float> to !llvm.ptr<float>
+      %49 = llvm.insertvalue %48, %46[0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %50 = llvm.extractvalue %5[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %51 = llvm.bitcast %50 : !llvm.ptr<float> to !llvm.ptr<float>
+      %52 = llvm.insertvalue %51, %49[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %53 = llvm.extractvalue %5[4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %54 = llvm.extractvalue %5[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %55 = llvm.mul %39, %53 : !llvm.i64
+      %56 = llvm.add %54, %55 : !llvm.i64
+      %57 = llvm.insertvalue %56, %52[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %58 = llvm.insertvalue %45, %57[3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %59 = llvm.mlir.constant(1 : i64) : !llvm.i64
+      %60 = llvm.insertvalue %59, %58[4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %61 = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %62 = llvm.extractvalue %11[0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %63 = llvm.bitcast %62 : !llvm.ptr<float> to !llvm.ptr<float>
+      %64 = llvm.insertvalue %63, %61[0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %65 = llvm.extractvalue %11[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %66 = llvm.bitcast %65 : !llvm.ptr<float> to !llvm.ptr<float>
+      %67 = llvm.insertvalue %66, %64[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %68 = llvm.extractvalue %11[4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %69 = llvm.extractvalue %11[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %70 = llvm.mul %39, %68 : !llvm.i64
+      %71 = llvm.add %69, %70 : !llvm.i64
+      %72 = llvm.insertvalue %71, %67[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %73 = llvm.insertvalue %45, %72[3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %74 = llvm.mlir.constant(1 : i64) : !llvm.i64
+      %75 = llvm.insertvalue %74, %73[4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %76 = llvm.icmp "slt" %19, %45 : !llvm.i64
+      llvm.cond_br %76, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %77 = llvm.extractvalue %60[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %78 = llvm.extractvalue %60[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %79 = llvm.mlir.constant(1 : index) : !llvm.i64
+      %80 = llvm.mul %19, %79 : !llvm.i64
+      %81 = llvm.add %78, %80 : !llvm.i64
+      %82 = llvm.getelementptr %77[%81] : (!llvm.ptr<float>, !llvm.i64) -> !llvm.ptr<float>
+      %83 = llvm.load %82 : !llvm.ptr<float>
+      %84 = llvm.call @__nv_fabsf(%83) : (!llvm.float) -> !llvm.float
+      %85 = llvm.extractvalue %75[1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %86 = llvm.extractvalue %75[2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<1 x i64>, array<1 x i64>)>
+      %87 = llvm.mlir.constant(1 : index) : !llvm.i64
+      %88 = llvm.mul %19, %87 : !llvm.i64
+      %89 = llvm.add %86, %88 : !llvm.i64
+      %90 = llvm.getelementptr %85[%89] : (!llvm.ptr<float>, !llvm.i64) -> !llvm.ptr<float>
+      llvm.store %84, %90 : !llvm.ptr<float>
+      llvm.br ^bb3
+    ^bb3:  // 2 preds: ^bb1, ^bb2
+      llvm.return
+    }
+  }
+}
+
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
index b943321e95b..8530e4eccde 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -1,21 +1,21 @@
 // RUN: kernel-gen-opt %s -tf-kernel-to-llvm -split-input-file | FileCheck %s
 
-// CHECK: llvm.func @_mlir_ciface_tf_alloc_raw
-// CHECK-SAME:  (!llvm.ptr<i8>, !llvm.i64) -> !llvm.ptr<i8>
+// CHECK: llvm.func @_mlir_ciface_tf_alloc
+// CHECK-SAME:  (!llvm.ptr<i8>, !llvm.i64, !llvm.i64, !llvm.i32, !llvm.i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
 
-// CHECK-LABEL: llvm.func @alloc_raw(
+// CHECK-LABEL: llvm.func @alloc(
 // CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
 // CHECK-SAME:    [[SIZE_0:%.*]]: !llvm.i64,
 // CHECK-SAME:    [[SIZE_2:%.*]]: !llvm.i64) -> [[DESC_TY:!.*]] {
-func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
+func @alloc(%ctx: !tf_framework.op_kernel_context,
                 %size_0 : index , %size_2 : index) -> memref<?x10x?xf32> {
-  %buf = tf_framework.alloc_raw(%ctx, %size_0, %size_2) : memref<?x10x?xf32>
+  %buf = tf_framework.alloc(%ctx, %size_0, %size_2) : memref<?x10x?xf32>
   std.return %buf : memref<?x10x?xf32>
 }
 // Compute number of elements.
 // CHECK: [[SIZE_1:%.*]] = llvm.mlir.constant(10 : index) : !llvm.i64
 // CHECK: [[NUM_ELEM_0:%.*]] = llvm.mul [[SIZE_0]], [[SIZE_1]] : !llvm.i64
-// CHECK: [[NUM_ELEM_1:%.*]] = llvm.mul [[NUM_ELEM_0]], [[SIZE_2]] : !llvm.i64
+// CHECK: [[NUM_ELEMS:%.*]] = llvm.mul [[NUM_ELEM_0]], [[SIZE_2]] : !llvm.i64
 
 // Compute the size of an individual element.
 // CHECK: [[NULL:%.*]] = llvm.mlir.null : !llvm.ptr<float>
@@ -25,10 +25,15 @@ func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
 // CHECK: [[SIZE_OF_FLOAT:%.*]] = llvm.ptrtoint [[GEP]]
 // CHECK-SAME:            !llvm.ptr<float> to !llvm.i64
 
+// Compute output index (-1) and candidate indices (0, NULL).
+// CHECK: [[OUTPUT_INDEX:%.*]] = llvm.mlir.constant(-1 : i32) : !llvm.i32
+// CHECK-NEXT: [[NUM_CANDIDATES:%.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK-NEXT: [[CANDIDATES_PTR:%.*]] = llvm.mlir.null : !llvm.ptr<i32>
+
 // Allocate memory.
-// CHECK: [[NUM_BYTES:%.*]] = llvm.mul [[NUM_ELEM_1]], [[SIZE_OF_FLOAT]]
-// CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_BYTES]])
-// CHECK-SAME:                  (!llvm.ptr<i8>, !llvm.i64) -> !llvm.ptr<i8>
+// CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_ELEMS]],
+// CHECK-SAME: [[SIZE_OF_FLOAT]], [[OUTPUT_INDEX]], [[NUM_CANDIDATES]],
+// CHECK-SAME: [[CANDIDATES_PTR]])
 
 // Build memref descriptor.
 // CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : [[DESC_TY]]
@@ -55,13 +60,13 @@ func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
 
 // -----
 
-// CHECK: llvm.func @_mlir_ciface_tf_dealloc_raw(!llvm.ptr<i8>, !llvm.ptr<i8>)
+// CHECK: llvm.func @_mlir_ciface_tf_dealloc(!llvm.ptr<i8>, !llvm.ptr<i8>)
 
-// CHECK-LABEL: llvm.func @dealloc_raw(
+// CHECK-LABEL: llvm.func @dealloc(
 // CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
-func @dealloc_raw(%ctx: !tf_framework.op_kernel_context,
+func @dealloc(%ctx: !tf_framework.op_kernel_context,
                   %memref : memref<?x10xf32>) {
-  tf_framework.dealloc_raw(%ctx, %memref) : memref<?x10xf32>
+  tf_framework.dealloc(%ctx, %memref) : memref<?x10xf32>
   return
 }
 // Extract allocated ptr from the memref descriptor.
@@ -71,5 +76,5 @@ func @dealloc_raw(%ctx: !tf_framework.op_kernel_context,
 // CHECK-SAME:                   !llvm.ptr<float> to !llvm.ptr<i8>
 
 // Deallocate.
-// CHECK: llvm.call @_mlir_ciface_tf_dealloc_raw(
+// CHECK: llvm.call @_mlir_ciface_tf_dealloc(
 // CHECK-SAME: [[TF_CTX]], [[VOID_PTR]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
index 3744a5ea31f..d2a64a26597 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
@@ -44,6 +44,10 @@ extern "C" CUmodule mgpuModuleLoad(void *data) {
   return module;
 }
 
+extern "C" void mgpuModuleUnload(CUmodule module) {
+  CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
+}
+
 extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) {
   CUfunction function = nullptr;
   CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
@@ -69,6 +73,10 @@ extern "C" CUstream mgpuStreamCreate() {
   return stream;
 }
 
+extern "C" void mgpuStreamDestroy(CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
+}
+
 extern "C" void mgpuStreamSynchronize(CUstream stream) {
   CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index e75db59d885..11dc473f691 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -24,23 +24,50 @@ namespace tf_framework {
 namespace {
 
 using tensorflow::Allocator;
+using tensorflow::AllocatorAttributes;
 
 Allocator* GetAllocator(void* op_kernel_ctx) {
   auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
   // TODO(pifon): Figure out how to set AllocatorAttributes correctly.
-  tensorflow::AllocatorAttributes attrs;
+  AllocatorAttributes attrs;
   return ctx->get_allocator(attrs);
 }
 
 }  // namespace
 
-extern "C" void* _mlir_ciface_tf_alloc_raw(void* op_kernel_ctx,
-                                           size_t num_bytes) {
+extern "C" void* _mlir_ciface_tf_alloc(void* op_kernel_ctx, size_t num_elements,
+                                       size_t element_size,
+                                       int32_t output_index,
+                                       int32_t num_candidates,
+                                       int32_t* candidate_input_indices) {
+  auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
+  if (output_index != -1) {
+    // Create a 1D shape, because the shapes don't have to match exactly for
+    // input forwarding. Only the number of elements must be the same.
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(num_elements);
+
+    // Iterate over indices of all inputs that can potentially be used for
+    // forwarding.
+    for (int i = 0; i < num_candidates; ++i) {
+      // TODO(pifon): Expose fetching AllocatorAttributes with the output_index.
+      AllocatorAttributes output_attr;
+      auto tensor = ctx->forward_input(
+          candidate_input_indices[i], output_index,
+          ctx->expected_output_dtype(output_index), output_shape,
+          ctx->output_memory_type(output_index), output_attr);
+      if (tensor != nullptr) {
+        return tensor->data();
+      }
+    }
+  }
+  // If no forwarding happened, allocate a chunk of memory.
   return GetAllocator(op_kernel_ctx)
-      ->AllocateRaw(Allocator::kAllocatorAlignment, num_bytes);
+      ->AllocateRaw(Allocator::kAllocatorAlignment,
+                    num_elements * element_size);
 }
 
-extern "C" void _mlir_ciface_tf_dealloc_raw(void* op_kernel_ctx, void* ptr) {
+extern "C" void _mlir_ciface_tf_dealloc(void* op_kernel_ctx, void* ptr) {
   GetAllocator(op_kernel_ctx)->DeallocateRaw(ptr);
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
index 143ebc95932..607cd38a1aa 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
@@ -22,10 +22,12 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc_raw(
-    void* op_kernel_ctx, size_t num_bytes);
+extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc(
+    void* op_kernel_ctx, size_t num_elements, size_t element_size,
+    int32_t output_index, int32_t num_candidates,
+    int32_t* candidate_input_indices);
 
-extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc_raw(
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc(
     void* op_kernel_ctx, void* ptr);
 
 }  // namespace tf_framework
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
index 84c2bf46b55..7ecae51c194 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
@@ -25,6 +25,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/crash_handler.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -63,6 +64,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
+  tensorflow::kernel_gen::SetCrashReportMessage();
   llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
                                         llvm::cl::value_desc("filename"),
                                         llvm::cl::init("foo.mlir"));
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
index 4cbe2e633f8..87c8e57804b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -62,7 +62,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(llvm::Module* module) {
   }
 
   llvm::TargetOptions target_options =
-      llvm::codegen::InitTargetOptionsFromCodeGenFlags();
+      llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
   return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
       triple.str(), "generic", "", target_options, llvm::Reloc::Model::PIC_));
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index b2595d2ad3a..a0babc208de 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -72,13 +72,16 @@ gentbl(
 cc_library(
     name = "passes",
     srcs = [
+        "buffer_reuse_pass.cc",
         "bufferize_pass.cc",
         "embed_tf_framework_pass.cc",
         "gpu_kernel_to_blob_pass.cc",
         "materialize_broadcasts_pass.cc",
         "parallel_loops_to_sequential.cc",
         "propagate_tf_abi_knowledge_pass.cc",
+        "same_shape_propagation.cc",
         "shape_to_descriptors_pass.cc",
+        "tensorflow_abi_knowledge_propagation.cc",
         "tf_kernel_to_llvm_pass.cc",
         "unfuse_batch_norm_pass.cc",
     ],
@@ -105,9 +108,12 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:TargetNVVMIR",
         "@llvm-project//mlir:TargetROCDLIR",
         "@llvm-project//mlir:ShapeToStandard",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
new file mode 100644
index 00000000000..d8ea4f52bd8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
@@ -0,0 +1,343 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <vector>
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Analysis/BufferAliasAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/Liveness.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
+
+// Needed to build `llvm::EquivalenceClasses` of `mlir::Value`s.
+namespace mlir {
+static bool operator<(const Value &lhs, const Value &rhs) {
+  return lhs.getAsOpaquePointer() < rhs.getAsOpaquePointer();
+}
+}  // namespace mlir
+
+constexpr llvm::StringRef
+    mlir::kernel_gen::tf_framework::TFAllocOp::kReuseOutputAttrName;
+constexpr llvm::StringRef
+    mlir::kernel_gen::tf_framework::TFAllocOp::kReuseInputCandidatesAttrName;
+constexpr llvm::StringRef
+    mlir::kernel_gen::tf_framework::TFFrameworkDialect::kTFEntryAttrName;
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+// TODO(frgossen): Move this to MLIR.
+static void walkBlocks(Operation *op, std::function<void(Block &)> callback) {
+  for (Region &region : op->getRegions()) {
+    for (Block &block : region) {
+      callback(block);
+      for (Operation &nested_op : block) walkBlocks(&nested_op, callback);
+    }
+  }
+}
+
+/// A temporary buffer size analysis that is correct but may be incomplete.
+class BufferSizeAnalysis {
+ public:
+  explicit BufferSizeAnalysis(FuncOp f) { build(f); }
+
+  bool is_same_size(Value a, Value b) { return ecs_.isEquivalent(a, b); }
+
+ private:
+  void build(FuncOp &f) {
+    auto buffers = find_buffer_values(f);
+
+    // Memrefs with statically known same shape must be of the same size.
+    int n = buffers.size();
+    for (int i = 0; i < n; ++i) {
+      for (int j = i + 1; j < n; ++j) {
+        Value a = buffers[i];
+        Value b = buffers[j];
+        auto a_ty = a.getType().dyn_cast<MemRefType>();
+        auto b_ty = b.getType().dyn_cast<MemRefType>();
+        if (a_ty && b_ty && a_ty.hasStaticShape() && b_ty.hasStaticShape() &&
+            a_ty.getSizeInBits() == b_ty.getSizeInBits() &&
+            a_ty.getAffineMaps() == b_ty.getAffineMaps()) {
+          ecs_.unionSets(a, b);
+        }
+      }
+    }
+
+    // Operands to `linalg.generic` with equal affine maps must be of same size.
+    f.walk([&](linalg::GenericOp genericOp) {
+      auto operand_buffers = genericOp.getInputsAndOutputBuffers();
+      int n = operand_buffers.size();
+      for (int i = 0; i < n; ++i) {
+        for (int j = i + 1; j < n; ++j) {
+          Value a = operand_buffers[i];
+          Value b = operand_buffers[j];
+          auto a_ty = a.getType().dyn_cast<MemRefType>();
+          auto b_ty = b.getType().dyn_cast<MemRefType>();
+          if (a_ty && b_ty && a_ty.getElementType() == b_ty.getElementType() &&
+              a_ty.getAffineMaps() == b_ty.getAffineMaps()) {
+            AffineMap map_i = genericOp.getIndexingMap(i);
+            AffineMap map_j = genericOp.getIndexingMap(j);
+            if (map_i == map_j && map_i.isPermutation()) ecs_.unionSets(a, b);
+          }
+        }
+      }
+    });
+
+    // Operand and result of `reshape_memref_cast` must be of same size.
+    f.walk([&](lmhlo::ReshapeMemRefCastOp reshapeOp) {
+      ecs_.unionSets(reshapeOp.result(), reshapeOp.operand());
+    });
+  }
+
+  llvm::SmallVector<Value, 8> find_buffer_values(FuncOp f) {
+    llvm::SmallVector<Value, 8> buffers;
+    f.walk([&](Operation *op) {
+      for (Value val : op->getResults())
+        if (val.getType().isa<BaseMemRefType>()) buffers.push_back(val);
+    });
+    walkBlocks(f.getOperation(), [&](Block &block) {
+      for (Value val : block.getArguments()) {
+        if (val.getType().isa<BaseMemRefType>()) buffers.push_back(val);
+      }
+    });
+    return buffers;
+  }
+
+  llvm::EquivalenceClasses<Value> ecs_;
+};
+
+class BufferReuseAnalysis {
+ public:
+  explicit BufferReuseAnalysis(FuncOp f) { build(f); }
+
+  static constexpr int kIndexAmbiguous = -1;
+
+  Optional<SmallVector<int64_t, 2>> get_reuse_candiates(AllocOp op) {
+    auto it = reuse_candidates_.find(op);
+    if (it == reuse_candidates_.end()) return llvm::None;
+    return it->second;
+  }
+
+  Optional<int64_t> get_output_index(AllocOp op) {
+    auto it = output_indices_.find(op);
+    if (it == output_indices_.end()) return llvm::None;
+    return it->second;
+  }
+
+ private:
+  void build(FuncOp &f) {
+    BufferAliasAnalysis aliases(f);
+    find_output_indices(f, aliases);
+    find_reuse_candiates(f, aliases);
+  }
+
+  void find_output_indices(FuncOp &f, BufferAliasAnalysis &aliases) {
+    f.walk([&](AllocOp alloc_op) {
+      int64_t output_index = kIndexAmbiguous;
+      int count_return_uses = 0;
+      auto buffer_aliases = aliases.resolve(alloc_op.getResult());
+      for (Value alias : buffer_aliases) {
+        for (auto &use : alias.getUses()) {
+          if (isa<ReturnOp>(use.getOwner())) {
+            int64_t index = use.getOperandNumber();
+            if (count_return_uses++ == 0)
+              output_index = index;
+            else if (output_index != index)
+              output_index = kIndexAmbiguous;
+          }
+        }
+      }
+      output_indices_[alloc_op] = output_index;
+    });
+  }
+
+  void find_reuse_candiates(FuncOp &f, BufferAliasAnalysis &aliases) {
+    Liveness liveness(f);
+    BufferSizeAnalysis size_equivalences(f);
+    walkBlocks(f.getOperation(), [&](Block &block) {
+      find_reuse_candiates(block, aliases, liveness.getLiveness(&block),
+                           size_equivalences, f.getArguments());
+    });
+  }
+
+  void find_reuse_candiates(Block &block, BufferAliasAnalysis &aliases,
+                            const LivenessBlockInfo *liveness,
+                            BufferSizeAnalysis &size_equivalences,
+                            ArrayRef<BlockArgument> arguments) {
+    for (Operation &op : block) {
+      auto alloc_op = dyn_cast<AllocOp>(op);
+      if (!alloc_op) continue;
+
+      // Find first use of the newly allocated buffer within this block.
+      Value new_buffer = alloc_op.getResult();
+      Operation *first_reuse = find_first_use_in_block(
+          new_buffer, alloc_op.getOperation()->getBlock());
+
+      // Find reuse candidates for the regarded allocation.
+      SmallVector<int64_t, 2> local_reuse_candidates;
+      for (auto it : llvm::enumerate(arguments)) {
+        int64_t old_buffer_index = it.index();
+        Value old_buffer = it.value();
+        if (!old_buffer.getType().isa<BaseMemRefType>()) continue;
+
+        // Will not reuse buffers of different size as they may be too small.
+        if (!size_equivalences.is_same_size(new_buffer, old_buffer)) continue;
+
+        // Only reuse buffers that are no longer used on first reuse, i.e. they
+        // are no longer alive.
+        bool livetimes_compatible = true;
+        for (Value old_buffer_alias : aliases.resolve(old_buffer)) {
+          if (first_reuse == nullptr) {
+            // If the first use is beyond the end of this block we look at the
+            // block end. An argument buffer that is already reusable there is
+            // certainly reusable at any later actual use.
+            if (liveness->isLiveOut(old_buffer_alias)) {
+              livetimes_compatible = false;
+              break;
+            }
+          } else {
+            // A buffer is *not* reusable if
+            //   i)  its last use is after the point of reuse, or
+            //   ii) its last use is also its first reuse but the operation
+            //       does not allow for local reuse.
+            Operation *last_use = liveness->getEndOperation(
+                old_buffer_alias,
+                liveness->getStartOperation(old_buffer_alias));
+            if (first_reuse->isBeforeInBlock(last_use)) {
+              livetimes_compatible = false;
+              break;
+            }
+            if (first_reuse == last_use &&
+                !can_reuse_locally(first_reuse, old_buffer_alias, new_buffer)) {
+              livetimes_compatible = false;
+              break;
+            }
+          }
+        }
+
+        // All criteria are fulfilled 🙂.
+        if (livetimes_compatible)
+          local_reuse_candidates.push_back(old_buffer_index);
+      }
+
+      reuse_candidates_[&op] = local_reuse_candidates;
+    }
+  }
+
+  Operation *find_first_use_in_block(Value value, Block *block) {
+    Operation *first_use = nullptr;
+    for (Operation *op : value.getUsers()) {
+      Operation *ancestor_op = block->findAncestorOpInBlock(*op);
+      if (ancestor_op == nullptr) continue;
+      if (first_use == nullptr || ancestor_op->isBeforeInBlock(first_use))
+        first_use = ancestor_op;
+    }
+    return first_use;
+  }
+
+  std::vector<Value> get_buffer_arguments(FuncOp &f) {
+    std::vector<Value> buffer_arguments;
+    for (BlockArgument arg : f.getArguments()) {
+      if (arg.getType().isa<BaseMemRefType>()) buffer_arguments.push_back(arg);
+    }
+    return buffer_arguments;
+  }
+
+  bool can_reuse_locally(Operation *op, Value old_buffer, Value new_buffer) {
+    // For now, we support only memrefs with the same memory layout.
+    auto old_buffer_ty = old_buffer.getType().dyn_cast<MemRefType>();
+    auto new_buffer_ty = old_buffer.getType().dyn_cast<MemRefType>();
+    if (!old_buffer_ty || !new_buffer_ty ||
+        old_buffer_ty.getAffineMaps() != new_buffer_ty.getAffineMaps())
+      return false;
+
+    if (auto generic_op = dyn_cast<linalg::GenericOp>(op)) {
+      assert(llvm::find(op->getOperands(), old_buffer) !=
+                 op->getOperands().end() &&
+             llvm::find(op->getOperands(), new_buffer) !=
+                 op->getOperands().end() &&
+             "expect `old/new_buffer` to be operand of `op`");
+
+      // If `linalg.generic` indexing maps are the same for input and output
+      // buffer then the last use of the input buffer happens before its first
+      // reuse (per memory location).
+      auto operand_buffers = generic_op.getInputsAndOutputBuffers();
+      int old_index =
+          llvm::find(operand_buffers, old_buffer) - operand_buffers.begin();
+      int new_index =
+          llvm::find(operand_buffers, new_buffer) - operand_buffers.begin();
+      AffineMap old_indexing_map = generic_op.getIndexingMap(old_index);
+      AffineMap new_indexing_map = generic_op.getIndexingMap(new_index);
+      return old_indexing_map == new_indexing_map &&
+             old_indexing_map.isPermutation();
+    }
+    return false;
+  }
+
+  DenseMap<Operation *, SmallVector<int64_t, 2>> reuse_candidates_;
+  DenseMap<Operation *, int64_t> output_indices_;
+};
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct BufferReusePass : public BufferReusePassBase<BufferReusePass> {
+  void runOnFunction() override {
+    if (!getFunction().getAttrOfType<UnitAttr>(
+            tf_framework::TFFrameworkDialect::kTFEntryAttrName))
+      return;
+
+    BufferReuseAnalysis analysis(getFunction());
+
+    // Annotate IR with reuse candidates and output indices per allocation.
+    Builder builder(&getContext());
+    getFunction().walk([&](AllocOp op) {
+      if (auto output_index = analysis.get_output_index(op)) {
+        auto attr = builder.getIndexAttr(*output_index);
+        op.getOperation()->setAttr(
+            tf_framework::TFAllocOp::kReuseOutputAttrName, attr);
+      }
+      if (auto reuse_candiates = analysis.get_reuse_candiates(op)) {
+        auto attr = builder.getIndexArrayAttr(*reuse_candiates);
+        op.getOperation()->setAttr(
+            tf_framework::TFAllocOp::kReuseInputCandidatesAttrName, attr);
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<FunctionPass> CreateBufferReusePass() {
+  return std::make_unique<BufferReusePass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
index e6f242eb59d..b9ff5cdb287 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // This file implements logic for translating mixed IR to buffer form.
 
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
+
 #include <cstddef>
 #include <memory>
 
@@ -29,7 +31,6 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
@@ -39,10 +40,9 @@ namespace transforms {
 namespace {
 
 class TensorFromElementsOpConverter
-    : public BufferAssignmentOpConversionPattern<TensorFromElementsOp> {
+    : public OpConversionPattern<TensorFromElementsOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      TensorFromElementsOp>::BufferAssignmentOpConversionPattern;
+  using OpConversionPattern<TensorFromElementsOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
       TensorFromElementsOp op, ArrayRef<Value> operands,
@@ -63,10 +63,9 @@ class TensorFromElementsOpConverter
 };
 
 class DynamicTensorFromElementsOpConverter
-    : public BufferAssignmentOpConversionPattern<DynamicTensorFromElementsOp> {
+    : public OpConversionPattern<DynamicTensorFromElementsOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      DynamicTensorFromElementsOp>::BufferAssignmentOpConversionPattern;
+  using OpConversionPattern<DynamicTensorFromElementsOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
       DynamicTensorFromElementsOp op, ArrayRef<Value> operands,
@@ -114,11 +113,9 @@ class DynamicTensorFromElementsOpConverter
   }
 };
 
-class TensorLoadOpConversion
-    : public BufferAssignmentOpConversionPattern<TensorLoadOp> {
+class TensorLoadOpConversion : public OpConversionPattern<TensorLoadOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      TensorLoadOp>::BufferAssignmentOpConversionPattern;
+  using OpConversionPattern<TensorLoadOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
       TensorLoadOp op, ArrayRef<Value> operands,
@@ -130,10 +127,9 @@ class TensorLoadOpConversion
 };
 
 class ExtractElementOpConversion
-    : public BufferAssignmentOpConversionPattern<ExtractElementOp> {
+    : public OpConversionPattern<ExtractElementOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      ExtractElementOp>::BufferAssignmentOpConversionPattern;
+  using OpConversionPattern<ExtractElementOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
       ExtractElementOp op, ArrayRef<Value> operands,
@@ -150,11 +146,23 @@ class ExtractElementOpConversion
   }
 };
 
-class TensorCastOpConverter
-    : public BufferAssignmentOpConversionPattern<TensorCastOp> {
+template <typename OpTy>
+class SimpleOpResultConversion : public OpConversionPattern<OpTy> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      TensorCastOp>::BufferAssignmentOpConversionPattern;
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      OpTy op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<OpTy>(
+        op, this->getTypeConverter()->convertType(op.getType()), operands);
+    return success();
+  }
+};
+
+class TensorCastOpConverter : public OpConversionPattern<TensorCastOp> {
+ public:
+  using OpConversionPattern<TensorCastOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
       TensorCastOp op, ArrayRef<Value> operands,
@@ -162,7 +170,7 @@ class TensorCastOpConverter
     Value arg = operands.front();
     if (!arg.getType().isa<BaseMemRefType>()) return failure();
 
-    auto result_ty = converter->convertType(op.getType());
+    auto result_ty = getTypeConverter()->convertType(op.getType());
     rewriter.replaceOpWithNewOp<MemRefCastOp>(op, arg, result_ty);
 
     return success();
@@ -172,11 +180,12 @@ class TensorCastOpConverter
 }  // namespace
 
 void populateStandardBufferizePattern(MLIRContext *context,
-                                      BufferAssignmentTypeConverter *converter,
+                                      BufferizeTypeConverter *converter,
                                       OwningRewritePatternList *patterns) {
   patterns->insert<ExtractElementOpConversion, TensorFromElementsOpConverter,
-                   DynamicTensorFromElementsOpConverter, TensorLoadOpConversion,
-                   TensorCastOpConverter>(context, converter);
+                   DynamicTensorFromElementsOpConverter,
+                   SimpleOpResultConversion<SelectOp>, TensorLoadOpConversion,
+                   TensorCastOpConverter>(*converter, context);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
index 8ddbb15219f..064a4581b58 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include <memory>
 
 #include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -26,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
@@ -79,13 +82,12 @@ struct BufferizePass : public BufferizePassBase<BufferizePass> {
     target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
     target.addIllegalDialect<mhlo::MhloDialect>();
     target.addIllegalOp<DynamicTensorFromElementsOp, ExtractElementOp,
-                        TensorFromElementsOp, TensorLoadOp, YieldOp,
-                        TensorCastOp>();
+                        TensorFromElementsOp, TensorLoadOp, TensorCastOp>();
     target.addDynamicallyLegalOp<TensorStoreOp>([&](TensorStoreOp op) {
       return !op.tensor().getType().isa<UnrankedTensorType>();
     });
 
-    BufferAssignmentTypeConverter converter;
+    BufferizeTypeConverter converter;
     auto typesAreLegal = [&converter](Operation* op) {
       return converter.isLegal(op->getOperandTypes()) &&
              converter.isLegal(op->getResultTypes());
@@ -96,19 +98,22 @@ struct BufferizePass : public BufferizePassBase<BufferizePass> {
       return converter.isLegal(inputs) && converter.isLegal(results) &&
              converter.isLegal(&op.getBody());
     });
-    target.addDynamicallyLegalOp<CallOp>(typesAreLegal);
-    target.addDynamicallyLegalOp<ReturnOp>(typesAreLegal);
+    target.addDynamicallyLegalOp<CallOp, ReturnOp, SelectOp>(typesAreLegal);
 
     OwningRewritePatternList patterns;
     mhlo::populateHLOToLHLOConversionPattern(&context, &converter, &patterns);
-    populateWithBufferAssignmentOpConversionPatterns<ReturnOp, ReturnOp,
-                                                     lmhlo::CopyOp>(
-        &context, &converter, &patterns);
+    populateWithBufferizeOpConversionPatterns<ReturnOp, ReturnOp,
+                                              lmhlo::CopyOp>(
+        &context, converter, patterns);
     populateStandardBufferizePattern(&context, &converter, &patterns);
+    populateShapeStructuralTypeConversionsAndLegality(&context, converter,
+                                                      patterns, target);
+    scf::populateSCFStructuralTypeConversionsAndLegality(&context, converter,
+                                                         patterns, target);
     patterns.insert<UnrankedTensorStoreTestOnlyPattern>(&context);
 
     auto module = getOperation();
-    if (failed(applyPartialConversion(module, target, patterns))) {
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
index aa02aefa9d2..3b006c954cf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
@@ -75,19 +75,19 @@ class AllocOpConverter : public OpConversionPattern<AllocOp> {
       return failure();
     }
     // Symbolic operands that bind to the symbols of the memref's layout map are
-    // not supported by AllocRawOp.
+    // not supported by TFAllocOp.
     if (alloc.getNumSymbolicOperands() != 0) {
       return failure();
     }
-    rewriter.replaceOpWithNewOp<AllocRawOp>(alloc, alloc.getType(), ctx,
-                                            operands);
+    rewriter.replaceOpWithNewOp<TFAllocOp>(alloc, alloc.getType(), ctx,
+                                           operands);
     return success();
   }
 };
 
 // Converts std.dealloc to tf_framework.dealloc_raw using OpKernelContextType
 // arg of the parent function.
-class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
+class TFDeallocOpConverter : public OpConversionPattern<DeallocOp> {
  public:
   using OpConversionPattern<DeallocOp>::OpConversionPattern;
 
@@ -108,8 +108,8 @@ class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
       return failure();
     }
     DeallocOp::Adaptor transformed(operands);
-    rewriter.replaceOpWithNewOp<DeallocRawOp>(dealloc, ctx,
-                                              transformed.memref());
+    rewriter.replaceOpWithNewOp<TFDeallocOp>(dealloc, ctx,
+                                             transformed.memref());
     return success();
   }
 };
@@ -118,7 +118,7 @@ class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
 
 void PopulateEmbedTFFrameworkConversionPatterns(
     MLIRContext *context, OwningRewritePatternList *patterns) {
-  patterns->insert<AllocOpConverter, DeallocOpConverter, FuncOpConverter>(
+  patterns->insert<AllocOpConverter, TFDeallocOpConverter, FuncOpConverter>(
       context);
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
index 6aea4d9c619..f6820dd9e5d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
@@ -28,8 +28,6 @@ namespace {
 #define GEN_PASS_CLASSES
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
 
-static constexpr StringRef kTFEntry = "tf_entry";
-
 // The pass rewrites the function marked with `tf_entry` attribute.
 // * adds tf_framework::OpKernelContextType argument to the function,
 // * std.alloc becomes tf_framework.alloc_raw,
@@ -53,7 +51,7 @@ class EmbedTFFrameworkPass
     target.addLegalDialect<tf_framework::TFFrameworkDialect>();
 
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
-      if (!op.getAttrOfType<UnitAttr>(kTFEntry)) {
+      if (!op.getAttrOfType<UnitAttr>(TFFrameworkDialect::kTFEntryAttrName)) {
         return true;
       }
       FunctionType func_type = op.getType();
@@ -61,10 +59,11 @@ class EmbedTFFrameworkPass
              func_type.getInput(0).isa<OpKernelContextType>();
     });
     target.addDynamicallyLegalOp<AllocOp, DeallocOp>([](Operation* op) {
-      return !op->getParentOfType<FuncOp>().getAttrOfType<UnitAttr>(kTFEntry);
+      return !op->getParentOfType<FuncOp>().getAttrOfType<UnitAttr>(
+          TFFrameworkDialect::kTFEntryAttrName);
     });
 
-    if (failed(applyPartialConversion(m, target, patterns))) {
+    if (failed(applyPartialConversion(m, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
index dd3f32e2b3c..e0c21f0b2e4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
@@ -44,7 +44,7 @@ struct MaterializeBroadcastsPass
                                                       &conversionPatterns);
 
     if (failed(applyPartialConversion(getFunction(), conversionTarget,
-                                      conversionPatterns))) {
+                                      std::move(conversionPatterns)))) {
       return signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
index 7981dbe5534..a428c618b7e 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
@@ -36,7 +36,8 @@ struct ParallelLoopsToSequentialPass
     target.addIllegalOp<mlir::scf::ParallelOp>();
     target.addLegalOp<mlir::scf::ForOp, mlir::scf::IfOp>();
     target.markUnknownOpDynamicallyLegal([](mlir::Operation*) { return true; });
-    if (failed(applyPartialConversion(getOperation(), target, patterns)))
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 5fd4091b2c0..8c4035972fd 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -37,6 +37,9 @@ std::unique_ptr<OperationPass<ModuleOp> > CreateEmbedTFFrameworkPass();
 
 namespace transforms {
 
+// Pass to find and annotate candidates for buffer reuse.
+std::unique_ptr<FunctionPass> CreateBufferReusePass();
+
 // Pass for applying LLVM legalization patterns.
 std::unique_ptr<OperationPass<ModuleOp> > CreateTFKernelToLLVMPass();
 
@@ -55,6 +58,8 @@ std::unique_ptr<FunctionPass> CreateMaterializeBroadcastsPass();
 std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
 
 // Pass to propagate TF ABI knowledge, e.g. offsets, alignment.
+// This is very limited and will be removed soon.
+// TODO(herhut): Remove this.
 std::unique_ptr<OperationPass<LLVM::LLVMFuncOp>>
 CreatePropagateTensorFlowABIKnowledgePass(
     llvm::ArrayRef<uint32_t> same_shape = {});
@@ -67,6 +72,12 @@ std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
 // Pass to unfuse batch norm.
 std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();
 
+// Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
+std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();
+
+// Pass to propagate shape equalities across kernel boundaries.
+std::unique_ptr<FunctionPass> CreatePropagateShapeKnowledgeToKernels();
+
 }  // namespace transforms
 
 #define GEN_PASS_REGISTRATION
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index a8b2506bd1c..2adeaf70427 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -28,6 +28,11 @@ def EmbedTFFrameworkPass : Pass<"embed-tf-framework", "ModuleOp"> {
   let constructor = "tf_framework::CreateEmbedTFFrameworkPass()";
 }
 
+def BufferReusePass : FunctionPass<"buffer-reuse"> {
+  let summary = "Pass to find and annotate candidates for buffer reuse.";
+  let constructor = "transforms::CreateBufferReusePass()";
+}
+
 def ShapeToDescriptorsPass : Pass<"shape-to-descriptors", "ModuleOp"> {
   let summary = "Pass to transform shape computations to descriptors";
   let constructor = "transforms::CreateShapeToDescriptorsPass()";
@@ -76,4 +81,16 @@ def PropagateTensorFlowABIKnowledgePass
   let constructor = "transforms::CreatePropagateTensorFlowABIKnowledgePass()";
 }
 
+def PropagateTfAbiKnowledgeToKernels
+    : FunctionPass<"propagate-tf-abi-knowledge-to-kernels"> {
+  let summary = "Pass to propagate tensorflow ABI knowledge to kernels";
+  let constructor = "transforms::CreatePropagateTfAbiKnowledgeToKernels()";
+}
+
+def PropagateShapeKnowledgeToKernels
+    : FunctionPass<"propagate-shape-knowledge-to-kernels"> {
+  let summary = "Pass to propagate shape information into kernels";
+  let constructor = "transforms::CreatePropagateShapeKnowledgeToKernels()";
+}
+
 #endif // TF_KERNEL_GEN_PASSES
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
index 0f2a41b3de6..32fc375d48b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 namespace mlir {
 
-class BufferAssignmentPlacer;
-class BufferAssignmentTypeConverter;
+class BufferizeTypeConverter;
 class LLVMTypeConverter;
 class MLIRContext;
 class OwningRewritePatternList;
@@ -45,7 +44,7 @@ namespace transforms {
 /// Collects a set of patterns that bufferize operations from the standard
 /// dialect.
 void populateStandardBufferizePattern(MLIRContext *context,
-                                      BufferAssignmentTypeConverter *converter,
+                                      BufferizeTypeConverter *converter,
                                       OwningRewritePatternList *patterns);
 }  // namespace transforms
 }  // namespace kernel_gen
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
new file mode 100644
index 00000000000..8e804554fa4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
@@ -0,0 +1,335 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains the analysis and transformation to rewrite kernel
+// functions such that they use a single set of arguments for the strides and
+// sizes of operands with equal shapes.
+
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace {
+
+using mlir::ArrayRef;
+using mlir::SmallVector;
+using mlir::Value;
+
+/// Represents a value or constant. Used to unify operands for operations that
+/// take both ssa values and attributes.
+struct ValueOrConst {
+  explicit ValueOrConst(Value v) : value_or_constant(v), is_constant(false) {}
+  explicit ValueOrConst(int64_t c) : value_or_constant(c), is_constant(true) {}
+
+  Value value() const {
+    assert(!is_constant);
+    return value_or_constant.value;
+  }
+
+  int64_t constant() const {
+    assert(is_constant);
+    return value_or_constant.constant;
+  }
+
+  bool isConstant() const { return is_constant; }
+
+ private:
+  union ValueOrConstStorage {
+    explicit ValueOrConstStorage(Value v) : value(v) {}
+    explicit ValueOrConstStorage(size_t c) : constant(c) {}
+
+    Value value;
+    int64_t constant;
+  } value_or_constant;
+
+  bool is_constant;
+};
+
+llvm::hash_code hash_value(ValueOrConst value) {
+  return value.isConstant() ? static_cast<llvm::hash_code>(value.constant())
+                            : mlir::hash_value(value.value());
+}
+
+bool operator==(ValueOrConst lhs, ValueOrConst rhs) {
+  if (lhs.isConstant()) {
+    return rhs.isConstant() && lhs.constant() == rhs.constant();
+  } else {
+    return !rhs.isConstant() && lhs.value() == rhs.value();
+  }
+}
+
+/// Represents a shape, as either a single ssa value that represents the entire
+/// shape vector or as a vector of ssa values representing scalars.
+struct ShapeValue {
+  explicit ShapeValue(Value vector)
+      : shape({ValueOrConst{vector}}), is_vector(true) {}
+  explicit ShapeValue(ValueOrConst vector) : shape({vector}), is_vector(true) {
+    assert(!vector.isConstant());
+  }
+  template <typename T>
+  explicit ShapeValue(T values)
+      : shape(values.begin(), values.end()), is_vector(false) {}
+
+  ValueOrConst vector() const {
+    assert(is_vector);
+    return shape.front();
+  }
+
+  ArrayRef<ValueOrConst> scalars() const {
+    assert(!is_vector);
+    return llvm::makeArrayRef(shape);
+  }
+
+  bool isVector() const { return is_vector; }
+
+ private:
+  SmallVector<ValueOrConst, 4> shape;
+  bool is_vector;
+};
+
+llvm::hash_code hash_value(ShapeValue shape) {
+  return shape.isVector() ? hash_value(shape.vector())
+                          : hash_value(shape.scalars());
+}
+
+bool operator==(ShapeValue lhs, ShapeValue rhs) {
+  if (lhs.isVector()) {
+    return rhs.isVector() && lhs.vector() == rhs.vector();
+  } else {
+    return !rhs.isVector() && lhs.scalars() == rhs.scalars();
+  }
+}
+
+}  // namespace
+
+namespace llvm {
+
+template <>
+struct DenseMapInfo<ShapeValue> {
+  static ShapeValue getEmptyKey() {
+    return ShapeValue(DenseMapInfo<mlir::Value>::getEmptyKey());
+  }
+  static ShapeValue getTombstoneKey() {
+    return ShapeValue(DenseMapInfo<mlir::Value>::getTombstoneKey());
+  }
+  static unsigned getHashValue(ShapeValue shape) { return hash_value(shape); }
+  static bool isEqual(ShapeValue LHS, ShapeValue RHS) { return LHS == RHS; }
+};
+
+}  // namespace llvm
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+// A basic shape equality inference. This should be superceeded by a proper
+// inference once available. Until then, we just build this out to the needs of
+// the kernel generator project.
+class ShapeEqualityKnowledge {
+ public:
+  /// Checks all operations for potential shape equality of their respective
+  /// results.
+  void build(FuncOp function) {
+    function.walk([&](Operation *op) {
+      if (auto reshape = dyn_cast<lmhlo::ReshapeMemRefCastOp>(op)) {
+        registerAssociation(ShapeValue{reshape.operand()}, reshape.result());
+        return;
+      }
+      if (auto alloc = dyn_cast<AllocOp>(op)) {
+        SmallVector<ValueOrConst, 4> shape;
+        ShapedType type = alloc.getResult().getType().cast<ShapedType>();
+        fillShapeFromAllocLike(alloc.getDynamicSizes(), type, shape);
+        registerAssociation(ShapeValue{shape}, alloc.getResult());
+        return;
+      }
+      if (auto alloc = dyn_cast<tf_framework::TFAllocOp>(op)) {
+        // Construct a symbol representing the allocated shape.
+        SmallVector<ValueOrConst, 4> shape;
+        ShapedType type = alloc.getResult().getType().cast<ShapedType>();
+        fillShapeFromAllocLike(alloc.dyn_sizes(), type, shape);
+        registerAssociation(ShapeValue{shape}, alloc.getResult());
+        return;
+      }
+    });
+  }
+
+  /// Checks whether `one` and `other` are known to have the same shape and
+  /// strides.
+  bool haveSameShape(Value one, Value other) {
+    return equal_shapes_.isEquivalent(one.getAsOpaquePointer(),
+                                      other.getAsOpaquePointer());
+  }
+
+ private:
+  static void fillShapeFromAllocLike(mlir::OperandRange operands,
+                                     ShapedType type,
+                                     SmallVectorImpl<ValueOrConst> &shape) {
+    assert(type.hasRank());
+    auto dynamic_sizes = operands.begin();
+    for (auto extent : type.getShape()) {
+      shape.push_back(ShapedType::isDynamic(extent)
+                          ? ValueOrConst{*(dynamic_sizes++)}
+                          : ValueOrConst{extent});
+    }
+  }
+
+  /// Registers the value `value` to have the shape represented by `shape`. If
+  /// `shape` has been registered before, place `value` into the same
+  /// equivalence class. Otherwise register `value` as an equivalence class of
+  /// its own.
+  void registerAssociation(ShapeValue shape, Value value) {
+    auto insert_symbolic = symbolic_shapes_.insert({shape, value});
+    if (insert_symbolic.second) {
+      equal_shapes_.insert(value.getAsOpaquePointer());
+      // We have seen this symbolic shape for the first time. Try to match it
+      // with a vector or shape we already know and alias classes if possible.
+      // This could be based on shape dialect if we weren't late in the
+      // lowering.
+      tryEvaluateShapeToRoot(shape, value);
+    } else {
+      equal_shapes_.unionSets(
+          insert_symbolic.first->second.getAsOpaquePointer(),
+          value.getAsOpaquePointer());
+    }
+  }
+
+  /// Follows the definition chains of the ShapeValue `shape` to identify cases
+  /// where `shape` is derived from some other value's shape. In such case, the
+  /// equivalence classes of that other value and `value` are unioned.
+  void tryEvaluateShapeToRoot(ShapeValue shape, Value value) {
+    // Just some pattern matching for common cases here.
+    if (!shape.isVector()) {
+      // Patterns that revolve around scalars.
+      // Check whether the scalars are all dim operations for some other memref.
+      // TODO(herhut): Use pattern match infra here.
+      Value candidate;
+      for (auto extent : llvm::enumerate(shape.scalars())) {
+        if (extent.value().isConstant()) {
+          candidate = {};
+          break;
+        }
+        if (auto dimOp = extent.value().value().getDefiningOp<mlir::DimOp>()) {
+          auto dimIndex = dimOp.getConstantIndex();
+          if (!dimIndex.hasValue() || (dimIndex.getValue() != extent.index())) {
+            candidate = {};
+            break;
+          }
+          if (candidate && candidate != dimOp.memrefOrTensor()) {
+            candidate = {};
+            break;
+          }
+          candidate = dimOp.memrefOrTensor();
+        }
+      }
+      if (candidate) {
+        equal_shapes_.unionSets(candidate.getAsOpaquePointer(),
+                                value.getAsOpaquePointer());
+      }
+    } else {
+      // Patterns that revovlve around vector representation.
+    }
+  }
+
+  // These are values with identical shapes (or rather their opaque pointers).
+  llvm::EquivalenceClasses<void *> equal_shapes_;
+  // A map from a value that encodes a shape to a value that has this shape.
+  llvm::DenseMap<ShapeValue, Value> symbolic_shapes_;
+};
+
+/// For arguments to kernels that have the same shape, use the stride and
+/// shape information of the left-most argument inside of the kernel function.
+/// That way, llvm can CSE index computations on same-shaped inputs.
+struct PropagateShapeKnowledgeToKernels
+    : public PropagateShapeKnowledgeToKernelsBase<
+          PropagateShapeKnowledgeToKernels> {
+  void runOnFunction() override {
+    ShapeEqualityKnowledge knowledge;
+
+    knowledge.build(getFunction());
+
+    getFunction().walk([&](gpu::LaunchFuncOp launch) {
+      auto module = launch.getParentOfType<ModuleOp>();
+      auto kernel = module.lookupSymbol<LLVM::LLVMFuncOp>(launch.kernel());
+
+      if (!kernel || kernel.isExternal()) return;
+
+      llvm::SmallVector<std::pair<Value, int>, 4> seen_memrefs;
+      int kernel_p = 0;
+      for (auto operand : launch.operands()) {
+        auto memref = operand.getType().dyn_cast<MemRefType>();
+        if (!memref) {
+          // Scalar argument, advance kernel position by one.
+          kernel_p++;
+          continue;
+        }
+        for (auto previous : seen_memrefs) {
+          if (!knowledge.haveSameShape(operand, previous.first)) {
+            continue;
+          }
+          // We use the first equality found and replace uses of corresponding
+          // size and stride information here.
+          // TODO(herhut): This is not safe if we had a cast operation
+          //     inbetween that changes stride information. The current
+          //     analysis above would not consider this equal.
+          // We need to replace sizes and strides.
+          auto args_to_replace = memref.getRank() * 2;
+          int previous_args_pos = previous.second;
+          auto previous_args = kernel.getArguments()
+                                   .drop_front(previous_args_pos + 3)
+                                   .take_front(args_to_replace);
+          auto current_args = kernel.getArguments()
+                                  .drop_front(kernel_p + 3)
+                                  .take_back(args_to_replace);
+          for (auto pair : llvm::zip(previous_args, current_args)) {
+            std::get<1>(pair).replaceAllUsesWith(std::get<0>(pair));
+          }
+          break;
+        }
+        seen_memrefs.push_back({operand, kernel_p});
+        // Advance base, aligned, offset, strides and sizes many arguments.
+        kernel_p += memref.getRank() * 2 + 3;
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<FunctionPass> CreatePropagateShapeKnowledgeToKernels() {
+  return std::make_unique<PropagateShapeKnowledgeToKernels>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
index ab66c513e33..d4a9baf17b9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
@@ -49,6 +49,10 @@ struct ShapeToDescriptorsPass
     target.addIllegalDialect<shape::ShapeDialect>();
     target.addLegalDialect<scf::SCFDialect>();
     target.addLegalDialect<StandardOpsDialect>();
+    // Don't mark the primary Cstr/Assuming ops as illegal, so they can be
+    // lowered at a later time to assertions.
+    target.addLegalOp<shape::AssumingOp, shape::AssumingYieldOp,
+                      shape::CstrRequireOp>();
 
     // Setup conversion patterns.
     OwningRewritePatternList patterns;
@@ -57,7 +61,7 @@ struct ShapeToDescriptorsPass
 
     // Apply conversion.
     auto module = getOperation();
-    if (failed(applyPartialConversion(module, target, patterns)))
+    if (failed(applyPartialConversion(module, target, std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
new file mode 100644
index 00000000000..612bea05489
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains the analysis and transformation to rewrite kernel
+// functions such that information about alignment, aliasing and zero offsets
+// steming from the tf_framework uses is propagated.
+
+#include <memory>
+
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+// TODO(herhut): Move upstream to MLIR.
+static constexpr StringRef kLLVMAlignAttrName = "llvm.align";
+static constexpr StringRef kLLVMNoAliasAttrName = "llvm.noalias";
+
+struct PropagateTfAbiKnowledgeToKernelsPass
+    : public PropagateTfAbiKnowledgeToKernelsBase<
+          PropagateTfAbiKnowledgeToKernelsPass> {
+  void runOnFunction() override {
+    FuncOp function = getFunction();
+    // We currently only handle entry functions and do not propagate across
+    // functions.
+    if (function.getAttrOfType<mlir::UnitAttr>(
+            tf_framework::TFFrameworkDialect::kTFEntryAttrName)) {
+      // For all operands of this function, we know they are aligned. Also, by
+      // construction of kernel generator, we know that there is no offset.
+      // TODO(herhut): Insert asserts in debug mode to check this.
+      for (auto argument : function.getArguments()) {
+        if (argument.getType().isa<BaseMemRefType>()) {
+          allocated_by_runtime.insert(argument);
+        }
+      }
+    }
+
+    // For locally allocated values, we know they are aligned and have offset
+    // zero. Further, they also do not alias with other memrefs, except in
+    // benign ways. This is ensured by the reuse analysis.
+    function.walk([&](tf_framework::TFAllocOp op) {
+      Value allocated = op.getResult();
+      allocated_by_runtime.insert(allocated);
+      no_alias.insert(allocated);
+    });
+
+    // Next, take what we have and propagate it through known operations.
+    propagateThroughUses();
+
+    // Now look at launches and make use of the knowledge we have.
+    function.walk([&](gpu::LaunchFuncOp launch) {
+      auto module = launch.getParentOfType<ModuleOp>();
+      auto kernel = module.lookupSymbol<LLVM::LLVMFuncOp>(launch.kernel());
+
+      if (!kernel || kernel.isExternal()) return;
+
+      // Count the position of kernel operands independently, as they do not
+      // coincide with laucnh operands as memref parameters get expanded when
+      // lowered to llvm.
+      int kernel_p = 0;
+      OpBuilder b = OpBuilder::atBlockBegin(&kernel.body().front());
+      Value zero;
+      for (auto operand : launch.operands()) {
+        auto memref = operand.getType().dyn_cast<MemRefType>();
+        if (!memref) {
+          // Scalar argument, advance kernel position by one.
+          kernel_p++;
+          continue;
+        }
+        if (allocated_by_runtime.contains(operand)) {
+          // This was allocated by the tf runtime, so it is aligned, has no
+          // offset and the two pointers in the descriptor coincide. Rewrite
+          // the kernel accordingly.
+          Value alloc_ptr = kernel.getArgument(kernel_p);
+          Value align_ptr = kernel.getArgument(kernel_p + 1);
+          alloc_ptr.replaceAllUsesWith(align_ptr);
+          Value offset = kernel.getArgument(kernel_p + 2);
+          if (!zero) {
+            zero = b.create<LLVM::ConstantOp>(kernel.getLoc(), offset.getType(),
+                                              b.getIndexAttr(0));
+          }
+          offset.replaceAllUsesWith(zero);
+          kernel.setArgAttr(
+              kernel_p + 1, kLLVMAlignAttrName,
+              b.getIndexAttr(
+                  tf_framework::TFFrameworkDialect::kAllocationAlignment));
+        }
+        if (no_alias.contains(operand)) {
+          // TODO(herhut): We also need to check whether any of the other args
+          //     are aliases. This is currently never the case by construction
+          //     but we could use the alias analysis from buffer placement here
+          //     to make sure.
+          // Add the no_alias attribute to the correspondign pointer.
+          kernel.setArgAttr(kernel_p + 1, kLLVMNoAliasAttrName,
+                            b.getBoolAttr(true));
+        }
+        // Advance base, aligned, offset, strides and sizes many arguments.
+        kernel_p += memref.getRank() * 2 + 3;
+      }
+    });
+  }
+
+ private:
+  void propagateThroughUses() {
+    llvm::SmallVector<Value, 4> worklist(allocated_by_runtime.begin(),
+                                         allocated_by_runtime.end());
+
+    while (!worklist.empty()) {
+      Value candidate = worklist.pop_back_val();
+      for (auto user : candidate.getUsers()) {
+        if (auto reshape = dyn_cast<lmhlo::ReshapeMemRefCastOp>(user)) {
+          // Reshape propagates alignment and offset.
+          // TODO(herhut): This should be a trait.
+          if (allocated_by_runtime.insert(reshape.result()).second) {
+            worklist.push_back(reshape.result());
+          }
+        }
+      }
+    }
+  }
+
+  // Set of values that were allocated by the tf runtime and hence are aligned
+  // and have no offset.
+  llvm::DenseSet<Value> allocated_by_runtime;
+  // Set of values we know do not alias other values.
+  llvm::DenseSet<Value> no_alias;
+};
+
+}  // namespace
+
+std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels() {
+  return std::make_unique<PropagateTfAbiKnowledgeToKernelsPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 431919c2de7..cc9884f97ee 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
@@ -30,8 +31,8 @@ namespace {
 using LLVM::LLVMFuncOp;
 using LLVM::LLVMType;
 
-static constexpr StringRef kCInterfaceAlloc = "_mlir_ciface_tf_alloc_raw";
-static constexpr StringRef kCInterfaceDealloc = "_mlir_ciface_tf_dealloc_raw";
+static constexpr StringRef kCInterfaceAlloc = "_mlir_ciface_tf_alloc";
+static constexpr StringRef kCInterfaceDealloc = "_mlir_ciface_tf_dealloc";
 
 /// Base class for patterns converting TF Framework ops to function calls.
 template <typename OpTy>
@@ -60,27 +61,42 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
   virtual LLVMType GetFuncType() const = 0;
 };
 
-class AllocRawOpConverter : public ConvertToLLVMCallOpPattern<AllocRawOp> {
+class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
  public:
-  using ConvertToLLVMCallOpPattern<AllocRawOp>::ConvertToLLVMCallOpPattern;
+  using ConvertToLLVMCallOpPattern<TFAllocOp>::ConvertToLLVMCallOpPattern;
 
   LogicalResult matchAndRewrite(
       Operation *op, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
-    AllocRawOp alloc_raw_op = cast<AllocRawOp>(op);
-    AllocRawOp::Adaptor transformed(operands);
+    TFAllocOp tf_alloc_op = cast<TFAllocOp>(op);
+    TFAllocOp::Adaptor transformed(operands);
 
-    MemRefType memref_type = alloc_raw_op.getType();
+    MemRefType memref_type = tf_alloc_op.getType();
 
     // Get memref descriptor sizes.
     SmallVector<Value, 4> sizes;
     getMemRefDescriptorSizes(loc, memref_type,
                              llvm::to_vector<4>(transformed.dyn_sizes()),
                              rewriter, sizes);
-    // Get memory block size in bytes.
-    Value num_bytes = getCumulativeSizeInBytes(
-        loc, memref_type.getElementType(), sizes, rewriter);
+    // Get number of elements.
+    Value num_elements = getNumElements(loc, sizes, rewriter);
+    // Get element size.
+    Value element_size =
+        getSizeInBytes(loc, memref_type.getElementType(), rewriter);
+
+    // Convert `output_index` or set it to -1 if the attribute is missing.
+    LLVM::LLVMType llvmInt32Type =
+        LLVM::LLVMType::getInt32Ty(rewriter.getContext());
+    Value output_index = rewriter.create<LLVM::ConstantOp>(
+        loc, llvmInt32Type,
+        rewriter.getI32IntegerAttr(tf_alloc_op.output_index().hasValue()
+                                       ? tf_alloc_op.output_index().getValue()
+                                       : -1));
+
+    // Convert `candidate_input_indices`.
+    auto candidates_count_and_ptr = ConvertI32ArrayAttrToStackAllocatedArray(
+        loc, tf_alloc_op.input_indices(), &rewriter);
 
     // Insert function call.
     FlatSymbolRefAttr tf_func_ref = getOrInsertTFFunction(rewriter, op);
@@ -88,7 +104,10 @@ class AllocRawOpConverter : public ConvertToLLVMCallOpPattern<AllocRawOp> {
         rewriter
             .create<LLVM::CallOp>(
                 loc, getVoidPtrType(), tf_func_ref,
-                llvm::makeArrayRef({transformed.ctx(), num_bytes}))
+                llvm::makeArrayRef({transformed.ctx(), num_elements,
+                                    element_size, output_index,
+                                    candidates_count_and_ptr.first,
+                                    candidates_count_and_ptr.second}))
             .getResult(0);
 
     MemRefDescriptor memRefDescriptor = CreateMemRefDescriptor(
@@ -103,10 +122,19 @@ class AllocRawOpConverter : public ConvertToLLVMCallOpPattern<AllocRawOp> {
   StringRef GetFuncName() const override { return kCInterfaceAlloc; }
 
   LLVMType GetFuncType() const override {
+    LLVMType llvm_i32_type =
+        LLVM::LLVMType::getInt32Ty(getDialect().getContext());
+    LLVMType llvm_i32_ptr_type = llvm_i32_type.getPointerTo();
     LLVMType llvm_void_ptr_type = getVoidPtrType();
-    return LLVM::LLVMType::getFunctionTy(
+    return LLVMType::getFunctionTy(
         llvm_void_ptr_type,
-        llvm::makeArrayRef({llvm_void_ptr_type, getIndexType()}),
+        llvm::makeArrayRef(
+            {/*void* op_kernel_ctx*/ llvm_void_ptr_type,
+             /*size_t num_elements*/ getIndexType(),
+             /*size_t element_size*/ getIndexType(),
+             /*int32_t output_index*/ llvm_i32_type,
+             /*int32_t num_candidates*/ llvm_i32_type,
+             /*int32_t* candidate_input_indices*/ llvm_i32_ptr_type}),
         /*isVarArg=*/false);
   }
 
@@ -144,16 +172,53 @@ class AllocRawOpConverter : public ConvertToLLVMCallOpPattern<AllocRawOp> {
     }
     return memref_desc;
   }
+
+  std::pair<Value, Value> ConvertI32ArrayAttrToStackAllocatedArray(
+      Location loc, llvm::Optional<ArrayAttr> attr,
+      ConversionPatternRewriter *rewriter) const {
+    LLVMType llvm_i32_type =
+        LLVM::LLVMType::getInt32Ty(getDialect().getContext());
+    LLVMType llvm_i32_ptr_type = llvm_i32_type.getPointerTo();
+
+    // If the attribute is missing or empty, set the element count to 0 and
+    // return NULL.
+    if (!attr.hasValue() || attr.getValue().empty()) {
+      Value zero = rewriter->create<LLVM::ConstantOp>(
+          loc, llvm_i32_type, rewriter->getI32IntegerAttr(0));
+      Value null_ptr = rewriter->create<LLVM::NullOp>(loc, llvm_i32_ptr_type);
+      return std::make_pair(zero, null_ptr);
+    }
+
+    // Allocate array to store the elements.
+    auto &array_attr = attr.getValue();
+    Value array_size = rewriter->create<LLVM::ConstantOp>(
+        loc, llvm_i32_type, rewriter->getI32IntegerAttr(array_attr.size()));
+    Value array_ptr = rewriter->create<LLVM::AllocaOp>(
+        loc, llvm_i32_ptr_type, array_size, /*alignment=*/0);
+
+    for (auto &dim : llvm::enumerate(array_attr)) {
+      Value index = rewriter->create<LLVM::ConstantOp>(
+          loc, llvm_i32_type, rewriter->getI32IntegerAttr(dim.index()));
+      Value elem_ptr = rewriter->create<LLVM::GEPOp>(loc, llvm_i32_ptr_type,
+                                                     array_ptr, index);
+      Value elem = rewriter->create<LLVM::ConstantOp>(
+          loc, llvm_i32_type,
+          rewriter->getI32IntegerAttr(
+              dim.value().cast<IntegerAttr>().getInt()));
+      rewriter->create<LLVM::StoreOp>(loc, elem, elem_ptr);
+    }
+    return std::make_pair(array_size, array_ptr);
+  }
 };
 
-class DeallocRawOpConverter : public ConvertToLLVMCallOpPattern<DeallocRawOp> {
+class TFDeallocOpConverter : public ConvertToLLVMCallOpPattern<TFDeallocOp> {
  public:
-  using ConvertToLLVMCallOpPattern<DeallocRawOp>::ConvertToLLVMCallOpPattern;
+  using ConvertToLLVMCallOpPattern<TFDeallocOp>::ConvertToLLVMCallOpPattern;
 
   LogicalResult matchAndRewrite(
       Operation *op, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
-    DeallocRawOp::Adaptor transformed(operands);
+    TFDeallocOp::Adaptor transformed(operands);
     MemRefDescriptor memref(transformed.memref());
 
     Value allocated_bytes_ptr = rewriter.create<LLVM::BitcastOp>(
@@ -194,7 +259,7 @@ class NullContextOpConverter : public ConvertOpToLLVMPattern<NullContextOp> {
 void PopulateTFFrameworkToLLVMConversionPatterns(
     LLVMTypeConverter *converter, OwningRewritePatternList *patterns) {
   patterns->insert<NullContextOpConverter>(*converter);
-  patterns->insert<AllocRawOpConverter, DeallocRawOpConverter>(*converter);
+  patterns->insert<TFAllocOpConverter, TFDeallocOpConverter>(*converter);
 }
 
 }  // namespace tf_framework
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index b2fcc424a50..10c8f6ccc3b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -62,7 +62,7 @@ class TFKernelToLLVMPass : public TFKernelToLLVMPassBase<TFKernelToLLVMPass> {
         .addIllegalDialect<gpu::GPUDialect, tf_framework::TFFrameworkDialect>();
     target.addIllegalOp<LLVM::DialectCastOp>();
 
-    if (failed(applyPartialConversion(m, target, patterns))) {
+    if (failed(applyPartialConversion(m, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
index d2773d91b07..5c347f471b1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 
@@ -29,7 +30,7 @@ struct UnfuseBatchNormPass
   void runOnFunction() override {
     mlir::OwningRewritePatternList patterns;
     mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
-    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
+    mlir::applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 689eb14e4af..aa45d34543a 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -124,6 +124,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -136,6 +137,7 @@ cc_library(
         ":hlo_module_importer",
         ":hlo_utils",
         ":mlir_hlo_to_hlo",
+        ":translate_cl_options",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:lhlo",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -299,6 +301,7 @@ cc_library(
         ":hlo_utils",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -427,8 +430,8 @@ tf_cc_binary(
     name = "xla-opt",
     deps = [
         ":all_xla_passes_for_testing",
-        "//tensorflow/compiler/jit:xla_cpu_jit",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/mlir:tf_mlir_opt_main",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
     ],
 )
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 209a7dfa7fe..62e8dc3bd0b 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/xla/attribute_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -140,31 +141,42 @@ tensorflow::Status HloFunctionImporter::ImportAsRegion(
   return ImportInstructions(computation, block);
 }
 
-tensorflow::Status HloFunctionImporter::ImportInstructions(
-    const HloComputation& computation, mlir::Block* block) {
+StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
+    const xla::HloComputation& computation,
+    const llvm::SmallVectorImpl<Value>& arguments, mlir::OpBuilder* builder) {
   // Setup the input parameters.
   const int num_parameters = computation.num_parameters();
+
+  if (arguments.size() != num_parameters)
+    return InvalidArgument("Caller vs callee argument sizes do not match");
+
   for (int i = 0; i < num_parameters; i++) {
     auto hlo_parameter = computation.parameter_instruction(i);
-    instruction_value_map_[hlo_parameter] = block->getArgument(i);
+    instruction_value_map_[hlo_parameter] = arguments[i];
   }
 
-  mlir::OpBuilder builder = mlir::OpBuilder::atBlockEnd(block);
   for (auto instruction : computation.MakeInstructionPostOrder()) {
     TF_ASSIGN_OR_RETURN(auto new_operation,
-                        ImportInstruction(instruction, &builder));
+                        ImportInstruction(instruction, builder));
     if (new_operation) {
       instruction_value_map_[instruction] = new_operation->getResult(0);
     }
   }
 
+  // Setup the return type (HLO only supports a single return value).
+  return GetMlirValue(computation.root_instruction());
+}
+
+Status HloFunctionImporter::ImportInstructions(
+    const HloComputation& computation, mlir::Block* block) {
+  llvm::SmallVector<Value, 4> arguments(block->args_begin(), block->args_end());
+  mlir::OpBuilder builder = mlir::OpBuilder::atBlockEnd(block);
+  TF_ASSIGN_OR_RETURN(Value result,
+                      ImportInstructionsImpl(computation, arguments, &builder));
+
   // TODO(suderman): Add location tracking details.
   mlir::Location loc = builder.getUnknownLoc();
 
-  // Setup the return type (HLO only supports a single return value).
-  TF_ASSIGN_OR_RETURN(auto result,
-                      GetMlirValue(computation.root_instruction()));
-
   // Create terminator op depending on the parent op of this region.
   if (llvm::isa<FuncOp>(block->getParentOp())) {
     builder.create<mlir::ReturnOp>(loc, result);
@@ -174,6 +186,19 @@ tensorflow::Status HloFunctionImporter::ImportInstructions(
   return tensorflow::Status::OK();
 }
 
+StatusOr<Value> HloFunctionImporter::ImportInstructions(
+    const xla::HloComputation& computation,
+    const llvm::SmallVectorImpl<Value>& arguments, mlir::OpBuilder* builder) {
+  mlir::Block* block = builder->getBlock();
+  if (block == nullptr)
+    return InvalidArgument(
+        "ImportInstructions requires a valid block in the builder");
+
+  HloFunctionImporter importer(
+      block->getParent()->getParentOfType<mlir::ModuleOp>(), {}, builder);
+  return importer.ImportInstructionsImpl(computation, arguments, builder);
+}
+
 StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     HloInstruction* instruction, mlir::OpBuilder* func_builder) {
   TF_ASSIGN_OR_RETURN(auto operands, GetOperands(instruction));
@@ -193,7 +218,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto attr = CreateDenseElementsAttrFromLiteral(literal, *builder_);
       if (!attr.ok()) return attr.status();
       mlir::Operation* new_operation =
-          func_builder->create<mlir::ConstantOp>(loc, attr.ValueOrDie());
+          func_builder->create<mlir::mhlo::ConstOp>(loc, attr.ValueOrDie());
       for (auto attr : attributes) {
         new_operation->setAttr(attr.first, attr.second);
       }
@@ -281,7 +306,12 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       MakeAndReturn(CustomCallOp);
     }
     case HloOpcode::kCompare: {
-      attributes.push_back(ConvertComparisonDirection(instruction));
+      auto compare = Cast<HloCompareInstruction>(instruction);
+      attributes.push_back(ConvertComparisonDirection(compare->direction()));
+      auto default_type = Comparison::DefaultComparisonType(
+          compare->operand(0)->shape().element_type());
+      if (compare->type() != default_type)
+        attributes.push_back(ConvertComparisonType(compare->type()));
       MakeAndReturn(CompareOp);
     }
     case HloOpcode::kCholesky: {
@@ -782,6 +812,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
   // Minor-to-major is a permutation of [0, rank), presenting tensor dimensions
   // in physical minor-to-major order.
   if (instruction->shape().IsArray() &&
+      !instruction->shape().layout().minor_to_major().empty() &&
       instruction->shape().layout() !=
           LayoutUtil::MakeDescendingLayout(
               instruction->shape().dimensions().size())) {
@@ -830,11 +861,16 @@ StatusOr<Value> HloFunctionImporter::GetMlirValue(HloInstruction* instruction) {
 }
 
 mlir::NamedAttribute HloFunctionImporter::ConvertComparisonDirection(
-    HloInstruction* instruction) {
+    ComparisonDirection direction) {
   return builder_->getNamedAttr(
       "comparison_direction",
-      builder_->getStringAttr(
-          ComparisonDirectionToString(instruction->comparison_direction())));
+      builder_->getStringAttr(ComparisonDirectionToString(direction)));
+}
+
+mlir::NamedAttribute HloFunctionImporter::ConvertComparisonType(
+    Comparison::Type type) {
+  return builder_->getNamedAttr(
+      "compare_type", builder_->getStringAttr(ComparisonTypeToString(type)));
 }
 
 mlir::DenseIntElementsAttr HloFunctionImporter::ConvertDimensions(
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index f925f7f471b..ee0372ad2b2 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -55,6 +56,13 @@ class HloFunctionImporter {
   static Status ImportAsRegion(const xla::HloComputation& computation,
                                mlir::Region* region, mlir::Builder* builder);
 
+  // Imports the given computation to the given place specified by `builder`.
+  // `arguments` contains values for all parameters.
+  static StatusOr<mlir::Value> ImportInstructions(
+      const xla::HloComputation& computation,
+      const llvm::SmallVectorImpl<mlir::Value>& arguments,
+      mlir::OpBuilder* builder);
+
  private:
   HloFunctionImporter(mlir::ModuleOp module,
                       std::unordered_map<const xla::HloComputation*,
@@ -80,6 +88,10 @@ class HloFunctionImporter {
   // Assumes that the block already has correct arguments populated.
   tensorflow::Status ImportInstructions(const HloComputation& computation,
                                         mlir::Block* block);
+  StatusOr<mlir::Value> ImportInstructionsImpl(
+      const xla::HloComputation& computation,
+      const llvm::SmallVectorImpl<mlir::Value>& arguments,
+      mlir::OpBuilder* builder);
 
   // Imports an instruction.
   StatusOr<mlir::Operation*> ImportInstruction(xla::HloInstruction* instruction,
@@ -107,7 +119,10 @@ class HloFunctionImporter {
 
   // Converts an XLA ComparisonDirection to the corresponding MLIR attribute.
   mlir::NamedAttribute ConvertComparisonDirection(
-      xla::HloInstruction* instruction);
+      ComparisonDirection direction);
+
+  // Converts an XLA Comparison::Type to the corresponding MLIR attribute.
+  mlir::NamedAttribute ConvertComparisonType(Comparison::Type type);
 
   // Converts the dimensions of an HLO instruction into an MLIR attribute.
   mlir::DenseIntElementsAttr ConvertDimensions(
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index daea2d9b8f6..4d045179f21 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -385,12 +385,14 @@ StatusOr<XlaOp> MlirHloBuilder::AddInstruction(
 
 StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
                                         XlaOp rhs,
-                                        ComparisonDirection direction) {
+                                        ComparisonDirection direction,
+                                        Comparison::Type type) {
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
                                          shape, builder_));
   auto op = builder_.create<mlir::mhlo::CompareOp>(
       loc_, ty, GetValue(lhs), GetValue(rhs),
-      builder_.getStringAttr(ComparisonDirectionToString(direction)));
+      builder_.getStringAttr(ComparisonDirectionToString(direction)),
+      builder_.getStringAttr(ComparisonTypeToString(type)));
   return MakeXlaOp(op.getResult());
 }
 
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 59b4bc7b1e0..cbdc6f48fdc 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -205,7 +205,8 @@ class MlirHloBuilder : public XlaBuilder {
                                  absl::Span<const XlaOp> operands) override;
 
   StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                          ComparisonDirection direction) override;
+                          ComparisonDirection direction,
+                          Comparison::Type type) override;
 
   XlaOp BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape, XlaOp lhs,
                             XlaOp rhs) override;
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index ccfcebab60e..0e904c153bb 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -731,6 +731,28 @@ LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
   return success();
 }
 
+// Specialize CompareOp export to set broadcast_dimensions argument.
+mlir::LogicalResult ExportXlaOp(mlir::mhlo::CompareOp op,
+                                OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp lhs, rhs;
+  if (failed(GetXlaOp(op.lhs(), value_map, &lhs, op))) return mlir::failure();
+  if (failed(GetXlaOp(op.rhs(), value_map, &rhs, op))) return mlir::failure();
+  auto dir = Convert_comparison_direction(op.comparison_direction());
+  auto type_attr = op.compare_typeAttr();
+
+  xla::XlaOp xla_result;
+  if (type_attr) {
+    auto type =
+        xla::StringToComparisonType(type_attr.getValue().str()).ValueOrDie();
+    xla_result = xla::Compare(lhs, rhs, /*broadcast_dimensions=*/{}, dir, type);
+  } else {
+    xla_result = xla::Compare(lhs, rhs, dir);
+  }
+  value_map[op] = xla_result;
+  return mlir::success();
+}
+
 LogicalResult ExportXlaOp(ConstOp op, OpLoweringContext ctx) {
   return failure();
 }
@@ -1012,13 +1034,24 @@ LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
                                                      &comparator)))
     return failure();
 
-  auto tupled = xla::Sort(GetTuple(op.operands(), ctx), comparator,
+  auto sorted = xla::Sort(GetTuple(op.operands(), ctx), comparator,
                           op.dimension(), op.is_stable());
 
   auto& value_map = *ctx.values;
+  auto shape_or = sorted.builder()->GetShape(sorted);
+  if (!shape_or.ok()) {
+    return op.emitError(shape_or.status().ToString());
+  }
+
+  xla::Shape& shape = shape_or.ValueOrDie();
+  if (!shape.IsTuple()) {
+    value_map[op.getResult(0)] = sorted;
+    return success();
+  }
+
   // MLIR's sort supports multiple returns, untuple all the results of XLA's.
   for (auto it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(tupled, it.index());
+    value_map[it.value()] = xla::GetTupleElement(sorted, it.index());
   }
   return success();
 }
@@ -1169,9 +1202,9 @@ StatusOr<xla::Literal> CreateArrayLiteralFromAttr(ElementsAttr attr,
 }
 
 xla::Layout ExtractLayout(mlir::Operation* op, int rank) {
-  if (auto attr =
-          op->getAttrOfType<mlir::DenseIntElementsAttr>("minor_to_major")) {
+  if (auto attr = GetLayoutFromMlirHlo(op)) {
     llvm::SmallVector<int64, 4> minor_to_major;
+    DCHECK_EQ(rank, attr.size());
     minor_to_major.reserve(attr.size());
     for (const llvm::APInt& i : attr) {
       minor_to_major.push_back(i.getZExtValue());
@@ -1726,4 +1759,8 @@ Status ConvertMlirHloToHlo(
   return Status::OK();
 }
 
+DenseIntElementsAttr GetLayoutFromMlirHlo(mlir::Operation* op) {
+  return op->getAttrOfType<mlir::DenseIntElementsAttr>("minor_to_major");
+}
+
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
index 4ca3e586128..a727f60084c 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
@@ -64,6 +64,8 @@ llvm::Optional<::xla::XlaOp> CreateXlaOperator(
     mlir::Operation* op,
     llvm::DenseMap<mlir::Value, ::xla::XlaOp>* value_lowering);
 
+mlir::DenseIntElementsAttr GetLayoutFromMlirHlo(mlir::Operation* op);
+
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_MLIR_HLO_TO_HLO_H_
diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 754b14f4b13..11507b11ea0 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -1,12 +1,23 @@
 load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
 
 package(licenses = ["notice"])
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
+    tags_override = {
+        "hlo_to_lhlo_with_xla/gpu_ops.mlir": tf_cuda_tests_tags() + [
+            "noasan",
+            "nomsan",
+            "noubsan",
+        ],  # b/171751580
+    },
     test_file_exts = [
         "mlir",
         "hlotxt",
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
new file mode 100644
index 00000000000..83c156554cd
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
@@ -0,0 +1,33 @@
+// RUN: xla-opt -split-input-file "-xla-hlo-to-lhlo-with-xla=platform=CUDA" %s
+//// | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<3x3xi32>
+// CHECK-SAME: %[[ARG1:.*]]: memref<2xi32>
+// CHECK-SAME: %[[ARG2:.*]]: memref<2x3xi32>
+// CHECK-SAME: %[[ARG3:.*]]: memref<36xi8> {lmhlo.alloc = 0
+// CHECK: %[[VIEW0:.*]] = std.view %[[ARG3]]{{.*}} : memref<36xi8> to memref3x3xi32>
+// CHECK: "lmhlo.copy"(%[[ARG0]], %[[VIEW0]])
+// CHECK: %[[VIEW1:.*]] = std.view %[[ARG3]]{{.*}} : memref<100xi8> to memref<5x5xf32>
+// CHECK:  "lmhlo.scatter"(%[[VIEW0]], %[[ARG1]], %[[ARG2]], %[[VIEW1]])
+// CHECK:  mhlo.add
+// CHECK: indices_are_sorted = false
+// CHECK: index_vector_dim = 1 : i64
+// CHECK: inserted_window_dims = dense<0> : tensor<1xi64>
+// CHECK: scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>
+// CHECK: update_window_dims = dense<1> : tensor<1xi64>
+// CHECK: unique_indices = false
+func @main(%operand:tensor<3x3xi32>, %indices: tensor<2xi32>, %updates: tensor<2x3xi32>) -> tensor<3x3xi32> {
+  %result = "mhlo.scatter"(%operand, %indices, %updates) ( {
+    ^bb0(%x: tensor<i32>, %y : tensor<i32>):
+      %result = "mhlo.add"(%x, %y): (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "mhlo.return"(%result) : (tensor<i32>) -> ()
+    }) { scatter_dimension_numbers = {index_vector_dim = 1 : i64,
+                inserted_window_dims = dense<0> : tensor<1xi64>,
+                scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+                update_window_dims = dense<1> : tensor<1xi64>},
+         indices_are_sorted = false,
+         unique_indices = false} : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %result : tensor<3x3xi32>
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt
new file mode 100644
index 00000000000..be8b2e13daf
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt
@@ -0,0 +1,83 @@
+// RUN: tf-mlir-translate -split-input-file -hlo-text-to-lhlo -optimize-xla-hlo=false %s | FileCheck %s
+
+HloModule TestModule
+
+// CHECK-LABEL: func @TestComputation
+
+FusedComputation {
+  // CHECK: tensor_load %arg0 {minor_to_major = dense<[0, 1]> : tensor<2xindex>}
+  x = f32[3, 2]{0,1} parameter(0)
+  ROOT y = f32[3, 2]{0,1} add(x, x)
+}
+
+ENTRY TestComputation {
+  x = f32[3, 2]{0,1} parameter(0)
+  ROOT y = f32[3, 2]{0,1} fusion(x), kind=kLoop, calls=FusedComputation
+}
+
+// -----
+
+HloModule ScatterModule
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo.scatter"
+// CHECK: ^bb0(%[[ARG5:.*]]: tensor<i32>, %[[ARG6:.*]]: tensor<i32>):
+// CHECK:  "mhlo.return"(%[[ARG6]])
+// CHECK: indices_are_sorted = false
+// CHECK: index_vector_dim = 1 : i64
+// CHECK: inserted_window_dims = dense<0> : tensor<1xi64>
+// CHECK: scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>
+// CHECK: update_window_dims = dense<1> : tensor<1xi64>
+// CHECK: unique_indices = false
+// CHECK: (memref<3x3xi32>, memref<2xi32>, memref<2x3xi32>, memref<3x3xi32>) -> ()
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter_op = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+
+// -----
+
+HloModule SelectAndScatter
+
+%ge_F32 (lhs.5: f32[], rhs.6: f32[]) -> pred[] {
+  %lhs.5 = f32[] parameter(0)
+  %rhs.6 = f32[] parameter(1)
+  ROOT %compare.7 = pred[] compare(f32[] %lhs.5, f32[] %rhs.6), direction=GE
+}
+
+%add_F32 (lhs.9: f32[], rhs.10: f32[]) -> f32[] {
+  %lhs.9 = f32[] parameter(0)
+  %rhs.10 = f32[] parameter(1)
+  ROOT %add.11 = f32[] add(f32[] %lhs.9, f32[] %rhs.10)
+}
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo.select_and_scatter"
+// CHECK: ^bb0(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>):
+// CHECK: %[[COMPARE:.*]] = "mhlo.compare"(%[[ARG0]], %[[ARG1]]) {comparison_direction = "GE"}
+// CHECK: "mhlo.return"(%[[COMPARE]]) : (tensor<i1>) -> ()
+// CHECK: ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
+// CHECK: %[[ADD:.*]] = mhlo.add %[[ARG2]], %[[ARG3]]
+// CHECK: "mhlo.return"(%[[ADD]]) : (tensor<f32>) -> ()
+// CHECK: padding = dense<0> : tensor<1xi64>
+// CHECK: window_dimensions = dense<3> : tensor<1xi64>
+// CHECK: window_strides = dense<3> : tensor<1xi64>
+// CHECK: (memref<6xf32>, memref<2xf32>, memref<f32>, memref<6xf32>) -> ()
+ENTRY main () -> f32[6] {
+  %operand = f32[6]{0} parameter(0)
+  %source = f32[2]{0} parameter(1)
+  %init = f32[] parameter(2)
+  ROOT %select-and-scatter.12 = f32[6]{0} select-and-scatter(f32[6]{0} %operand, f32[2]{0} %source, f32[] %init), window={size=3 stride=3}, select=%ge_F32, scatter=%add_F32
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
index 5ce78c2dfa3..e7312e2114c 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
@@ -325,3 +325,52 @@ func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> (tensor<5x5xi32>,
 
   return %res#0, %res#1 : tensor<5x5xi32>, tensor<5x5xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<f32> {{.*}}lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<f32> {{.*}}lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
+// CHECK: "lmhlo.fusion"() ( {
+// CHECK:   %[[VAR0:.*]] = tensor_load %[[ARG0]] : memref<f32>
+// CHECK:   %[[VAR1:.*]] = tensor_load %[[ARG1]] : memref<f32>
+// CHECK:   %[[VAR2:.*]] = mhlo.add %[[VAR0]], %[[VAR1]] : tensor<f32>
+// CHECK:   tensor_store %[[VAR2]], %[[MEMREF:.*]] : memref<f32>
+// CHECK:   "lmhlo.terminator"() : () -> ()
+// CHECK: }) : () -> ()
+func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %result = "mhlo.fusion"(%arg0, %arg1) ( {
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%result) : (tensor<f32>) -> ()
+    }) { fusion_kind = "kLoop" } : (tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  return %result : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo.fusion"() ( {
+// CHECK:   %[[VAL0:.*]] = tensor_load %{{.*}} : memref<f32>
+// CHECK:   %[[VAL1:.*]] = tensor_load %{{.*}} : memref<f32>
+// CHECK:   %[[VAL2:.*]] = tensor_load %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL0]], %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL1]], %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL2]], %{{.*}} : memref<f32>
+// CHECK:   "lmhlo.terminator"() : () -> ()
+// CHECK: }) : () -> ()
+func @main(%arg0: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg1: tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
+  %result = "mhlo.fusion"(%arg0, %arg1) ( {
+    ^bb0(%arg2: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg3: tuple<tensor<f32>>):
+      %0 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tuple<tensor<f32>>
+      %1 = "mhlo.get_tuple_element"(%0) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
+      %2 = "mhlo.get_tuple_element"(%arg2) {index = 1 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tensor<f32>
+      %3 = "mhlo.get_tuple_element"(%arg3) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
+      %4 = "mhlo.tuple"(%1, %2, %3) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+      "mhlo.return"(%4) : (tuple<tensor<f32>, tensor<f32>, tensor<f32>>) -> ()
+    }) { fusion_kind = "kLoop" } : (tuple<tuple<tensor<f32>>, tensor<f32>>, tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+
+  return %result : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index a21a78cf7f4..ee32407c24f 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -126,7 +126,7 @@ func @constant(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // CHECK-LABEL: func @greater
 func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"}
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {compare_type = "SIGNED", comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index fcbc5ebd337..f56d2b2d473 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -342,7 +342,7 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
 }
 
 // CHECK-LABEL: fusedBatchNormGradV3_Training
-func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>) {
   // CHECK-NEXT: %[[grad:.*]] = "mhlo.convert"(%arg0) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = "mhlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[training:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
@@ -350,10 +350,11 @@ func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: %[[scale_backprop:.*]] = "mhlo.get_tuple_element"(%[[training]]) {index = 1 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK-NEXT: %[[offset_backprop:.*]] = "mhlo.get_tuple_element"(%[[training]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK-NEXT: %[[x_backprop:.*]] = "mhlo.convert"(%[[tact]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: return %[[x_backprop]] : tensor<8x8x8x8xf32>
+  // CHECK: return %[[x_backprop]]
+  // CHECK-SAME: tensor<8x8x8x8xf32>
 
-  %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  return %0#0 : tensor<8x8x8x8xf32>
+  %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<0xf32>, tensor<*xf32>)
+  return %0#0, %0#3, %0#4 : tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>
 }
 
 // CHECK-LABEL: fusedBatchNormGradV3_noTraining_mixed_precision
@@ -1020,6 +1021,13 @@ func @identity(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   return %0: tensor<1xi32>
 }
 
+// CHECK-LABEL: func @identityN
+func @identityN(%arg0: tensor<1xi32>, %arg1: tensor<1xf32>) -> (tensor<1xi32>, tensor<1xf32>) {
+  // CHECK-NEXT:  return %arg0, %arg1 : tensor<1xi32>, tensor<1xf32>
+  %0:2 = "tf.IdentityN"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xf32>) -> (tensor<1xi32>, tensor<1xf32>)
+  return %0#0, %0#1: tensor<1xi32>, tensor<1xf32>
+}
+
 // CHECK-LABEL: func @stopgradient
 func @stopgradient(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK-NEXT:  return %arg0 : tensor<1xi32>
@@ -1216,32 +1224,30 @@ func @test_sparse_mat_mul_with_cast(%arg0: tensor<3x4xf32>, %arg1: tensor<4x5xbf
 // CHECK-LABEL: matrix_band_part
 // CHECK-SAME: (%[[INPUT:.*]]: tensor<64x64xbf16>, %[[LOWER:.*]]: tensor<i64>, %[[UPPER:.*]]: tensor<i64>)
 func @matrix_band_part(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
-  // CHECK: %[[M:.*]] = mhlo.constant dense<64> : tensor<i64>
-  // CHECK: %[[N:.*]] = mhlo.constant dense<64> : tensor<i64>
+  // CHECK-DAG: %[[M:.*]] = mhlo.constant dense<64> : tensor<i64>
+  // CHECK-DAG: %[[N:.*]] = mhlo.constant dense<64> : tensor<i64>
 
-  // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i64>
-  // CHECK: %[[A:.*]] = "mhlo.compare"(%[[LOWER]], %[[ZERO]]) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-  // CHECK: %[[B:.*]] = "mhlo.select"(%[[A]], %[[M]], %[[LOWER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i64>
+  // CHECK-DAG: %[[A:.*]] = "mhlo.compare"(%[[LOWER]], %[[ZERO]]) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK-DAG: %[[B:.*]] = "mhlo.select"(%[[A]], %[[M]], %[[LOWER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
 
-  // CHECK: %[[C:.*]] = "mhlo.compare"(%[[UPPER]], %[[ZERO]]) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-  // CHECK: %[[D:.*]] = "mhlo.select"(%[[C]], %[[N]], %[[UPPER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK-DAG: %[[C:.*]] = "mhlo.compare"(%[[UPPER]], %[[ZERO]]) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK-DAG: %[[D:.*]] = "mhlo.select"(%[[C]], %[[N]], %[[UPPER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK-DAG: %[[F:.*]] = "mhlo.negate"(%[[B]]) : (tensor<i64>) -> tensor<i64>
 
-  // CHECK: %[[E:.*]] = "mhlo.convert"(%[[B]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[F:.*]] = "mhlo.negate"(%[[E]]) : (tensor<bf16>) -> tensor<bf16>
+  // CHECK-DAG: %[[X:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<64x64xi64>
+  // CHECK-DAG: %[[Y:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64x64xi64>
+  // CHECK-DAG: %[[OFFSET:.*]] = mhlo.subtract %[[X]], %[[Y]] : tensor<64x64xi64>
+  // CHECK-DAG: %[[G:.*]] = chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<i64>, tensor<64x64xi64>) -> tensor<64x64xi1>
 
-  // CHECK: %[[X:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<64x64xbf16>
-  // CHECK: %[[Y:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64x64xbf16>
-  // CHECK: %[[OFFSET:.*]] = mhlo.subtract %[[X]], %[[Y]] : tensor<64x64xbf16>
-  // CHECK: %[[G:.*]] = chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<64x64xi1>
+  // CHECK-DAG: %[[I:.*]] = chlo.broadcast_compare %[[OFFSET]], %[[D]] {comparison_direction = "LE"} : (tensor<64x64xi64>, tensor<i64>) -> tensor<64x64xi1>
 
-  // CHECK: %[[H:.*]] = "mhlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<64x64xi1>
+  // CHECK-DAG: %[[J:.*]] = mhlo.and %[[G]], %[[I]] : tensor<64x64xi1>
 
-  // CHECK: %[[J:.*]] = mhlo.and %[[G]], %[[I]] : tensor<64x64xi1>
+  // CHECK-DAG: %[[ZERO2:.*]] = mhlo.constant dense<0.000000e+00> : tensor<64x64xbf16>
 
-  // CHECK: %[[ZERO2:.*]] = mhlo.constant dense<0.000000e+00> : tensor<64x64xbf16>
-  // CHECK: %[[R:.*]] = "mhlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
-  // CHECK: return %[[R]]
+  // CHECK-DAG: %[[R:.*]] = "mhlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
+  // CHECK-DAG: return %[[R]]
   %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
   return %0 : tensor<64x64xbf16>
 }
@@ -1249,19 +1255,20 @@ func @matrix_band_part(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: ten
 // CHECK-LABEL: matrix_band_part_2
 // CHECK-SAME: (%[[INPUT:.*]]: tensor<12x24x48xbf16>, %[[LOWER:.*]]: tensor<i64>, %[[UPPER:.*]]: tensor<i64>)
 func @matrix_band_part_2(%arg0: tensor<12x24x48xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<12x24x48xbf16> {
-  // CHECK: %[[X:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<24x48xbf16>
-  // CHECK: %[[Y:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<24x48xbf16>
-  // CHECK: %[[OFFSET:.*]] = mhlo.subtract %[[X]], %[[Y]] : tensor<24x48xbf16>
+  // CHECK-DAG: %[[X:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<24x48xi64>
+  // CHECK-DAG: %[[Y:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<24x48xi64>
+  // CHECK-DAG: %[[OFFSET:.*]] = mhlo.subtract %[[X]], %[[Y]] : tensor<24x48xi64>
 
-  // CHECK: %[[G:.*]] = chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<24x48xi1>
+  // CHECK-DAG: %[[G:.*]] = chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<i64>, tensor<24x48xi64>) -> tensor<24x48xi1>
 
-  // CHECK: %[[H:.*]] = "mhlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<24x48xi1>
-  // CHECK: %[[J:.*]] = mhlo.and %[[G]], %[[I]] : tensor<24x48xi1>
+  // CHECK-DAG: %[[I:.*]] = chlo.broadcast_compare %[[OFFSET]], %[[D]] {comparison_direction = "LE"} : (tensor<24x48xi64>, tensor<i64>) -> tensor<24x48xi1>
+  // CHECK-DAG: %[[J:.*]] = mhlo.and %[[G]], %[[I]] : tensor<24x48xi1>
 
-  // CHECK: %[[ZERO2:.*]] = mhlo.constant dense<0.000000e+00> : tensor<12x24x48xbf16>
-  // CHECK: %[[R:.*]] = "mhlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
-  // CHECK: return %[[R]]
+  // CHECK-DAG: %[[ZERO2:.*]] = mhlo.constant dense<0.000000e+00> : tensor<12x24x48xbf16>
+
+  // CHECK-DAG: %[[K:.*]] = "mhlo.broadcast_in_dim"(%[[J]]) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<24x48xi1>) -> tensor<12x24x48xi1>
+  // CHECK-DAG: %[[R:.*]] = "mhlo.select"(%[[K]], %[[INPUT]], %[[ZERO2]])
+  // CHECK-DAG: return %[[R]]
   %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<12x24x48xbf16>, tensor<i64>, tensor<i64>) -> tensor<12x24x48xbf16>
   return %0 : tensor<12x24x48xbf16>
 }
@@ -2633,6 +2640,14 @@ func @slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tens
   return %0 : tensor<1x4xi32>
 }
 
+// CHECK-LABEL: slice_mhlo_sizes
+func @slice_mhlo_sizes(%arg0: tensor<1x1024x4xf32>, %arg1: tensor<3xi32>) -> tensor<1x512x4xf32> {
+  // CHECK-NOT: "tf.Slice"
+  %0 = "mhlo.constant"() {value = dense<[1, 512, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = "tf.Slice"(%arg0, %arg1, %0) : (tensor<1x1024x4xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x512x4xf32>
+  return %1 : tensor<1x512x4xf32>
+}
+
 // CHECK-LABEL: slice_variable_start_negative_one_size
 func @slice_variable_start_negative_one_size(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
   // CHECK: %[[RESULT:.*]] = "tf.Slice"
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
index 1fa7367763e..ff205d7d510 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
@@ -26,10 +26,10 @@ ENTRY %indexed_conditional () -> f32[] {
 }
 
 // CHECK-LABEL: func @main() -> tensor<f32>
-// CHECK: %[[INDEX:.*]] = constant dense<1> : tensor<i32>
-// CHECK: %[[OPERAND_1:.*]] = constant dense<5.600000e+01> : tensor<f32>
-// CHECK: %[[OPERAND_2:.*]] = constant dense<1.200000e+01> : tensor<f32>
-// CHECK: %[[OPERAND_3:.*]] = constant dense<1.300000e+01> : tensor<f32>
+// CHECK: %[[INDEX:.*]] = mhlo.constant dense<1> : tensor<i32>
+// CHECK: %[[OPERAND_1:.*]] = mhlo.constant dense<5.600000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_2:.*]] = mhlo.constant dense<1.200000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_3:.*]] = mhlo.constant dense<1.300000e+01> : tensor<f32>
 // CHECK: %[[RESULT:.*]] = "mhlo.case"(%[[INDEX]], %[[OPERAND_1]], %[[OPERAND_2]], %[[OPERAND_3]]) ( {
 // CHECK:   ^bb0(%[[ARG_1:.*]]: tensor<f32>):
 // CHECK:     %[[RES_1:.*]] = "mhlo.negate"(%[[ARG_1]]) : (tensor<f32>) -> tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index c078191d170..9a9ac78cc70 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -750,7 +750,7 @@ func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>) -> te
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ( {
   ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):	// no predecessors
-    %2 = "mhlo.compare"(%arg3, %arg4) {comparison_direction = "GE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2 = "mhlo.compare"(%arg3, %arg4) {compare_type = "TOTALORDER", comparison_direction = "GE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%2) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):	// no predecessors
@@ -764,7 +764,7 @@ func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>) -> te
 }
 
 // CHECK:  %[[SELECT_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> pred[] {
-// CHECK:  ROOT %[[RESULT:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GE
+// CHECK:  ROOT %[[RESULT:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GE, type=TOTALORDER
 
 // CHECK:  %[[SCATTER_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> f32[] {
 // CHECK:  ROOT %[[RESULT:.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
@@ -950,7 +950,7 @@ func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
 func @main(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
-    %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %7 = "mhlo.compare"(%arg0, %arg1) {compare_type = "FLOAT", comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
   }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
@@ -963,6 +963,22 @@ func @main(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
 // CHECK: [[GET0:%.+]] = f32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=0
 // CHECK: ROOT [[GET1:%.+]] = s32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=1
 
+// -----
+
+// CHECK:  HloModule
+func @main(%input0: tensor<16x16xf32>) {
+  %0 = "mhlo.sort"(%input0) ( {
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %7 = "mhlo.compare"(%arg0, %arg1) {compare_type = "FLOAT", comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%7) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>) -> (tensor<16x16xf32>)
+  return
+}
+
+// CHECK: %[[SORT_CMP:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> pred[] {
+// CHECK:   ROOT %[[CMP:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GT
+
+// CHECK: ROOT %[[RESULT:.*]] = f32[16,16] sort(f32[16,16] %Arg_0.1), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
index 4cc70be0965..d5e100f9a4c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
@@ -4,100 +4,102 @@
 
 HloModule tfcompile.48
 
-// CHECK-LABEL: func @main(%arg0: tensor<1x300xf32>, %arg1: tensor<1x300x3x1xf32>) -> tuple<tensor<300x1x5xf32>> {
+// CHECK-LABEL:   func @main(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<1x300xf32>,
+// CHECK-SAME:               %[[VAL_1:.*]]: tensor<1x300x3x1xf32>) -> tuple<tensor<300x1x5xf32>> {
 ENTRY %tfcompile.48 {
   %arg0.1 = f32[1,300] parameter(0)
   %arg1.2 = f32[1,300,3,1] parameter(1)
 
-  // CHECK-NEXT: %0 = "mhlo.reshape"(%arg0) : (tensor<1x300xf32>) -> tensor<1x300xf32>
+  // CHECK-NEXT: %[[VAL_2:.*]] = "mhlo.reshape"(%[[VAL_0]]) : (tensor<1x300xf32>) -> tensor<1x300xf32>
   %reshape.3 = f32[1,300] reshape(%arg0.1)
 
-  // CHECK-NEXT: %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %[[VAL_3:.*]] = "mhlo.transpose"(%[[VAL_2]]) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
   %transpose.27 = f32[300,1] transpose(%reshape.3), dimensions={1,0}
 
-  // CHECK-NEXT: %2 = "mhlo.reshape"(%1) : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
+  // CHECK-NEXT: %[[VAL_4:.*]] = "mhlo.reshape"(%[[VAL_3]]) : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
   %reshape.28 = f32[300,1,1] reshape(%transpose.27)
 
-  // CHECK-NEXT: %3 = "mhlo.reshape"(%2) : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %[[VAL_5:.*]] = "mhlo.reshape"(%[[VAL_4]]) : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
   %reshape.29 = f32[300,1] reshape(%reshape.28)
 
-  // CHECK-NEXT: %4 = "mhlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_6:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_5]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
   %broadcast.30 = f32[300,1,5] broadcast(%reshape.29), dimensions={0,1}
 
-  // CHECK-NEXT: %cst = constant  dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %[[VAL_7:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
   %constant.8 = f32[] constant(1)
 
-  // CHECK-NEXT: %5 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_8:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_7]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.9 = f32[300,1,5] broadcast(%constant.8), dimensions={}
 
-  // CHECK-NEXT: %6 = mhlo.multiply %4, %5 : tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_9:.*]] = mhlo.multiply %[[VAL_6]], %[[VAL_8]] : tensor<300x1x5xf32>
   %multiply.31 = f32[300,1,5] multiply(%broadcast.30, %broadcast.9)
 
-  // CHECK-NEXT: %cst_0 = constant  dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %[[VAL_10:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %constant.32 = f32[] constant(0)
 
-  // CHECK-NEXT: %7 = "mhlo.broadcast_in_dim"(%cst_0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_11:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_10]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.33 = f32[300,1,5] broadcast(%constant.32), dimensions={}
 
-  // CHECK-NEXT: %8 = "mhlo.compare"(%6, %7) {comparison_direction = "GT"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
+  // CHECK-NEXT: %[[VAL_12:.*]] = "mhlo.compare"(%[[VAL_9]], %[[VAL_11]]) {comparison_direction = "GT"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
   %compare.34 = pred[300,1,5] compare(%multiply.31, %broadcast.33), direction=GT
 
-  // CHECK-NEXT: %cst_1 = constant  dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %[[VAL_13:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %constant.10 = f32[] constant(0)
 
-  // CHECK-NEXT: %9 = "mhlo.broadcast_in_dim"(%cst_1) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_14:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_13]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.11 = f32[300,1,5] broadcast(%constant.10), dimensions={}
 
-  // CHECK-NEXT: %cst_2 = constant  dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %[[VAL_15:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %constant.40 = f32[] constant(0)
 
-  // CHECK-NEXT: %10 = "mhlo.broadcast_in_dim"(%cst_2) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_16:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_15]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x5xf32>
   %broadcast.41 = f32[300,5] broadcast(%constant.40), dimensions={}
 
-  // CHECK-NEXT: %11 = "mhlo.copy"(%arg1) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
+  // CHECK-NEXT: %[[VAL_17:.*]] = "mhlo.copy"(%[[VAL_1]]) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
   %copy.1 = f32[1,300,3,1] copy(%arg1.2)
 
-  // CHECK-NEXT: %12 = "mhlo.reshape"(%11) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
+  // CHECK-NEXT: %[[VAL_18:.*]] = "mhlo.reshape"(%[[VAL_17]]) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
   %reshape.4 = f32[1,300,3,1] reshape(%copy.1)
 
-  // CHECK-NEXT: %13 = "mhlo.reshape"(%12) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
+  // CHECK-NEXT: %[[VAL_19:.*]] = "mhlo.reshape"(%[[VAL_18]]) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
   %reshape.24 = f32[1,300,3] reshape(%reshape.4)
 
-  // CHECK-NEXT: %14 = "mhlo.transpose"(%13) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
+  // CHECK-NEXT: %[[VAL_20:.*]] = "mhlo.transpose"(%[[VAL_19]]) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
   %transpose.25 = f32[300,1,3] transpose(%reshape.24), dimensions={1,0,2}
 
-  // CHECK-NEXT: %15 = "mhlo.reshape"(%14) : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
+  // CHECK-NEXT: %[[VAL_21:.*]] = "mhlo.reshape"(%[[VAL_20]]) : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
   %reshape.26 = f32[300,3] reshape(%transpose.25)
 
-  // CHECK-NEXT: %cst_3 = constant  dense<{{\[\[}}-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
+  // CHECK-NEXT: %[[VAL_22:.*]] = mhlo.constant dense<{{\[\[}}-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
   %constant.35 = f32[3,5] constant({ { -0.106023, 0.121505, 0.800239, -0.768885, 0.0966113 }, { 0.689014, -0.407056, -0.797853, 0.00378925, -0.208881 }, { -0.608529, 0.0276617, 0.268557, 0.577401, -0.428437 } })
 
   // TODO(b/129709049) consider making this default precision config implied.
-  // CHECK-NEXT: %16 = "mhlo.dot"(%15, %cst_3) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_23:.*]] = "mhlo.dot"(%[[VAL_21]], %[[VAL_22]]) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
   %dot.36 = f32[300,5] dot(%reshape.26, %constant.35), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
-  // CHECK-NEXT: %cst_4 = constant  dense<0.000000e+00> : tensor<5xf32>
+  // CHECK-NEXT: %[[VAL_24:.*]] = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
   %constant.37 = f32[5]{0} constant({0, 0, 0, 0, 0})
 
-  // CHECK-NEXT: %17 = "mhlo.broadcast_in_dim"(%cst_4) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_25:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_24]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<300x5xf32>
   %broadcast.38 = f32[300,5] broadcast(%constant.37), dimensions={1}
 
-  // CHECK-NEXT: %18 = mhlo.add %16, %17 : tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_26:.*]] = mhlo.add %[[VAL_23]], %[[VAL_25]] : tensor<300x5xf32>
   %add.39 = f32[300,5] add(%dot.36, %broadcast.38)
 
-  // CHECK-NEXT: %19 = mhlo.maximum %10, %18 : tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_27:.*]] = mhlo.maximum %[[VAL_16]], %[[VAL_26]] : tensor<300x5xf32>
   %maximum.42 = f32[300,5] maximum(%broadcast.41, %add.39)
 
-  // CHECK-NEXT: %20 = "mhlo.reshape"(%19) : (tensor<300x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_28:.*]] = "mhlo.reshape"(%[[VAL_27]]) : (tensor<300x5xf32>) -> tensor<300x1x5xf32>
   %reshape.44 = f32[300,1,5] reshape(%maximum.42)
 
-  // CHECK-NEXT: %21 = "mhlo.select"(%8, %9, %20) : (tensor<300x1x5xi1>, tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_29:.*]] = "mhlo.select"(%[[VAL_12]], %[[VAL_14]], %[[VAL_28]]) : (tensor<300x1x5xi1>, tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %select.45 = f32[300,1,5] select(%compare.34, %broadcast.11, %reshape.44)
 
-  // CHECK-NEXT: %22 = "mhlo.reshape"(%21) : (tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_30:.*]] = "mhlo.reshape"(%[[VAL_29]]) : (tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %reshape.46 = f32[300,1,5] reshape(%select.45)
 
-  // CHECK-NEXT: %23 = "mhlo.tuple"(%22) : (tensor<300x1x5xf32>) -> tuple<tensor<300x1x5xf32>>
-  // CHECK-NEXT: return %23 : tuple<tensor<300x1x5xf32>>
+  // CHECK-NEXT: %[[VAL_31:.*]] = "mhlo.tuple"(%[[VAL_30]]) : (tensor<300x1x5xf32>) -> tuple<tensor<300x1x5xf32>>
+  // CHECK-NEXT: return %[[VAL_31]] : tuple<tensor<300x1x5xf32>>
   ROOT %tuple.47 = (f32[300,1,5]) tuple(%reshape.46)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
index 28e98c1376a..327e5107e4a 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
@@ -20,7 +20,7 @@ HloModule tfcompile.20
 ENTRY %tfcompile.20 {
   %arg0.1 = f32[] parameter(0), parameter_replication={false}, metadata={op_name="XLA_Args"}
 
-  // CHECK: [[C0:%.+]] = constant
+  // CHECK: [[C0:%.+]] = mhlo.constant
   %constant.3 = f32[] constant(10), metadata={op_type="Less" op_name="Less"}
 
   // CHECK: [[R1:%.+]] = "mhlo.compare"([[A0]], [[C0]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index cce49b16c6c..2a3d42f30ff 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -144,10 +144,10 @@ add {
   %Arg_2.3 = f32[3] parameter(2)
 
   // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
-  %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ
+  %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ, type=FLOAT
 
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
-  %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {compare_type = "TOTALORDER", comparison_direction = "LE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
+  %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE, type=TOTALORDER
 
   // Requires broadcast of compatible tensors.
   // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg2) {comparison_direction = "GT"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
@@ -176,48 +176,49 @@ add {
 %test_constant {
 
   // Scalar/0D tensor constant
-  // CHECK-NEXT:  %cst = constant dense<1> : tensor<i64>
+  // CHECK-NEXT: %[[VAL_0:.*]] = mhlo.constant dense<1> : tensor<i64>
   %constant.0 = s64[] constant(1)
 
   // Note that double brackets "[[" have to be escaped as they denote variables
   // in FileCheck. The only way to do so is to drop into regex with "{{"
-  // CHECK-NEXT:  constant  dense<{{\[\[\[\[}}1.000000e+00]], {{\[\[}}2.000000e+00]]], {{\[\[\[}}3.000000e+00]], {{\[\[}}4.000000e+00]]]]> : tensor<2x2x1x1xf32>
+  // CHECK-NEXT: %[[VAL_1:.*]] = mhlo.constant dense<{{\[\[}}{{\[\[}}1.000000e+00]], {{\[\[}}2.000000e+00]]], {{\[\[}}[3.000000e+00]], {{\[\[}}4.000000e+00]]]]> : tensor<2x2x1x1xf32>
   %constant.1 = f32[2,2,1,1]{3,2,1,0} constant({{{{1.0}},{{2.0}}},{{{3.0}},{{4.0}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK: dense<[1, 2, 4, 8]> : tensor<4xui64>
+  // CHECK: %[[VAL_2:.*]] = mhlo.constant dense<[1, 2, 4, 8]> : tensor<4xui64>
   %constant.2 = u64[4] constant({ 1, 2, 4, 8 })
 
-  // CHECK: dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
+  // CHECK: %[[VAL_3:.*]] = mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
   %constant.3 = bf16[4] constant({1, 2, 3, 4})
 
-  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+  // CHECK: %[[VAL_4:.*]] = mhlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
   %constant.4 = c64[] constant((1, 0))
 
-  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+  // CHECK: %[[VAL_5:.*]] = mhlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
   %constant.5 = c128[] constant((1, 0))
 
-  // CHECK: dense<[1.000000e+00, -4.000000e+00, -6.550400e+04, 1.562500e-02]> : tensor<4xf16>
+  // CHECK: %[[VAL_6:.*]] = mhlo.constant dense<[1.000000e+00, -4.000000e+00, -6.550400e+04, 1.562500e-02]> : tensor<4xf16>
   ROOT %constant.6 = f16[4] constant({1, -4, -65504, 0.015625})
 }
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual
 // implementations with attributes, etc.
-// CHECK-LABEL:  func @test_conv(%arg0: tensor<256x32x32x6xf32>) -> tuple<tensor<256x30x30x16xf32>>
+// CHECK-LABEL: func @test_conv(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<256x32x32x6xf32>) -> tuple<tensor<256x30x30x16xf32>> attributes {sym_visibility = "private"} {
 %test_conv {
   %arg0.1 = f32[256,32,32,6]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  %0 = "mhlo.copy"(%arg0) {minor_to_major = dense<[2, 1, 3, 0]> : tensor<4xindex>} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
+  // CHECK-NEXT: %[[VAL_1:.*]] = "mhlo.copy"(%[[VAL_0]]) {minor_to_major = dense<[2, 1, 3, 0]> : tensor<4xindex>} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
   %copy.1 = f32[256,32,32,6]{2,1,3,0} copy(%arg0.1), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  %1 = "mhlo.reshape"(%0) {minor_to_major = dense<[2, 1, 3, 0]> : tensor<4xindex>} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
+  // CHECK-NEXT: %[[VAL_2:.*]] = "mhlo.reshape"(%[[VAL_1]]) {minor_to_major = dense<[2, 1, 3, 0]> : tensor<4xindex>} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
   %reshape.2 = f32[256,32,32,6]{2,1,3,0} reshape(%copy.1)
 
   // Note that double brackets "[[" have to be escaped as they denote variables
   // in FileCheck. The only way to do so is to drop into regex with "{{"
-  // CHECK-NEXT:  %cst = constant  dense<{{\[\[\[\[}}5.000000e-01]], {{\[\[}}-6.000000e-01]]], {{\[\[\[}}3.000000e-01]], {{\[\[}}-1.000000e-01]]]]> : tensor<2x2x1x1xf32>
+  // CHECK-NEXT: %[[VAL_3:.*]] = mhlo.constant dense<{{\[\[}}{{\[\[}}5.000000e-01]], {{\[\[}}-6.000000e-01]]], {{\[\[}}[3.000000e-01]], {{\[\[}}-1.000000e-01]]]]> : tensor<2x2x1x1xf32>
   %constant.3 = f32[2,2,1,1]{3,2,1,0} constant({{{{0.5}}, {{-0.6}}}, {{{0.3}}, {{-0.1}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK-NEXT:  %2 = "mhlo.convolution"(%1, %cst) {
+  // CHECK-NEXT: %[[VAL_4:.*]] = "mhlo.convolution"(%[[VAL_2]], %[[VAL_3]]) {
   // CHECK-SAME:     batch_group_count = 1 : i64
   // CHECK-SAME:     dimension_numbers = {
   // CHECK-SAME:       input_batch_dimension = 0 : i64
@@ -241,10 +242,10 @@ add {
 
   %convolution.4 = f32[16,30,30,256]{2,1,3,0} convolution(%reshape.2, %constant.3), window={size=3x3 stride=4x5 pad=44_45x60_60 rhs_dilate=2x3}, dim_labels=b01f_01io->f01b, metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK-NEXT:  %3 = "mhlo.reshape"(%2) : (tensor<16x30x30x256xf32>) -> tensor<256x30x30x16xf32>
+  // CHECK-NEXT: %[[VAL_5:.*]] = "mhlo.reshape"(%[[VAL_4]]) : (tensor<16x30x30x256xf32>) -> tensor<256x30x30x16xf32>
   %reshape.5 = f32[256,30,30,16]{3,2,1,0} reshape(%convolution.4), metadata={op_name="HLO_Retvals"}
 
-  // CHECK-NEXT:  "mhlo.tuple"(%3) : (tensor<256x30x30x16xf32>) -> tuple<tensor<256x30x30x16xf32>>
+  // CHECK-NEXT: %[[VAL_6:.*]] = "mhlo.tuple"(%[[VAL_5]]) : (tensor<256x30x30x16xf32>) -> tuple<tensor<256x30x30x16xf32>>
   ROOT %tuple.6 = (f32[256,30,30,16]{3,2,1,0}) tuple(%reshape.5), metadata={op_name="HLO_Retvals"}
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
index f7e1ba9ff15..892fca73b6d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
@@ -4,25 +4,25 @@ HloModule tfcompile.1
 
 // CHECK-LABEL: func @main() -> tensor<i1> {
 ENTRY %tfcompile.1 {
-  // CHECK-NEXT: %cst = constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %[[VAL_0:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
   %constant.0 = f32[] constant(1)
 
-  // CHECK-NEXT: %cst_0 = constant dense<1.000000e+00> : tensor<f64>
+  // CHECK-NEXT: %[[VAL_1:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f64>
   %constant.1 = f64[] constant(1)
 
-  // CHECK-NEXT: %cst_1 = constant dense<1> : tensor<i8>
+  // CHECK-NEXT: %[[VAL_2:.*]] = mhlo.constant dense<1> : tensor<i8>
   %constant.2 = s8[] constant(1)
 
-  // CHECK-NEXT: %cst_2 = constant dense<1> : tensor<i16>
+  // CHECK-NEXT: %[[VAL_3:.*]] = mhlo.constant dense<1> : tensor<i16>
   %constant.3 = s16[] constant(1)
 
-  // CHECK-NEXT: %cst_3 = constant dense<1> : tensor<i32>
+  // CHECK-NEXT: %[[VAL_4:.*]] = mhlo.constant dense<1> : tensor<i32>
   %constant.4 = s32[] constant(1)
 
-  // CHECK-NEXT: %cst_4 = constant dense<1> : tensor<i64>
+  // CHECK-NEXT: %[[VAL_5:.*]] = mhlo.constant dense<1> : tensor<i64>
   %constant.5 = s64[] constant(1)
 
-  // CHECK-NEXT: %cst_5 = constant dense<true> : tensor<i1>
-  // CHECK-NEXT: return %cst_5 : tensor<i1>
+  // CHECK-NEXT: %[[VAL_6:.*]] = mhlo.constant dense<true> : tensor<i1>
+  // CHECK-NEXT: return %[[VAL_6]] : tensor<i1>
   ROOT %constant.6 = pred[] constant(1)
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 6a47d892f34..774ebeabd15 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -188,22 +188,20 @@ Type GetSumAccumulationType(Type input_type) {
   return input_type;
 }
 
-// Returns axis in HLO format from TF elements attr with exactly one element
-// containing axis in the TensorFlow format. TensorFlow format supports negative
-// indexing unlike HLO.
-static IntegerAttr GetHLOAxisFromTFAxis(ElementsAttr attr, int64_t rank,
+// Returns axis in HLO format from TF elements attr with exactly one element or
+// is an IntegerAttr, containing axis in the TensorFlow format. TensorFlow
+// format supports negative indexing unlike HLO.
+static IntegerAttr GetHLOAxisFromTFAxis(Attribute attr, int64_t rank,
                                         Builder *b) {
-  SmallVector<uint64_t, 1> index(attr.getType().getRank(), 0);
-  int64_t axis = attr.getValue<IntegerAttr>(index).getInt();
-  if (axis < 0) {
-    axis += rank;
+  IntegerAttr intAttr = attr.dyn_cast_or_null<IntegerAttr>();
+  if (auto elementAttr = attr.dyn_cast_or_null<ElementsAttr>()) {
+    SmallVector<uint64_t, 1> index(elementAttr.getType().getRank(), 0);
+    intAttr = elementAttr.getValue<IntegerAttr>(index);
   }
-  return b->getI64IntegerAttr(axis);
-}
 
-static IntegerAttr GetHLOAxisFromTFAxis(IntegerAttr attr, int64_t rank,
-                                        Builder *b) {
-  int64_t axis = attr.getInt();
+  assert(intAttr && "Invalid attribute passed to GetHLOAxisFromTFAxis");
+
+  int64_t axis = intAttr.getInt();
   if (axis < 0) {
     axis += rank;
   }
@@ -788,11 +786,11 @@ static int GetDimensionSizeFromEnd(Value input, int dim_from_end) {
 // dimension and last dimension, respectively). The element type of the
 // outputted RankedTensorType will match the element type of `input`.
 // Requires that `input` is a tensor.
-static RankedTensorType Get2DTensorType(Value input) {
+static RankedTensorType Get2DTensorType(Value input, Value num_lower) {
   // `dim_0` refers to the second-to-last dimension; `dim_1` refers to the last.
   int dim_0 = GetDimensionSizeFromEnd(input, 1);
   int dim_1 = GetDimensionSizeFromEnd(input, 0);
-  auto element_type = input.getType().cast<TensorType>().getElementType();
+  auto element_type = num_lower.getType().cast<TensorType>().getElementType();
   return RankedTensorType::get({dim_0, dim_1}, element_type);
 }
 
@@ -1707,6 +1705,17 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
   }
 };
 
+// Bypasses IdentityN op.
+class ConvertIdentityNOp : public OpRewritePattern<TF::IdentityNOp> {
+ public:
+  using OpRewritePattern<TF::IdentityNOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::IdentityNOp op,
+                                PatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op, op.getOperands());
+    return success();
+  }
+};
+
 template <typename OpTy>
 class ConvertFFTOp : public OpRewritePattern<OpTy> {
  public:
@@ -1881,11 +1890,27 @@ class ConvertFusedBatchNormGradBase
     }
 
     x_backprop = rewriter.create<ConvertOp>(loc, x_backprop, act_ele_type);
-    // It doesn't matter what values we provide for the last 2 results.
-    rewriter.replaceOp(op,
-                       {/*x_backprop=*/x_backprop,
-                        /*scale_backprop=*/scale_backprop,
-                        /*offset_backprop=*/offset_backprop, op.x(), op.x()});
+    Value last_val[2];
+    if (op.getResult(3).use_empty() && op.getResult(4).use_empty()) {
+      // It doesn't matter what values we provide for the last 2 results.
+      last_val[0] = last_val[1] = op.x();
+    } else {
+      auto const_val = rewriter.create<ConstOp>(
+          op.getLoc(),
+          DenseElementsAttr::get<float>(
+              RankedTensorType::get({0}, getElementTypeOrSelf(op.getResult(3))),
+              0.0));
+      auto maybe_cast = [&](Value val, Type t) -> Value {
+        if (val.getType() == t) return val;
+        return rewriter.create<TensorCastOp>(op.getLoc(), t, val);
+      };
+      last_val[0] = maybe_cast(const_val, op.getResult(3).getType());
+      last_val[1] = maybe_cast(const_val, op.getResult(4).getType());
+    }
+    rewriter.replaceOp(
+        op, {/*x_backprop=*/x_backprop,
+             /*scale_backprop=*/scale_backprop,
+             /*offset_backprop=*/offset_backprop, last_val[0], last_val[1]});
     return success();
   }
 };
@@ -2023,13 +2048,25 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                 /*reserve_space_1=*/reserve_space_1,
                                 /*reserve_space_2=*/batch_variance});
       } else {  // TF::FusedBatchNormV3Op
-        // FusedBatchNormV3 expects a 5th output, but the output is unused; it
-        // doesn't matter what we pass there.
+        // For FusedBatchNormV3Op, also create a constant tensor to forward to
+        // last reserve_space_3 output.
+        auto reserve_space_3_type =
+            op.getResult(5).getType().template cast<TensorType>();
+        int num_elements = reserve_space_3_type.hasStaticShape()
+                               ? reserve_space_3_type.getNumElements()
+                               : 0;
+        auto const_attr_type = RankedTensorType::get(
+            {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
+        Value dummy_const = rewriter.create<ConstOp>(
+            op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+        if (const_attr_type != reserve_space_3_type)
+          dummy_const = rewriter.create<TensorCastOp>(
+              op.getLoc(), reserve_space_3_type, dummy_const);
         rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
                                 /*batch_variance=*/corrected_variance,
                                 /*reserve_space_1=*/reserve_space_1,
                                 /*reserve_space_2=*/batch_variance,
-                                /*reserve_space_3=*/op.x()});
+                                /*reserve_space_3=*/dummy_const});
       }
     } else {  // Inference case.
       auto bn_train_op = rewriter.create<BatchNormInferenceOp>(
@@ -5207,6 +5244,62 @@ class ConvertXlaDynamicUpdateSliceOp
   }
 };
 
+// Converts a TF XlaAllReduce op to AllReduce HLO.
+class ConvertXlaAllReduceOp : public OpRewritePattern<TF::XlaAllReduceOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::XlaAllReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    DenseIntElementsAttr group_assignment;
+    if (!matchPattern(op.group_assignment(), m_Constant(&group_assignment)))
+      return failure();
+    auto replica_groups =
+        hlo::ConvertElementsAttr(group_assignment, rewriter.getIntegerType(64))
+            .cast<DenseIntElementsAttr>();
+    if (replica_groups.getType().getRank() != 2) return failure();
+
+    Location loc = op.getLoc();
+    Type element_type = getElementTypeOrSelf(op.input().getType());
+
+    auto all_reduce = rewriter.create<AllReduceOp>(
+        loc, op.getType(), op.input(), replica_groups, ChannelHandle());
+    StringRef reduce_op = op.reduce_op();
+    if (reduce_op == "Add") {
+      BuildReduceBody<AddOp>(element_type, &all_reduce.computation(),
+                             &rewriter);
+    } else if (reduce_op == "Mul") {
+      BuildReduceBody<MulOp>(element_type, &all_reduce.computation(),
+                             &rewriter);
+    } else if (reduce_op == "Min") {
+      BuildReduceBody<MinOp>(element_type, &all_reduce.computation(),
+                             &rewriter);
+    } else if (reduce_op == "Max") {
+      BuildReduceBody<MaxOp>(element_type, &all_reduce.computation(),
+                             &rewriter);
+    } else {
+      // For mean, add replicas in the same group. Then divide the sum by the
+      // number of replicas in each group below.
+      assert(reduce_op == "Mean");
+      BuildReduceBody<AddOp>(element_type, &all_reduce.computation(),
+                             &rewriter);
+    }
+    Value result = all_reduce.getResult();
+
+    // For mean, divide the merge result by group size.
+    if (reduce_op == "Mean") {
+      int64_t replica_group_size = replica_groups.getType().getDimSize(1);
+      auto divisor = GetScalarConstOfType(element_type, loc, replica_group_size,
+                                          &rewriter);
+      auto broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      result = rewriter.create<chlo::BroadcastDivOp>(
+          loc, result, divisor.getResult(), broadcast_dims);
+    }
+
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
 // Converts ClipByValue to XLA's clamp operation. Includes the broadcasting
 // semantics for static and dynamic cases.
 class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
@@ -6088,8 +6181,8 @@ LogicalResult legalizeTF(
     // Fully qualify ReturnOp here as mhlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
     DenseSet<Operation *> nonlegalized_ops;
-    LogicalResult result =
-        applyPartialConversion(op, target, patterns, &nonlegalized_ops);
+    LogicalResult result = applyPartialConversion(
+        op, target, std::move(patterns), &nonlegalized_ops);
     // In order to enforce that the conversion result is fully converted,
     // fail if there are any nonlegalized ops in the set.
     if (failed(result) || !nonlegalized_ops.empty()) {
@@ -6099,12 +6192,12 @@ LogicalResult legalizeTF(
     return result;
   }
 
-  return applyPartialConversion(op, target, patterns);
+  return applyPartialConversion(op, target, std::move(patterns));
 }
 
 void PopulateLegalizeTfPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
   patterns->insert<
       ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
       ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
@@ -6117,13 +6210,14 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
       ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
       ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV2Op,
       ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
-      ConvertInplaceUpdateOp, ConvertLinSpaceOp, ConvertMaxOp, ConvertMinOp,
-      ConvertAvgPool2DOp, ConvertAvgPool3DOp, ConvertAvgPool2DGradOp,
-      ConvertAvgPool3DGradOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
-      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
-      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
-      ConvertDynamicRangeOp, ConvertMatrixDiagPartV3Op, ConvertRangeOp,
-      ConvertSelectV2Op, ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
+      ConvertIdentityNOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
+      ConvertMaxOp, ConvertMinOp, ConvertAvgPool2DOp, ConvertAvgPool3DOp,
+      ConvertAvgPool2DGradOp, ConvertAvgPool3DGradOp, ConvertMaxPool2DOp,
+      ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
+      ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
+      ConvertProdOp, ConvertQrOp, ConvertDynamicRangeOp,
+      ConvertMatrixDiagPartV3Op, ConvertRangeOp, ConvertSelectV2Op,
+      ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
       ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
       ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
       ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
@@ -6131,7 +6225,7 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
       ConvertUnpackOp, ConvertUnsortedSegmentMaxOp, ConvertUnsortedSegmentMinOp,
       ConvertUnsortedSegmentProdOp, ConvertUnsortedSegmentSumOp,
       ConvertRandomShuffleOp, ConvertXlaShardingOp,
-      ConvertXlaDynamicUpdateSliceOp>(context);
+      ConvertXlaDynamicUpdateSliceOp, ConvertXlaAllReduceOp>(context);
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 52bbbf6f9da..11af809ffb7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -53,7 +53,7 @@ def GetHLOAxisFromTFAxisVariadic : NativeCodeCall<
 
 def CastElementsToI64Elements : NativeCodeCall<
   "hlo::ConvertElementsAttr("
-    "$0, $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+    "$0.cast<ElementsAttr>(), $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
 
 def : Pattern<
     (TF_FusedBatchNormOp:$root $x, $scale, $offset, $mean, $variance, $epsilon,
@@ -143,10 +143,13 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (HLO_SelectOp
          (HLOClient_BroadcastCompareOp
           (HLOClient_BroadcastCompareOp $l, (HLO_ConstOp (GetScalarOfType<0> $l)),
-           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
+           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT,
+           (HLO_DEFAULT_COMPARISON_TYPE)),
           (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (GetScalarOfType<0> $r)),
-           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-          (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
+           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT,
+           (HLO_DEFAULT_COMPARISON_TYPE)),
+          (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ,
+          (HLO_DEFAULT_COMPARISON_TYPE)),
          (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
          (HLOClient_BroadcastDivOp
           (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
@@ -170,14 +173,18 @@ def : Pat<(TF_FloorModOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (HLOClient_BroadcastCompareOp
          (HLOClient_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
          (HLO_ConstOp:$l_zeros (GetScalarOfType<0> $l)),
-         (BinBroadcastDimensions $l, $rem), HLO_COMPARISON_DIRECTION_NE),
+         (BinBroadcastDimensions $l, $rem), HLO_COMPARISON_DIRECTION_NE,
+         (HLO_DEFAULT_COMPARISON_TYPE)),
         (HLOClient_BroadcastCompareOp
          (HLOClient_BroadcastCompareOp:$r_cmp $r,
           (HLO_ConstOp:$r_zeros (GetScalarOfType<0> $r)),
-          (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
+          (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT,
+          (HLO_DEFAULT_COMPARISON_TYPE)),
          (HLOClient_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
-          (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),
-         (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE),
+          (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT,
+          (HLO_DEFAULT_COMPARISON_TYPE)),
+         (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE,
+         (HLO_DEFAULT_COMPARISON_TYPE)),
         (NullDenseIntElementsAttr)),
        (HLOClient_BroadcastAddOp $r,
         $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
@@ -204,7 +211,8 @@ foreach fromToBinPair = [[TF_LogicalAndOp, HLOClient_BroadcastAndOp],
 class DirectComparePat<Op FromOp, StrEnumAttrCase direction>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (HLOClient_BroadcastCompareOp
-           $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
+           $l, $r, (BinBroadcastDimensions $l, $r), direction,
+           (HLO_DEFAULT_COMPARISON_TYPE))>;
 
 def : DirectComparePat<TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT>;
 def : DirectComparePat<TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE>;
@@ -215,7 +223,8 @@ class EqualityPat<Op FromOp, StrEnumAttrCase direction>
     : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r,
            TrueBoolAttr:$incompatible_shape_error),
         (HLOClient_BroadcastCompareOp
-         $l, $r, (BinBroadcastDimensions $l, $r), direction),
+         $l, $r, (BinBroadcastDimensions $l, $r), direction,
+         (HLO_DEFAULT_COMPARISON_TYPE)),
         [(AreBroadcastCompatible $l, $r)]>;
 
 def : EqualityPat<TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ>;
@@ -253,7 +262,7 @@ def IsShapedTensor
 // the conversion, original Concat op operands still refers to the old ops even
 // if HLO constant op is introduced as an replacement for the TensorFlow
 // Constant op.
-def : Pat<(TF_ConcatV2Op $inputs, (TF_ConstOp OneElementAttr:$axis)),
+def : Pat<(TF_ConcatV2Op $inputs, (ConstantLikeMatcher OneElementAttr:$axis)),
           (HLO_ConcatenateOp $inputs,
             (GetHLOAxisFromTFAxisVariadic $axis, $inputs)),
           [(HasRankedFirstOperand $inputs)]>;
@@ -262,7 +271,7 @@ def : Pat<(TF_ConcatV2Op $inputs, (TF_ConstOp OneElementAttr:$axis)),
 // CollectivePermute op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_CollectivePermuteOp $input, (TF_ConstOp $source_target_pairs)),
+def : Pat<(TF_CollectivePermuteOp $input, (ConstantLikeMatcher ElementsAttr:$source_target_pairs)),
           (HLO_CollectivePermuteOp $input,
             (CastElementsToI64Elements $source_target_pairs))>;
 
@@ -270,7 +279,7 @@ def : Pat<(TF_CollectivePermuteOp $input, (TF_ConstOp $source_target_pairs)),
 // CrossReplicaSum op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
+def : Pat<(TF_CrossReplicaSumOp $input, (ConstantLikeMatcher ElementsAttr:$group_assignment)),
           (HLO_CrossReplicaSumOp $input,
             (CastElementsToI64Elements $group_assignment))>;
 
@@ -278,7 +287,7 @@ def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
 // All2All op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (TF_ConstOp $group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
+def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (ConstantLikeMatcher ElementsAttr:$group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
           (HLO_AllToAllOp $input, $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment))>;
 
 //===----------------------------------------------------------------------===//
@@ -308,7 +317,7 @@ def : Pat<(TF_IFFTOp:$res $input),
 // indexing to the HLO format.
 def LegalizeGatherV2 :
   Pat<(TF_GatherV2Op AnyRankedTensor:$params, AnyRankedTensor:$indices,
-        (TF_ConstOp $axis), $batch_dims),
+        (ConstantLikeMatcher ElementsAttr:$axis), $batch_dims),
       (HLO_TorchIndexSelectOp $params, $indices,
         (GetHLOAxisFromTFAxis $axis, $params),
         (GetHLOAxisFromTFAxis $batch_dims, $indices))>;
@@ -318,16 +327,16 @@ def LegalizeGatherV2 :
 //===----------------------------------------------------------------------===//
 
 class SliceDenseIntElementsAttrColumn2D<string column> : NativeCodeCall<
-  "SliceDenseIntElementsAttrColumn2D($0, " # column # " )">;
+  "SliceDenseIntElementsAttrColumn2D($0.cast<ElementsAttr>(), " # column # " )">;
 
 class SliceDenseIntElementsAttr<string index, string axis> : NativeCodeCall<
-  "SliceDenseIntElementsAttr($0, " # index # ", " # axis # ")">;
+  "SliceDenseIntElementsAttr($0.cast<ElementsAttr>(), " # index # ", " # axis # ")">;
 
 // Interior padding attribute based on the TF padding.
 def GetInteriorPadding : NativeCodeCall <
-  "GetInteriorPadding($0)">;
+  "GetInteriorPadding($0.cast<ElementsAttr>())">;
 
-def : Pat<(TF_PadV2Op $input, (TF_ConstOp $padding), $c),
+def : Pat<(TF_PadV2Op $input, (ConstantLikeMatcher ElementsAttr:$padding), $c),
           (HLO_PadOp $input, $c,
            (SliceDenseIntElementsAttrColumn2D<"0"> $padding),
            (SliceDenseIntElementsAttrColumn2D<"1"> $padding),
@@ -365,7 +374,8 @@ class getIntegerAttr<string x>: NativeCodeCall<
   "$_builder.getI64IntegerAttr(" # x # ")">;
 
 class GetDimensionSizeFromEnd<string dimFromEnd>: NativeCodeCall<
-  "$_builder.getI64IntegerAttr(GetDimensionSizeFromEnd($0, " # dimFromEnd # "))"
+  "$_builder.getIntegerAttr(getElementTypeOrSelf($1.getType()), "
+  "                         GetDimensionSizeFromEnd($0, " # dimFromEnd # "))"
   >;
 
 // TODO(b/149615308): Enable IotaOp usage as a child operation in a pattern
@@ -373,7 +383,7 @@ class GetDimensionSizeFromEnd<string dimFromEnd>: NativeCodeCall<
 // cannot be inferred.
 class createIotaOp<string dim>: NativeCodeCall<
   "$_builder.create<mhlo::IotaOp>($0.getOwner()->getLoc(), "
-  "Get2DTensorType($1), $_builder.getI64IntegerAttr(" # dim # "))">;
+  "Get2DTensorType($1, $2), $_builder.getI64IntegerAttr(" # dim # "))">;
 
 // This op needs to be created in C++ because the generated Convert Op has no
 // way to specify shape information as an input. In the MatrixBandPart op
@@ -396,41 +406,40 @@ def createConvertOp: NativeCodeCall<
 //   return (indicator ? input : zero_matrix)
 //
 // TODO(b/149961547): Support dynamic shaped `input` in MatrixBandPartOp.
-def : Pattern<(TF_MatrixBandPartOp:$op AnyRankedTensor:$input, $num_lower, $num_upper),
-         [(HLO_ConstOp:$m_dim (GetDimensionSizeFromEnd<"0"> $input)),
-          (HLO_ConstOp:$n_dim (GetDimensionSizeFromEnd<"1"> $input)),
+def : Pattern<(TF_MatrixBandPartOp:$op AnyStaticShapeTensor:$input, $num_lower,
+               $num_upper),
+         [(HLO_ConstOp:$m_dim (GetDimensionSizeFromEnd<"1"> $input, $num_lower)),
+          (HLO_ConstOp:$n_dim (GetDimensionSizeFromEnd<"0"> $input, $num_upper)),
           (HLO_SelectOp:$num_lower_or_m
            (HLO_CompareOp
             $num_lower, (HLO_ConstOp:$zero (ConstantSplat<"0"> $num_lower)),
-            HLO_COMPARISON_DIRECTION_LT
+            HLO_COMPARISON_DIRECTION_LT, (HLO_DEFAULT_COMPARISON_TYPE)
            ),
            $m_dim,
            $num_lower
           ),
           (HLO_SelectOp:$num_upper_or_n
            (HLO_CompareOp
-            $num_upper, $zero, HLO_COMPARISON_DIRECTION_LT
+            $num_upper, $zero, HLO_COMPARISON_DIRECTION_LT,
+            (HLO_DEFAULT_COMPARISON_TYPE)
            ),
            $n_dim,
            $num_upper
           ),
-          (HLO_SelectOp
+          (TF_SelectV2Op
            (HLO_AndOp
             (HLOClient_BroadcastCompareOp
-             (HLO_NegOp
-              (createConvertOp $op, $num_lower_or_m, $input)
-             ),
+             (HLO_NegOp $num_lower_or_m),
              (HLO_SubOp:$offset
-              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input)
+              (createIotaOp<"1"> $op, $input, $num_lower),
+              (createIotaOp<"0"> $op, $input, $num_lower)
              ),
-             (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
+             (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE,
+             (HLO_DEFAULT_COMPARISON_TYPE)
             ),
-            (HLOClient_BroadcastCompareOp
-             $offset,
-             (createConvertOp
-              $op, $num_upper_or_n, $input
-             ),
-             (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
+            (HLOClient_BroadcastCompareOp $offset, $num_upper_or_n,
+             (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE,
+             (HLO_DEFAULT_COMPARISON_TYPE)
             )
            ),
            $input,
@@ -455,7 +464,7 @@ def : Pat<(TF_EluOp AnyRankedTensor:$features),
               $features,
               (HLO_ConstOp:$zero (GetScalarOfType<0> $features)),
               (BinBroadcastDimensions $zero, $features),
-              HLO_COMPARISON_DIRECTION_GT),
+              HLO_COMPARISON_DIRECTION_GT, (HLO_DEFAULT_COMPARISON_TYPE)),
            $features,
            (HLO_Expm1Op $features))>;
 
@@ -465,7 +474,7 @@ def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$featur
               $features,
               (HLO_ConstOp:$zero (GetScalarOfType<0> $features)),
               (BinBroadcastDimensions $zero, $features),
-              HLO_COMPARISON_DIRECTION_GT),
+              HLO_COMPARISON_DIRECTION_GT, (HLO_DEFAULT_COMPARISON_TYPE)),
             $gradients,
             (HLO_MulOp
              $gradients,
@@ -510,7 +519,8 @@ def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$featu
           (HLO_SelectOp
             (HLOClient_BroadcastCompareOp $features,
               (HLO_ConstOp (GetScalarOfType<0> $features)),
-              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_GT),
+              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_GT,
+              (HLO_DEFAULT_COMPARISON_TYPE)),
             $gradients, (HLO_ConstOp (ConstantSplat<"0"> $gradients)))>;
 
 //===----------------------------------------------------------------------===//
@@ -528,7 +538,7 @@ def TFSliceSizes2HLOSliceSizes : NativeCodeCall<
     "&$_builder)">;
 
 def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
-           (TF_ConstOp $slice_sizes)),
+           (ConstantLikeMatcher AnyAttr:$slice_sizes)),
           (HLO_DynamicSliceOp $input,
            (CastToI64AndUnpackTensor $op, $starting_indices),
            (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes)),
@@ -560,9 +570,9 @@ def : Pat<(TF_LegacyCallOp:$op $args, FlatSymbolRefAttr:$f, $attr),
 //===----------------------------------------------------------------------===//
 
 // Handles axis conversion for TF reverse.
-def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1, &$_builder)">;
+def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1.cast<ElementsAttr>(), &$_builder)">;
 
-def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
+def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (ConstantLikeMatcher ElementsAttr:$axis)),
     (HLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
 
 //===----------------------------------------------------------------------===//
@@ -603,7 +613,7 @@ foreach Mapping = [
 def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse),
           (HLO_ConvertOp $arg)>;
 
-def : Pat<(TF_TransposeOp:$res $arg, (TF_ConstOp $permutation)),
+def : Pat<(TF_TransposeOp:$res $arg, (ConstantLikeMatcher ElementsAttr:$permutation)),
           (HLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>;
 
 // Result of the following ops changing tensor shape needs to have static
@@ -682,7 +692,8 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                   $features,
                   (HLO_NegOp $threshold),
                   (NullDenseIntElementsAttr),
-                  HLO_COMPARISON_DIRECTION_GT
+                  HLO_COMPARISON_DIRECTION_GT,
+                  (HLO_DEFAULT_COMPARISON_TYPE)
                  ),
                  $features,
                  (HLO_SelectOp
@@ -690,7 +701,8 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                    $features,
                    $threshold,
                    (NullDenseIntElementsAttr),
-                   HLO_COMPARISON_DIRECTION_LT
+                   HLO_COMPARISON_DIRECTION_LT,
+                   (HLO_DEFAULT_COMPARISON_TYPE)
                   ),
                   $features_exp,
                   (HLO_Log1pOp $features_exp)
@@ -707,7 +719,7 @@ def ToGatherDimNumsAttr : NativeCodeCall<"GetGatherDimNumsAttr($0, &$_builder)">
 
 def HasValidGatherDims : Constraint<CPred<"HasValidGatherDims($0)">>;
 
-def : Pat<(TF_XlaGatherOp $operand, $start_indices, (TF_ConstOp $slice_sizes),
+def : Pat<(TF_XlaGatherOp $operand, $start_indices, (ConstantLikeMatcher ElementsAttr:$slice_sizes),
                           $dimension_numbers, $indices_are_sorted),
           (HLO_GatherOp $operand, $start_indices,
                         (ToGatherDimNumsAttr $dimension_numbers),
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 6e4a2495a4f..c6699778b42 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
@@ -138,6 +139,8 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::FFT2DOp>(),
     TypeID::get<TF::FFT3DOp>(),
     TypeID::get<TF::FFTOp>(),
+    TypeID::get<TF::FakeQuantWithMinMaxArgsGradientOp>(),
+    TypeID::get<TF::FakeQuantWithMinMaxVarsGradientOp>(),
     TypeID::get<TF::FloorDivOp>(),
     TypeID::get<TF::FloorModOp>(),
     TypeID::get<TF::GatherNdOp>(),
@@ -161,7 +164,6 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::LeftShiftOp>(),
     TypeID::get<TF::LessEqualOp>(),
     TypeID::get<TF::LessOp>(),
-    TypeID::get<TF::LgammaOp>(),
     TypeID::get<TF::ListDiffOp>(),
     TypeID::get<TF::LogicalAndOp>(),
     TypeID::get<TF::LogicalNotOp>(),
@@ -175,6 +177,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::MatrixSolveOp>(),
     TypeID::get<TF::MatrixTriangularSolveOp>(),
     TypeID::get<TF::MirrorPadOp>(),
+    TypeID::get<TF::MirrorPadGradOp>(),
     TypeID::get<TF::MulOp>(),
     TypeID::get<TF::MultinomialOp>(),
     TypeID::get<TF::NegOp>(),
@@ -223,14 +226,22 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::SqrtGradOp>(),
     TypeID::get<TF::SquareOp>(),
     TypeID::get<TF::StatelessMultinomialOp>(),
+    TypeID::get<TF::StatelessRandomGetKeyCounterAlgOp>(),
     TypeID::get<TF::StatelessRandomNormalOp>(),
+    TypeID::get<TF::StatelessRandomNormalV2Op>(),
     TypeID::get<TF::StatelessRandomUniformOp>(),
+    TypeID::get<TF::StatelessRandomUniformV2Op>(),
     TypeID::get<TF::StatelessRandomUniformIntOp>(),
+    TypeID::get<TF::StatelessRandomUniformIntV2Op>(),
     TypeID::get<TF::StatelessTruncatedNormalOp>(),
+    TypeID::get<TF::StatelessTruncatedNormalV2Op>(),
     TypeID::get<TF::SubOp>(),
     TypeID::get<TF::TanOp>(),
+    TypeID::get<TF::TensorScatterAddOp>(),
+    TypeID::get<TF::TensorScatterSubOp>(),
     TypeID::get<TF::TPUEmbeddingActivationsOp>(),
     TypeID::get<TF::TransposeOp>(),
+    TypeID::get<TF::TridiagonalSolveOp>(),
     TypeID::get<TF::TruncateDivOp>(),
     TypeID::get<TF::TruncatedNormalOp>(),
     TypeID::get<TF::TruncateModOp>(),
@@ -246,7 +257,8 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::XlaKeyValueSortOp>(),
     TypeID::get<TF::XlaPadOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
-    TypeID::get<TF::XlogyOp>()
+    TypeID::get<TF::XlogyOp>(),
+    TypeID::get<TF::XlaSortOp>()
   };
   // clang-format on
 
@@ -565,7 +577,8 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     patterns.insert<Tf2XlaRewritePattern>(device_type_);
-    if (failed(applyPatternsAndFoldGreedily(getFunction(), patterns)))
+    if (failed(
+            applyPatternsAndFoldGreedily(getFunction(), std::move(patterns))))
       signalPassFailure();
   }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
index b60d95d1ddf..2ce50dffcd0 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
@@ -29,7 +29,9 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -40,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
@@ -110,7 +113,7 @@ Status ConvertModule(std::unique_ptr<HloModule> hlo_module, ModuleOp module,
   // Run all HLO passes to produce an optimized module.
   auto result_or = backend->compiler()->RunHloPassesAndBufferAssignement(
       std::move(hlo_module), backend->default_stream_executor(),
-      backend->memory_allocator());
+      backend->memory_allocator(), optimize_xla_hlo);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(result_or.status(),
                                   "running XLA pass pipeline");
   std::unique_ptr<HloModule> optimized_hlo_module =
@@ -131,8 +134,8 @@ Status ConvertModule(std::unique_ptr<HloModule> hlo_module, ModuleOp module,
   return Status::OK();
 }
 
-// This pass take a MLIR HLO module, convert it to XLA to perform the HLO
-// optimization pipeline for the required platform, and then convert back to
+// This pass takes an MLIR HLO module, converts it to XLA to perform the HLO
+// optimization pipeline for the required platform, and then converts it back to
 // MLIR LHLO.
 class XlaHloToLhloPass
     : public PassWrapper<XlaHloToLhloPass, OperationPass<ModuleOp>> {
@@ -276,27 +279,217 @@ Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
   return EmitSortOp(instr).status();
 }
 
-Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
-                                      const Shape& current_shape,
-                                      ::xla::ShapeIndex* current_shape_index,
-                                      SmallVectorImpl<Value>* values) {
-  if (current_shape.IsTuple()) {
-    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
-      current_shape_index->push_back(i);
-      TF_RETURN_IF_ERROR(CreateView(instr, current_shape.tuple_shapes(i),
-                                    current_shape_index, values));
-      current_shape_index->pop_back();
+// Walks MHLO::TupleOp recursively.
+Status WalkTuplePostOrder(Value v,
+                          const std::function<Status(Value)>& visitor) {
+  if (auto* op = v.getDefiningOp()) {
+    if (auto tuple = dyn_cast<mhlo::TupleOp>(op)) {
+      for (Value sub_v : tuple.val()) {
+        TF_RETURN_IF_ERROR(WalkTuplePostOrder(sub_v, visitor));
+      }
+      return Status::OK();
     }
-    return Status::OK();
   }
+  return visitor(v);
+}
+
+// This function removes all uses of a fused region argument, and rewire those
+// uses to a `tensor_load %memref`, where %memref is caller argument.
+//
+// It also flattens all input/output tuples into more region arguments /
+// results.
+StatusOr<Value> LhloDialectEmitter::RewriteFusionOperand(
+    const HloInstruction* root, const Shape& shape,
+    ::xla::ShapeIndex* shape_index, OpBuilder* b, Location loc) {
+  if (shape.IsTuple()) {
+    llvm::SmallVector<Value, 4> values;
+    for (int i = 0; i < shape.tuple_shapes_size(); i++) {
+      shape_index->push_back(i);
+      TF_ASSIGN_OR_RETURN(
+          auto v, RewriteFusionOperand(root, shape.tuple_shapes(i), shape_index,
+                                       b, loc));
+      values.push_back(v);
+      shape_index->pop_back();
+    }
+    return Value(b->create<mhlo::TupleOp>(loc, values));
+  }
+  TF_ASSIGN_OR_RETURN(Value memref,
+                      GetOrCreateArrayView(root, shape, *shape_index));
+  auto load = b->create<TensorLoadOp>(loc, memref);
+  if (shape.layout() !=
+      xla::LayoutUtil::MakeDescendingLayout(shape.dimensions().size())) {
+    llvm::SmallVector<int64_t, 4> minor_to_major(
+        shape.layout().minor_to_major().begin(),
+        shape.layout().minor_to_major().end());
+    load.setAttr("minor_to_major", b->getIndexTensorAttr(minor_to_major));
+  }
+  return load.getResult();
+}
+
+StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
+    HloInstruction* instr) {
+  Location loc = getLocation(instr);
+
+  auto* fusion_instr = ::xla::Cast<::xla::HloFusionInstruction>(instr);
+
+  auto fusion = builder_.create<lmhlo::FusionOp>(getLocation(instr),
+                                                 ArrayRef<NamedAttribute>{});
+  auto after_fusion = builder_.saveInsertionPoint();
+  builder_ = mlir::OpBuilder(fusion);
+
+  auto region_builder = OpBuilder::atBlockBegin(&fusion.region().front());
+
+  llvm::SmallVector<Value, 8> arguments;
+  for (int i = 0; i < instr->operands().size(); i++) {
+    const HloInstruction* operand = instr->operand(i);
+    xla::ShapeIndex shape_index;
+    TF_ASSIGN_OR_RETURN(
+        auto arg, RewriteFusionOperand(operand, operand->shape(), &shape_index,
+                                       &region_builder, loc));
+    arguments.push_back(arg);
+  }
+
+  TF_ASSIGN_OR_RETURN(Value result,
+                      ::xla::HloFunctionImporter::ImportInstructions(
+                          *fusion_instr->fused_instructions_computation(),
+                          arguments, &region_builder));
+
+  {
+    int i = 0;
+    llvm::SmallVector<Value, 4> output;
+    TF_RETURN_IF_ERROR(GetOrCreateView(instr, &output));
+    TF_RETURN_IF_ERROR(WalkTuplePostOrder(result, [&](Value v) mutable {
+      region_builder.create<TensorStoreOp>(loc, v, output[i++]);
+      return Status::OK();
+    }));
+    if (i != output.size()) {
+      return ::xla::InternalError("output sizes don't match");
+    }
+  }
+
+  // Fold GTE/Tuple pairs.
+  //
+  // Since the fused region refers to values in its parent region, we can't
+  // call applyPatternAndFoldGreedily. We optimize it manually.
+  //
+  // Only walk once, because post-ordering is exactly what we need for GTE
+  // optimizations.
+  fusion.region().walk([](mhlo::GetTupleElementOp gte) {
+    SmallVector<Value, 4> folded_values;
+    if (succeeded(OpBuilder(gte).tryFold(gte, folded_values))) {
+      gte.replaceAllUsesWith(folded_values[0]);
+    }
+  });
+
+  // Effectively a DCE on the region.
+  {
+    llvm::SmallVector<mlir::Operation*, 4> ops;
+    fusion.region().walk([&](mlir::Operation* op) { ops.push_back(op); });
+    // Visit the user first.
+    std::reverse(ops.begin(), ops.end());
+    for (auto op : ops) {
+      if (isOpTriviallyDead(op)) op->erase();
+    }
+  }
+
+  builder_.restoreInsertionPoint(after_fusion);
+  return fusion;
+}
+
+Status LhloDialectEmitter::HandleFusion(HloInstruction* instr) {
+  return EmitFusionOp(instr).status();
+}
+
+StatusOr<mhlo::ScatterDimensionNumbers>
+LhloDialectEmitter::GetScatterDimensionNumbers(HloInstruction* instr) {
+  auto* scatter_instr = ::xla::Cast<::xla::HloScatterInstruction>(instr);
+
+  const ::xla::ScatterDimensionNumbers& xla_scatter_dim =
+      scatter_instr->scatter_dimension_numbers();
+  auto scatter_dimension_numbers = mhlo::ScatterDimensionNumbers::get(
+      GetI64DenseElementsAttr(xla_scatter_dim.update_window_dims()),
+      GetI64DenseElementsAttr(xla_scatter_dim.inserted_window_dims()),
+      GetI64DenseElementsAttr(xla_scatter_dim.scatter_dims_to_operand_dims()),
+      builder_.getI64IntegerAttr(xla_scatter_dim.index_vector_dim()),
+      module_.getContext());
+  return scatter_dimension_numbers;
+}
+
+StatusOr<lmhlo::ScatterOp> LhloDialectEmitter::EmitScatterOp(
+    HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto scatter,
+                      CreateOpWithoutAttrs<lmhlo::ScatterOp>(instr));
+
+  // copy attributes
+  auto* scatter_instr = ::xla::Cast<::xla::HloScatterInstruction>(instr);
+
+  TF_ASSIGN_OR_RETURN(auto scatter_dimension_numbers,
+                      GetScatterDimensionNumbers(instr));
+  scatter.scatter_dimension_numbersAttr(scatter_dimension_numbers);
+  scatter.indices_are_sortedAttr(
+      builder_.getBoolAttr(scatter_instr->indices_are_sorted()));
+  scatter.unique_indicesAttr(
+      builder_.getBoolAttr(scatter_instr->unique_indices()));
+
+  // import update computation as region
+  TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
+      *scatter_instr->called_computations()[0], &scatter.update_computation(),
+      &builder_));
+
+  return scatter;
+}
+
+Status LhloDialectEmitter::HandleScatter(HloInstruction* instr) {
+  return EmitScatterOp(instr).status();
+}
+
+StatusOr<lmhlo::SelectAndScatterOp> LhloDialectEmitter::EmitSelectAndScatterOp(
+    HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto select_and_scatter,
+                      CreateOpWithoutAttrs<lmhlo::SelectAndScatterOp>(instr));
+
+  // copy attributes
+  auto* select_and_scatter_instr =
+      ::xla::Cast<::xla::HloSelectAndScatterInstruction>(instr);
+  const ::xla::Window& window = select_and_scatter_instr->window();
+
+  select_and_scatter.window_dimensionsAttr(
+      GetWindowElements(window, [](const ::xla::WindowDimension& dim) {
+        return static_cast<int64_t>(dim.size());
+      }));
+  select_and_scatter.window_stridesAttr(
+      GetWindowElements(window, [](const ::xla::WindowDimension& dim) {
+        return static_cast<int64_t>(dim.stride());
+      }));
+  select_and_scatter.paddingAttr(
+      GetWindowElements(window, [](const ::xla::WindowDimension& dim) {
+        return static_cast<int64_t>(dim.padding_low());
+      }));
+
+  // import select and scatter computation as region
+  TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
+      *select_and_scatter_instr->select(), &select_and_scatter.select(),
+      &builder_));
+  TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
+      *select_and_scatter_instr->scatter(), &select_and_scatter.scatter(),
+      &builder_));
+  return select_and_scatter;
+}
+
+Status LhloDialectEmitter::HandleSelectAndScatter(HloInstruction* instr) {
+  return EmitSelectAndScatterOp(instr).status();
+}
+
+StatusOr<Value> LhloDialectEmitter::GetOrCreateArrayView(
+    const ::xla::HloInstruction* instr, const ::xla::Shape& current_shape,
+    const ::xla::ShapeIndex& shape_index) {
   TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
                                          current_shape, builder_));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      assignment_.GetUniqueSlice(instr, *current_shape_index));
+                      assignment_.GetUniqueSlice(instr, shape_index));
   Value alloc = allocations_[slice.allocation()];
   if (alloc.getType() == out_type && slice.offset() == 0) {
-    values->push_back(alloc);
-    return Status::OK();
+    return alloc;
   }
 
   auto out_memref_type = out_type.dyn_cast<MemRefType>();
@@ -304,6 +497,13 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
     return tensorflow::errors::Internal(
         "Expected memref type when creating a view for leaf type of a tuple.");
 
+  // Cache generated ViewOp and StaticMemRefCastOp by (instruction,
+  // shape_index).
+  auto& cached_value = slices_[std::make_pair(instr, shape_index)];
+  if (cached_value) {
+    return cached_value;
+  }
+
   Value byte_shift =
       builder_.create<ConstantIndexOp>(alloc.getLoc(), slice.offset());
 
@@ -327,7 +527,24 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
   if (physical_out_type != out_type)
     result = builder_.create<lmhlo::StaticMemRefCastOp>(loc, out_memref_type,
                                                         result);
-  values->push_back(result);
+  return cached_value = result;
+}
+
+Status LhloDialectEmitter::GetOrCreateViewImpl(
+    const HloInstruction* instr, const Shape& current_shape,
+    ::xla::ShapeIndex* current_shape_index, SmallVectorImpl<Value>* values) {
+  if (current_shape.IsTuple()) {
+    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
+      current_shape_index->push_back(i);
+      TF_RETURN_IF_ERROR(GetOrCreateViewImpl(
+          instr, current_shape.tuple_shapes(i), current_shape_index, values));
+      current_shape_index->pop_back();
+    }
+    return Status::OK();
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto v, GetOrCreateArrayView(instr, current_shape, *current_shape_index));
+  values->push_back(v);
   return Status::OK();
 }
 
@@ -336,25 +553,8 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
 // create another view to adjust the slice for the shape of the instruction.
 Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
                                            SmallVectorImpl<Value>* values) {
-  // Cache generated ViewOp and StaticMemRefCastOp by instruction. We could have
-  // gone fancier to do the following caching:
-  //   %slice = ViewOp(%allocation, %offset) : memref<i8xSIZE>
-  //   %typed_slice = ViewOp(%slice) : memref<f32x...>
-  //
-  // where %slice is cached. This in theory gives easier time for alias
-  // analysis, since the identity of %slice defines alias. However,
-  // %typed_slice can't be cached, as different buffers with different types and
-  // shapes may still alias. Creating two ViewOps doesn't seem to worth the
-  // effort for a slightly easier aliasing, so we don't over optimize here.
-  auto result = slices_.try_emplace(instr, llvm::SmallVector<Value, 1>{});
-  llvm::SmallVectorImpl<Value>& new_values = result.first->second;
-  if (result.second) {
-    ::xla::ShapeIndex shape_index;
-    TF_RETURN_IF_ERROR(
-        CreateView(instr, instr->shape(), &shape_index, &new_values));
-  }
-  values->insert(values->end(), new_values.begin(), new_values.end());
-  return Status::OK();
+  ::xla::ShapeIndex shape_index;
+  return GetOrCreateViewImpl(instr, instr->shape(), &shape_index, values);
 }
 
 Status LhloDialectEmitter::Initialize() {
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
index 89514116254..47cde92a8bc 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MHLO_TO_LHLO_WITH_XLA_H_
 #define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MHLO_TO_LHLO_WITH_XLA_H_
 
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace mlir {
@@ -43,11 +45,34 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
         i8_type_(builder_.getIntegerType(8)) {}
 
   ::xla::StatusOr<lmhlo::SortOp> EmitSortOp(::xla::HloInstruction* instr);
+  ::xla::StatusOr<lmhlo::FusionOp> EmitFusionOp(::xla::HloInstruction* instr);
+  ::xla::StatusOr<lmhlo::ScatterOp> EmitScatterOp(::xla::HloInstruction* instr);
+  ::xla::StatusOr<mhlo::ScatterDimensionNumbers> GetScatterDimensionNumbers(
+      ::xla::HloInstruction* instr);
+  ::xla::StatusOr<lmhlo::SelectAndScatterOp> EmitSelectAndScatterOp(
+      ::xla::HloInstruction* instr);
 
  private:
   template <typename OpType>
   ::xla::StatusOr<OpType> CreateOpWithoutAttrs(::xla::HloInstruction* instr);
 
+  template <typename T>
+  DenseIntElementsAttr GetI64DenseElementsAttr(const T& container) {
+    return builder_.getI64TensorAttr(
+        {container.data(), static_cast<size_t>(container.size())});
+  }
+
+  DenseIntElementsAttr GetWindowElements(
+      const ::xla::Window& window,
+      std::function<int64_t(const xla::WindowDimension& dim)> getter) {
+    llvm::SmallVector<int64_t, 4> elements;
+    elements.reserve(window.dimensions_size());
+    for (const ::xla::WindowDimension& dim : window.dimensions()) {
+      elements.push_back(getter(dim));
+    }
+    return GetI64DenseElementsAttr(elements);
+  }
+
   tensorflow::Status DefaultAction(::xla::HloInstruction* instr) final;
 
   // Computation parameters don't need any specific handling when they are
@@ -57,21 +82,33 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   }
 
   tensorflow::Status HandleSort(::xla::HloInstruction* instr) final;
+  tensorflow::Status HandleFusion(::xla::HloInstruction* instr) final;
+  tensorflow::Status HandleScatter(::xla::HloInstruction* instr) final;
+  tensorflow::Status HandleSelectAndScatter(::xla::HloInstruction* instr) final;
 
   // Helper function that recursively visits the tuple structure in
   // `current_shape`, and reconstruct a matching lmhlo::TupleOp.
   // Each leaf node is converted to an std.view op with corresponding offsets.
   // If no tuple presents, it simply returns a view of the buffer.
-  tensorflow::Status CreateView(const ::xla::HloInstruction* instr,
-                                const ::xla::Shape& current_shape,
-                                ::xla::ShapeIndex* current_shape_index,
-                                SmallVectorImpl<Value>* values);
+  tensorflow::Status GetOrCreateViewImpl(const ::xla::HloInstruction* instr,
+                                         const ::xla::Shape& current_shape,
+                                         ::xla::ShapeIndex* current_shape_index,
+                                         SmallVectorImpl<Value>* values);
 
   // Helper function to create view/tuple of views to a buffer for a given
   // instruction result.
   tensorflow::Status GetOrCreateView(const ::xla::HloInstruction* instr,
                                      SmallVectorImpl<Value>* values);
 
+  ::xla::StatusOr<Value> GetOrCreateArrayView(
+      const ::xla::HloInstruction* instr, const ::xla::Shape& current_shape,
+      const ::xla::ShapeIndex& current_shape_index);
+
+  ::xla::StatusOr<Value> RewriteFusionOperand(const ::xla::HloInstruction* root,
+                                              const ::xla::Shape& shape,
+                                              ::xla::ShapeIndex* shape_index,
+                                              OpBuilder* b, Location loc);
+
   // Return an MLIR location for an HLO instruction.
   Location getLocation(::xla::HloInstruction* inst) {
     return NameLoc::get(builder_.getIdentifier(inst->name()),
@@ -102,7 +139,8 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   //
   // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
   // process every instruction.
-  llvm::DenseMap<const xla::HloInstruction*, llvm::SmallVector<Value, 1>>
+  absl::flat_hash_map<std::pair<const xla::HloInstruction*, xla::ShapeIndex>,
+                      Value>
       slices_;
 
   // The BufferAssignment computed by XLA ahead of time.
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index 9aca3ce7f98..3822e10089b 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -139,7 +139,8 @@ Shape TypeToShape(mlir::Type type) {
       for (const auto& e : llvm::enumerate(strides)) {
         strides_with_indices.push_back({e.value(), e.index()});
       }
-      std::sort(strides_with_indices.begin(), strides_with_indices.end());
+      std::stable_sort(strides_with_indices.begin(),
+                       strides_with_indices.end());
 
       llvm::SmallVector<int64, 4> minor_to_major;
       int64_t stride = 1;
@@ -148,7 +149,7 @@ Shape TypeToShape(mlir::Type type) {
 
         // Either the affine map is not perfectly strided, or the dimensions
         // recovered from strides don't match the actual dimensions in shapes.
-        if (stride != pr.first) return {};
+        if (stride != pr.first && m.getShape()[pr.second] != 1) return {};
 
         stride *= m.getShape()[pr.second];
       }
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
index a4a2bc42d99..97417748b64 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
@@ -196,5 +196,22 @@ TEST(TypeToShapeTest, ConvertMemRefToShape) {
   EXPECT_TRUE(ShapeUtil::Equal(converted, shape));
 }
 
+TEST(TypeToShapeTest, ConvertMemRefToShape2) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(PrimitiveType::C64, {2, 4, 3, 3},
+                                               {2, 3, 1, 0});
+  MLIRContext context;
+  mlir::Builder builder(&context);
+
+  StatusOr<mlir::Type> mlir_type =
+      ConvertShapeToType<MemRefType>(shape, builder);
+  ASSERT_TRUE(mlir_type.ok());
+  mlir::Type type = mlir_type.ConsumeValueOrDie();
+  Shape converted = TypeToShape(type);
+  EXPECT_TRUE(ShapeUtil::Equal(
+      converted, ShapeUtil::MakeShapeWithLayout(PrimitiveType::C64,
+                                                {2, 4, 3, 3}, {2, 3, 1, 0})));
+  EXPECT_TRUE(ShapeUtil::Equal(converted, shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
index bfe4ed3844f..7eb1fb40f5e 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
@@ -27,3 +27,9 @@ llvm::cl::opt<bool> emit_return_tuple(
     "emit-return-tuple",
     llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
     llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> optimize_xla_hlo(
+    "optimize-xla-hlo",
+    llvm::cl::desc("Enable optimizations when translating XLA HLO -> LHLO"),
+    llvm::cl::init(true));
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
index 1d5a29a5fdb..14a2878dff8 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
@@ -24,5 +24,6 @@ limitations under the License.
 
 extern llvm::cl::opt<bool> emit_use_tuple_arg;
 extern llvm::cl::opt<bool> emit_return_tuple;
+extern llvm::cl::opt<bool> optimize_xla_hlo;
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 1dfcf88e654..4eedf8c8f72 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -126,6 +126,7 @@ tf_xla_py_test(
     srcs = ["adagrad_da_test.py"],
     python_version = "PY3",
     tags = [
+        "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
@@ -143,6 +144,7 @@ tf_xla_py_test(
     srcs = ["adam_test.py"],
     python_version = "PY3",
     tags = [
+        "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
@@ -182,6 +184,7 @@ tf_xla_py_test(
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
+        "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
@@ -327,7 +330,6 @@ tf_xla_py_test(
     name = "self_adjoint_eig_op_test",
     size = "medium",
     srcs = ["self_adjoint_eig_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -393,7 +395,6 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_inverse_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -416,7 +417,6 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_solve_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -639,7 +639,6 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -696,7 +695,6 @@ tf_xla_py_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 6,
     tags = [
@@ -721,6 +719,7 @@ tf_xla_py_test(
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
+        "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
@@ -758,6 +757,7 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "notsan",  # TODO(b/171000704): data race
     ],
     deps = [
         ":xla_test",
@@ -795,6 +795,7 @@ tf_xla_py_test(
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
+        "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
@@ -850,6 +851,7 @@ tf_xla_py_test(
     size = "medium",
     timeout = "long",
     srcs = ["matrix_band_part_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1018,7 +1020,6 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1391,9 +1392,9 @@ tf_xla_py_test(
     name = "unary_ops_test",
     size = "medium",
     srcs = ["unary_ops_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
+        "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
@@ -1531,6 +1532,7 @@ tf_xla_py_test(
     name = "scatter_nd_op_test",
     size = "medium",
     srcs = ["scatter_nd_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1550,7 +1552,7 @@ tf_xla_py_test(
     srcs = ["sort_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 1,
+    shard_count = 2,
     # Times out in fastbuild mode.
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1561,6 +1563,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1705,6 +1708,7 @@ tf_cuda_cc_test(
     name = "unary_ops_composition_test",
     srcs = ["unary_ops_composition_test.cc"],
     tags = [
+        "no_cuda_asan",  # TODO(b/171317888): re-enable.
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ] + tf_cuda_tests_tags(),
     deps = [
@@ -1892,6 +1896,7 @@ tf_xla_py_test(
         "cpu",
         "gpu",
     ],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 59c8c544347..957a88878aa 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -1329,6 +1330,40 @@ class BinaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
+  def testSymmetricMirrorPadGrad(self):
+    mirror_pad_grad = lambda t, paddings: gen_array_ops.mirror_pad_grad(
+        t, paddings, "SYMMETRIC")
+    for dtype in self.numeric_types:
+      self._testBinary(
+          mirror_pad_grad,
+          np.broadcast_to(np.arange(0, 7, dtype=dtype), (3, 2, 1, 7)),
+          np.array([
+              [1, 1],
+              [0, 0],
+              [0, 0],
+              [2, 2],
+          ], dtype=np.int32),
+          expected=np.broadcast_to(
+              np.array([9, 27, 27], dtype=dtype), (1, 2, 1, 3)))
+
+  def testReflectMirrorPadGrad(self):
+    mirror_pad_grad = lambda t, paddings: gen_array_ops.mirror_pad_grad(
+        t, paddings, "REFLECT")
+    for dtype in self.numeric_types:
+      self._testBinary(
+          mirror_pad_grad,
+          np.broadcast_to(
+              np.reshape(np.arange(0, 7, dtype=dtype), (7, 1)), (1, 4, 7, 1)),
+          np.array([
+              [0, 0],
+              [1, 1],
+              [2, 2],
+              [0, 0],
+          ], dtype=np.int32),
+          expected=np.broadcast_to(
+              np.reshape(np.array([16, 18, 8], dtype=dtype), (3, 1)),
+              (1, 2, 3, 1)))
+
   def testReshape(self):
     for dtype in self.numeric_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index eb46c536e07..b8909608823 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
@@ -63,13 +64,16 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         self.assertAllClose(
             result, np_reduce_fn(test_input, axis=1), rtol=rtol, atol=atol)
 
-        with self.assertRaisesWithPredicateMatch(
-            errors_impl.InvalidArgumentError, 'Invalid reduction dim'):
-          sess.run(out, {a: test_input, index: [-33]})
+        # MLIR bridge doesn't return the same error so it can't be matched
+        # directly.
+        if not test_util.is_mlir_bridge_enabled():
+          with self.assertRaisesWithPredicateMatch(
+              errors_impl.InvalidArgumentError, 'Invalid reduction dim'):
+            sess.run(out, {a: test_input, index: [-33]})
 
-        with self.assertRaisesWithPredicateMatch(
-            errors_impl.InvalidArgumentError, 'Invalid reduction dim'):
-          sess.run(out, {a: test_input, index: [2]})
+          with self.assertRaisesWithPredicateMatch(
+              errors_impl.InvalidArgumentError, 'Invalid reduction dim'):
+            sess.run(out, {a: test_input, index: [2]})
 
   REAL_DATA = [
       np.zeros(shape=(2, 0)),
@@ -168,6 +172,7 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA,
                         index_dtype)
 
+  @test_util.disable_mlir_bridge('Error messages differ')
   def testReduceSumWithDuplicateAxes(self, index_dtype):
     with self.session() as sess:
       with self.test_scope():
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
index 3adb169e7f0..04531108b70 100644
--- a/tensorflow/compiler/tests/scatter_nd_op_test.py
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -161,6 +162,7 @@ class ScatterNdTest(xla_test.XLATestCase):
     expected = np.zeros([2, 2], dtype=np.int32)
     self.assertAllEqual(expected, self._runScatterNd(indices, updates, [2, 2]))
 
+  @test_util.disable_mlir_bridge("Error messages differ")
   def testRank3InvalidShape1(self):
     indices = np.zeros([3, 2, 2], np.int32)
     updates = np.zeros([2, 2, 2], np.int32)
@@ -168,6 +170,7 @@ class ScatterNdTest(xla_test.XLATestCase):
                                              "Must have updates.shape"):
       self._runScatterNd(indices, updates, [2, 2, 2])
 
+  @test_util.disable_mlir_bridge("Error messages differ")
   def testRank3InvalidShape2(self):
     indices = np.zeros([2, 2, 1], np.int32)
     updates = np.zeros([2, 2], np.int32)
diff --git a/tensorflow/compiler/tests/sort_ops_test.py b/tensorflow/compiler/tests/sort_ops_test.py
index 838718aa1e3..8e7f24b58cf 100644
--- a/tensorflow/compiler/tests/sort_ops_test.py
+++ b/tensorflow/compiler/tests/sort_ops_test.py
@@ -18,18 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
-class XlaSortOpTest(xla_test.XLATestCase):
+class XlaSortOpTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def _assertOpOutputMatchesExpected(self, op, args, expected):
     with self.session() as session:
@@ -49,21 +51,31 @@ class XlaSortOpTest(xla_test.XLATestCase):
 
   def testSort(self):
     supported_types = set(
-        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64])
     for dtype in supported_types.intersection(self.numeric_types):
+      # TPU implementation is not supported for double precision
+      if dtype == np.float64 and self.device == "TPU":
+        continue
       x = np.arange(101, dtype=dtype)
       np.random.shuffle(x)
       self._assertOpOutputMatchesExpected(
           xla.sort, [x], expected=[np.arange(101, dtype=dtype)])
 
   def testKeyValueSort(self):
-    supported_key_types = set(
-        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
-    supported_value_types = set(
-        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32,
-         dtypes.int64.as_numpy_dtype, dtypes.uint64.as_numpy_dtype])
+    supported_key_types = set([
+        dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64, np.int32,
+        np.uint32
+    ])
+    supported_value_types = set([
+        dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64, np.int32,
+        np.uint32, dtypes.int64.as_numpy_dtype, dtypes.uint64.as_numpy_dtype
+    ])
     for key_type in supported_key_types.intersection(self.numeric_types):
       for value_type in supported_value_types.intersection(self.numeric_types):
+        if key_type == np.float64 or value_type == np.float64:
+          # TPU implementation is not supported for double precision
+          if self.device == "TPU":
+            continue
         x = np.arange(101, dtype=key_type)
         np.random.shuffle(x)
         y = (-x).astype(value_type)
@@ -75,9 +87,13 @@ class XlaSortOpTest(xla_test.XLATestCase):
             ])
 
   def testTopK(self):
-    supported_types = set(
-        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
+    supported_types = set([
+        dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64, np.int32,
+        np.uint32
+    ])
     for dtype in supported_types.intersection(self.numeric_types):
+      if dtype == np.float64 and self.device == "TPU":
+        continue
       # Use small input size for bfloat16. Otherwise, we'll get duplicate values
       # after conversion to bfloat16, so the possible resulting index array is
       # no longer unique.
@@ -99,10 +115,18 @@ class XlaSortOpTest(xla_test.XLATestCase):
               topk, [x.astype(dtype)],
               expected=[x[indices].astype(dtype), indices])
 
-  def testTopK2D(self):
-    supported_types = set(
-        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
-    for dtype in supported_types.intersection(self.numeric_types):
+  @parameterized.named_parameters(
+      ("HalfPrecision", dtypes.bfloat16.as_numpy_dtype),
+      ("SinglePrecision", np.float32),
+      ("DoublePrecision", np.float64),
+      ("Int", np.int32),
+      ("UnsignedInt", np.uint32),
+  )
+  def testTopK2D(self, dtype):
+    if dtype in self.numeric_types:
+      # TPU implementation is not supported for double precision
+      if dtype == np.float64 and self.device == "TPU":
+        return
       # Use small input size for bfloat16. Otherwise, we'll get duplicate values
       # after conversion to bfloat16, so the possible resulting index array is
       # no longer unique.
@@ -127,10 +151,15 @@ class XlaSortOpTest(xla_test.XLATestCase):
               topk, [x.astype(dtype)],
               expected=[expected.astype(dtype), indices])
 
+  @test_util.disable_mlir_bridge("Support compare type in HLO Compare Op")
   def testTopKZeros(self):
     """Tests that positive and negative zeros sort correctly."""
-    supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32])
+    supported_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64])
     for dtype in supported_types.intersection(self.numeric_types):
+      # TPU implementation is not supported for double precision
+      if dtype == np.float64 and self.device == "TPU":
+        continue
       with self.session() as sess:
         p = array_ops.placeholder(dtype)
         with self.test_scope():
@@ -143,8 +172,12 @@ class XlaSortOpTest(xla_test.XLATestCase):
 
   def testTopKInfinities(self):
     """Tests that positive and negative infinity sort correctly."""
-    supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32])
+    supported_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64])
     for dtype in supported_types.intersection(self.numeric_types):
+      # TPU implementation is not supported for double precision
+      if dtype == np.float64 and self.device == "TPU":
+        continue
       with self.session() as sess:
         p = array_ops.placeholder(dtype)
         with self.test_scope():
@@ -159,9 +192,12 @@ class XlaSortOpTest(xla_test.XLATestCase):
                      dtype=dtype), results[0])
         self.assertEqual(list([2, 1, 0, 4, 5, 3]), list(results[1]))
 
-  def testInTopK(self):
-    supported_types = set([np.int32, np.int64])
-    for dtype in supported_types.intersection(self.numeric_types):
+  @parameterized.named_parameters(
+      ("Int32", np.int32),
+      ("Int64", np.uint64),
+  )
+  def testInTopK(self, dtype):
+    if dtype in self.numeric_types:
       array_size = 200 * 1000
       k_options = [0, 1, 2, 10, 20, 100, 1000, 200 * 1000]
       batch = 16
diff --git a/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py b/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py
index e462211e5dd..ca50916dcca 100644
--- a/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py
+++ b/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import math_ops
@@ -211,6 +212,7 @@ class TridiagonalSolveOpsTest(xla_test.XLATestCase):
 
   # test2x2NotInvertible is skipped as runtime error not raised for now.
 
+  @test_util.disable_mlir_bridge("Error messages differ")
   def testPartialPivotingRaises(self):
     np.random.seed(0)
     batch_size = 8
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index b5f82bcff12..f3f6fa8ae52 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -542,7 +542,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
     for dtype in self.float_types:
 
       def quantize_and_dequantize_v2(x):
-        return array_ops.quantize_and_dequantize_v2(
+        return array_ops.quantize_and_dequantize(
             x, -127, 127, signed_input=True, num_bits=8)
 
       self._assertOpOutputMatchesExpected(
@@ -551,7 +551,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
       def quantize_and_dequantize_v2_round_half_up(x):
-        return array_ops.quantize_and_dequantize_v2(
+        return array_ops.quantize_and_dequantize(
             x,
             -1,
             1.0,
@@ -575,7 +575,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
                             dtype=dtype))
 
       def quantize_and_dequantize_v2_round_half_to_even(x):
-        return array_ops.quantize_and_dequantize_v2(
+        return array_ops.quantize_and_dequantize(
             x,
             -1.0,
             1.0,
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 3e9f5e8c5dd..b80b6263992 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.compiler.xla import xla_data_pb2
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -299,6 +302,78 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
           expected=np.array([0, 45, 120, 231], dtype=dtype))
 
+  @test_util.disable_mlir_bridge('Not supported yet')
+  def testVariadicReduce(self):
+    for dtype in set(self.numeric_types).intersection(
+        set([np.float32, np.complex64])):
+
+      @def_function.function
+      def kahan_sum_reducer(t0, t1):
+        (s0, c0), (s1, c1) = t0, t1
+        s0minusc = s0 - (c0 + c1)
+        t = s1 + s0minusc
+        c = (t - s1) - s0minusc
+        s = t
+        return s, c
+
+      def kahan_sum_reduction(dims, output_idx):
+
+        def fn(x):
+          arg = array_ops.zeros([], dtype)  # pylint: disable=cell-var-from-loop
+          reducer = kahan_sum_reducer.get_concrete_function(
+              (arg, arg), (arg, arg))
+
+          return xla.variadic_reduce(
+              (x, array_ops.zeros_like(x)),
+              init_value=(arg, arg),
+              dimensions_to_reduce=dims,
+              reducer=reducer)[output_idx]
+
+        return fn
+
+      xs = np.array([1e5, np.pi, -1e5, np.exp(1.)])
+      xs = np.array([xs, xs[::-1] / 3, xs / 7], dtype)
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[], output_idx=0),
+          args=(xs,), expected=xs)
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[], output_idx=1),
+          args=(xs,), expected=np.zeros_like(xs))
+      shuffle_indices = np.argsort(np.random.randn(xs.shape[0]))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0], output_idx=0),
+          args=(xs[shuffle_indices],),
+          expected=np.array([np.exp(1) / 3 + 1e5 * 8 / 7,
+                             np.pi * 8 / 7 - 1e5 / 3,
+                             -1e5 * 8 / 7 + np.pi / 3,
+                             np.exp(1) * 8 / 7 + 1e5 / 3], dtype=dtype))
+      error_term_equality = functools.partial(self.assertAllClose, atol=.005)
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0], output_idx=1),
+          args=(xs[shuffle_indices],), expected=np.zeros_like(xs[0]),
+          equality_fn=error_term_equality)
+      shuffle_indices = np.argsort(np.random.randn(xs.shape[1]))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[1], output_idx=0),
+          args=(xs[:, shuffle_indices],),
+          expected=np.array([np.pi + np.exp(1.),
+                             (np.pi + np.exp(1.)) / 3,
+                             (np.pi + np.exp(1.)) / 7], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[1], output_idx=1),
+          args=(xs[:, shuffle_indices],), expected=np.zeros_like(xs[:, 0]),
+          equality_fn=error_term_equality)
+      # Now, shuffle both dims.
+      xs = xs[np.argsort(np.random.randn(xs.shape[0]))]
+      xs = xs[:, np.argsort(np.random.randn(xs.shape[1]))]
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0, 1], output_idx=0),
+          args=(xs,), expected=dtype((np.pi + np.exp(1.)) * 31 / 21))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0, 1], output_idx=1),
+          args=(xs,), expected=dtype(0),
+          equality_fn=error_term_equality)
+
   @test_util.disable_mlir_bridge('Not supported yet')
   def testSelectAndScatter(self):
     for dtype in set(self.numeric_types).intersection(
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index e7183cd7e18..a82c1c485b9 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -483,7 +483,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 2804a381e0c..8fbe0f4ceb9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -60,8 +60,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+
 using absl::StrAppend;
 using absl::StrCat;
+using ::tensorflow::tensorrt::segment::ClusterProperty;
+using ::tensorflow::tensorrt::segment::NodePtrCompare;
+using ::tensorflow::tensorrt::segment::Segment;
 
 namespace {
 
@@ -125,15 +129,21 @@ bool ShallKeepControlEdgeFrom(const Node* input_node) {
 // Function to get subsegment information structure.
 Status GetEngineInfo(const Graph* g,
                      const grappler::GraphProperties& graph_properties,
-                     const std::set<const Node*>& segment_nodes,
+                     const Segment& segment,
                      const std::unordered_map<string, Node*>& node_map,
                      const std::vector<Node*>& reverse_topo_order,
                      EngineInfo* info) {
   std::vector<const Node*> subgraph_nodes;  // Topologically sorted nodes.
   std::set<const Node*> added_const_nodes;  // Used to prevent double insertion.
+
+  const ClusterProperty& segment_property = segment.property;
+  const std::set<const Node*, NodePtrCompare>& segment_nodes = segment.nodes;
+
   // The device assignment accumulated from the compatible device assignments
   // for the nodes in the segment.
-  DeviceNameUtils::ParsedName segment_device;
+  const DeviceNameUtils::ParsedName segment_device =
+      segment_property.DeviceName();
+  info->max_batch_size = segment_property.BatchSize().GetOptionalMaxBatchSize();
 
   // Map from src_node_name+port to the unique port numbers of the TRT op, where
   // the src_node_name is the name of the source node of the input/output
@@ -146,18 +156,6 @@ Status GetEngineInfo(const Graph* g,
        ++it) {
     const Node* node = *it;
     if (segment_nodes.count(node) == 0) continue;
-
-    absl::optional<DeviceNameUtils::ParsedName> new_segment_device =
-        MergeIfCompatible(segment_device, GetDeviceName(node));
-    if (!new_segment_device.has_value()) {
-      // The segmenter should guarantee that nodes in the same segment have
-      // compatible device assignments.
-      return errors::Internal(
-          "segment nodes have incompatible device assignments: ",
-          DeviceNameUtils::ParsedNameToString(segment_device), " vs ",
-          GetDeviceName(node), " to node ", node->name());
-    }
-    segment_device = *new_segment_device;
     subgraph_nodes.push_back(node);
 
     const int node_id = node->id();
@@ -332,7 +330,7 @@ void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
 //    invocation of CreateTRTNode().
 Status CreateTRTNode(const ConversionParams& params,
                      const std::vector<EngineInfo>& infos, int pos,
-                     int max_batch_size, Graph* graph,
+                     int default_max_batch_size, Graph* graph,
                      std::vector<Node*>* engine_nodes) {
   const auto& info = infos.at(pos);
   std::vector<tensorflow::TensorShapeProto> input_shape_protos;
@@ -427,6 +425,11 @@ Status CreateTRTNode(const ConversionParams& params,
       (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration);
   // Build the engine and get its serialized representation.
   string segment_string;
+
+  int max_batch_size = info.max_batch_size.has_value()
+                           ? info.max_batch_size.value()
+                           : default_max_batch_size;
+
   if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
     std::pair<int, Allocator*> device_allocator =
         GetDeviceAndAllocator(params, info);
@@ -443,6 +446,7 @@ Status CreateTRTNode(const ConversionParams& params,
     cudaSetDevice(cuda_device_id);
 
     auto trt_logger = GetLoggerRegistry()->LookUp(params.trt_logger_name);
+
     // Create static engines with precision_mode fp32/fp16.
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
@@ -486,6 +490,7 @@ Status CreateTRTNode(const ConversionParams& params,
           .Attr("calibration_data", "")
           .Attr("max_cached_engines_count", info.maximum_cached_engines)
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
+          .Attr("max_batch_size", max_batch_size)
           .Attr("precision_mode", prec_string)
           .Attr("use_calibration", info.use_calibration)
           .Attr("_use_implicit_batch", params.use_implicit_batch)
@@ -738,7 +743,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   segment_options.allow_dynamic_non_batch_dim =
       AllowDynamicNonBatchDimension(params);
 
-  segment::SegmentNodesVector initial_segments;
+  segment::SegmentVector initial_segments;
   TrtNodeValidator validator(static_graph_properties, params.precision_mode,
                              params.use_calibration, params.use_implicit_batch);
   TF_RETURN_IF_ERROR(segment::SegmentGraph(
@@ -755,14 +760,11 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   // Get the EngineInfo for each segment.
   std::unordered_map<string, Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
   engine_segments.reserve(initial_segments.size());
   std::vector<Node*> reverse_topo_order;
   GetPostOrder(graph, &reverse_topo_order);
-  size_t total_engine_bytes_size = 0;
-  std::vector<size_t> engine_bytes_size;
-  segment::SegmentNodesVector converted_segments;
+  segment::SegmentVector converted_segments;
   converted_segments.reserve(initial_segments.size());
   string engine_name_prefix =
       StrCat("TRTEngineOp_", GetNextGraphSequenceNumber(), "_");
@@ -782,6 +784,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
     curr_engine.allow_build_at_runtime = params.allow_build_at_runtime;
+    if (!curr_engine.max_batch_size.has_value()) {
+      curr_engine.max_batch_size = params.max_batch_size;
+    }
 
     status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def,
                                             &graph, curr_engine.engine_name);
@@ -793,9 +798,6 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       continue;
     }
 
-    engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
-    total_engine_bytes_size += engine_bytes_size.back();
-    total_num_nodes_in_segments += curr_segment.size();
     engine_segments.push_back(std::move(curr_engine));
     converted_segments.push_back(std::move(curr_segment));
 
@@ -834,20 +836,16 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   engine_nodes.resize(engine_segments.size());
   for (int i = 0; i < engine_segments.size(); ++i) {
     auto& engine = engine_segments.at(i);
-    // Partition the workspace size by the average of node ratio and segment
-    // graphdef size
-    engine.max_workspace_size_bytes =
-        params.max_workspace_size_bytes *
-        (engine_bytes_size.at(i) / total_engine_bytes_size +
-         converted_segments.at(i).size() / total_num_nodes_in_segments) /
-        2.0;
+    // TODO(b/170762693): implement the heuristic to calculate
+    // max_workspace_size_bytes.
+    engine.max_workspace_size_bytes = params.max_workspace_size_bytes;
     VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to "
             << engine.engine_name;
     auto status = CreateTRTNode(params, engine_segments, i,
                                 params.max_batch_size, &graph, &engine_nodes);
 
     string msg = StrCat("segment ", i, " consisting of ",
-                        converted_segments.at(i).size(), " nodes by ",
+                        converted_segments.at(i).nodes.size(), " nodes by ",
                         engine.engine_name);
     if (status.ok()) {
       LOG(INFO) << "Replaced " << msg << ".";
@@ -859,7 +857,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
-      for (const Node* node : converted_segments.at(i)) {
+      for (const Node* node : converted_segments.at(i).nodes) {
         StrAppend(&msg, node->name(), ", ");
       }
       VLOG(1) << msg;
@@ -868,7 +866,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
     if (status.ok()) {
-      for (const Node* node : converted_segments.at(i)) {
+      for (const Node* node : converted_segments.at(i).nodes) {
         graph.RemoveNode(const_cast<Node*>(node));
       }
     }
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 6752009c37b..99d7730bfe5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -429,11 +430,52 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
   return Status::OK();
 }
 
+std::string GetLayerNameSuffix(absl::string_view sub_op_name,
+                               absl::optional<int> sub_op_instance) {
+  std::string op_suffix(sub_op_name);
+  if (sub_op_instance.has_value()) {
+    op_suffix =
+        absl::StrCat(op_suffix, "_", std::to_string(sub_op_instance.value()));
+  }
+  return op_suffix;
+}
+
+// Sets the name of an ILayer using the name of the node_def. If the operation
+// represented by the ILayer is generated by the converter to support the
+// conversion of node_def, callers need to specify a non-empty sub_op_name
+// to be appended to the name of node_def to avoid layer name conflicts. If the
+// operation is generated multiple times, callers also need to specify
+// sub_op_instance to be appended to the name of the layers to avoid layer name
+// conflicts.
+void SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def,
+                  absl::string_view sub_op_name = "",
+                  absl::optional<int> sub_op_instance = absl::nullopt) {
+  std::string sub_op_suffix = GetLayerNameSuffix(sub_op_name, sub_op_instance);
+  if (sub_op_suffix.empty()) {
+    layer->setName(node_def.name().c_str());
+  } else {
+    layer->setName(absl::StrCat(node_def.name(), "-", sub_op_suffix).c_str());
+  }
+}
+
+// Sets the name of an ILayer using the format of
+// "main_op_name"_"sub_op_name"_"sub_op_instance".
+void SetLayerName(nvinfer1::ILayer* layer, absl::string_view main_op_name,
+                  absl::string_view sub_op_name,
+                  absl::optional<int> sub_op_instance = absl::nullopt) {
+  std::string layer_name_suffix =
+      GetLayerNameSuffix(sub_op_name, sub_op_instance);
+  layer->setName(absl::StrCat(main_op_name, "-", layer_name_suffix).c_str());
+}
+
 nvinfer1::ITensor* Converter::CreateConstantLayer(
     const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) {
   nvinfer1::Weights trt_weights = weights.GetTrtWeights();
   nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
   if (!layer) return nullptr;
+  SetLayerName(layer, "_tftrt_constant_",
+               std::to_string(next_constant_layer_id_));
+  next_constant_layer_id_++;
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
 #if !IS_TRT_VERSION_GE(5, 1, 3, 0)
   // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
@@ -1313,6 +1355,7 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
 
 Status Converter::RenameAndMarkOutputTensors(
     const std::vector<Converter::EngineOutputInfo>& output_tensors) {
+  int output_index = 0;
   for (const auto& output : output_tensors) {
     TRT_TensorOrWeights tensor_or_weights;
     TF_RETURN_IF_ERROR(
@@ -1341,6 +1384,7 @@ Status Converter::RenameAndMarkOutputTensors(
       nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor);
       TFTRT_RETURN_ERROR_IF_NULLPTR(
           layer, StrCat("Output Copy for ", tensor->getName()));
+      SetLayerName(layer, tensor->getName(), "shuffle", output_index);
       MarkQuantizationRangesAsInferrable(tensor, layer->getOutput(0));
       tensor = layer->getOutput(0);
     }
@@ -1349,6 +1393,7 @@ Status Converter::RenameAndMarkOutputTensors(
     // Set type after marking as output. TRT only supports setType for engine
     // outputs and inputs (type is inferred otherwise).
     tensor->setType(output.trt_dtype);
+    output_index++;
     VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
             << " with data type " << DebugString(output.trt_dtype)
             << ", which feeds TF node " << output.dest_node_name;
@@ -1475,8 +1520,9 @@ Status Converter::GetTensorOrWeights(const string& name,
 
 Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
                                   const std::vector<int>& order_with_batch_dim,
-                                  absl::string_view name,
-                                  nvinfer1::ITensor** output_tensor) {
+                                  nvinfer1::ITensor** output_tensor,
+                                  const NodeDef& node_def,
+                                  absl::string_view sub_op_name) {
   const auto dims = input_tensor->getDimensions();
   const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1
                                              : order_with_batch_dim.size();
@@ -1491,7 +1537,8 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
 
   nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
-  layer->setName(std::basic_string<char>(name).c_str());
+  SetLayerName(layer, node_def, sub_op_name);
+
   MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0));
 
   nvinfer1::Permutation permutation;
@@ -1552,10 +1599,19 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
   return Status::OK();
 }
 
-Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
-                                        const nvinfer1::Dims& dims,
-                                        const bool validation_only,
-                                        nvinfer1::ITensor** tensor) {
+// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
+// (which doesn't contain the batch dimension).
+//
+// If validation_only is true, it doesn't do the conversion but only do some
+// minimum validation for the eligibility of the conversion, and *tensor will
+// be set to nullptr.
+Status PrepareTensorForShape(Converter* converter,
+                             const TRT_TensorOrWeights& input,
+                             const nvinfer1::Dims& dims,
+                             const bool validation_only,
+                             nvinfer1::ITensor** tensor,
+                             const NodeDef& node_def,
+                             absl::optional<int> op_instance) {
   const nvinfer1::Dims input_dims = input.GetTrtDims();
   // If one of input_dims and dims doesn't have static shape, it means some of
   // the dims are unknown or need to be inferred. And we don't do further checks
@@ -1579,28 +1635,32 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
     return Status::OK();
   }
 
+  TFTRT_RETURN_ERROR_IF_NULLPTR(converter, "converter is nullptr");
   if (input.is_tensor()) {
     if (DimsEqual(input_dims, dims)) {
       *tensor = input.tensor();
     } else {
       nvinfer1::IShuffleLayer* layer =
-          this->network()->addShuffle(*input.tensor());
+          converter->network()->addShuffle(*input.tensor());
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
+      SetLayerName(layer, node_def, "shuffle", op_instance);
       layer->setReshapeDimensions(dims);
-      MarkQuantizationRangesAsInferrable(input.tensor(), layer->getOutput(0));
+      converter->MarkQuantizationRangesAsInferrable(input.tensor(),
+                                                    layer->getOutput(0));
       *tensor = layer->getOutput(0);
     }
   } else {
-    *tensor = CreateConstantLayer(input.weights(), dims);
+    *tensor = converter->CreateConstantLayer(input.weights(), dims);
     TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
-    if (precision_mode() == TrtPrecisionMode::INT8 && !use_calibration()) {
+    if (converter->precision_mode() == TrtPrecisionMode::INT8 &&
+        !converter->use_calibration()) {
       // If we are in int8 mode and not calibrating, we need to explicitly set a
       // quantization range for the output tensor of the IConstantLayer. Here we
       // set the range to [min(weights), max(weights)].
       float min_range = 0.0f;
       float max_range = 0.0f;
       TF_RETURN_IF_ERROR(
-          GetWeightRange(input.weights(), &min_range, &max_range));
+          converter->GetWeightRange(input.weights(), &min_range, &max_range));
       // Avoid setting range to 0 because TRT will throw an error. If the
       // weights are zero then the range doesn't matter: using 127.0f should
       // ensure the quantized weight will be exactly zero.
@@ -1608,7 +1668,7 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
         min_range = -127.0f;
         max_range = 127.0f;
       }
-      ProvideQuantizationRange(*tensor, min_range, max_range);
+      converter->ProvideQuantizationRange(*tensor, min_range, max_range);
     }
   }
   return Status::OK();
@@ -2086,6 +2146,7 @@ Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs,
         *tensor, nvinfer1::DimsHW((*padding)[0].first, (*padding)[1].first),
         nvinfer1::DimsHW((*padding)[0].second, (*padding)[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, params->node_def.name());
+    SetLayerName(pad_layer, params->node_def, "pad");
     params->converter->MarkQuantizationRangesAsInferrable(
         tensor, pad_layer->getOutput(0));
     *padding = {{0, 0}, {0, 0}};
@@ -2186,7 +2247,7 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   const bool need_transpose = (data_format == "NHWC");
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
   // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
@@ -2252,7 +2313,6 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
 #else
     layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
   } else {
@@ -2269,11 +2329,11 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
 #else
     layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilation(dilation);
     conv_layer = layer;
   }
+  SetLayerName(conv_layer, node_def, "conv");
   nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
   // Add an extra padding for Deconv because TRT doesn't accept the
   // argument output_shape and thus the TRT output shape could be wrong
@@ -2306,13 +2366,13 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
           params->converter->network()->addPadding(*output_tensor, pre_padding,
                                                    post_padding);
       output_tensor = padding_layer->getOutput(0);
+      SetLayerName(padding_layer, node_def, "pad");
     }
   }
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -2370,7 +2430,7 @@ Status ConvertTranspose(OpConverterParams* params) {
   // Start conversion.
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-      input_tensor, perm, params->node_def.name(), &output_tensor));
+      input_tensor, perm, &output_tensor, params->node_def));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
@@ -2401,6 +2461,7 @@ Status ConvertShape(OpConverterParams* params) {
   nvinfer1::IShapeLayer* shape_layer =
       params->converter->network()->addShape(*inputs.at(0).tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
+  SetLayerName(shape_layer, params->node_def, "shape");
   params->outputs->push_back(TRT_TensorOrWeights(shape_layer->getOutput(0)));
   return Status::OK();
 #else
@@ -2469,9 +2530,9 @@ Status ConvertReshape(OpConverterParams* params) {
 
   // Perform the conversion.
   nvinfer1::ITensor* output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, output_nonbatch_dims, params->validation_only,
-      &output_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, input_tensor, output_nonbatch_dims,
+      params->validation_only, &output_tensor, params->node_def));
   if (params->validation_only) return Status::OK();
 
   // Record the conversion result.
@@ -2513,8 +2574,9 @@ Status ConvertExpandDims(OpConverterParams* params) {
     // Reshape tensor.
     nvinfer1::Dims new_dims;
     TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        input_tensor, new_dims, /*validation_only=*/false, &output_tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, input_tensor, new_dims, /*validation_only=*/false,
+        &output_tensor, params->node_def));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -2524,7 +2586,8 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
                                  std::vector<std::pair<int, int>> slices,
                                  OpConverterParams* params,
                                  nvinfer1::ITensor** output,
-                                 std::vector<int> size_for_added_dims) {
+                                 std::vector<int> size_for_added_dims,
+                                 absl::optional<int> op_instance) {
   *output = nullptr;
   // DynamicReshape relies on INetworkDefinition::addShape that was introduced
   // in TensorRT 6.
@@ -2536,9 +2599,11 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
   nvinfer1::ITensor* shape = network()->addShape(*input)->getOutput(0);
   // Build new shape = shape[:trt_axis] + [1] + shape[trt_axis:]
   std::vector<nvinfer1::ITensor const*> concat_inputs;
-  for (int i = 0; i < std::max(slices.size(), size_for_added_dims.size());
-       i++) {
+  int max_num_slices = std::max(slices.size(), size_for_added_dims.size());
+  int op_instance_value = op_instance.has_value() ? op_instance.value() : 0;
+  for (int i = 0; i < max_num_slices; i++) {
     nvinfer1::ITensor* tensor;
+    int slice_instance = i * max_num_slices + op_instance_value;
     // maybe_add_a_dimension(i);
     if (i < size_for_added_dims.size() && size_for_added_dims[i] >= 0) {
       TF_RETURN_IF_ERROR(
@@ -2546,11 +2611,11 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
       concat_inputs.push_back(tensor);
     }
     if (i < slices.size()) {
-      concat_inputs.push_back(
-          network()
-              ->addSlice(*shape, {1, {slices[i].first}},
-                         {1, {slices[i].second - slices[i].first}}, {1, {1}})
-              ->getOutput(0));
+      nvinfer1::ISliceLayer* slice_layer = network()->addSlice(
+          *shape, {1, {slices[i].first}},
+          {1, {slices[i].second - slices[i].first}}, {1, {1}});
+      concat_inputs.push_back(slice_layer->getOutput(0));
+      SetLayerName(slice_layer, params->node_def, "slice", slice_instance);
     }
   }
   nvinfer1::IConcatenationLayer* concat_layer = network()->addConcatenation(
@@ -2560,6 +2625,7 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
   nvinfer1::ITensor* new_shape = concat_layer->getOutput(0);
   // Reshape input using new shape
   nvinfer1::IShuffleLayer* shuffle = network()->addShuffle(*input);
+  SetLayerName(shuffle, params->node_def, "shuffle", op_instance);
   shuffle->setInput(1, *new_shape);
   *output = shuffle->getOutput(0);
   return Status::OK();
@@ -2572,7 +2638,8 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
 Status Converter::DynamicExpandDims(nvinfer1::ITensor* input,
                                     const nvinfer1::Dims& dims, int axis,
                                     OpConverterParams* params,
-                                    nvinfer1::ITensor** output) {
+                                    nvinfer1::ITensor** output,
+                                    absl::optional<int> op_instance) {
   if (params->validation_only) {
     *output = nullptr;
     return errors::Internal(
@@ -2588,7 +2655,7 @@ Status Converter::DynamicExpandDims(nvinfer1::ITensor* input,
   if (axis != dims.nbDims) {
     slices.push_back(std::pair<int, int>{axis, dims.nbDims});
   }
-  return DynamicReshape(input, slices, params, output, extra_dims);
+  return DynamicReshape(input, slices, params, output, extra_dims, op_instance);
 }
 
 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
@@ -2615,8 +2682,9 @@ Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
   nvinfer1::Dims new_dims;
   VLOG(2) << "input_dims" << input_dims;
   TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(*input_dims, &new_dims));
-  TF_RETURN_IF_ERROR(PrepareTensorForShape(TRT_TensorOrWeights(input), new_dims,
-                                           /*validation_only=*/false, output));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, TRT_TensorOrWeights(input), new_dims,
+      /*validation_only=*/false, output, params->node_def));
   return Status::OK();
 }
 
@@ -2680,11 +2748,11 @@ Status ConvertSqueeze(OpConverterParams* params) {
 }
 
 template <typename Container>
-Status ConvertStridedSliceHelper(OpConverterParams* params,
-                                 const TRT_TensorOrWeights& input,
-                                 Container begin, Container size,
-                                 const Container& stride,
-                                 const nvinfer1::Dims* final_shape = nullptr) {
+Status ConvertStridedSliceHelper(
+    OpConverterParams* params, const TRT_TensorOrWeights& input,
+    Container begin, Container size, const Container& stride,
+    const nvinfer1::Dims* final_shape = nullptr,
+    absl::optional<int> op_instance = absl::nullopt) {
   const auto& node_def = params->node_def;
   // Get input dims.
   nvinfer1::Dims dims = input.GetTrtDims();
@@ -2709,6 +2777,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
                                      node_def.op(), ", at ", node_def.name());
     }
   }
+
 // TRT 5.1 adds ISliceLayer. For older versions, we attempt to use the
 // padding layer with negative padding.
 #if IS_TRT_VERSION_GE(5, 1, 3, 1)
@@ -2723,12 +2792,13 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
 
   nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
       *input.tensor(), begin_dims, size_dims, stride_dims);
+  SetLayerName(layer, params->node_def, "slice", op_instance);
   nvinfer1::ITensor* tensor = layer->getOutput(0);
   // Reshape for shrink_axis.
   if (final_shape) {
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false,
-        &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(tensor), *final_shape,
+        /*validation_only=*/false, &tensor, node_def, op_instance));
   }
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
   return Status::OK();
@@ -2782,6 +2852,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
     if (params->validation_only) return Status::OK();
     nvinfer1::IShuffleLayer* layer =
         params->converter->network()->addShuffle(*input.tensor());
+    SetLayerName(layer, params->node_def, "shuffle", op_instance);
     params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
     return Status::OK();
   } else if (pad_dims.size() == 1) {
@@ -2829,31 +2900,33 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
   // Start conversion.
   nvinfer1::ITensor* tensor = input.tensor();
   if (need_reshape) {
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        input, reshape_dims, /*validation_only=*/false, &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, input, reshape_dims, /*validation_only=*/false,
+        &tensor, node_def, op_instance));
   }
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, transpose_order, StrCat(node_def.name(), "_for_pad"), &tensor));
+        tensor, transpose_order, &tensor, node_def, "for_pad", op_instance));
   }
   // Add padding layer
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *tensor, pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, params->node_def, "pad");
   params->converter->MarkQuantizationRangesAsInferrable(tensor,
                                                         layer->getOutput(0));
   tensor = layer->getOutput(0);
   // Restore transpose
   if (need_transpose) {
-    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, inv_transpose_order, StrCat(node_def.name(), "_after_pad"),
-        &tensor));
+    TF_RETURN_IF_ERROR(
+        params->converter->TransposeTensor(tensor, inv_transpose_order, &tensor,
+                                           node_def, "after_pad", op_instance));
   }
   // Reshape for shrink_axis.
   if (final_shape) {
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false,
-        &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(tensor), *final_shape,
+        /*validation_only=*/false, &tensor, node_def, op_instance));
   } else if (need_reshape) {
     // Restore reshape.
     // Calculate output dimensions
@@ -2874,9 +2947,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
     nvinfer1::Dims new_dims;
     TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
                                                  /*ignore_first_dim=*/true));
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), new_dims, /*validation_only=*/false,
-        &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(tensor), new_dims,
+        /*validation_only=*/false, &tensor, node_def, op_instance));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
@@ -3166,8 +3239,7 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
   const bool need_transpose = is_ndhwc;
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 4, 1, 2, 3}, StrCat(node_def.name(), "_to_NCDHW"),
-        &tensor));
+        tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
   }
 
   // group == 0 signifies that this is a depthwise convolution, so set
@@ -3206,7 +3278,6 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
 
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
   } else {
@@ -3222,18 +3293,17 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
 
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilationNd(dilation_dhw);
     conv_layer = layer;
   }
+  SetLayerName(conv_layer, node_def, "conv");
   nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
 
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 4, 1}, StrCat(node_def.name(), "_to_NDHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -3302,8 +3372,7 @@ Status ConvertPool3D(OpConverterParams* params) {
   if (data_format == "NDHWC") {
     // NDHWC => NCDHW
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 4, 1, 2, 3}, StrCat(node_def.name(), "_to_NCDHW"),
-        &tensor));
+        tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
   }
 
   const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index],
@@ -3324,14 +3393,13 @@ Status ConvertPool3D(OpConverterParams* params) {
     // SAME_UPPER means that post padding is preferred.
     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
   }
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "pooling");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (data_format == "NDHWC") {
     // NCDHW => NDHWC
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 4, 1}, StrCat(node_def.name(), "_to_NDHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -3426,7 +3494,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
   const bool need_transpose = (data_format == "NHWC");
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
 
   nvinfer1::DimsHW kernel_size;
@@ -3482,7 +3550,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
 #else
   conv_layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-  conv_layer->setName(node_def.name().c_str());
+  SetLayerName(conv_layer, node_def, "conv");
   conv_layer->setNbGroups(1);
   conv_layer->setDilation(dilation);
   nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
@@ -3493,13 +3561,13 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
         params->converter->network()->addActivation(*output_tensor,
                                                     op_pair->second);
     TFTRT_RETURN_ERROR_IF_NULLPTR(activation_layer, node_def.name());
+    SetLayerName(activation_layer, node_def, "activation");
     output_tensor = activation_layer->getOutput(0);
   }
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -3541,7 +3609,7 @@ Status ConvertPool(OpConverterParams* params) {
     h_index = 1;
     w_index = 2;
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
 
   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
@@ -3575,6 +3643,7 @@ Status ConvertPool(OpConverterParams* params) {
         *tensor, nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    SetLayerName(pad_layer, node_def, "pad");
     params->converter->MarkQuantizationRangesAsInferrable(
         tensor, pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
@@ -3604,13 +3673,12 @@ Status ConvertPool(OpConverterParams* params) {
 #else
   layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "pooling");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (data_format == "NHWC") {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -3633,6 +3701,7 @@ Status ConvertLeakyRelu(OpConverterParams* params) {
       params->converter->network()->addActivation(
           *inputs.at(0).tensor(), nvinfer1::ActivationType::kLEAKY_RELU);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "activation");
   layer->setAlpha(alpha);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -3655,12 +3724,14 @@ Status ConvertLeakyRelu(OpConverterParams* params) {
       params->converter->network()->addElementWise(
           *tensor, *const_alpha_tensor, nvinfer1::ElementWiseOperation::kPROD);
   TFTRT_RETURN_ERROR_IF_NULLPTR(mul_layer, node_def.name());
+  SetLayerName(mul_layer, node_def, "mul");
   // max(x, alpha * x)
   nvinfer1::IElementWiseLayer* max_layer =
       params->converter->network()->addElementWise(
           *tensor, *mul_layer->getOutput(0),
           nvinfer1::ElementWiseOperation::kMAX);
   TFTRT_RETURN_ERROR_IF_NULLPTR(max_layer, node_def.name());
+  SetLayerName(mul_layer, node_def, "max");
   nvinfer1::ITensor* output_tensor = max_layer->getOutput(0);
   params->converter->MarkQuantizationRangesAsInferrable(
       output_tensor, mul_layer->getOutput(0));
@@ -3705,6 +3776,7 @@ Status ConvertClipByValue(OpConverterParams* params) {
   layer->setAlpha(clip_value_min);
   layer->setBeta(clip_value_max);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "activation");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, clip_value_min,
                                               clip_value_max);
@@ -3748,7 +3820,7 @@ Status ConvertActivation(OpConverterParams* params) {
       params->converter->network()->addActivation(*inputs.at(0).tensor(),
                                                   op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "activation");
   // Set parameters.
 #if IS_TRT_VERSION_GE(5, 1, 2, 0)
   if (node_def.op() == "Elu") {
@@ -3852,7 +3924,7 @@ Status ConvertRelu6(OpConverterParams* params) {
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   layer->setAlpha(0.0f);
   layer->setBeta(6.0f);
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "activation");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -3867,6 +3939,7 @@ Status ConvertRelu6(OpConverterParams* params) {
       params->converter->network()->addActivation(
           *tensor, nvinfer1::ActivationType::kRELU);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
+  SetLayerName(relu_layer, node_def, "activation");
 
   // Large range of relu is problematic during quantization in INT8 precision
   // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
@@ -3888,6 +3961,7 @@ Status ConvertRelu6(OpConverterParams* params) {
           *relu_layer->getOutput(0), *const6_tensor,
           nvinfer1::ElementWiseOperation::kMIN);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
+  SetLayerName(relu6_layer, node_def, "min");
   nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
 
@@ -3932,6 +4006,7 @@ Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
     nvinfer1::IShuffleLayer* shuffle_layer =
         params->converter->network()->addShuffle(*tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    SetLayerName(shuffle_layer, node_def, "shuffle", /*op_instance=*/0);
     params->converter->MarkQuantizationRangesAsInferrable(
         tensor, shuffle_layer->getOutput(0));
 
@@ -3963,6 +4038,7 @@ Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
       *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
       empty_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "scale");
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
@@ -3971,6 +4047,7 @@ Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
     nvinfer1::IShuffleLayer* shuffle_layer =
         params->converter->network()->addShuffle(*output_tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    SetLayerName(shuffle_layer, node_def, "shuffle", /*op_instance=*/1);
     // NOTE: for same reason as mentioned above we need to apply the reshape
     // unconditionally.
     nvinfer1::Dims reshape_dims = original_dims;
@@ -4054,14 +4131,18 @@ Status ConvertBiasAdd(OpConverterParams* params) {
 
   // Convert input to a TRT tensor
   nvinfer1::ITensor* input_tensor{nullptr};
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), input_shape, params->validation_only, &input_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(params->converter, inputs.at(0),
+                                           input_shape, params->validation_only,
+                                           &input_tensor, node_def,
+                                           /*op_instance=*/0));
 
   // Finally, reshape bias. Since the bias is usually a constant, this will
   // normally happen at conversion-time.
   nvinfer1::ITensor* bias_tensor{nullptr};
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), bias_shape, params->validation_only, &bias_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(params->converter, inputs.at(1),
+                                           bias_shape, params->validation_only,
+                                           &bias_tensor, node_def,
+                                           /*op_instance=*/1));
   VLOG(2) << "Bias shape adjusted to " << DebugString(bias_shape);
 
   if (params->validation_only) return Status::OK();
@@ -4070,6 +4151,7 @@ Status ConvertBiasAdd(OpConverterParams* params) {
       params->converter->network()->addElementWise(
           *input_tensor, *bias_tensor, nvinfer1::ElementWiseOperation::kSUM);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "sum");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4297,17 +4379,19 @@ Status ConvertBinary(OpConverterParams* params) {
   nvinfer1::ITensor* tensor_l = nullptr;
   nvinfer1::ITensor* tensor_r = nullptr;
   // This will also convert constants to tensors, and set quantization ranges.
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      operand_l, broadcasted_dims_l, params->validation_only, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      operand_r, broadcasted_dims_r, params->validation_only, &tensor_r));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, operand_l, broadcasted_dims_l, params->validation_only,
+      &tensor_l, node_def, /*op_instance=*/0));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, operand_r, broadcasted_dims_r, params->validation_only,
+      &tensor_r, node_def, /*op_instance=*/1));
   if (params->validation_only) return Status::OK();
 
   // Add ElementWise layer.
   nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
       *tensor_l, *tensor_r, op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
 
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)
@@ -4315,6 +4399,7 @@ Status ConvertBinary(OpConverterParams* params) {
     layer = params->converter->network()->addUnary(
         *trt_tensor, nvinfer1::UnaryOperation::kFLOOR);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    SetLayerName(layer, node_def, "floor");
     trt_tensor = layer->getOutput(0);
   }
 #endif
@@ -4353,10 +4438,12 @@ Status ConvertRsqrt(OpConverterParams* params) {
   nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
       *tensor, nvinfer1::UnaryOperation::kSQRT);
   TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
+  SetLayerName(sqrt_layer, node_def, "sqrt");
   // Recip
   nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
       *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
   TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
+  SetLayerName(recip_layer, node_def, "recip");
   params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
   return Status::OK();
 }
@@ -4408,7 +4495,7 @@ Status ConvertUnary(OpConverterParams* params) {
   nvinfer1::IUnaryLayer* layer =
       params->converter->network()->addUnary(*tensor, op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   // Set quantization ranges.
@@ -4453,6 +4540,7 @@ Status ConvertSquare(OpConverterParams* params) {
           *inputs.at(0).tensor(), *const2_tensor,
           nvinfer1::ElementWiseOperation::kPOW);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4511,6 +4599,7 @@ Status ConvertReduce(OpConverterParams* params) {
   nvinfer1::ILayer* layer = params->converter->network()->addReduce(
       *tensor, reduce_operation, axes, keep_dims);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -4585,21 +4674,25 @@ Status ConvertPack(OpConverterParams* params) {
   nvinfer1::Dims expanded_dims;
   TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(tensor_dims, &expanded_dims));
   std::vector<nvinfer1::ITensor*> expanded_tensors;
+  int input_index = 0;
   for (const TRT_TensorOrWeights& input : inputs) {
     nvinfer1::ITensor* expanded_tensor = nullptr;
     if (input.is_tensor() && !params->use_implicit_batch &&
         !HasStaticShape(dims)) {
       if (!params->validation_only) {
         TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
-            input.tensor(), dims, trt_axis, params, &expanded_tensor));
+            input.tensor(), dims, trt_axis, params, &expanded_tensor,
+            input_index));
       }
     } else {
-      TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-          input, expanded_dims, params->validation_only, &expanded_tensor));
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          params->converter, input, expanded_dims, params->validation_only,
+          &expanded_tensor, node_def, input_index));
     }
     if (!params->validation_only) {
       expanded_tensors.push_back(expanded_tensor);
     }
+    input_index++;
   }
   if (params->validation_only) return Status::OK();
 
@@ -4615,6 +4708,7 @@ Status ConvertPack(OpConverterParams* params) {
           const_cast<nvinfer1::ITensor**>(expanded_tensors.data()),
           expanded_tensors.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "concat");
   // Note that trt_axis stays the same even after expanding tensors at the axis.
   layer->setAxis(trt_axis);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
@@ -4696,7 +4790,7 @@ Status ConvertPad(OpConverterParams* params) {
   if (pad_index[0] == 1) {
     legit_pad = false;
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 2, 1}, StrCat(node_def.name(), "_to_pad"), &tensor));
+        tensor, {0, 3, 2, 1}, &tensor, node_def, "to_pad"));
     permuted_pad_index[0] = 3;
   }
 
@@ -4714,13 +4808,13 @@ Status ConvertPad(OpConverterParams* params) {
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *tensor, pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->MarkQuantizationRangesAsInferrable(tensor, output_tensor);
 
   if (!legit_pad) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 3, 2, 1}, StrCat(node_def.name(), "_from_pad"),
-        &output_tensor));
+        output_tensor, {0, 3, 2, 1}, &output_tensor, node_def, "from_pad"));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4780,7 +4874,7 @@ Status ConvertSplitHelper(OpConverterParams* params,
   for (int i = 0; i < num_splits; ++i) {
     begin[trt_axis + 1] = i * split_size_on_axis;
     TF_RETURN_IF_ERROR(ConvertStridedSliceHelper(
-        params, input, begin, size, stride, final_shape_for_unpack_ptr));
+        params, input, begin, size, stride, final_shape_for_unpack_ptr, i));
   }
   return Status::OK();
 }
@@ -4854,6 +4948,7 @@ Status ConvertCast(OpConverterParams* params) {
   nvinfer1::ITensor* input = params->inputs.at(0).tensor();
   nvinfer1::IIdentityLayer* layer =
       params->converter->network()->addIdentity(*input);
+  SetLayerName(layer, node_def);
   layer->setPrecision(nvinfer1::DataType::kFLOAT);
 
   if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
@@ -4911,6 +5006,7 @@ Status ConvertConcat(OpConverterParams* params) {
           const_cast<nvinfer1::ITensor* const*>(input_tensors.data()),
           input_tensors.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   layer->setAxis(trt_axis);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -5057,7 +5153,7 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
       combined_scale_weights.GetTrtWeights(),
       dummy_power_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5137,6 +5233,7 @@ Status ConvertGather(OpConverterParams* params) {
   nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
       *params_tensor, *indices_input.tensor(), trt_axis);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   nvinfer1::Dims trt_gather_output_dims = output_tensor->getDimensions();
@@ -5161,9 +5258,10 @@ Status ConvertGather(OpConverterParams* params) {
     trt_gather_output_dims.d[trt_axis] = 1;
     ++trt_gather_output_dims.nbDims;
 
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(output_tensor), trt_gather_output_dims,
-        /*validation_only=*/false, &output_tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(output_tensor),
+        trt_gather_output_dims,
+        /*validation_only=*/false, &output_tensor, node_def));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -5173,15 +5271,15 @@ Status ConvertGather(OpConverterParams* params) {
 Status ConvertFullyConnectedHelper(OpConverterParams* params,
                                    nvinfer1::ITensor* tensor_a,
                                    TRT_ShapedWeights weights_b,
-                                   bool transpose_b, const string& node_name) {
+                                   bool transpose_b, const NodeDef& node_def) {
   // Reshape input to 3D - this will be a no-op unless using int8 precision.
   auto input_dim = tensor_a->getDimensions();
   while (input_dim.nbDims < 3) {
     input_dim.d[input_dim.nbDims++] = 1;
   }
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(tensor_a), input_dim, /*validation_only=*/false,
-      &tensor_a));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, TRT_TensorOrWeights(tensor_a), input_dim,
+      /*validation_only=*/false, &tensor_a, node_def, /*op_instance=*/0));
 
   // FC layer will transpose weights, so we need to pre-transpose.
   TRT_ShapedWeights weights(weights_b.TrtDType());
@@ -5197,15 +5295,16 @@ Status ConvertFullyConnectedHelper(OpConverterParams* params,
       params->converter->network()->addFullyConnected(
           *tensor_a, noutput, weights.GetTrtWeights(), biases.GetTrtWeights());
 
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   // Reshape output to 1D - this will be a no-op unless using int8 precision.
   auto output_dim = output_tensor->getDimensions();
   output_dim.nbDims = 1;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(output_tensor), output_dim, /*validation_only=*/false,
-      &output_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, TRT_TensorOrWeights(output_tensor), output_dim,
+      /*validation_only=*/false, &output_tensor, node_def, /*op_instance=*/1));
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5214,7 +5313,7 @@ Status ConvertFullyConnectedHelper(OpConverterParams* params,
 Status ConvertMatMulHelper(OpConverterParams* params,
                            TRT_TensorOrWeights input_a,
                            TRT_TensorOrWeights input_b, bool transpose_a,
-                           bool transpose_b, string node_name) {
+                           bool transpose_b, const NodeDef& node_def) {
   // TODO: ReorderCKtoKC is currently not general enough to transpose weights
   // that are not 2D.
   if ((transpose_a && input_a.is_weights() &&
@@ -5252,15 +5351,14 @@ Status ConvertMatMulHelper(OpConverterParams* params,
   if (should_use_fc || (can_use_fc && params->converter->precision_mode() ==
                                           TrtPrecisionMode::INT8)) {
     return ConvertFullyConnectedHelper(
-        params, input_a.tensor(), input_b.weights(), transpose_b, node_name);
+        params, input_a.tensor(), input_b.weights(), transpose_b, node_def);
   }
 
   const auto get_matrix_op = [](nvinfer1::ITensor* in,
                                 bool transpose) -> nvinfer1::MatrixOperation {
-    return (in->getDimensions().nbDims < 2)
-               ? nvinfer1::MatrixOperation::kVECTOR
-               : (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE
-                             : nvinfer1::MatrixOperation::kNONE;
+    return (in->getDimensions().nbDims < 2) ? nvinfer1::MatrixOperation::kVECTOR
+           : (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE
+                         : nvinfer1::MatrixOperation::kNONE;
   };
 
   // If the MatMul operand is a constant, applies transposes at conversion-time
@@ -5293,7 +5391,8 @@ Status ConvertMatMulHelper(OpConverterParams* params,
           *tensor_a, get_matrix_op(tensor_a, transpose_a), *tensor_b,
           get_matrix_op(tensor_b, transpose_b));
 
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5316,7 +5415,7 @@ Status ConvertMatMul(OpConverterParams* params) {
   bool transpose_b = attrs.get<bool>("transpose_b");
 
   return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1), transpose_a,
-                             transpose_b, node_def.name());
+                             transpose_b, node_def);
 }
 
 Status ConvertBatchMatMul(OpConverterParams* params) {
@@ -5378,15 +5477,17 @@ Status ConvertBatchMatMul(OpConverterParams* params) {
       params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
   nvinfer1::ITensor* tensor_l = nullptr;
   nvinfer1::ITensor* tensor_r = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+  TF_RETURN_IF_ERROR(
+      PrepareTensorForShape(params->converter, inputs.at(0), broadcasted_dims_l,
+                            params->validation_only, &tensor_l, node_def));
+  TF_RETURN_IF_ERROR(
+      PrepareTensorForShape(params->converter, inputs.at(1), broadcasted_dims_r,
+                            params->validation_only, &tensor_r, node_def));
   if (params->validation_only) return Status::OK();
 
   return ConvertMatMulHelper(params, TRT_TensorOrWeights(tensor_l),
                              TRT_TensorOrWeights(tensor_r), transpose_a,
-                             transpose_b, node_def.name());
+                             transpose_b, node_def);
 }
 
 Status ConvertSoftmax(OpConverterParams* params) {
@@ -5408,6 +5509,7 @@ Status ConvertSoftmax(OpConverterParams* params) {
   nvinfer1::ISoftMaxLayer* layer =
       params->converter->network()->addSoftMax(*tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   // Tensorflow SoftMax assumes applying softmax on the last dimension.
   layer->setAxes(1 << (num_trt_dims - 1));
 
@@ -5452,6 +5554,7 @@ Status ConvertArgMinMax(OpConverterParams* params) {
   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
       *inputs.at(0).tensor(), topk_op, 1, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "topk");
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
 
   // Squeeze on axis.
@@ -5460,9 +5563,9 @@ Status ConvertArgMinMax(OpConverterParams* params) {
   nvinfer1::Dims new_dims;
   TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(size, &new_dims));
   nvinfer1::ITensor* output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(output_indices_tensor), new_dims,
-      /*validation_only=*/false, &output_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, TRT_TensorOrWeights(output_indices_tensor), new_dims,
+      /*validation_only=*/false, &output_tensor, node_def));
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5508,6 +5611,7 @@ Status ConvertTopK(OpConverterParams* params) {
   nvinfer1::ITopKLayer* layer =
       params->converter->network()->addTopK(*tensor, op, k, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   nvinfer1::ITensor* output_value_tensor = layer->getOutput(0);
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
@@ -5583,6 +5687,7 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
   nvinfer1::IShuffleLayer* first_shuffle =
       params->converter->network()->addShuffle(*inputs.at(0).tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(first_shuffle, node_def.name());
+  SetLayerName(first_shuffle, node_def, "shuffle", /*op_instance=*/0);
   if (data_format == "NHWC") {
     first_shuffle->setFirstTranspose({2, 0, 1});
   }
@@ -5592,6 +5697,7 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
   nvinfer1::IShuffleLayer* second_shuffle =
       params->converter->network()->addShuffle(*first_shuffle->getOutput(0));
   TFTRT_RETURN_ERROR_IF_NULLPTR(second_shuffle, node_def.name());
+  SetLayerName(second_shuffle, node_def, "shuffle", /*op_instance=*/1);
   second_shuffle->setReshapeDimensions(second_shuffle_shape);
   if (data_format == "NHWC") {
     second_shuffle->setSecondTranspose({1, 2, 0});
@@ -5618,10 +5724,12 @@ Status ConvertSquaredDifference(OpConverterParams* params) {
       params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
   nvinfer1::ITensor* tensor_l = nullptr;
   nvinfer1::ITensor* tensor_r = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+  TF_RETURN_IF_ERROR(
+      PrepareTensorForShape(params->converter, inputs.at(0), broadcasted_dims_l,
+                            params->validation_only, &tensor_l, node_def));
+  TF_RETURN_IF_ERROR(
+      PrepareTensorForShape(params->converter, inputs.at(1), broadcasted_dims_r,
+                            params->validation_only, &tensor_r, node_def));
   if (params->validation_only) return Status::OK();
 
   // Subtract x - y.
@@ -5629,12 +5737,15 @@ Status ConvertSquaredDifference(OpConverterParams* params) {
       params->converter->network()->addElementWise(
           *tensor_l, *tensor_r, nvinfer1::ElementWiseOperation::kSUB);
   TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
+  SetLayerName(sub, node_def, "sub");
+
   // Multiply (x - y) * (x - y).
   nvinfer1::IElementWiseLayer* mul =
       params->converter->network()->addElementWise(
           *sub->getOutput(0), *sub->getOutput(0),
           nvinfer1::ElementWiseOperation::kPROD);
   TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
+  SetLayerName(mul, node_def, "mul");
 
   params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
   return Status::OK();
@@ -5772,6 +5883,7 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
   nvinfer1::IPluginV2Layer* layer = params->converter->network()->addPluginV2(
       &plugin_inputs[0], static_cast<int>(plugin_inputs.size()), *plugin);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "plugin");
 
   // Set plugin outputs
   nvinfer1::ITensor* output_nmsed_boxes = layer->getOutput(1);
@@ -5785,25 +5897,22 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
   nvinfer1::ITensor* output_nmsed_scores = nullptr;
   nvinfer1::ITensor* output_nmsed_classes = nullptr;
 
-  auto shrink_last_dim = [params](nvinfer1::ITensor* in_tensor,
-                                  nvinfer1::ITensor** out_tensor) {
+  auto shrink_last_dim = [&](int output_index, nvinfer1::ITensor** out_tensor) {
+    nvinfer1::ITensor* in_tensor = layer->getOutput(output_index);
     nvinfer1::Dims dims = in_tensor->getDimensions();
     if (dims.d[dims.nbDims - 1] != 1) {
       return errors::Internal("Expect last dims to be 1, for tensor ",
                               DebugString(*in_tensor));
     }
     --dims.nbDims;
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(in_tensor), dims,
-        /*validation_only=*/false, out_tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(in_tensor), dims,
+        /*validation_only=*/false, out_tensor, node_def, output_index));
     return Status::OK();
   };
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(2), &output_nmsed_scores));
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(3), &output_nmsed_classes));
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(0), &output_num_detections));
+  TF_RETURN_IF_ERROR(shrink_last_dim(2, &output_nmsed_scores));
+  TF_RETURN_IF_ERROR(shrink_last_dim(3, &output_nmsed_classes));
+  TF_RETURN_IF_ERROR(shrink_last_dim(0, &output_num_detections));
 #endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes));
@@ -5864,7 +5973,7 @@ Status ConvertResize(OpConverterParams* params) {
 
   // Transpose tensor from NHWC to NCHW format.
   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-      tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+      tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
 
   // Calculate output dimensions.
   // Given input dimensions [N, C, H, W] and output size [H_out, W_out],
@@ -5881,6 +5990,7 @@ Status ConvertResize(OpConverterParams* params) {
   nvinfer1::IResizeLayer* layer =
       params->converter->network()->addResize(*tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   // Set layer parameters.
   layer->setResizeMode(resize_mode);
@@ -5891,7 +6001,7 @@ Status ConvertResize(OpConverterParams* params) {
   nvinfer1::ITensor* output = layer->getOutput(0);
 
   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-      output, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"), &output));
+      output, {0, 2, 3, 1}, &output, node_def, "to_NHWC"));
   params->outputs->push_back(TRT_TensorOrWeights(output));
   // Success
   return Status::OK();
@@ -5940,6 +6050,7 @@ Status ConvertAddN(OpConverterParams* params) {
     nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
         *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUM);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    SetLayerName(layer, node_def, std::to_string(i));
     lhs = layer->getOutput(0);
   }
   params->outputs->push_back(TRT_TensorOrWeights(lhs));
@@ -6062,6 +6173,8 @@ Status ConvertGraphDefToEngine(
 
   VLOG(1) << "Starting to convert TensorFlow ops to TensorRT layers";
   std::vector<Converter::EngineOutputInfo> output_tensors;
+  int num_layers = converter->network()->getNbLayers();
+  absl::flat_hash_set<const char*> layer_names;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
     const string& node_name = node_def.name();
@@ -6140,6 +6253,25 @@ Status ConvertGraphDefToEngine(
     } else {
       TF_RETURN_IF_ERROR(converter->ConvertNode(node_def));
     }
+
+    // To support TF-TRT profiling, we ensure each ILayer has a non-empty name.
+    // BuildCudaEngine returns an error if there is any ILayer name collision.
+    // We want to report the error here before BuildCudaEngine in a more
+    // meaningful way.
+    int new_num_layers = converter->network()->getNbLayers();
+    for (int i = num_layers; i < new_num_layers; i++) {
+      auto layer = converter->network()->getLayer(i);
+      if (layer->getName() == nullptr ||
+          !layer_names.insert(layer->getName()).second) {
+        std::string error_message =
+            absl::StrCat("Converting node ", node_name, ", op=", node_def.op(),
+                         layer->getName() ? "create a layer with name collision"
+                                          : "create a layer without a name");
+        LOG_WARNING_WITH_PREFIX << error_message;
+        return errors::Internal(error_message);
+      }
+    }
+    num_layers = new_num_layers;
   }
   TF_RETURN_IF_ERROR(converter->RenameAndMarkOutputTensors(output_tensors));
   if (convert_successfully) *convert_successfully = true;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index a621735fad1..35593143332 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
@@ -91,6 +92,8 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
+        max_batch_size(absl::nullopt),
+        maximum_cached_engines(0),
         precision_mode(TrtPrecisionMode::FP32),
         use_calibration(true),
         allow_build_at_runtime(true) {}
@@ -107,6 +110,7 @@ struct EngineInfo {
   enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
   EngineType engine_type;
   int64 max_workspace_size_bytes;
+  absl::optional<int> max_batch_size;
   int maximum_cached_engines;
   TrtPrecisionMode precision_mode;
   bool use_calibration;
@@ -515,22 +519,15 @@ class Converter {
 
   // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to
   // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch
-  // dimension which should always be 0.
+  // dimension which should always be 0. If this is for adding a transpose layer
+  // to support the conversion of 'node_def', callers need to provide a
+  // non-empty 'sub_op_name' appended to the name of 'node_def' to avoid layer
+  // name conflicts.
   Status TransposeTensor(nvinfer1::ITensor* input_tensor,
                          const std::vector<int>& order_with_batch_dim,
-                         absl::string_view name,
-                         nvinfer1::ITensor** output_tensor);
-
-  // Converts 'input' into 'tensor' with shape specified by 'dims' (which
-  // doesn't contain the batch dimension).
-  //
-  // If validation_only is true, it doesn't do the conversion but only do some
-  // minimum validation for the eligibility of the conversion, and *tensor will
-  // be set to nullptr.
-  Status PrepareTensorForShape(const TRT_TensorOrWeights& input,
-                               const nvinfer1::Dims& dims,
-                               const bool validation_only,
-                               nvinfer1::ITensor** tensor);
+                         nvinfer1::ITensor** output_tensor,
+                         const NodeDef& node_def,
+                         absl::string_view sub_op_name = "");
 
   // Reshapes a dynamic shape tensor by removing or adding dimensions of size 1,
   // and/or permuting the dimensions. The new shape is derived from the shape of
@@ -575,12 +572,14 @@ class Converter {
   Status DynamicReshape(nvinfer1::ITensor* input,
                         std::vector<std::pair<int, int>> slices,
                         OpConverterParams* params, nvinfer1::ITensor** output,
-                        std::vector<int> size_for_added_dims = {});
+                        std::vector<int> size_for_added_dims = {},
+                        absl::optional<int> op_instance = absl::nullopt);
 
   // Inserts a singleton dimension at axis for a dynamic shape tensor.
   Status DynamicExpandDims(nvinfer1::ITensor* input, const nvinfer1::Dims& dims,
                            int axis, OpConverterParams* params,
-                           nvinfer1::ITensor** output);
+                           nvinfer1::ITensor** output,
+                           absl::optional<int> op_instance = absl::nullopt);
 
   // Helper function to add a squeeze op to the network.
   //
@@ -594,6 +593,10 @@ class Converter {
   nvinfer1::ITensor* CreateConstantLayer(const TRT_ShapedWeights& weights,
                                          const nvinfer1::Dims& dims);
 
+  // Gets the min and max value in a TRT_ShapedWeights
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
+
  private:
   Converter(TrtPrecisionMode precision_mode, bool use_calibration,
             nvinfer1::ILogger* trt_logger, const bool use_implicit_batch);
@@ -618,10 +621,6 @@ class Converter {
 
   void PropagateQuantizationRanges();
 
-  // Gets the min and max value in a TRT_ShapedWeights
-  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
-                        float* out_max) const;
-
   // Registered op converters by op type.
   std::unordered_map<string, OpConverter> op_registry_;
 
@@ -667,10 +666,29 @@ class Converter {
   // acceptable by TRT.
   int batch_size_ = -1;
 
+  // Assign a ID to each constant layer we create, so that we can assign a
+  // unique name to the layer.
+  int next_constant_layer_id_ = 0;
+
   friend class ConverterTest;
   friend class OpConverterTest;
 };
 
+// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
+// (which doesn't contain the batch dimension).
+//
+// If validation_only is true, it doesn't do the conversion but only do some
+// minimum validation for the eligibility of the conversion, and *tensor will
+// be set to nullptr.
+// If validation_only is false converter must not be nullptr.
+Status PrepareTensorForShape(Converter* converter,
+                             const TRT_TensorOrWeights& input,
+                             const nvinfer1::Dims& dims,
+                             const bool validation_only,
+                             nvinfer1::ITensor** tensor,
+                             const NodeDef& node_def,
+                             absl::optional<int> op_instance = absl::nullopt);
+
 // Return OK if the broadcast scheme is supported and compute the shapes after
 // broadcasting. check_feasibility can be set to false in cases where dimensions
 // do not need to match exactly (as in the case of BatchMatMulV2).
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 95f3f47efb1..a33d5c28cb2 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -204,6 +204,23 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
       << "  actual: " << DebugString(rhs);
 }
 
+void ExpectTrtLayerNames(absl::Span<const std::string> names,
+                         nvinfer1::INetworkDefinition* network) {
+  EXPECT_EQ(network->getNbLayers(), names.size());
+
+  for (int i = 0; i < network->getNbLayers(); i++) {
+    auto layer = network->getLayer(i);
+    EXPECT_EQ(layer->getName(), names[i]);
+  }
+}
+
+void VerifyTrtLayerNameNotEmpty(nvinfer1::INetworkDefinition* network) {
+  for (int i = 0; i < network->getNbLayers(); i++) {
+    auto layer = network->getLayer(i);
+    EXPECT_NE(layer->getName(), nullptr);
+  }
+}
+
 Matcher<std::vector<float>> ArrayFloatNear(const std::vector<float>& values,
                                            float max_abs_error = 1e-5,
                                            bool nan_sensitive = false) {
@@ -804,6 +821,8 @@ TEST_F(ConverterTest, ConvertNode) {
   TF_EXPECT_OK(GetTensorOrWeights("my_op:1", &actual_output_2));
   EXPECT_EQ(&output_tensors[1], actual_output_2.tensor());
   EXPECT_EQ(125, actual_output_2.tensor()->getDimensions().d[0]);
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, AddAndGetInputs) {
@@ -833,6 +852,8 @@ TEST_F(ConverterTest, AddAndGetInputs) {
   ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
   ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
   ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
@@ -881,30 +902,33 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
   }
   EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
   EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, TransposeTensor) {
   nvinfer1::ITensor* input_tensor = converter_->network()->addInput(
       "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5}));
   nvinfer1::ITensor* output_tensor = nullptr;
-
+  NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {});
   // Rank doesn't match.
   ExpectStatus(
-      converter_->TransposeTensor(input_tensor, {0, 1}, "Bad perm",
-                                  &output_tensor),
+      converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor,
+                                  dummy_node_def, "sub1"),
       error::INVALID_ARGUMENT,
       "Rank of perm for transpose does not match with that of the input");
 
   // Transpose at batch dimension.
-  ExpectStatus(converter_->TransposeTensor(input_tensor, {1, 0, 2, 3},
-                                           "Batch perm", &output_tensor),
-               error::UNIMPLEMENTED,
-               "Transpose at batch dimension is not supported.");
+  ExpectStatus(
+      converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor,
+                                  dummy_node_def, "sub2"),
+      error::UNIMPLEMENTED, "Transpose at batch dimension is not supported.");
 
   // OK.
-  TF_EXPECT_OK(converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, "OK",
-                                           &output_tensor));
+  TF_EXPECT_OK(converter_->TransposeTensor(
+      input_tensor, {0, 3, 1, 2}, &output_tensor, dummy_node_def, "sub3"));
   ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
+  ExpectTrtLayerNames({"dummy_op-sub3"}, converter_->network());
 }
 
 void TestPrepareTensorForShape(
@@ -923,9 +947,11 @@ void TestPrepareTensorForShape(
   }
   nvinfer1::ITensor* output_tensor = nullptr;
 
+  NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {});
   for (bool validation_only : {false, true}) {
-    const Status status = converter->PrepareTensorForShape(
-        input, GetTestDims(reshape_dims), validation_only, &output_tensor);
+    const Status status =
+        PrepareTensorForShape(converter, input, GetTestDims(reshape_dims),
+                              validation_only, &output_tensor, dummy_node_def);
     if (expected_code == error::OK) {
       TF_EXPECT_OK(status);
       if (validation_only) {
@@ -979,6 +1005,8 @@ TEST_F(ConverterTest, PrepareTensorForShape) {
                             /*input_is_tensor=*/false, converter_.get(),
                             weight_store_, error::INVALID_ARGUMENT,
                             "Shape is not fully defined");
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -1052,6 +1080,8 @@ TEST_F(ConverterTest, ProvideQuantizationRange) {
   // Symmetric range
   converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f);
   EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]);
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
@@ -1078,6 +1108,8 @@ TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
   EXPECT_EQ(infer_3.getDynamicRange(), 5.0f);
   EXPECT_EQ(not_infer.getDynamicRange(), 100.0f);
 #endif
+
+  VerifyTrtLayerNameNotEmpty(int8_converter->network());
 }
 
 TEST_F(ConverterTest, PropagateQuantizationRanges) {
@@ -1100,6 +1132,8 @@ TEST_F(ConverterTest, PropagateQuantizationRanges) {
     EXPECT_EQ(5.0f, ranges[&infer[i]]);
   }
   EXPECT_EQ(ranges.count(&not_infer), 0);
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, GetTrtBroadcastShape) {
@@ -1203,6 +1237,8 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
                  "(tensor #dims 4 vs broadcast #dims 5)");
   symmetric_test({2, 3}, {7, 5}, kIsTensor, kIsTensor, {}, {},
                  error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, CreateConstantLayer) {
@@ -1217,6 +1253,8 @@ TEST_F(ConverterTest, CreateConstantLayer) {
         << DebugString(tensor->getType());
     ExpectTrtDimsEqualsArray({3, 10}, tensor->getDimensions());
   }
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 class ConvertGraphDefToEngineTest : public ::testing::Test {
@@ -1576,6 +1614,9 @@ class OpConverterTest : public ::testing::Test {
                      const char* expected_msg_substr = nullptr) {
     ExpectStatus(converter_->ConvertNode(node->def()), expected_code,
                  expected_msg_substr);
+    if (expected_code == error::OK) {
+      VerifyTrtLayerNameNotEmpty(converter_->network());
+    }
   }
 
   // Helper method to run both validation and conversion, when the expected
@@ -4234,72 +4275,72 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
 
   // Ok.
   std::vector<TestParams> ok_params = {
-    // Basic
-    TestParams{/*input_dims=*/{1, 1, 2, 3},
-               /*input=*/{0, 1, 2, 3, 3, 4},
-               /*filter_dims=*/{1, 2, 1, 1},
-               /*filter=*/{-1, 1},
-               /*strides=*/{1, 1, 1, 1},
-               /*padding=*/"VALID",
-               /*data_format=*/"NCHW",
-               /*dilations=*/{1, 1, 1, 1},
-               /*expected_output_dims=*/{1, 1, 2, 2},
-               /*expected_output=*/{1, 1, 0, 1}},
-    // SAME padding (Asymmetric)
-    TestParams{/*input_dims=*/{1, 1, 2, 3},
-               /*input=*/{0, 1, 2, 3, 3, 4},
-               /*filter_dims=*/{1, 2, 1, 1},
-               /*filter=*/{-1, 1},
-               /*strides=*/{1, 1, 1, 1},
-               /*padding=*/"SAME",
-               /*data_format=*/"NCHW",
-               /*dilations=*/{1, 1, 1, 1},
-               /*expected_output_dims=*/{1, 1, 2, 3},
-               /*expected_output=*/{1, 1, -2, 0, 1, -4}},
-    // SAME padding (Symmetric)
-    TestParams{/*input_dims=*/{1, 1, 2, 3},
-               /*input=*/{0, 1, 2, 3, 3, 4},
-               /*filter_dims=*/{1, 3, 1, 1},
-               /*filter=*/{-1, 0, 1},
-               /*strides=*/{1, 1, 1, 1},
-               /*padding=*/"SAME",
-               /*data_format=*/"NCHW",
-               /*dilations=*/{1, 1, 1, 1},
-               /*expected_output_dims=*/{1, 1, 2, 3},
-               /*expected_output=*/{1, 2, -1, 3, 1, -3}},
-    // NHWC
-    TestParams{/*input_dims=*/{1, 2, 3, 1},
-               /*input=*/{0, 1, 2, 3, 3, 4},
-               /*filter_dims=*/{1, 2, 1, 1},
-               /*filter=*/{-1, 1},
-               /*strides=*/{1, 1, 1, 1},
-               /*padding=*/"VALID",
-               /*data_format=*/"NHWC",
-               /*dilations=*/{1, 1, 1, 1},
-               /*expected_output_dims=*/{1, 2, 2, 1},
-               /*expected_output=*/{1, 1, 0, 1}},
-    // Dilated
-    TestParams{/*input_dims=*/{1, 1, 2, 3},
-               /*input=*/{0, 1, 2, 3, 3, 4},
-               /*filter_dims=*/{1, 2, 1, 1},
-               /*filter=*/{-1, 1},
-               /*strides=*/{1, 1, 1, 1},
-               /*padding=*/"VALID",
-               /*data_format=*/"NCHW",
-               /*dilations=*/{1, 1, 1, 2},
-               /*expected_output_dims=*/{1, 1, 2, 1},
-               /*expected_output=*/{2, 1}},
-    // Strided
-    TestParams{/*input_dims=*/{1, 1, 2, 4},
-               /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
-               /*filter_dims=*/{1, 2, 1, 1},
-               /*filter=*/{-1, 1},
-               /*strides=*/{1, 1, 1, 2},
-               /*padding=*/"VALID",
-               /*data_format=*/"NCHW",
-               /*dilations=*/{1, 1, 1, 1},
-               /*expected_output_dims=*/{1, 1, 2, 2},
-               /*expected_output=*/{1, 0, 1, 3}},
+      // Basic
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // SAME padding (Asymmetric)
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
+                 /*expected_output=*/{1, 1, -2, 0, 1, -4}},
+      // SAME padding (Symmetric)
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 3, 1, 1},
+                 /*filter=*/{-1, 0, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
+                 /*expected_output=*/{1, 2, -1, 3, 1, -3}},
+      // NHWC
+      TestParams{/*input_dims=*/{1, 2, 3, 1},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NHWC",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 2, 1},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // Dilated
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 1},
+                 /*expected_output=*/{2, 1}},
+      // Strided
+      TestParams{/*input_dims=*/{1, 1, 2, 4},
+                 /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
+                 /*expected_output=*/{1, 0, 1, 3}},
   };
 
   for (int i = 0; i < ok_params.size(); i++) {
@@ -6771,20 +6812,20 @@ template <typename OpType, DataType dtype>
 void TestConvertResize(OpConverterTest* test) {
   typedef typename EnumToDataType<dtype>::Type CType;
 
-  std::vector<ResizeTestParams<CType>> params {
-    {
-        /*input_dims=*/{1, 2, 1},       // H, W, C
-        /*output_resize_dims=*/{2, 3},  // H_out, W_out
-        /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
-        /*align_corners=*/false,
-        /*expected_output_dims=*/{2, 3, 1},  // H, W, C
-        /*expected_nearest_output_values=*/
-        CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
-        /*expected_bilinear_output_values=*/
-        CastTestVector<float, CType>({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}),
-    },
-    {
-      /*input_dims=*/{1, 2, 1},           // H, W, C
+  std::vector<ResizeTestParams<CType>> params{
+      {
+          /*input_dims=*/{1, 2, 1},       // H, W, C
+          /*output_resize_dims=*/{2, 3},  // H_out, W_out
+          /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
+          /*align_corners=*/false,
+          /*expected_output_dims=*/{2, 3, 1},  // H, W, C
+          /*expected_nearest_output_values=*/
+          CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
+          /*expected_bilinear_output_values=*/
+          CastTestVector<float, CType>({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}),
+      },
+      {
+          /*input_dims=*/{1, 2, 1},       // H, W, C
           /*output_resize_dims=*/{2, 3},  // H_out, W_out
           /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
           /*align_corners=*/true,
@@ -6793,8 +6834,7 @@ void TestConvertResize(OpConverterTest* test) {
           CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
           /*expected_bilinear_output_values=*/
           CastTestVector<float, CType>({2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f}),
-    }
-  };
+      }};
 
 // This use case is not supported as of TRT version 7.1
 #if IS_TRT_VERSION_GE(7, 1, 0, 0)
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 4d6f8fa1b31..12fea3ade40 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -203,32 +203,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
     LOG(INFO) << CurrentStackTrace();
     PrintDebugInfo(cluster, item);
   }
-  if (!is_dynamic_op_) {
-    int max_batch_dim = -1;
-    if (!item.feed.empty()) {
-      for (const auto& f : item.feed) {
-        const auto& shape = f.second.shape();
-        if (shape.dims() > 0) {
-          if (shape.dim_size(0) > max_batch_dim)
-            max_batch_dim = shape.dim_size(0);
-          VLOG(2) << "Setting max_batch_dim to " << max_batch_dim
-                  << " using batch dimension of " << f.first << " with shape "
-                  << shape;
-        }
-      }
-    }
-    if (max_batch_dim > maximum_batch_size_) {
-      return errors::InvalidArgument(
-          "Specified max_batch_size=", maximum_batch_size_,
-          " is less than maximum batch dimension of inputs (", max_batch_dim,
-          "). ", "To continue, set max_batch_size to >= ", max_batch_dim);
-    } else if (max_batch_dim < maximum_batch_size_) {
-      LOG(INFO) << "Specified max_batch_size=" << maximum_batch_size_
-                << " is larger than maximum batch dimension of inputs ("
-                << max_batch_dim << "). "
-                << "This can result in poor performance.";
-    }
-  }
 
   if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) {
     VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 5b2ae822d59..23fd0095da1 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -60,6 +60,9 @@ using absl::StrCat;
 using ::nvinfer1::IRuntime;
 using ::stream_executor::port::StatusOr;
 
+#define LOG_FIRST_FEW_WARNING_WITH_PREFIX \
+  LOG_FIRST_N(WARNING, 5) << "TF-TRT Warning: "
+
 // A helper class to call done() when destructed for asynchronous execution.
 // Helps simultaneous execution of native and TRT engines.
 
@@ -584,9 +587,10 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   Status verify_input_shape_status = VerifyInputShapes(input_concrete_shapes);
   // TODO(bixia): Fix the segmentation.
   if (!verify_input_shape_status.ok()) {
-    LOG_FIRST_N(WARNING, 5) << "Running native segment for" << name()
-                            << " due to failure in verifying input shapes: "
-                            << verify_input_shape_status.error_message();
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX
+        << "Running native segment for" << name()
+        << " due to failure in verifying input shapes: "
+        << verify_input_shape_status.error_message();
     ExecuteNativeSegment(ctx, helper);
     return;
   }
@@ -625,7 +629,7 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     return true;
   };
   if (!engine_context->cuda_engine) {
-    LOG_WARNING_WITH_PREFIX
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX
         << "Engine retrieval for input shapes: "
         << TensorShapeUtils::ShapeListString(input_concrete_shapes)
         << " failed. Running native segment for " << name();
@@ -636,8 +640,9 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   }
   Status stat = ExecuteTrtEngine(ctx, engine_context, trt_context_idx);
   if (!stat.ok()) {
-    LOG_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat
-                            << " Retrying with native segment for " << name();
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat
+                                      << " Retrying with native segment for "
+                                      << name();
     if (!may_execute_native_segment()) {
       return;
     }
@@ -755,9 +760,10 @@ StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>> TRTEngineOp::BuildEngine(
       calibrator, &engine, use_calibration, use_implicit_batch_, nullptr,
       &cache_resource->profiles_);
   if (!status.ok()) {
-    LOG_WARNING_WITH_PREFIX << "Engine creation for " << name() << " failed. "
-                            << "The native segment will be used instead. "
-                            << "Reason: " << status;
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX
+        << "Engine creation for " << name() << " failed. "
+        << "The native segment will be used instead. "
+        << "Reason: " << status;
     // Store an empty engine in the cache for these input shapes so we don't try
     // to build the same failing engine again.
     cache_resource->cache_.emplace(input_concrete_shapes,
@@ -822,9 +828,9 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
               FunctionDefToGraphDef(func_handle_, lib, &segment_graph_def_);
         }
         if (!status.ok()) {
-          LOG_WARNING_WITH_PREFIX << "Getting segment graph for " << name()
-                                  << " failed. "
-                                  << "Reason: " << status;
+          LOG_FIRST_FEW_WARNING_WITH_PREFIX << "Getting segment graph for "
+                                            << name() << " failed. "
+                                            << "Reason: " << status;
         }
       }
       auto result = BuildEngine(input_concrete_shapes, batch_size,
@@ -883,7 +889,7 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
   // If cache does not have a compatible engine then create a new engine.
   if (engine_contexts == nullptr) {
     if (!allow_build_at_runtime_) {
-      LOG_WARNING_WITH_PREFIX
+      LOG_FIRST_FEW_WARNING_WITH_PREFIX
           << "Found no engine in cache matching input shapes. "
           << "Not building a new engine because "
           << "allow_build_at_runtime=False. "
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index 2527fe9b910..a2a41f5a03c 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -37,6 +37,7 @@ REGISTER_OP("TRTEngineOp")
     .Attr("OutT: list({int8,float16,float32,int32})")
     .Attr("input_shapes: list(shape) = []")
     .Attr("max_cached_engines_count: int = 1")
+    .Attr("max_batch_size: int = 1")
     .Attr("workspace_size_bytes: int")
     .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
     .Attr("calibration_data: string = ''")
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 8e1464de7ef..02ba31fecd2 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
-#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
@@ -242,12 +241,6 @@ struct SimpleEdgePtrCompare {
   }
 };
 
-struct NodePtrCompare {
-  bool operator()(const Node* lhs, const Node* rhs) const {
-    return lhs->name() < rhs->name();
-  }
-};
-
 // Copied from TF ReverseDFS, which only works for Graph.
 void StableDFS(const SimpleGraph& g, bool reverse,
                const std::vector<const SimpleNode*>& start,
@@ -478,7 +471,7 @@ absl::Span<const OpInfo::TensorProperties> GetInputsToDeterminateBatchSize(
       "Add",
       "AddV2",
       "Mul",
-      "Sub"
+      "Sub",
       "Div",
       "FloorDiv",
       "RealDiv",
@@ -646,10 +639,18 @@ ClusterBatchSize GetClusterBatchSizeForNode(
     return cluster_batch_size;
   }
 
+  const NodeDef& node_def = node->def();
+  if (node_def.attr().count(kTftrtOpMaxBatchSizeAttr)) {
+    cluster_batch_size.SetMaxBatchSize(
+        node_def.attr().at(kTftrtOpMaxBatchSizeAttr).i());
+  }
+
+  // As shape inference cannot provide any useful information about the batch
+  // size, we keep it as missing.
   if (!graph_properties ||
       !graph_properties->HasInputProperties(node->name())) {
     VLOG(3) << "doesn't have input property";
-    return cluster_batch_size.SetBatchSize(-1);
+    return cluster_batch_size;
   }
 
   const std::vector<OpInfo::TensorProperties>& input_properties =
@@ -658,8 +659,8 @@ ClusterBatchSize GetClusterBatchSizeForNode(
       FindLeadingShape(GetInputsToDeterminateBatchSize(node, input_properties));
   DCHECK(optional_leading_shape.has_value());
   const TensorShapeProto* leading_shape = optional_leading_shape.value();
-
   DCHECK(!leading_shape->unknown_rank() && leading_shape->dim_size() >= 2);
+  VLOG(3) << "set batch size as " << leading_shape->dim(0).size();
   return cluster_batch_size.SetBatchSize(leading_shape->dim(0).size());
 }
 
@@ -676,21 +677,6 @@ void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
   segments->emplace_back(node, std::move(property));
 }
 
-bool OpBatchSizeExceedMaximumBatchSize(
-    const grappler::GraphProperties* graph_properties, const Node* node,
-    bool use_implicit_batch, absl::optional<int> maximum_batch_size) {
-  ClusterBatchSize cluster_batch_size =
-      GetClusterBatchSizeForNode(graph_properties, node, use_implicit_batch);
-  if (cluster_batch_size.HasStaticBatchSize() &&
-      maximum_batch_size.has_value() &&
-      cluster_batch_size.GetStaticBatchSize() > maximum_batch_size.value()) {
-    VLOG(2) << "OP batch size " << cluster_batch_size.GetStaticBatchSize()
-            << "  max_batch_size " << maximum_batch_size.value();
-    return true;
-  }
-  return false;
-}
-
 }  // namespace
 
 Status SegmentGraph(const Graph* tf_graph,
@@ -698,8 +684,7 @@ Status SegmentGraph(const Graph* tf_graph,
                     const std::function<Status(const Node*)>& candidate_fn,
                     const std::function<bool(const Edge*)>& input_candidate_fn,
                     const std::function<bool(const Edge*)>& output_candidate_fn,
-                    const SegmentOptions& options,
-                    SegmentNodesVector* segments) {
+                    const SegmentOptions& options, SegmentVector* segments) {
   if (!options.use_implicit_batch && !options.allow_dynamic_non_batch_dim) {
     return errors::Internal(
         "Explicit batch mode should allow dynamic non-batch dimensions");
@@ -787,14 +772,6 @@ Status SegmentGraph(const Graph* tf_graph,
             << "(Op type: " << node->tf_node()->type_string() << "), "
             << "(Op name: " << node->name() << ")";
         exclude_node("Denylisted with the env var TF_TRT_OP_DENYLIST");
-      } else if (OpBatchSizeExceedMaximumBatchSize(
-                     graph_properties, node->tf_node(),
-                     options.use_implicit_batch, options.maximum_batch_size)) {
-        LOG_WARNING_WITH_PREFIX
-            << "Implicit batch mode requires OP batch size not larger than "
-            << "the converter maximum batch size: "
-            << "(Op name: " << node->name() << ")";
-        exclude_node("OP batch size too large");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
                 << "(Op type: " << node->tf_node()->type_string() << "), "
@@ -943,18 +920,21 @@ Status SegmentGraph(const Graph* tf_graph,
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the segment nodes set.
-  std::map<string, std::set<const Node*, NodePtrCompare>> sg_map;
+  std::map<string, Segment> sg_map;
 
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
-      sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node());
+      sg_map[u.ParentValue()->name()].nodes.insert(u.Value()->tf_node());
+    }
+    if ((u.Value() != nullptr) && (u.ParentValue() == u.Value())) {
+      sg_map[u.Value()->name()].property = u.Property();
     }
   }
 
   // --------------------------------- Step 2 ---------------------------------
   // Remove ineligible input/output nodes.
   for (auto& itr : sg_map) {
-    std::set<const Node*, NodePtrCompare>& segment_nodes = itr.second;
+    std::set<const Node*, NodePtrCompare>& segment_nodes = itr.second.nodes;
     VLOG(1) << "Segment original size: " << segment_nodes.size();
     while (true) {
       std::deque<const Node*> in_nodes_que, out_nodes_que;
@@ -1042,7 +1022,8 @@ Status SegmentGraph(const Graph* tf_graph,
   for (const auto& itr : sg_map) {
     const string& segment_root = itr.first;
     // Return format does not require set comparator.
-    std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
+    std::set<const Node*, NodePtrCompare> segment_nodes(
+        itr.second.nodes.begin(), itr.second.nodes.end());
     if (VLOG_IS_ON(1) && !segment_nodes.empty()) {
       string s;
       for (auto node : segment_nodes) {
@@ -1066,8 +1047,7 @@ Status SegmentGraph(const Graph* tf_graph,
               << num_effective_nodes << " effective nodes, dropping";
       continue;
     }
-
-    segments->emplace_back(segment_nodes);
+    segments->emplace_back(itr.second.property, segment_nodes);
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index bab6e089fa4..ad41d5eb40f 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -32,10 +33,10 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Vector of segments, each entry contains a set of node pointers.
-using SegmentNodesVector = std::vector<std::set<const Node*>>;
+constexpr char kTftrtOpMaxBatchSizeAttr[] = "_tftrt_op_max_batch_size";
 
 struct SegmentOptions {
+  // This struct holds per graph segmenting parameters.
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
   bool use_implicit_batch = true;
@@ -45,9 +46,28 @@ struct SegmentOptions {
   // When use_implicit_batch is false or when we are building dynamic engines,
   // we allow dynamic non-batch dimensions.
   bool allow_dynamic_non_batch_dim = false;
+  // The name of the device to put the segment on.
   std::set<string> exclude_node_list;
 };
 
+struct NodePtrCompare {
+  bool operator()(const Node* lhs, const Node* rhs) const {
+    return lhs->name() < rhs->name();
+  }
+};
+
+struct Segment {
+  Segment() {}
+  Segment(const ClusterProperty& property,
+          const std::set<const Node*, NodePtrCompare>& nodes)
+      : property(property), nodes(nodes) {}
+  ClusterProperty property;
+  std::set<const Node*, NodePtrCompare> nodes;
+};
+
+// Vector of segments, each entry contains a set of node pointers.
+using SegmentVector = std::vector<Segment>;
+
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
 // @param tf_graph Graph of the network.
@@ -63,8 +83,7 @@ Status SegmentGraph(const Graph* tf_graph,
                     const std::function<Status(const Node*)>& candidate_fn,
                     const std::function<bool(const Edge*)>& input_candidate_fn,
                     const std::function<bool(const Edge*)>& output_candidate_fn,
-                    const SegmentOptions& options,
-                    SegmentNodesVector* segments);
+                    const SegmentOptions& options, SegmentVector* segments);
 
 }  // namespace segment
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index ee406c9743f..12f3e7a5742 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -65,7 +65,7 @@ class SegmentTest : public ::testing::Test {
                const std::set<string>& input_candidates,
                const std::set<string>& output_candidates,
                const std::vector<std::set<string>>& expected_segments) {
-    SegmentNodesVector segments;
+    SegmentVector segments;
     TF_EXPECT_OK(SegmentGraph(graph, graph_properties,
                               MakeCandidateFn(candidates),
                               MakeInputEdgeCandidateFn(input_candidates),
@@ -82,12 +82,12 @@ class SegmentTest : public ::testing::Test {
             expected_segments);
   }
 
-  void ValidateSegment(const SegmentNodesVector& segments,
+  void ValidateSegment(const SegmentVector& segments,
                        const std::vector<std::set<string>>& expected_segments) {
     EXPECT_EQ(expected_segments.size(), segments.size());
     for (int i = 0; i < segments.size(); ++i) {
       std::set<string> segment_node_names;
-      for (const Node* node : segments[i]) {
+      for (const Node* node : segments[i].nodes) {
         segment_node_names.insert(node->name());
       }
       const auto& expected = expected_segments[i];
@@ -490,9 +490,10 @@ TEST_F(SegmentTest, TwoChainsDiffBatchSizes) {
   RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
           /*expected_segments=*/{{"output-0", "const-scalar"}});
 
+  // Converter will create engines based on the static batch size
   EnableImplicitBatchModeForStaticEngine(1);
   RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
-          /*expected_segments=*/{});
+          /*expected_segments=*/{{"output-0", "const-scalar"}});
 }
 
 TEST_F(SegmentTest, SameRankImplicitBroadcastingStaticBatchSize) {
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.cc b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
index 289a2734183..29882ed6e60 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
-#include "tensorflow/core/lib/core/errors.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 
@@ -29,9 +28,6 @@ namespace {
 template <typename T>
 inline bool CheckIfCompatible(const absl::optional<T>& a,
                               const absl::optional<T>& b) {
-  if (!a.has_value() && !b.has_value()) {
-    return true;
-  }
   if (a.has_value() && b.has_value()) {
     return *a == *b;
   }
@@ -52,57 +48,79 @@ template <typename T>
 inline absl::optional<T> MergeCompatible(const absl::optional<T>& a,
                                          const absl::optional<T>& b) {
   DCHECK(CheckIfCompatible(a, b));
-  return b;
+  return a.has_value() ? a : b;
 }
 
 }  // namespace
 
 ClusterBatchSize::ClusterBatchSize()
-    : has_dynamic_batch_size_(false), static_batch_size_(absl::nullopt) {}
+    : batch_size_(absl::nullopt), max_batch_size_(absl::nullopt) {}
 
 bool ClusterBatchSize::operator==(const ClusterBatchSize& other) {
-  return has_dynamic_batch_size_ == other.has_dynamic_batch_size_ &&
-         static_batch_size_ == other.static_batch_size_;
+  return batch_size_ == other.batch_size_ &&
+         max_batch_size_ == other.max_batch_size_;
 }
 
-int ClusterBatchSize::GetStaticBatchSize() const {
-  DCHECK(HasStaticBatchSize());
-  return static_batch_size_.value();
-}
-
-// Sets the batch size assuming that the object doesn't have a batch size yet:
-//   a non-negative input value representing a static batch size.
-//   a negative input value representing a dynamic batch size.
 ClusterBatchSize& ClusterBatchSize::SetBatchSize(int batch_size) {
-  if (batch_size < 0) {
-    has_dynamic_batch_size_ = true;
-    return *this;
-  }
-  static_batch_size_ = MergeCompatible<int>(static_batch_size_, batch_size);
+  SetBatchSize(static_cast<absl::optional<int>>(batch_size));
   return *this;
 }
 
+ClusterBatchSize& ClusterBatchSize::SetBatchSize(
+    const absl::optional<int>& batch_size) {
+  batch_size_ = MergeCompatible<int>(batch_size_, batch_size);
+  if (batch_size_.has_value() && batch_size_.value() >= 0) {
+    SetMaxBatchSize(batch_size_);
+  }
+  return *this;
+}
+
+bool ClusterBatchSize::HasBatchSize() const { return batch_size_.has_value(); }
+
+int ClusterBatchSize::GetBatchSize() const {
+  DCHECK(HasBatchSize());
+  return batch_size_.value();
+}
+
+ClusterBatchSize& ClusterBatchSize::SetMaxBatchSize(int max_batch_size) {
+  SetBatchSize(static_cast<absl::optional<int>>(max_batch_size));
+  return *this;
+}
+
+ClusterBatchSize& ClusterBatchSize::SetMaxBatchSize(
+    const absl::optional<int>& max_batch_size) {
+  max_batch_size_ = MergeCompatible<int>(max_batch_size_, max_batch_size);
+  return *this;
+}
+
+absl::optional<int> ClusterBatchSize::GetOptionalMaxBatchSize() const {
+  return max_batch_size_;
+}
+
 bool ClusterBatchSize::MergeIfCompatible(const ClusterBatchSize& other) {
-  if (!CheckIfCompatible(static_batch_size_, other.static_batch_size_)) {
+  if (!CheckIfCompatible(batch_size_, other.batch_size_) ||
+      !CheckIfCompatible(max_batch_size_, other.max_batch_size_)) {
     return false;
   }
-  if (other.HasStaticBatchSize()) {
-    static_batch_size_ = other.GetStaticBatchSize();
-  }
-  if (other.HasDynamicBatchSize()) {
-    has_dynamic_batch_size_ = true;
-  }
+
+  SetBatchSize(other.batch_size_);
+  SetMaxBatchSize(other.max_batch_size_);
   return true;
 }
 
 string ClusterBatchSize::ToString() const {
   string s;
-  absl::StrAppendFormat(&s, "batch_size=(%d,%d", HasDynamicBatchSize(),
-                        HasStaticBatchSize());
-  if (HasStaticBatchSize()) {
-    absl::StrAppendFormat(&s, ",%d", GetStaticBatchSize());
-  }
-  absl::StrAppend(&s, ")");
+  const auto append_optional_num = [&](const absl::optional<int>& num) {
+    if (num.has_value()) {
+      absl::StrAppendFormat(&s, "%d", num.value());
+    } else {
+      absl::StrAppendFormat(&s, "?");
+    }
+  };
+  absl::StrAppendFormat(&s, "batch_size=");
+  append_optional_num(batch_size_);
+  absl::StrAppendFormat(&s, ", max_batch_size=");
+  append_optional_num(max_batch_size_);
   return s;
 }
 
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 74a20aa4a24..9a2f1e8dd5b 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
 #include "absl/types/optional.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
@@ -28,30 +29,45 @@ namespace segment {
 // ClusterBatchSize is a data structure to record the batch size we have seen
 // for a cluster during segmentation.
 //
-// When constructing clusters for implicit batch mode, we support the
-// with both dynamic batch size and static batch size. We restrict nodes inside
-// a cluster to either have dynamic batch size or have the same value for static
-// batch size. For this reason, we use a field has_dynamic_batch_size_ to keep
-// track of whether the cluster has any node with dynamic batch size. We use
-// field static_batch_size_ to keep track of whether the cluster has any node
-// with static batch size and what the value of the static batch size, if any.
-// Examples:
+// With the help of shape inference, all the dynamic batch sizes are converted
+// to a negative integer number.
+// If the number is -1, then nothing is known about the dynamic batch size.
+// Ideally, we should not put nodes with -1 batch size into the same cluster,
+// as they will likely have different batch sizes at runtime. However, we
+// currently treat -1 as an equivalent class for simple implementation. We may
+// need to revise this if it causes performance issues.
+// If the number is strictly less than -1, then it represents a equivalent
+// class. It is infered that all the nodes with the same equivalent class
+// (strictly less than -1) shall have the same batch size at runtime.
+//
+// When constructing clusters for implicit batch mode, we support both
+// dynamic batch sizes and static batch sizes. As all the nodes inside the same
+// cluster shall have the same batch size at runtime, we restrict nodes inside a
+// cluster to either have the same dynamic batch size equivalent class or the
+// same static batch size value.
+//
+// Besides, all the nodes with an annotated max batch size inside the same
+// cluster shall have the same annotated max batch size. (It is allowed if
+// part or all the nodes inside the cluster doesn't have annotated max batch
+// size). Static batch sizes are treated as max batch size annotations. The
+// converter max batch size is used for an OP with a dynamic batch size and no
+// annotated max batch size.
+//
 // cluster:  a = a1[1,3] + a1[1,3]
-// ClusterBatchSize: has_dynamic_batch_size_ = false
-//                   static_batch_size_ = {has value, 1}
+// ClusterBatchSize: batch_size_ = 1
+//                   max_batch_size_ = 1
 //
 // cluster:  b = b1[-1,3] + b2[-1, 3]
-// ClusterBatchSize: has_dynamic_batch_size_ = true
-//                   static_batch_size_ = {has no value}
+// ClusterBatchSize: batch_size_ = -1
+//                   max_batch_size_ = null
 //
-// cluster:  a = a1[1,3] + a1[1,3]; b = b1[-1,3] + b2[-1, 3]
-// ClusterBatchSize: has_dynamic_batch_size_ = true
-//                   static_batch_size_ = {has value, 1}
+// cluster:  c = c1[-2,3] + c2[-2, 3](max_batch_size=100)
+// ClusterBatchSize: batch_size_ = -2
+//                   max_batch_size_ = 100
 //
 // When constructing cluster for explicit batch mode, all ClusterBatchSize is
 // irrelevant.
 //
-//
 
 class ClusterBatchSize {
  public:
@@ -61,29 +77,41 @@ class ClusterBatchSize {
   bool operator!=(const ClusterBatchSize& other) { return !(*this == other); }
 
   // Sets the batch size assuming that the object doesn't have a batch size yet:
-  //   a non-negative input value representing a static batch size.
-  //   a negative input value representing a dynamic batch size.
+  //   A non-negative input representing a static batch size value.
+  //   A negative input representing a dynamic batch size equivalent class.
   ClusterBatchSize& SetBatchSize(int batch_size);
-  bool HasStaticBatchSize() const { return static_batch_size_.has_value(); }
-  int GetStaticBatchSize() const;
+  bool HasBatchSize() const;
+  int GetBatchSize() const;
 
+  // Sets the max batch size assuming that the object doesn't have a max batch
+  // size yet.
+  ClusterBatchSize& SetMaxBatchSize(int max_batch_size);
+  absl::optional<int> GetOptionalMaxBatchSize() const;
+
+  // Merge `other` into the current ClusterBatchSize if the two are not
+  // conflicting. Two ClusterBatchSizes are conflicting iff they both have a
+  // value and their values are different.
   bool MergeIfCompatible(const ClusterBatchSize& other);
 
-  // Returns a string for the batch size.
-  //   If the object has a static batch size, return a string for the value.
-  //   If the object has a dynamic size, return -1.
-  //   Otherwise, returns -2 to represent that the batch size hasn't been set
-  //     yet.
+  // Returns a string for the batch size and the annotated max batch size.
+  // For the batch size:
+  //   If the object has a static batch size, return a string representing a
+  //     non-negative integer.
+  //   If the object has a dynamic batch size, return a string representing a
+  //     negative integer as an equivalent class.
+  //   If the object doesn't have a batch size yet, return "?".
+  // For the annotated max batch size:
+  //   If the cluster has annotated max batch size in at least one of the nodes,
+  //     return a string representing the annotated max batch size. Otherwise,
+  //     return "?".
   std::string ToString() const;
 
  private:
-  bool HasDynamicBatchSize() const { return has_dynamic_batch_size_; }
+  ClusterBatchSize& SetBatchSize(const absl::optional<int>& batch_size);
+  ClusterBatchSize& SetMaxBatchSize(const absl::optional<int>& batch_size);
 
-  // To track whether the cluster has any node with dynamic batch size.
-  bool has_dynamic_batch_size_;
-  // To track whether the cluster has any node with static batch size, and the
-  // unique value for static batch size.
-  absl::optional<int> static_batch_size_;
+  absl::optional<int> batch_size_;
+  absl::optional<int> max_batch_size_;
 };
 
 inline std::ostream& operator<<(std::ostream& os,
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 1cce3424fae..df1b7f86956 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -196,7 +196,7 @@ filegroup(
     srcs = [
         "xla_compiled_cpu_function.h",
         "//tensorflow/compiler/xla:cpu_runtime_hdrs",
-        "//tensorflow/compiler/xla/service/cpu:single_threaded_runtime_hdrs",
+        "//tensorflow/compiler/xla/service/cpu:runtime_hdrs",
         "//tensorflow/core/kernels:xla_cpu_runtime_hdrs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
     ],
@@ -208,7 +208,7 @@ filegroup(
     srcs = [
         "xla_compiled_cpu_function.cc",
         "//tensorflow/compiler/xla:cpu_runtime_srcs",
-        "//tensorflow/compiler/xla/service/cpu:single_threaded_runtime_srcs",
+        "//tensorflow/compiler/xla/service/cpu:runtime_srcs",
         "//tensorflow/core/kernels:xla_cpu_runtime_srcs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
     ],
@@ -249,6 +249,11 @@ cc_library(
         "//third_party/eigen3",
         "//tensorflow/core/framework:numeric_types",
         "//tensorflow/core/platform:bfloat16",
+    ] + [
+        # Extra dependencies required for multithreaded runtime objects.
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:mutex",
     ] + tf_additional_tensor_coding_deps(),
     alwayslink = 1,
 )
@@ -346,6 +351,7 @@ cc_library(
         ":xla_op_registry",
         ":xla_resource",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -742,10 +748,10 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -868,7 +874,9 @@ cc_library(
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:device_util",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 7e1878682f2..3059d5a4e2c 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -190,6 +190,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:stateful_random_ops_header",
         "//tensorflow/core/kernels:stateless_random_ops_v2_header",
+        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 7e8d3d7002a..b461aa43153 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -186,7 +186,7 @@ class StatelessCategoricalOp : public CategoricalOp {
 
 REGISTER_XLA_OP(Name("StatelessMultinomial")
                     .CompileTimeConstantInput("num_samples")
-                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("T", {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessCategoricalOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index d29644dd0de..e9f32f212c2 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -269,9 +269,19 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
   dims.set_output_feature_dimension(feature_dim);
   dims.set_kernel_input_feature_dimension(attrs.num_spatial_dims);
   dims.set_kernel_output_feature_dimension(attrs.num_spatial_dims + 1);
-
+  xla::PaddingType padding_type = xla::PaddingType::PADDING_INVALID;
   for (int i = 0; i < attrs.num_spatial_dims; ++i) {
     const int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
+    if (input_shape.is_dynamic_dimension(dim)) {
+      TF_RET_CHECK(attrs.padding == VALID || attrs.padding == SAME)
+          << "Dynamic convolution only supports valid and same padding";
+      if (attrs.padding == VALID) {
+        padding_type = xla::PaddingType::PADDING_VALID;
+      }
+      if (attrs.padding == SAME) {
+        padding_type = xla::PaddingType::PADDING_SAME;
+      }
+    }
     dims.add_input_spatial_dimensions(dim);
     dims.add_kernel_spatial_dimensions(i);
     dims.add_output_spatial_dimensions(dim);
@@ -290,6 +300,15 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
         &padding[i].first, &padding[i].second));
   }
 
+  if (padding_type != xla::PaddingType::PADDING_INVALID) {
+    return xla::DynamicConvForward(
+        conv_input, filter, window_strides, padding, lhs_dilation, rhs_dilation,
+        dims,
+        /*feature_group_count=*/attrs.depthwise ? in_depth
+                                                : feature_group_count,
+        /*batch_group_count=*/1, precision_config, padding_type);
+  }
+
   return xla::ConvGeneralDilated(
       conv_input, filter, window_strides, padding, lhs_dilation, rhs_dilation,
       dims,
@@ -300,7 +319,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
 xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
     StringPiece type_string, const xla::Shape& input_shape, xla::XlaOp filter,
     xla::XlaOp out_backprop, const ConvOpAttrs& attrs,
-    const xla::PrecisionConfig* precision_config) {
+    const xla::PrecisionConfig* precision_config, xla::XlaOp* input_sizes) {
   TF_RETURN_IF_ERROR(CheckConvAttrs(attrs));
 
   int num_dims = attrs.num_spatial_dims + 2;
@@ -347,8 +366,19 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
   std::vector<int64> lhs_dilation(attrs.num_spatial_dims);
   std::vector<int64> rhs_dilation(attrs.num_spatial_dims);
   std::vector<int64> ones(attrs.num_spatial_dims, 1);
+  xla::PaddingType padding_type = xla::PaddingType::PADDING_INVALID;
   for (int i = 0; i < attrs.num_spatial_dims; ++i) {
     int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
+    if (out_backprop_shape.is_dynamic_dimension(dim)) {
+      TF_RET_CHECK(attrs.padding == VALID || attrs.padding == SAME)
+          << "Dynamic convolution only supports valid and same padding";
+      if (attrs.padding == VALID) {
+        padding_type = xla::PaddingType::PADDING_VALID;
+      }
+      if (attrs.padding == SAME) {
+        padding_type = xla::PaddingType::PADDING_SAME;
+      }
+    }
     dnums.add_input_spatial_dimensions(dim);
     dnums.add_kernel_spatial_dimensions(i);
     dnums.add_output_spatial_dimensions(dim);
@@ -366,7 +396,15 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
   }
   // Mirror the filter in the spatial dimensions.
   filter = xla::Rev(filter, kernel_spatial_dims);
-
+  if (padding_type != xla::PaddingType::PADDING_INVALID) {
+    TF_RET_CHECK(input_sizes != nullptr);
+    return xla::DynamicConvInputGrad(
+        *input_sizes, out_backprop, filter, /*window_strides=*/ones, padding,
+        lhs_dilation, rhs_dilation, dnums,
+        /*feature_group_count=*/
+        feature_group_count,
+        /*batch_group_count=*/1, precision_config, padding_type);
+  }
   // activation gradients
   //   = gradients (with padding and dilation) <conv> mirrored_weights
   return xla::ConvGeneralDilated(out_backprop, filter, /*window_strides=*/ones,
@@ -444,9 +482,19 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   for (int i = 0; i < attrs.num_spatial_dims; ++i) {
     dnums.add_output_spatial_dimensions(i);
   }
-
+  xla::PaddingType padding_type = xla::PaddingType::PADDING_INVALID;
   for (int64 i = 0; i < attrs.num_spatial_dims; ++i) {
     int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
+    if (activations_shape.is_dynamic_dimension(dim)) {
+      TF_RET_CHECK(attrs.padding == VALID || attrs.padding == SAME)
+          << "Dynamic convolution only supports valid and same padding";
+      if (attrs.padding == VALID) {
+        padding_type = xla::PaddingType::PADDING_VALID;
+      }
+      if (attrs.padding == SAME) {
+        padding_type = xla::PaddingType::PADDING_SAME;
+      }
+    }
     dnums.add_input_spatial_dimensions(dim);
     dnums.add_kernel_spatial_dimensions(dim);
     rhs_dilation[i] = dims.spatial_dims[i].stride;
@@ -503,12 +551,20 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   //
   // This is done by specifying the window dilation factors in the
   // convolution HLO below.
-
-  filter_backprop = xla::ConvGeneralDilated(
-      activations, gradients, window_strides, padding, /*lhs_dilation=*/ones,
-      rhs_dilation, dnums,
-      /*feature_group_count=*/1,
-      /*batch_group_count=*/batch_group_count, precision_config);
+  if (padding_type != xla::PaddingType::PADDING_INVALID) {
+    filter_backprop = xla::DynamicConvKernelGrad(
+        activations, gradients, window_strides, padding, /*lhs_dilation=*/ones,
+        rhs_dilation, dnums,
+        /*feature_group_count=*/1,
+        /*batch_group_count=*/batch_group_count, precision_config,
+        padding_type);
+  } else {
+    filter_backprop = xla::ConvGeneralDilated(
+        activations, gradients, window_strides, padding, /*lhs_dilation=*/ones,
+        rhs_dilation, dnums,
+        /*feature_group_count=*/1,
+        /*batch_group_count=*/batch_group_count, precision_config);
+  }
 
   if (attrs.depthwise) {
     filter_backprop = xla::Reshape(filter_backprop, filter_shape.dimensions());
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 09829fb2767..94451aeac6a 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -64,7 +64,8 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
 xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
     StringPiece type_string, const xla::Shape& input_shape, xla::XlaOp filter,
     xla::XlaOp out_backprop, const ConvOpAttrs& attrs,
-    const xla::PrecisionConfig* precision_config = nullptr);
+    const xla::PrecisionConfig* precision_config = nullptr,
+    xla::XlaOp* input_sizes = nullptr);
 xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     StringPiece type_string, xla::XlaOp activations,
     const xla::Shape& filter_shape, xla::XlaOp gradients,
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index e9cd5d2744e..802f7f5cef5 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -112,10 +112,10 @@ class ConvBackpropInputOp : public XlaOpKernel {
                     "The rank of the specified input shape must be "
                     "num_spatial_dims + 2. Expected ",
                     attrs_.num_spatial_dims + 2, " got ", input_shape.rank()));
-
-    xla::StatusOr<xla::XlaOp> in_backprop =
-        MakeXlaBackpropInputConvOp(ctx->op_kernel().type_string(), input_shape,
-                                   ctx->Input(1), ctx->Input(2), attrs_);
+    xla::XlaOp input_sizes = ctx->Input(0);
+    xla::StatusOr<xla::XlaOp> in_backprop = MakeXlaBackpropInputConvOp(
+        ctx->op_kernel().type_string(), input_shape, ctx->Input(1),
+        ctx->Input(2), attrs_, nullptr, &input_sizes);
     OP_REQUIRES_OK(ctx, in_backprop.status());
     ctx->SetOutput(0, in_backprop.ValueOrDie());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 656f9b898f3..3b3b406b532 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -26,7 +28,7 @@ class MirrorPadOp : public XlaOpKernel {
  public:
   explicit MirrorPadOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
-  xla::StatusOr<xla::XlaOp> DoMirrorPad(const xla::XlaOp& t,
+  xla::StatusOr<xla::XlaOp> DoMirrorPad(const xla::XlaOp t,
                                         const xla::Shape& original_shape,
                                         const xla::LiteralSlice& pad_literal,
                                         const MirrorPadMode mode,
@@ -108,5 +110,104 @@ class MirrorPadOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("MirrorPad").CompileTimeConstantInput("paddings"),
                 MirrorPadOp);
 
+class MirrorPadGradOp : public XlaOpKernel {
+ public:
+  explicit MirrorPadGradOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  xla::StatusOr<xla::XlaOp> DoMirrorPadGrad(
+      const xla::XlaOp t, const xla::Shape& original_shape,
+      const xla::LiteralSlice& pad_literal, const MirrorPadMode mode,
+      xla::XlaBuilder* b) {
+    // The difference in the semantics of REFLECT and SYMMETRIC is that REFLECT
+    // will not mirror the border values while symmetric does.
+    // e.g. input is [1, 2, 3] and paddings is [0, 2], then the output is:
+    // - [1, 2, 3, 2, 1] in reflect mode
+    // - [1, 2, 3, 3, 2] in symmetric mode.
+    int64 excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0;
+    xla::XlaOp grad = t;
+    for (int64 dimno = original_shape.rank() - 1; dimno >= 0; --dimno) {
+      int64 lhs_padding = pad_literal.Get<int64>({dimno, 0});
+      int64 rhs_padding = pad_literal.Get<int64>({dimno, 1});
+      int64 dim_size = original_shape.dimensions(dimno);
+      int64 result_dim_size = dim_size - lhs_padding - rhs_padding;
+
+      // Padding amounts on each side must be no more than the size of the
+      // original shape.
+      TF_RET_CHECK(lhs_padding >= 0 &&
+                   lhs_padding <= dim_size - excluded_edges);
+      TF_RET_CHECK(rhs_padding >= 0 &&
+                   rhs_padding <= dim_size - excluded_edges);
+
+      xla::XlaOp lhs_pad = xla::SliceInDim(grad, 0, lhs_padding, 1, dimno);
+      xla::XlaOp reverse_lhs_pad = xla::Rev(lhs_pad, {dimno});
+      xla::XlaOp padded_lhs_pad = xla::PadInDim(
+          reverse_lhs_pad, xla::ScalarLike(reverse_lhs_pad, 0), dimno,
+          /*pad_lo=*/excluded_edges,
+          /*pad_hi=*/result_dim_size - lhs_padding - excluded_edges);
+
+      xla::XlaOp rhs_pad =
+          xla::SliceInDim(grad, dim_size - rhs_padding, dim_size, 1, dimno);
+      xla::XlaOp reverse_rhs_pad = xla::Rev(rhs_pad, {dimno});
+      xla::XlaOp padded_rhs_pad = xla::PadInDim(
+          reverse_rhs_pad, xla::ScalarLike(reverse_rhs_pad, 0), dimno,
+          /*pad_lo=*/result_dim_size - rhs_padding - excluded_edges,
+          /*pad_hi=*/excluded_edges);
+
+      xla::XlaOp grad_core =
+          xla::SliceInDim(grad, lhs_padding, dim_size - rhs_padding, 1, dimno);
+
+      grad = padded_lhs_pad + grad_core + padded_rhs_pad;
+    }
+    return grad;
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
+
+    MirrorPadMode mode;
+    OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode));
+    OP_REQUIRES(
+        ctx, mode == MirrorPadMode::REFLECT || mode == MirrorPadMode::SYMMETRIC,
+        xla::Unimplemented("Unsupported MirrorPadGrad mode. Only SYMMETRIC and "
+                           "REFLECT modes are currently supported"));
+
+    const int dims = input_shape.dims();
+    OP_REQUIRES(
+        ctx,
+        TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
+        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+                                pad_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, dims == pad_shape.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs",
+            pad_shape.DebugString(), " ", input_shape.DebugString()));
+
+    // Evaluate the 'padding' constant input, reshaping to a matrix.
+    xla::Literal pad_literal;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
+
+    xla::XlaBuilder* b = ctx->builder();
+    auto in0 = ctx->Input("input");
+    xla::StatusOr<xla::Shape> in0_shape = b->GetShape(in0);
+    OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status());
+    xla::StatusOr<xla::XlaOp> accum_status =
+        DoMirrorPadGrad(in0, in0_shape.ValueOrDie(), pad_literal, mode, b);
+
+    OP_REQUIRES_OK(ctx, accum_status.status());
+
+    ctx->SetOutput(0, accum_status.ValueOrDie());
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MirrorPadGradOp);
+};
+
+REGISTER_XLA_OP(Name("MirrorPadGrad").CompileTimeConstantInput("paddings"),
+                MirrorPadGradOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 75faa2eac81..99f50101ee0 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -108,6 +108,38 @@ class XlaSetBoundOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("XlaSetBound").CompileTimeConstantInput("bound"),
                 XlaSetBoundOp);
 
+class XlaSetDynamicDimensionSizeOp : public XlaOpKernel {
+ public:
+  explicit XlaSetDynamicDimensionSizeOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape dim_index_shape = ctx->InputShape("dim_index");
+    const TensorShape size_shape = ctx->InputShape("size");
+
+    OP_REQUIRES(ctx,
+                ctx->InputType("dim_index") == DT_INT32 &&
+                    ctx->InputType("size") == DT_INT32,
+                errors::InvalidArgument("dim_index and size has to be int32 for"
+                                        "XlaSetDynamicDimensionSizeOp"));
+
+    OP_REQUIRES(
+        ctx, dim_index_shape.dims() == 0,
+        errors::InvalidArgument("XlaSetDynamicDimensionSizeOp's dim_index and "
+                                "size has to be int32 scalar value"));
+    int64 dim_index;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("dim_index", &dim_index));
+
+    xla::XlaOp result =
+        xla::SetDimensionSize(ctx->Input(0), ctx->Input("size"), dim_index);
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSetDynamicDimensionSize").CompileTimeConstantInput("dim_index"),
+    XlaSetDynamicDimensionSizeOp);
+
 class ShapeNOp : public XlaOpKernel {
  public:
   explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index a46cceddced..cfd05f18c8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -181,7 +181,7 @@ Status CompileImpl(
   }
   xla::Literal alg_literal;
   TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
-  Algorithm alg = Algorithm(alg_literal.Get<int>({}));
+  Algorithm alg = Algorithm(alg_literal.Get<int64>({}));
   if (!(alg == RNG_ALG_THREEFRY || alg == RNG_ALG_PHILOX)) {
     return errors::InvalidArgument("Unsupported algorithm id: ", alg);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 22cfd160088..488407eaa12 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -58,7 +58,8 @@ class TopKOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("TopKV2").CompileTimeConstantInput("k").TypeConstraint(
-                    "T", {DT_UINT32, DT_INT32, DT_FLOAT, DT_BFLOAT16}),
+                    "T",
+                    {DT_UINT32, DT_INT32, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}),
                 TopKOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
index 8b481d55a80..555905ebe6b 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace {
@@ -38,16 +39,28 @@ class XlaReduceOp : public XlaOpKernel {
         context, dims_set.size() == dimensions_to_reduce_.size(),
         errors::InvalidArgument("Duplicate dimension in dimensions_to_reduce "
                                 "argument to XlaReduce"));
+    if (context->HasAttr("N")) {  // variadic reduce
+      use_tuples_ = true;
+      OP_REQUIRES_OK(context, context->GetAttr("N", &n_));
+    } else {
+      use_tuples_ = false;
+      n_ = 1;
+    }
   }
 
   void Compile(XlaOpKernelContext* context) override {
-    const TensorShape input_shape = context->InputShape("input");
-    const TensorShape init_value_shape = context->InputShape("init_value");
+    OP_REQUIRES(context, n_ * 2 == context->num_inputs(),
+                errors::InvalidArgument("Expected ", n_ * 2, " inputs but got ",
+                                        context->num_inputs()));
+
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape init_value_shape = context->InputShape(n_);
     const DataType dtype = context->input_type(0);
 
     const int rank = input_shape.dims();
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(init_value_shape),
-                errors::InvalidArgument("init_value must be a scalar"));
+                errors::InvalidArgument("init_value must be a scalar but got ",
+                                        init_value_shape.DebugString()));
 
     auto dim_in_range = [rank](int64 dim) { return dim >= 0 && dim < rank; };
     OP_REQUIRES(context,
@@ -67,35 +80,58 @@ class XlaReduceOp : public XlaOpKernel {
     compile_options.always_return_tuple = false;
     compile_options.is_entry_computation = false;
     XlaCompiler::CompilationResult reducer;
-    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
-                                compile_options, *reducer_,
-                                {reducer_arg, reducer_arg}, &reducer));
+    OP_REQUIRES_OK(
+        context,
+        context->compiler()->CompileFunction(
+            compile_options, *reducer_,
+            std::vector<XlaCompiler::Argument>(n_ * 2, reducer_arg), &reducer));
 
-    xla::Shape scalar_shape;
-    OP_REQUIRES_OK(context,
-                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    xla::Shape expected_shape;
+    OP_REQUIRES_OK(
+        context, TensorShapeToXLAShape(dtype, TensorShape(), &expected_shape));
+    if (use_tuples_) {
+      expected_shape = xla::ShapeUtil::MakeTupleShape(
+          std::vector<xla::Shape>(n_, expected_shape));
+    }
     OP_REQUIRES(
         context,
-        xla::ShapeUtil::Compatible(reducer.xla_output_shape, scalar_shape),
+        xla::ShapeUtil::Compatible(reducer.xla_output_shape, expected_shape),
         errors::InvalidArgument(
             "Invalid output shape of XlaReduce reducer. Expected ",
-            xla::ShapeUtil::HumanString(scalar_shape), " got ",
+            xla::ShapeUtil::HumanString(expected_shape), " got ",
             xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
 
+    std::vector<xla::XlaOp> inputs;
+    std::vector<xla::XlaOp> inits;
+    inputs.reserve(n_);
+    inits.reserve(n_);
+    for (int i = 0; i < n_; i++) {
+      inputs.emplace_back(context->Input(i));
+      inits.emplace_back(context->Input(n_ + i));
+    }
     xla::XlaOp output =
-        xla::Reduce(context->Input("input"), context->Input("init_value"),
-                    *reducer.computation, dimensions_to_reduce_);
-    context->SetOutput(0, output);
+        xla::Reduce(context->builder(), inputs, inits, *reducer.computation,
+                    dimensions_to_reduce_);
+    if (use_tuples_) {
+      for (int i = 0; i < n_; i++) {
+        context->SetOutput(i, xla::GetTupleElement(output, i));
+      }
+    } else {
+      context->SetOutput(0, output);
+    }
   }
 
  private:
   const NameAttrList* reducer_;
   std::vector<int64> dimensions_to_reduce_;
+  bool use_tuples_;
+  int n_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaReduceOp);
 };
 
 REGISTER_XLA_OP(Name("XlaReduce"), XlaReduceOp);
+REGISTER_XLA_OP(Name("XlaVariadicReduce"), XlaReduceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index b46429ef0d1..4a8b3dd56cf 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -17,9 +17,28 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace {
+
+// Checks if the module has any TPU devices in its device list.
+bool HasTPUDevice(mlir::ModuleOp op) {
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(op.getOperation(), &devices)))
+    return false;
+
+  for (const auto& device : devices.device_names()) {
+    llvm::errs() << "Device: " << device.type << "\n";
+    if (device.has_type && device.type == "TPU") return true;
+  }
+  return false;
+}
+}  // namespace
 
 namespace tensorflow {
 
@@ -44,6 +63,12 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
     return Status::OK();
   }
 
+  // Skip MLIR TPU Bridge if no TPU devices found.
+  if (!HasTPUDevice(module)) {
+    VLOG(0) << "Skipping MLIR TPU Bridge, no TPU devices found";
+    return Status::OK();
+  }
+
   VLOG(0) << "Running MLIR TPU Bridge";
   mlir_bridge_gauge_v2->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
@@ -62,6 +87,12 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
     return Status::OK();
   }
 
+  // Skip MLIR TPU Bridge if no TPU devices found.
+  if (!HasTPUDevice(module)) {
+    VLOG(0) << "Skipping MLIR TPU Bridge V1 Compat, no TPU devices found";
+    return Status::OK();
+  }
+
   VLOG(0) << "Running MLIR TPU Bridge V1 Compat";
   mlir_bridge_gauge_v1->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
index 09d3ef1f6d4..2f08a80e975 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
@@ -31,7 +31,12 @@ class MlirBridgePass : public MlirOptimizationPass {
 
   bool IsEnabled(const ConfigProto& config_proto) const override {
     return config_proto.experimental().enable_mlir_bridge() ||
-           tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge;
+           config_proto.experimental().mlir_bridge_rollout() ==
+               tensorflow::ConfigProto::Experimental::
+                   MLIR_BRIDGE_ROLLOUT_ENABLED ||
+           tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+               tensorflow::ConfigProto::Experimental::
+                   MLIR_BRIDGE_ROLLOUT_ENABLED;
   }
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
@@ -48,7 +53,12 @@ class MlirBridgeV1CompatPass : public MlirV1CompatOptimizationPass {
 
   bool IsEnabled(const ConfigProto& config_proto) const override {
     return config_proto.experimental().enable_mlir_bridge() ||
-           tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge;
+           config_proto.experimental().mlir_bridge_rollout() ==
+               tensorflow::ConfigProto::Experimental::
+                   MLIR_BRIDGE_ROLLOUT_ENABLED ||
+           tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+               tensorflow::ConfigProto::Experimental::
+                   MLIR_BRIDGE_ROLLOUT_ENABLED;
   }
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index f73d2b109a1..0b980a80b5c 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -301,6 +301,19 @@ REGISTER_OP("XlaSetBound")
         returns the same value.
 )doc");
 
+REGISTER_OP("XlaSetDynamicDimensionSize")
+    .Input("input: T")
+    .Input("dim_index: int32")
+    .Input("size: int32")
+    .Output("output: T")
+    .Attr("T: type")
+    // Use unknown shape to prevent constant folding.
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(
+        R"doc(Make a static dimension into a xla bounded dynamic dimension.
+        The current static dimension size will become the bound and the second
+        operand becomes the dynamic size of the dimension.)doc");
+
 REGISTER_OP("XlaDynamicSlice")
     .Input("input: T")
     .Input("start_indices: Tindices")
@@ -475,6 +488,60 @@ reducer: a reducer function to apply
 dimensions_to_reduce: dimension numbers over which to reduce
 )doc");
 
+REGISTER_OP("XlaVariadicReduce")
+    .Input("input: N * T")
+    .Input("init_value: N * T")
+    .Attr("N: int >= 1")
+    .Attr("T: numbertype")
+    .Attr("dimensions_to_reduce: list(int)")
+    .Attr("reducer: func")
+    .Output("output: N * T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int n;
+      TF_RETURN_IF_ERROR(c->GetAttr("N", &n));
+      for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+          c->MergeInput(i, c->input(j));
+        }
+      }
+      if (c->RankKnown(c->input(0))) {
+        int rank = c->Rank(c->input(0));
+        std::vector<int64> dimensions_to_reduce;
+        TF_RETURN_IF_ERROR(
+            c->GetAttr("dimensions_to_reduce", &dimensions_to_reduce));
+        std::set<int64> dims_set(dimensions_to_reduce.begin(),
+                                 dimensions_to_reduce.end());
+        auto dim_in_range = [rank](int64 dim) {
+          return dim >= 0 && dim < rank;
+        };
+        const int dimensions_to_reduce_size = dimensions_to_reduce.size();
+        if (rank < dimensions_to_reduce_size ||
+            dims_set.size() != dimensions_to_reduce.size() ||
+            !absl::c_all_of(dimensions_to_reduce, dim_in_range)) {
+          return errors::InvalidArgument(
+              "Invalid dimensions_to_reduce argument to XlaVariadicReduce");
+        }
+        for (int i = 0; i < n; i++) {
+          c->set_output(
+              i, c->UnknownShapeOfRank(rank - dimensions_to_reduce.size()));
+        }
+      } else {
+        for (int i = 0; i < n; i++) {
+          c->set_output(i, c->input(i));
+        }
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Wraps the variadic XLA Reduce operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#variadic_reduce.
+
+input: the input tensor(s)
+init_value: scalar initial value(s) for the reduction
+reducer: a reducer function to apply
+dimensions_to_reduce: dimension numbers over which to reduce
+)doc");
+
 REGISTER_OP("XlaReduceWindow")
     .Input("input: T")
     .Input("init_value: T")
@@ -738,7 +805,7 @@ REGISTER_OP("XlaGather")
     .Input("slice_sizes: Tindices")
     .Attr("dimension_numbers: string")
     .Attr("indices_are_sorted: bool")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
     .Output("output: T")
     .SetShapeFn(shape_inference::UnknownShape)
@@ -759,7 +826,7 @@ REGISTER_OP("XlaScatter")
     .Attr("update_computation: func")
     .Attr("dimension_numbers: string")
     .Attr("indices_are_sorted: bool")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
     .Output("output: T")
     .SetShapeFn(shape_inference::UnchangedShape)
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 6278ea1a3a9..f9d7181ffac 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -340,6 +340,7 @@ def random_uniform(minval, maxval, dims, name=None):
 
 recv = gen_xla_ops.xla_recv
 reduce = gen_xla_ops.xla_reduce
+variadic_reduce = gen_xla_ops.xla_variadic_reduce
 
 
 def reduce_window(operand,
@@ -398,6 +399,21 @@ replica_id = gen_xla_ops.xla_replica_id
 set_bound = gen_xla_ops.xla_set_bound
 
 
+# Make a static dimension into a xla bounded dynamic dimension. The current
+# static dimension size will become the bound and the second operand becomes the
+# dynamic size of the dimension.
+#
+# This should mostly be used for testing.
+#
+# def f():
+#   array = tf.convert_to_tensor([[1, 2, 3, 4, 5]])
+#   # Tells xla the valid size of the array is 3.
+#   dim = 0
+#   p = xla_set_dynamic_dimension_size(array, dim, 3)
+#   assert(reduce_sum(p) == 6) # xla knows only the first 3 elements are valid.
+set_dynamic_dimension_size = gen_xla_ops.xla_set_dynamic_dimension_size
+
+
 def reshape(x, new_sizes, dimensions=None, name=None):
   if dimensions is not None:
     x = array_ops.transpose(x, dimensions)
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index fdd8484c249..3d6a66c6ebc 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/jit/defs.h"
@@ -627,8 +628,28 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.inline_with_single_device_body_placer = true;
   graph_optimizer_options.ignore_noinline = is_inside_mustcompile;
 
-  optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
-                     /*device=*/nullptr, &graph, graph_optimizer_options);
+  {
+    GraphShapeInfo shape_info;
+    InferShapes(graph.get(), /*arg_shapes=*/{},
+                flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
+        .IgnoreError();
+    auto node_name_index = graph->BuildNodeNameIndex();
+    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    for (const auto& node_shape_info : shape_info) {
+      const string& node_name = node_shape_info.first;
+      const std::vector<InferredShape>& output_shapes = node_shape_info.second;
+      const auto& node_iter = node_name_index.find(node_name);
+      if (node_iter != node_name_index.end()) {
+        auto& partial_shapes = shape_map[node_name];
+        for (const auto& inferred_shape : output_shapes) {
+          partial_shapes.push_back(inferred_shape.shape);
+        }
+      }
+    }
+    graph_optimizer_options.shape_map = &shape_map;
+    optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
+                       /*device=*/nullptr, &graph, graph_optimizer_options);
+  }
 
   // Run shape inference on the graph and optimize the graph again.
   GraphShapeInfo shape_info;
@@ -655,6 +676,38 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   return graph;
 }
 
+// Collects all control rets from `orig_control_ret_nodes` that are still valid,
+// keeping the same order.
+std::vector<std::string> GetValidControlRets(
+    absl::Span<Node* const> orig_control_ret_nodes, const Graph& graph) {
+  // Build map from control ret node to index.
+  absl::flat_hash_map<const Node*, int> control_ret_nodes_map;
+  for (int i = 0; i < orig_control_ret_nodes.size(); ++i) {
+    const Node* n = orig_control_ret_nodes[i];
+    control_ret_nodes_map[n] = i;
+  }
+  // Check which control rets are still valid.
+  std::vector<bool> is_valid_control_ret(orig_control_ret_nodes.size(), false);
+  int num_valid_control_rets = 0;
+  for (const Node* n : graph.nodes()) {
+    auto iter = control_ret_nodes_map.find(n);
+    if (iter != control_ret_nodes_map.end()) {
+      ++num_valid_control_rets;
+      is_valid_control_ret[iter->second] = true;
+    }
+  }
+  // Return valid control rets in same order as they appear in
+  // `orig_control_ret_nodes`.
+  std::vector<std::string> valid_control_rets;
+  valid_control_rets.reserve(num_valid_control_rets);
+  for (int i = 0; i < orig_control_ret_nodes.size(); ++i) {
+    if (is_valid_control_ret[i]) {
+      valid_control_rets.push_back(orig_control_ret_nodes[i]->name());
+    }
+  }
+  return valid_control_rets;
+}
+
 Status XlaCompiler::CompileFunction(
     const XlaCompiler::CompileOptions& options,
     const NameAttrList& fn_name_attrs,
@@ -734,24 +787,26 @@ Status XlaCompiler::CompileFunction(
 
   VLOG(1) << "====================================================";
 #ifdef LIBTPU_ON_GCE
-  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
     VLOG(1) << "MLIR is not supported in this environment.";
   }
   TF_RETURN_IF_ERROR(
       CompileGraph(options, function_id, std::move(graph), args, result));
 #else
-  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
     VLOG(1) << "Using MLIR bridge";
     GraphDebugInfo debug_info;
-    std::vector<std::string> control_rets;
-    for (const auto* control_ret_node : fbody->control_ret_nodes) {
-      control_rets.push_back(control_ret_node->name());
-    }
+
+    std::vector<std::string> valid_control_rets =
+        GetValidControlRets(fbody->control_ret_nodes, *graph);
+
     TF_RETURN_IF_ERROR(CompileGraphToXlaHlo(
         std::move(*graph), mlir::SpanToArrayRef<XlaCompiler::Argument>(args),
-        control_rets, options_.device_type.type_string(), options.use_tuple_arg,
-        *options_.flib_def, debug_info, options_.shape_representation_fn,
-        result));
+        valid_control_rets, options_.device_type.type_string(),
+        options.use_tuple_arg, *options_.flib_def, debug_info,
+        options_.shape_representation_fn, result));
   } else {
     TF_RETURN_IF_ERROR(
         CompileGraph(options, function_id, std::move(graph), args, result));
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 831da22e033..9883a3f47c5 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -490,12 +490,14 @@ cc_library(
 cc_library(
     name = "error_spec",
     hdrs = ["error_spec.h"],
+    visibility = [":friends"],
 )
 
 cc_library(
     name = "literal_comparison",
     srcs = ["literal_comparison.cc"],
     hdrs = ["literal_comparison.h"],
+    visibility = [":friends"],
     deps = [
         ":error_spec",
         ":literal",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index c05b2c8ece7..409cf37762b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -257,6 +257,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 410c86732d6..76cc6f0159b 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -203,7 +203,7 @@ static XlaOp ErfcImpl32(XlaOp x) {
 // Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
 //
 // This follows Cephes's f32 implementation of erf.
-static XlaOp ErfImpl32(XlaOp x) {
+static XlaOp ErfImpl32Cephes(XlaOp x) {
   // Coefficients for by erf(f32), from Cephes.
   //
   // erf(x) = x P(x^2), 0 < x < 1
@@ -291,11 +291,31 @@ XlaOp Erfc(XlaOp x) {
     // (not surprising!), so upcast to f32 in this case.
     return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
       return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
-                    ScalarLike(x, 1) - ErfImpl32(x));
+                    ScalarLike(x, 1) - ErfImpl32Cephes(x));
     });
   });
 }
 
+// Compute a polynomial approximation of the error function.
+// This is the same approximation used by Eigen.
+static XlaOp ErfImpl32(XlaOp x) {
+  static const std::array<float, 7> kAlpha{
+      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
+      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
+      -1.60960333262415e-02f,
+  };
+
+  static const std::array<float, 5> kBeta{
+      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
+      -7.37332916720468e-03f, -1.42647390514189e-02f,
+  };
+
+  x = Clamp(ScalarLike(x, -4.f), x, ScalarLike(x, 4.f));
+  auto x2 = x * x;
+  return x * EvaluatePolynomial<float>(x2, kAlpha) /
+         EvaluatePolynomial<float>(x2, kBeta);
+}
+
 XlaOp Erf(XlaOp x) {
   auto& b = *x.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -310,10 +330,8 @@ XlaOp Erf(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
-      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl32(x),
-                    ScalarLike(x, 1) - ErfcImpl32(x));
-    });
+    return DoWithUpcastToF32(x, {BF16, F16},
+                             [](XlaOp x) { return ErfImpl32(x); });
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 82a6128025f..c56b8f50906 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -267,9 +267,8 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
 }
 
 static ShapedBuffer MaybeOwningShapeTreeToShapedBuffer(
-    const ShapeTree<MaybeOwningDeviceMemory>& tree, se::Platform* platform,
-    int device_ordinal) {
-  ShapedBuffer result(tree.shape(), platform, device_ordinal);
+    const ShapeTree<MaybeOwningDeviceMemory>& tree, int device_ordinal) {
+  ShapedBuffer result(tree.shape(), device_ordinal);
   auto it = tree.begin();
   auto out_it = result.buffers().begin();
   for (; it != tree.end(); ++it, ++out_it) {
@@ -299,8 +298,7 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
     shaped_buffer_ptrs.reserve(arguments.size());
     for (size_t i = 0; i < arguments.size(); ++i) {
       shaped_buffers.push_back(MaybeOwningShapeTreeToShapedBuffer(
-          arguments[i].Buffers(), backend_->platform(),
-          stream->parent()->device_ordinal()));
+          arguments[i].Buffers(), stream->parent()->device_ordinal()));
       shaped_buffer_ptrs.push_back(&shaped_buffers.back());
     }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 2ac3200800b..28c8c5cacb8 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -691,8 +691,10 @@ XlaOp XlaBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
 
 StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                                     ComparisonDirection direction) {
-  return Compare(shape, lhs, rhs, direction,
-                 Comparison::DefaultComparisonType(shape.element_type()));
+  TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(lhs));
+  return Compare(
+      shape, lhs, rhs, direction,
+      Comparison::DefaultComparisonType(operand_shape.element_type()));
 }
 
 StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
@@ -1069,6 +1071,18 @@ XlaOp XlaBuilder::Pad(XlaOp operand, XlaOp padding_value,
   });
 }
 
+XlaOp XlaBuilder::PadInDim(XlaOp operand, XlaOp padding_value, int64 dimno,
+                           int64 pad_lo, int64 pad_hi) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
+    PaddingConfig padding_config = MakeNoPaddingConfig(shape->rank());
+    auto* dims = padding_config.mutable_dimensions(dimno);
+    dims->set_edge_padding_low(pad_lo);
+    dims->set_edge_padding_high(pad_hi);
+    return Pad(operand, padding_value, padding_config);
+  });
+}
+
 StatusOr<XlaOp> XlaBuilder::PadInternal(const Shape& shape, XlaOp operand,
                                         XlaOp padding_value,
                                         const PaddingConfig& padding_config) {
@@ -1437,6 +1451,110 @@ XlaOp XlaBuilder::ConvGeneralDilated(
   });
 }
 
+StatusOr<HloInstructionProto> XlaBuilder::DynamicConvInstruction(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type) {
+  TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
+  TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
+  std::vector<int64> window_dimensions(
+      dimension_numbers.kernel_spatial_dimensions_size());
+  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
+    window_dimensions[i] =
+        rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+  }
+
+  TF_ASSIGN_OR_RETURN(Window window, ShapeInference::InferWindowFromDimensions(
+                                         window_dimensions, window_strides,
+                                         padding, lhs_dilation, rhs_dilation));
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      ShapeInference::InferConvolveShape(
+                          *lhs_shape, *rhs_shape, feature_group_count,
+                          batch_group_count, window, dimension_numbers));
+
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  *instr.mutable_window() = window;
+  *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+  instr.set_feature_group_count(feature_group_count);
+  instr.set_batch_group_count(batch_group_count);
+  instr.set_padding_type(padding_type);
+
+  if (precision_config != nullptr) {
+    *instr.mutable_precision_config() = *precision_config;
+  }
+  return std::move(instr);
+}
+
+XlaOp XlaBuilder::DynamicConvInputGrad(
+    XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(
+        HloInstructionProto instr,
+        DynamicConvInstruction(lhs, rhs, window_strides, padding, lhs_dilation,
+                               rhs_dilation, dimension_numbers,
+                               feature_group_count, batch_group_count,
+                               precision_config, padding_type));
+
+    instr.set_custom_call_target("DynamicConvolutionInputGrad");
+
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall,
+                          {input_sizes, lhs, rhs});
+  });
+}
+
+XlaOp XlaBuilder::DynamicConvKernelGrad(
+    XlaOp activations, XlaOp gradients, absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(
+        HloInstructionProto instr,
+        DynamicConvInstruction(activations, gradients, window_strides, padding,
+                               lhs_dilation, rhs_dilation, dimension_numbers,
+                               feature_group_count, batch_group_count,
+                               precision_config, padding_type));
+
+    instr.set_custom_call_target("DynamicConvolutionKernelGrad");
+
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall,
+                          {activations, gradients});
+  });
+}
+
+XlaOp XlaBuilder::DynamicConvForward(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(
+        HloInstructionProto instr,
+        DynamicConvInstruction(lhs, rhs, window_strides, padding, lhs_dilation,
+                               rhs_dilation, dimension_numbers,
+                               feature_group_count, batch_group_count,
+                               precision_config, padding_type));
+    instr.set_custom_call_target("DynamicConvolutionForward");
+
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, {lhs, rhs});
+  });
+}
+
 StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
     const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
     absl::Span<const int64> window_strides,
@@ -2322,31 +2440,53 @@ XlaOp XlaBuilder::ReduceWindow(XlaOp operand, XlaOp init_value,
                                absl::Span<const int64> window_dimensions,
                                absl::Span<const int64> window_strides,
                                Padding padding) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_RETURN_IF_ERROR(
-        ValidatePaddingValues(AsInt64Slice(operand_shape->dimensions()),
-                              window_dimensions, window_strides));
+  return ReduceWindow(absl::MakeSpan(&operand, 1),
+                      absl::MakeSpan(&init_value, 1), computation,
+                      window_dimensions, window_strides, padding);
+}
 
+XlaOp XlaBuilder::ReduceWindow(absl::Span<const XlaOp> operands,
+                               absl::Span<const XlaOp> init_values,
+                               const XlaComputation& computation,
+                               absl::Span<const int64> window_dimensions,
+                               absl::Span<const int64> window_strides,
+                               Padding padding) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    const Shape* operand_shape = nullptr;
+    for (const auto& operand : operands) {
+      TF_ASSIGN_OR_RETURN(operand_shape, GetShapePtr(operand));
+      TF_RETURN_IF_ERROR(
+          ValidatePaddingValues(AsInt64Slice(operand_shape->dimensions()),
+                                window_dimensions, window_strides));
+    }
+    CHECK(operand_shape != nullptr);
     std::vector<std::pair<int64, int64>> padding_values =
         MakePadding(AsInt64Slice(operand_shape->dimensions()),
                     window_dimensions, window_strides, padding);
     return ReduceWindowWithGeneralPadding(
-        operand, init_value, computation, window_dimensions, window_strides,
+        operands, init_values, computation, window_dimensions, window_strides,
         /*base_dilations=*/{}, /*window_dilations=*/{}, padding_values);
   });
 }
 
 XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
-    XlaOp operand, XlaOp init_value, const XlaComputation& computation,
+    absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
+    const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const int64> base_dilations,
     absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
+  std::vector<const Shape*> operand_shapes, init_shapes;
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* init_shape, GetShapePtr(init_value));
+    for (int i = 0; i < operands.size(); ++i) {
+      const auto& operand = operands[i];
+      const auto& init_value = init_values[i];
+      TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+      operand_shapes.push_back(operand_shape);
+      TF_ASSIGN_OR_RETURN(const Shape* init_shape, GetShapePtr(init_value));
+      init_shapes.push_back(init_shape);
+    }
     TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(auto window,
@@ -2356,12 +2496,33 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
                             /*rhs_dilation=*/window_dilations));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferReduceWindowShape(
-                         *operand_shape, *init_shape, window, to_apply_shape));
-    return ReduceWindowInternal(shape, operand, init_value, computation,
+                         absl::MakeSpan(operand_shapes),
+                         absl::MakeSpan(init_shapes), window, to_apply_shape));
+    return ReduceWindowInternal(shape, operands, init_values, computation,
                                 std::move(window));
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
+    const Shape& shape, absl::Span<const XlaOp> operands,
+    absl::Span<const XlaOp> init_values, const XlaComputation& computation,
+    Window window) {
+  if (operands.size() == 1) {
+    return ReduceWindowInternal(shape, operands[0], init_values[0], computation,
+                                window);
+  } else {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    *instr.mutable_window() = std::move(window);
+    AddCalledComputation(computation, &instr);
+    std::vector<XlaOp> args;
+    args.insert(args.end(), operands.begin(), operands.end());
+    args.insert(args.end(), init_values.begin(), init_values.end());
+    return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
+                          absl::MakeSpan(args));
+  }
+}
+
 StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
     const Shape& shape, XlaOp operand, XlaOp init_value,
     const XlaComputation& computation, Window window) {
@@ -3628,6 +3789,12 @@ XlaOp Pad(const XlaOp operand, const XlaOp padding_value,
   return operand.builder()->Pad(operand, padding_value, padding_config);
 }
 
+XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64 dimno, int64 pad_lo,
+               int64 pad_hi) {
+  return operand.builder()->PadInDim(operand, padding_value, dimno, pad_lo,
+                                     pad_hi);
+}
+
 XlaOp Reshape(const XlaOp operand, absl::Span<const int64> dimensions,
               absl::Span<const int64> new_sizes) {
   return operand.builder()->Reshape(operand, dimensions, new_sizes);
@@ -3856,6 +4023,49 @@ XlaOp ConvGeneralDilated(const XlaOp lhs, const XlaOp rhs,
       precision_config);
 }
 
+XlaOp DynamicConvInputGrad(XlaOp input_sizes, const XlaOp lhs, const XlaOp rhs,
+                           absl::Span<const int64> window_strides,
+                           absl::Span<const std::pair<int64, int64>> padding,
+                           absl::Span<const int64> lhs_dilation,
+                           absl::Span<const int64> rhs_dilation,
+                           const ConvolutionDimensionNumbers& dimension_numbers,
+                           int64 feature_group_count, int64 batch_group_count,
+                           const PrecisionConfig* precision_config,
+                           PaddingType padding_type) {
+  return lhs.builder()->DynamicConvInputGrad(
+      input_sizes, lhs, rhs, window_strides, padding, lhs_dilation,
+      rhs_dilation, dimension_numbers, feature_group_count, batch_group_count,
+      precision_config, padding_type);
+}
+
+XlaOp DynamicConvKernelGrad(
+    XlaOp activations, XlaOp gradients, absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type) {
+  return activations.builder()->DynamicConvKernelGrad(
+      activations, gradients, window_strides, padding, lhs_dilation,
+      rhs_dilation, dimension_numbers, feature_group_count, batch_group_count,
+      precision_config, padding_type);
+}
+
+XlaOp DynamicConvForward(const XlaOp lhs, const XlaOp rhs,
+                         absl::Span<const int64> window_strides,
+                         absl::Span<const std::pair<int64, int64>> padding,
+                         absl::Span<const int64> lhs_dilation,
+                         absl::Span<const int64> rhs_dilation,
+                         const ConvolutionDimensionNumbers& dimension_numbers,
+                         int64 feature_group_count, int64 batch_group_count,
+                         const PrecisionConfig* precision_config,
+                         PaddingType padding_type) {
+  return lhs.builder()->DynamicConvForward(
+      lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation,
+      dimension_numbers, feature_group_count, batch_group_count,
+      precision_config, padding_type);
+}
+
 XlaOp Fft(const XlaOp operand, FftType fft_type,
           absl::Span<const int64> fft_length) {
   return operand.builder()->Fft(operand, fft_type, fft_length);
@@ -4065,6 +4275,17 @@ XlaOp ReduceWindow(const XlaOp operand, const XlaOp init_value,
                                          padding);
 }
 
+XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                   absl::Span<const XlaOp> init_values,
+                   const XlaComputation& computation,
+                   absl::Span<const int64> window_dimensions,
+                   absl::Span<const int64> window_strides, Padding padding) {
+  CHECK(!operands.empty());
+  return operands[0].builder()->ReduceWindow(operands, init_values, computation,
+                                             window_dimensions, window_strides,
+                                             padding);
+}
+
 XlaOp ReduceWindowWithGeneralPadding(
     const XlaOp operand, const XlaOp init_value,
     const XlaComputation& computation,
@@ -4074,8 +4295,9 @@ XlaOp ReduceWindowWithGeneralPadding(
     absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return operand.builder()->ReduceWindowWithGeneralPadding(
-      operand, init_value, computation, window_dimensions, window_strides,
-      base_dilations, window_dilations, padding);
+      absl::MakeSpan(&operand, 1), absl::MakeSpan(&init_value, 1), computation,
+      window_dimensions, window_strides, base_dilations, window_dilations,
+      padding);
 }
 
 XlaOp AllGather(const XlaOp operand, int64 all_gather_dimension,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index f736ae1d470..ed3aea594d3 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -458,6 +458,8 @@ class XlaBuilder {
 
   XlaOp Pad(XlaOp operand, XlaOp padding_value,
             const PaddingConfig& padding_config);
+  XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64 dimno, int64 pad_lo,
+                 int64 pad_hi);
 
   virtual StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
                                       XlaOp padding_value,
@@ -560,6 +562,45 @@ class XlaBuilder {
                            int64 batch_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
+  XlaOp DynamicConvForward(XlaOp lhs, XlaOp rhs,
+                           absl::Span<const int64> window_strides,
+                           absl::Span<const std::pair<int64, int64>> padding,
+                           absl::Span<const int64> lhs_dilation,
+                           absl::Span<const int64> rhs_dilation,
+                           const ConvolutionDimensionNumbers& dimension_numbers,
+                           int64 feature_group_count, int64 batch_group_count,
+                           const PrecisionConfig* precision_config,
+                           PaddingType padding_type);
+
+  XlaOp DynamicConvInputGrad(
+      XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type);
+
+  XlaOp DynamicConvKernelGrad(
+      XlaOp activations, XlaOp gradients,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type);
+
+  StatusOr<HloInstructionProto> DynamicConvInstruction(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type);
+
   virtual StatusOr<XlaOp> ConvGeneralDilatedInternal(
       const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
       absl::Span<const int64> window_strides,
@@ -648,18 +689,28 @@ class XlaBuilder {
                      absl::Span<const int64> window_dimensions,
                      absl::Span<const int64> window_strides, Padding padding);
 
+  XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                     absl::Span<const XlaOp> init_values,
+                     const XlaComputation& computation,
+                     absl::Span<const int64> window_dimensions,
+                     absl::Span<const int64> window_strides, Padding padding);
+
   XlaOp ReduceWindowWithGeneralPadding(
-      XlaOp operand, XlaOp init_value, const XlaComputation& computation,
+      absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
+      const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const int64> base_dilations,
       absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
-
+  StatusOr<XlaOp> ReduceWindowInternal(const Shape& shape,
+                                       absl::Span<const XlaOp> operands,
+                                       absl::Span<const XlaOp> init_values,
+                                       const XlaComputation& computation,
+                                       Window window);
   virtual StatusOr<XlaOp> ReduceWindowInternal(
       const Shape& shape, XlaOp operand, XlaOp init_value,
       const XlaComputation& computation, Window window);
-
   XlaOp CrossReplicaSum(XlaOp operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
@@ -842,9 +893,10 @@ class XlaBuilder {
                  absl::optional<ComparisonDirection> direction = absl::nullopt,
                  absl::optional<Comparison::Type> type = absl::nullopt);
 
+  StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                          ComparisonDirection direction);
+
   // Internal helper method for binary op compare without broadcast dimensions.
-  virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                  ComparisonDirection direction);
   virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                                   ComparisonDirection direction,
                                   Comparison::Type type);
@@ -978,6 +1030,9 @@ class XlaBuilder {
   friend XlaOp Pad(XlaOp operand, XlaOp padding_value,
                    const PaddingConfig& padding_config);
 
+  friend XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64 dimno,
+                        int64 pad_lo, int64 pad_hi);
+
   friend XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
                        absl::Span<const int64> new_sizes);
 
@@ -1046,13 +1101,49 @@ class XlaBuilder {
       XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
       Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
       int64 feature_group_count, int64 batch_group_count,
-      const PrecisionConfig* precision_config);
+      const PrecisionConfig* precision_confige);
   friend XlaOp ConvGeneral(XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
                            const ConvolutionDimensionNumbers& dimension_numbers,
                            int64 feature_group_count, int64 batch_group_count,
                            const PrecisionConfig* precision_config);
+  friend XlaOp DynamicConvForward(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type);
+  friend XlaOp DynamicConvKernelGrad(
+      XlaOp activations, XlaOp gradients,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type);
+  friend XlaOp DynamicConvInputGrad(
+      XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type);
+
+  friend XlaOp ConvKernelGrad(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
+
   friend XlaOp ConvGeneralDilated(
       XlaOp lhs, XlaOp rhs, absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64>> padding,
@@ -1137,6 +1228,12 @@ class XlaBuilder {
                             absl::Span<const int64> window_dimensions,
                             absl::Span<const int64> window_strides,
                             Padding padding);
+  friend XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                            absl::Span<const XlaOp> init_values,
+                            const XlaComputation& computation,
+                            absl::Span<const int64> window_dimensions,
+                            absl::Span<const int64> window_strides,
+                            Padding padding);
   friend XlaOp ReduceWindowWithGeneralPadding(
       XlaOp operand, XlaOp init_value, const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
@@ -1518,6 +1615,11 @@ XlaOp Copy(XlaOp operand);
 XlaOp Pad(XlaOp operand, XlaOp padding_value,
           const PaddingConfig& padding_config);
 
+// Enqueues a pad operation in a given dimension, taking all other
+// dimensions as they are.
+XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64 dimno, int64 pad_lo,
+               int64 pad_hi);
+
 // Enqueues an operation onto the computation that flattens the operand based
 // on the dimension order (major/slowest-varying to minor/fastest-varying)
 // given, followed by reshaping it into the shape with the given dimension
@@ -1744,6 +1846,34 @@ XlaOp ConvGeneralDilated(XlaOp lhs, XlaOp rhs,
                          int64 batch_group_count = 1,
                          const PrecisionConfig* precision_config = nullptr);
 
+XlaOp DynamicConvForward(XlaOp lhs, XlaOp rhs,
+                         absl::Span<const int64> window_strides,
+                         absl::Span<const std::pair<int64, int64>> padding,
+                         absl::Span<const int64> lhs_dilation,
+                         absl::Span<const int64> rhs_dilation,
+                         const ConvolutionDimensionNumbers& dimension_numbers,
+                         int64 feature_group_count, int64 batch_group_count,
+                         const PrecisionConfig* precision_config,
+                         PaddingType padding_type);
+
+XlaOp DynamicConvInputGrad(XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+                           absl::Span<const int64> window_strides,
+                           absl::Span<const std::pair<int64, int64>> padding,
+                           absl::Span<const int64> lhs_dilation,
+                           absl::Span<const int64> rhs_dilation,
+                           const ConvolutionDimensionNumbers& dimension_numbers,
+                           int64 feature_group_count, int64 batch_group_count,
+                           const PrecisionConfig* precision_config,
+                           PaddingType padding_type);
+
+XlaOp DynamicConvKernelGrad(
+    XlaOp activations, XlaOp gradients, absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type);
+
 // Enqueues an FFT instruction onto the computation, of the given type and
 // with the given FFT length.
 XlaOp Fft(XlaOp operand, FftType fft_type, absl::Span<const int64> fft_length);
@@ -1965,6 +2095,12 @@ XlaOp ReduceWindow(XlaOp operand, XlaOp init_value,
                    absl::Span<const int64> window_dimensions,
                    absl::Span<const int64> window_strides, Padding padding);
 
+XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                   absl::Span<const XlaOp> init_values,
+                   const XlaComputation& computation,
+                   absl::Span<const int64> window_dimensions,
+                   absl::Span<const int64> window_strides, Padding padding);
+
 // As ReduceWindow(), but the padding is given in the format
 // returned by MakePadding().
 XlaOp ReduceWindowWithGeneralPadding(
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 7011c946203..4fc6c848a38 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -871,6 +873,8 @@ TEST_F(XlaBuilderTest, DynamicReduceWindow) {
   ReduceWindow(gte, init, sum, /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  VLOG(2) << module->entry_computation()->root_instruction()->ToString()
+          << "\n";
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -878,6 +882,46 @@ TEST_F(XlaBuilderTest, DynamicReduceWindow) {
       << result_shape;
 }
 
+TEST_F(XlaBuilderTest, VariadicDynamicReduceWindow) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto p1 = Parameter(&b, 1, tuple_param_shape, "p1");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p1, 0);
+  std::vector<XlaOp> input_operands = {gte0, gte1};
+  XlaBuilder bsum(TestName());
+  auto p2 = Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x0");
+  auto p3 = Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "x1");
+  auto p4 = Parameter(&bsum, 2, ShapeUtil::MakeShape(F32, {}), "y0");
+  auto p5 = Parameter(&bsum, 3, ShapeUtil::MakeShape(F32, {}), "y1");
+  std::vector<XlaOp> output_operands = {Add(p2, p4), Add(p3, p5)};
+  Tuple(&bsum, absl::MakeSpan(output_operands));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  auto init = ConstantR0<float>(&b, 0.f);
+  ReduceWindow(input_operands, {init, init}, sum,
+               /*window_dimensions=*/{1, 2, 4},
+               /*window_strides=*/{1, 1, 1}, Padding::kValid);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  VLOG(2) << module->entry_computation()->root_instruction()->ToString()
+          << "\n";
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.tuple_shapes(0).dynamic_dimensions(),
+                              {true, false, false}))
+      << result_shape.tuple_shapes(0);
+  EXPECT_TRUE(ContainersEqual(result_shape.tuple_shapes(1).dynamic_dimensions(),
+                              {true, false, false}))
+      << result_shape.tuple_shapes(1);
+}
+
 TEST_F(XlaBuilderTest, DynamicSelectAndScatter) {
   XlaBuilder b(TestName());
   Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
@@ -1203,5 +1247,16 @@ TEST_F(XlaBuilderTest, AddFrontendAttribute) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   ExpectInstructionsAttributesMatch(*module, expected);
 }
+
+TEST_F(XlaBuilderTest, ComparisonType) {
+  XlaBuilder b(TestName());
+  (void)Le(ConstantR0<int32>(&b, 1), ConstantR0<int32>(&b, 2));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Compare(op::Constant(), op::Constant()));
+  EXPECT_EQ(Comparison::Type::kSigned,
+            DynCast<HloCompareInstruction>(root)->type());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 2dd7acb2f67..41860deb2e6 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_deterministic_reductions(false);
   opts.set_xla_cpu_enable_xprof_traceme(false);
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
+  opts.set_xla_multiheap_size_constraint_per_heap(-1);
 
   return opts;
 }
@@ -571,6 +572,17 @@ static void AllocateFlags() {
       "that falling back to the driver can have drawbacks like using more "
       "memory and/or other bugs during compilation, so we recommend setting "
       "this flag to false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_multiheap_size_constraint_per_heap",
+      int32_setter_for(
+          &DebugOptions::set_xla_multiheap_size_constraint_per_heap),
+      flag_values->xla_multiheap_size_constraint_per_heap(),
+      "Generates multiple heaps (i.e., temp buffers) with a size "
+      "constraint on each heap to avoid Out-of-Memory due to memory "
+      "fragmentation. The constraint is soft, so it works with tensors "
+      "larger than the given constraint size. -1 corresponds to no "
+      "constraints."));
+
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index a926e8b3c88..6cd9068364a 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -197,6 +197,28 @@ class Sharding(object):
         type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=shardings)
 
 
+def copy_sharding(from_tensor, to_tensor, use_sharding_op=False):
+  """Copies the a tensor's sharding to another.
+
+  Args:
+    from_tensor: Source tensor. Must be the sole output of an op.
+    to_tensor: the tensor the annotate with the copy.
+    use_sharding_op: whether to create a sharding op on `to_tensor`.
+
+  Returns:
+    A tensor with sharding annotation copied from `from_tensor`.
+  """
+  sharding = get_op_sharding(from_tensor.op)
+  if sharding is None:
+    return to_tensor
+
+  if use_sharding_op:
+    to_tensor = tf2xla.sharding(from_tensor)
+  attr_value = attr_value_pb2.AttrValue(s=sharding)
+  # pylint: disable=protected-access
+  to_tensor.op._set_attr('_XlaSharding', attr_value)
+  return to_tensor
+
 # Helpers for the above factory functions that allow easy application of
 # shardings, for example:
 #   tensor = xla_sharding.replicate(tensor)
@@ -297,7 +319,10 @@ def get_op_sharding(op):
   Returns:
     The attribute representing XLA sharding on this op.
   """
-  return op.get_attr('_XlaSharding')
+  try:
+    return op.get_attr('_XlaSharding')
+  except ValueError:
+    return None
 
 
 def auto_to_manual_spmd_partition(tensor, manual_sharding):
@@ -339,21 +364,16 @@ def manual_to_auto_spmd_partition(tensor, manual_sharding, full_shape):
       tensor, manual_sharding=manual_sharding, full_shape=full_shape)
 
 
-def mesh_split(tensor,
-               device_mesh,
-               tensor_split_dims_mapping,
-               use_sharding_op=False):
-  """Returns a tensor that is split along multiple dimensions in a device mesh.
+def mesh_split_sharding(device_mesh, tensor_split_dims_mapping):
+  """Returns a Sharding object representing sharding along multiple dimensions.
 
   Args:
-    tensor: A tf.Tensor to split.
     device_mesh: An np.ndarray describing the topology of the device mesh and
       each element is the ID of the device in the topology.
     tensor_split_dims_mapping: A list of integers that map each tensor axis to
       the device mesh axis along which it is sharded. Its length is the tensor
       rank, and tensor_split_dims_mapping[i] is device mesh axis for tensor
       dimension i. Use -1 for tensor dimensions that are not sharded.
-    use_sharding_op: If true, adds a sharding op to set the sharding.
 
   Raises:
     ValueError: The number of tensor split dimensions is larger than device mesh
@@ -380,6 +400,32 @@ def mesh_split(tensor,
   tile_assignment = _np.reshape(tile_assignment, tile_shape)
 
   if partial:
-    return partial_tile(
-        tensor, tile_assignment, use_sharding_op=use_sharding_op)
-  return tile(tensor, tile_assignment, use_sharding_op=use_sharding_op)
+    return Sharding.partial_tile(tile_assignment)
+  return Sharding.tile(tile_assignment)
+
+
+def mesh_split(tensor,
+               device_mesh,
+               tensor_split_dims_mapping,
+               use_sharding_op=False):
+  """Returns a tensor that is split along multiple dimensions in a device mesh.
+
+  Args:
+    tensor: A tf.Tensor to split.
+    device_mesh: An np.ndarray describing the topology of the device mesh and
+      each element is the ID of the device in the topology.
+    tensor_split_dims_mapping: A list of integers that map each tensor axis to
+      the device mesh axis along which it is sharded. Its length is the tensor
+      rank, and tensor_split_dims_mapping[i] is device mesh axis for tensor
+      dimension i. Use -1 for tensor dimensions that are not sharded.
+    use_sharding_op: If true, adds a sharding op to set the sharding.
+
+  Raises:
+    ValueError: The number of tensor split dimensions is larger than device mesh
+      rank.
+  """
+  sharding = mesh_split_sharding(device_mesh, tensor_split_dims_mapping)
+  if use_sharding_op:
+    tensor = tf2xla.sharding(tensor)
+  sharding.apply_to_tensor(tensor)
+  return tensor
diff --git a/tensorflow/compiler/xla/g3doc/known_issues.md b/tensorflow/compiler/xla/g3doc/known_issues.md
index 1c03c716a02..2bffab580f5 100644
--- a/tensorflow/compiler/xla/g3doc/known_issues.md
+++ b/tensorflow/compiler/xla/g3doc/known_issues.md
@@ -30,3 +30,12 @@ random operations, such as `tf.random.normal`, or `tf.nn.dropout`.  XLA will
 behave as if the compilation was seeded with a new unique seed at each run. This
 limitation does not apply to stateless random ops.
 
+## TensorFlow while loops need to be bounded (or have backprop disabled)
+
+TF while [loops](https://www.tensorflow.org/api_docs/python/tf/while_loop)
+created using `tf.while_loop` support backpropagation by accumulating all
+intermediate results in a `TensorArray`.
+
+Since XLA only supports bounded `TensorArray`s, all compiled while loops need to
+either have `maximum_iterations` parameter set to a constant value known at
+compile time, or backpropagation disabled using `back_prop=False`.
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 43a3860e405..1ff96db8637 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -176,6 +176,7 @@ cc_library(
         "//learning/brain/research/jax:__subpackages__",
         "//learning/deepmind/tensorflow/tensorfn:__subpackages__",
         "//learning/pathways:__subpackages__",
+        "//tensorflow/compiler/xla:friends",
     ],
     deps = [
         ":local_device_state",
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
index c571ef2a4df..9b0f060f392 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 
 namespace xla {
@@ -25,7 +26,7 @@ static const char kCpuPlatformName[] = "cpu";
 
 CpuDevice::CpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state)
-    : PjRtDevice(id, std::move(local_device_state), kCpuPlatformName,
+    : PjRtDevice(id, std::move(local_device_state),
                  /*device_kind=*/kCpuPlatformName) {}
 
 StatusOr<std::unique_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
@@ -57,7 +58,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   }
 
   return std::make_unique<PjRtClient>(
-      kCpuPlatformName, client, std::move(devices), /*host_id=*/0,
+      kCpuName, client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
       /*should_stage_host_to_device_transfers=*/false,
       /*gpu_run_options=*/nullptr);
diff --git a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
index c56b41861b0..f43ec5a9216 100644
--- a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
@@ -54,9 +54,9 @@ TEST(GpuMultiStream, Basics) {
   device_assignment(0, 0) = device->id();
   compile_options.executable_build_options.set_device_assignment(
       device_assignment);
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtExecutable> executable,
-                          PjRtExecutable::Compile(computation, client.get(),
-                                                  std::move(compile_options)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtExecutable> executable,
+      client->Compile(computation, std::move(compile_options)));
 
   int64 dummy_size = 1 << 20;
   std::vector<int32> dummy_inputs(dummy_size);
@@ -71,22 +71,22 @@ TEST(GpuMultiStream, Basics) {
     // must wait.
     TF_ASSERT_OK_AND_ASSIGN(
         auto dummy_buffer,
-        PjRtBuffer::FromHostBuffer(
+        client->BufferFromHostBuffer(
             dummy_inputs.data(), dummy_shape,
-            PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes,
-            /*buffer_reference=*/nullptr, client.get(), device));
+            PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes,
+            /*buffer_reference=*/nullptr, device));
     TF_ASSERT_OK_AND_ASSIGN(
         auto in_buffer0,
-        PjRtBuffer::FromHostBuffer(
+        client->BufferFromHostBuffer(
             inputs.data(), shape,
-            PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes,
-            /*buffer_reference=*/nullptr, client.get(), device));
+            PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes,
+            /*buffer_reference=*/nullptr, device));
     TF_ASSERT_OK_AND_ASSIGN(
         auto in_buffer1,
-        PjRtBuffer::FromHostBuffer(
+        client->BufferFromHostBuffer(
             inputs.data(), shape,
-            PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes,
-            /*buffer_reference=*/nullptr, client.get(), device));
+            PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes,
+            /*buffer_reference=*/nullptr, device));
     // The execution may be enqueued before the transfers complete, requiring
     // adequate device-side synchronization.
     ExecuteOptions options;
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.cc b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
index 376d8687892..2819cabf258 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 
 namespace xla {
@@ -25,7 +26,7 @@ static const char kInterpreterPlatformName[] = "interpreter";
 
 InterpreterDevice::InterpreterDevice(
     int id, std::unique_ptr<LocalDeviceState> local_device_state)
-    : PjRtDevice(id, std::move(local_device_state), kInterpreterPlatformName,
+    : PjRtDevice(id, std::move(local_device_state),
                  /*device_kind=*/kInterpreterPlatformName) {}
 
 StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient() {
@@ -51,7 +52,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient() {
   devices.push_back(std::move(device));
 
   return std::make_unique<PjRtClient>(
-      kInterpreterPlatformName, client, std::move(devices), /*host_id=*/0,
+      "interpreter", client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
       /*should_stage_host_to_device_transfers=*/false,
       /*gpu_run_options=*/nullptr);
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index df92921c39d..fde6016e5f9 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -30,8 +30,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-static const char kGpuPlatformName[] = "gpu";
-
 // A custom PjRtClient that overrides the device assignment method.
 class GpuClient : public xla::PjRtClient {
  public:
@@ -298,8 +296,8 @@ Status BuildDistributedDevices(
 GpuDevice::GpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state,
                      std::string device_kind, int node_id)
-    : PjRtDevice(id, std::move(local_device_state), kGpuPlatformName,
-                 std::move(device_kind), node_id) {}
+    : PjRtDevice(id, std::move(local_device_state), std::move(device_kind),
+                 node_id) {}
 
 StatusOr<std::unique_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
@@ -325,7 +323,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetNvidiaGpuClient(
   }
 
   return std::unique_ptr<PjRtClient>(std::make_unique<GpuClient>(
-      "gpu", xla_client, std::move(devices),
+      kGpuName, xla_client, std::move(devices),
       /*node_id=*/node_id, std::move(allocator),
       std::move(host_memory_allocator),
       /*should_stage_host_to_device_transfers=*/true,
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 02ae37b71db..868ba991b71 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -98,6 +98,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -113,6 +114,13 @@ limitations under the License.
 
 namespace xla {
 
+PjRtPlatformId PjRtDevice::platform_id() const {
+  return client_->platform_id();
+}
+const std::string& PjRtDevice::platform_name() const {
+  return client_->platform_name();
+}
+
 StatusOr<LocalDeviceState*> PjRtDevice::GetLocalDeviceState() const {
   if (local_device_state_) {
     return local_device_state_.get();
@@ -145,8 +153,8 @@ StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
           devices[replica].size(), replica, devices[0].size());
     }
     for (int partition = 0; partition < devices[replica].size(); ++partition) {
-      if (devices[0][0]->platform_name() !=
-          devices[replica][partition]->platform_name()) {
+      if (devices[0][0]->platform_id() !=
+          devices[replica][partition]->platform_id()) {
         return InvalidArgument(
             "Device assignment passed to Compile() must have devices of a "
             "single kind, got %s for replica 0 partition 0 and %s for replica "
@@ -181,7 +189,8 @@ PjRtClient::PjRtClient(
     std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<GpuExecutableRunOptions> gpu_run_options)
-    : platform_name_(std::move(platform_name)),
+    : platform_id_(tensorflow::Fingerprint64(platform_name)),
+      platform_name_(std::move(platform_name)),
       client_(client),
       host_memory_allocator_(std::move(host_memory_allocator)),
       devices_(std::move(devices)),
@@ -206,15 +215,15 @@ PjRtClient::PjRtClient(
     CHECK(id_to_device_.insert({device->id(), device.get()}).second)
         << "Duplicate device id: " << device->id();
 
-    if (device->local_device_state()) {
-      int idx = device->local_device_state()->device_ordinal();
+    if (device->IsLocalDevice()) {
+      int idx = device->local_device_id();
       if (idx >= local_devices_.size()) {
         local_devices_.resize(idx + 1);
       }
       CHECK(local_devices_[idx] == nullptr) << idx;
       local_devices_[idx] = device.get();
     }
-    device->client_ = this;
+    device->SetClient(this);
   }
   for (int idx = 0; idx < local_devices_.size(); ++idx) {
     CHECK(local_devices_[idx] != nullptr) << idx;
@@ -227,62 +236,6 @@ StatusOr<DeviceAssignment> PjRtClient::GetDefaultDeviceAssignment(
                                                                 num_partitions);
 }
 
-StatusOr<absl::flat_hash_set<int>> PjRtClient::GetParametersThatMustBeDonated(
-    const LocalExecutable& executable, bool tuple_inputs) const {
-  HloComputation* computation =
-      executable.executable()->module().entry_computation();
-  int number_of_parameters = [&]() -> int {
-    if (tuple_inputs) {
-      CHECK_EQ(computation->num_parameters(), 1);
-      const Shape& input_tuple_shape =
-          computation->parameter_instruction(0)->shape();
-      CHECK(input_tuple_shape.IsTuple());
-      return input_tuple_shape.tuple_shapes_size();
-    } else {
-      return computation->num_parameters();
-    }
-  }();
-  // If any buffer in a parameter is aliased we will donate the entire input
-  // parameter.
-  absl::flat_hash_set<int> parameters_to_donate;
-  const HloInputOutputAliasConfig& config =
-      executable.executable()->module().input_output_alias_config();
-  TF_RETURN_IF_ERROR(config.ForEachAliasWithStatus(
-      [&](const ShapeIndex& output_index,
-          const HloInputOutputAliasConfig::Alias& alias) {
-        if (tuple_inputs) {
-          if (alias.parameter_number != 0) {
-            return InvalidArgument(
-                "Unexpected parameter number %d in alias config with tupled "
-                "inputs",
-                alias.parameter_number);
-          }
-          const ShapeIndex& index = alias.parameter_index;
-          if (!index.empty()) {
-            int this_parameter = index.data()[0];
-            if (this_parameter >= number_of_parameters) {
-              return InvalidArgument(
-                  "Unexpected parameter index %s in alias config with tupled "
-                  "inputs and %d parameters",
-                  index.ToString(), number_of_parameters);
-            }
-            parameters_to_donate.insert(this_parameter);
-          }
-        } else {
-          int this_parameter = alias.parameter_number;
-          if (this_parameter >= number_of_parameters) {
-            return InvalidArgument(
-                "Unexpected parameter number %d in alias config without tupled "
-                "inputs and %d parameters",
-                this_parameter, number_of_parameters);
-          }
-          parameters_to_donate.insert(this_parameter);
-        }
-        return Status::OK();
-      }));
-  return parameters_to_donate;
-}
-
 std::unique_ptr<HloCostAnalysis> PjRtClient::GetHloCostAnalysis() {
   return absl::make_unique<HloCostAnalysis>(
       client_->backend().compiler()->ShapeSizeBytesFunction());
@@ -576,24 +529,23 @@ void PjRtBuffer::ScopedHold::AddToInput(
   }
 }
 
-/* static */
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
+bool PjRtBuffer::IsOnCpu() const { return client()->platform_id() == kCpuId; }
+
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtClient::BufferFromHostBuffer(
     const void* data, const Shape& shape,
     HostBufferSemantics host_buffer_semantics,
-    std::shared_ptr<void> buffer_reference, PjRtClient* client,
-    PjRtDevice* device) {
-  tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostBuffer");
-  VLOG(2) << "PjRtBuffer::FromHostBuffer: shape: " << shape.ToString()
+    std::shared_ptr<void> buffer_reference, PjRtDevice* device) {
+  tensorflow::profiler::TraceMe traceme("PjRtClient::BufferFromHostBuffer");
+  VLOG(2) << "PjRtClient::BufferFromHostBuffer: shape: " << shape.ToString()
           << " device: " << device->DebugString();
   if (shape.IsTuple()) {
-    return InvalidArgument("Use FromHostLiteral to transfer a tuple");
+    return InvalidArgument("Use BufferFromHostLiteral to transfer a tuple");
   }
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       device->GetLocalDeviceState());
   int64 size = ShapeUtil::ByteSizeOf(shape);
 
-  TransferManager* transfer_manager =
-      client->client()->backend().transfer_manager();
+  TransferManager* transfer_manager = client()->backend().transfer_manager();
   TF_ASSIGN_OR_RETURN(Shape compact_shape,
                       transfer_manager->ChooseCompactLayoutForShape(shape));
 
@@ -628,10 +580,11 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
         };
         buffer = se::DeviceMemoryBase(const_cast<void*>(data), size);
       } else {
-        void* staging_buffer = client->host_memory_allocator()->AllocateRaw(
+        void* staging_buffer = host_memory_allocator()->AllocateRaw(
             cpu_function_runtime::kMinAlign, size);
-        on_delete_callback = [staging_buffer, client]() {
-          client->host_memory_allocator()->DeallocateRaw(staging_buffer);
+        on_delete_callback = [staging_buffer, host_memory_allocator =
+                                                  host_memory_allocator()]() {
+          host_memory_allocator->DeallocateRaw(staging_buffer);
         };
         buffer = se::DeviceMemoryBase(staging_buffer, size);
         std::memcpy(staging_buffer, data, size);
@@ -643,7 +596,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
           std::initializer_list<se::DeviceMemoryBase>{buffer},
           definition_events, std::move(on_delete_callback));
       return absl::make_unique<PjRtBuffer>(
-          shape, shape, std::move(device_buffer), client, device);
+          shape, shape, std::move(device_buffer), this, device);
     }
   }
 
@@ -651,21 +604,22 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
       std::unique_ptr<PjRtBuffer> py_buffer,
       AllocateDestinationBuffer(compact_shape, device, local_device,
                                 local_device->host_to_device_stream(),
-                                /*is_uninitialized_create=*/false, client));
+                                /*is_uninitialized_create=*/false, this));
 
-  ScopedHold device_buffer(py_buffer->GetBufferWithUsageHold());
+  PjRtBuffer::ScopedHold device_buffer(py_buffer->GetBufferWithUsageHold());
   CHECK(device_buffer.ok());
 
   // If necessary, allocate a host-side buffer for staging host-to-device
   // transfers. On GPU this is a buffer in pinned memory.
   std::shared_ptr<void> staging_buffer;
   if (host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall ||
-      client->should_stage_host_to_device_transfers()) {
-    void* ptr = client->host_memory_allocator()->AllocateRaw(
+      should_stage_host_to_device_transfers()) {
+    void* ptr = host_memory_allocator()->AllocateRaw(
         tensorflow::Allocator::kAllocatorAlignment, size);
-    staging_buffer = std::shared_ptr<void>(ptr, [client](void* ptr) {
-      client->host_memory_allocator()->DeallocateRaw(ptr);
-    });
+    staging_buffer = std::shared_ptr<void>(
+        ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
+          host_memory_allocator->DeallocateRaw(ptr);
+        });
   }
 
   // Copy the buffer into a staging buffer before returning control to the
@@ -684,22 +638,23 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
   // usage holds have gone away.
   // TODO(misard) assess if it would be preferable to introduce a heuristic to
   // put the transfer into the calling thread for small literals.
-  auto transfer_h2d = [client, transfer_manager, local_device, data, size,
+  auto transfer_h2d = [local_client = client(), transfer_manager, local_device,
+                       data, size,
                        movable_device_buffer{device_buffer.ToClosure()}, shape,
                        py_buffer{py_buffer.get()}, compact_shape,
                        on_device_shape{py_buffer->on_device_shape()},
                        staging_buffer{std::move(staging_buffer)},
                        buffer_reference{std::move(buffer_reference)},
                        host_buffer_semantics]() {
-    ScopedHold device_buffer(movable_device_buffer);
+    PjRtBuffer::ScopedHold device_buffer(movable_device_buffer);
     // This function uses TF_CHECK_OK and ValueOrDie() since we have no way
     // to report failures from a callback. However, the operations here are
     // unlikely to fail and not recoverable even if we were to fail: DMAs to
     // memory that has already been allocated, and a possible Event
     // allocation.
 
-    ShapedBuffer buffer = device_buffer->AsShapedBuffer(
-        compact_shape, on_device_shape, client->client()->platform());
+    ShapedBuffer buffer =
+        device_buffer->AsShapedBuffer(compact_shape, on_device_shape);
     // If applicable on the backend, stage the transfer via host memory
     // allocated via the host_memory_allocator. On GPU, this is pinned
     // memory.
@@ -736,41 +691,38 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
     // already defers its work onto a stream (= thread on CPU).
     transfer_h2d();
   } else {
-    client->h2d_transfer_pool()->Schedule(transfer_h2d);
+    h2d_transfer_pool()->Schedule(transfer_h2d);
   }
   return py_buffer;
 }
 
-/* static */
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CreateUninitialized(
-    const Shape& shape, PjRtClient* client, PjRtDevice* device) {
-  tensorflow::profiler::TraceMe traceme("PjRtBuffer::CreateUninitialized");
-  VLOG(2) << "PjRtBuffer::CreateUninitialized: shape: " << shape.ToString()
-          << " device: " << device->DebugString();
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtClient::CreateUninitializedBuffer(
+    const Shape& shape, PjRtDevice* device) {
+  tensorflow::profiler::TraceMe traceme(
+      "PjRtClient::CreateUninitializedBuffer");
+  VLOG(2) << "PjRtClient::CreateUninitializedBuffer: shape: "
+          << shape.ToString() << " device: " << device->DebugString();
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       device->GetLocalDeviceState());
 
-  TransferManager* transfer_manager =
-      client->client()->backend().transfer_manager();
+  TransferManager* transfer_manager = client()->backend().transfer_manager();
   TF_ASSIGN_OR_RETURN(Shape compact_shape,
                       transfer_manager->ChooseCompactLayoutForShape(shape));
 
   return AllocateDestinationBuffer(compact_shape, device, local_device,
                                    /*copy_stream=*/nullptr,
-                                   /*is_uninitialized_create=*/true, client);
+                                   /*is_uninitialized_create=*/true, this);
 }
 
-/* static */
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
-    const LiteralSlice& literal, PjRtClient* client, PjRtDevice* device) {
-  tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostLiteral");
-  VLOG(2) << "PjRtBuffer::FromHostLiteral: shape: "
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtClient::BufferFromHostLiteral(
+    const LiteralSlice& literal, PjRtDevice* device) {
+  tensorflow::profiler::TraceMe traceme("PjRtClient::BufferFromHostLiteral");
+  VLOG(2) << "PjRtClient::BufferFromHostLiteral: shape: "
           << literal.shape().ToString() << " device: " << device->DebugString();
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       device->GetLocalDeviceState());
 
-  TransferManager* transfer_manager =
-      client->client()->backend().transfer_manager();
+  TransferManager* transfer_manager = client()->backend().transfer_manager();
   TF_ASSIGN_OR_RETURN(
       Shape compact_shape,
       transfer_manager->ChooseCompactLayoutForShape(literal.shape()));
@@ -778,9 +730,9 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
       std::unique_ptr<PjRtBuffer> py_buffer,
       AllocateDestinationBuffer(compact_shape, device, local_device,
                                 local_device->host_to_device_stream(),
-                                /*is_uninitialized_create=*/false, client));
+                                /*is_uninitialized_create=*/false, this));
 
-  ScopedHold device_buffer(py_buffer->GetBufferWithUsageHold());
+  PjRtBuffer::ScopedHold device_buffer(py_buffer->GetBufferWithUsageHold());
   CHECK(device_buffer.ok());
 
   // The host to device transfer is performed on a thread pool, mostly because
@@ -789,11 +741,11 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
   // usage holds have gone away.
   // TODO(misard) assess if it would be preferable to introduce a heuristic to
   // put the transfer into the calling thread for small literals.
-  auto transfer_h2d = [client, transfer_manager, local_device,
+  auto transfer_h2d = [local_client = client(), transfer_manager, local_device,
                        movable_device_buffer{device_buffer.ToClosure()},
                        literal, py_buffer{py_buffer.get()}, compact_shape,
                        on_device_shape{py_buffer->on_device_shape()}]() {
-    ScopedHold device_buffer(movable_device_buffer);
+    PjRtBuffer::ScopedHold device_buffer(movable_device_buffer);
     // This function uses TF_CHECK_OK and ValueOrDie() since we have no way
     // to report failures from a callback. However, the operations here are
     // unlikely to fail and not recoverable even if we were to fail: DMAs to
@@ -801,8 +753,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
     // allocation.
 
     se::Stream* h2d_stream = local_device->host_to_device_stream();
-    ShapedBuffer buffer = device_buffer->AsShapedBuffer(
-        compact_shape, on_device_shape, client->client()->platform());
+    ShapedBuffer buffer =
+        device_buffer->AsShapedBuffer(compact_shape, on_device_shape);
     TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
         h2d_stream, literal, buffer));
 
@@ -817,12 +769,12 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
         .IgnoreError();  // Can return error::Unimplemented
     QCHECK(h2d_stream->ok());
   };
-  client->h2d_transfer_pool()->Schedule(transfer_h2d);
+  h2d_transfer_pool()->Schedule(transfer_h2d);
   return py_buffer;
 }
 
-/*static*/ void PjRtBuffer::MakeCrossHostReceiveBuffers(
-    absl::Span<const Shape> shapes, PjRtClient* client, PjRtDevice* device,
+void PjRtClient::MakeCrossHostReceiveBuffers(
+    absl::Span<const Shape> shapes, PjRtDevice* device,
     PjRtCrossHostRecvNotifier&& notifier) {
   if (shapes.empty()) {
     notifier(InvalidArgument(
@@ -843,7 +795,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
     StatusOr<std::unique_ptr<PjRtBuffer>> buffer_or =
         AllocateDestinationBuffer(shape, device, local_device,
                                   /*copy_stream=*/nullptr,
-                                  /*is_uninitialized_create=*/false, client);
+                                  /*is_uninitialized_create=*/false, this);
     if (!buffer_or.ok()) {
       notifier(buffer_or.status());
       return;
@@ -851,7 +803,31 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
     buffers.push_back(buffer_or.ConsumeValueOrDie());
   }
 
-  client->EnqueueCrossHostReceive(std::move(buffers), std::move(notifier));
+  EnqueueCrossHostReceive(std::move(buffers), std::move(notifier));
+}
+
+// Transfer the given literal to the infeed queue of the given local device.
+Status PjRtDevice::TransferToInfeed(const LiteralSlice& literal) const {
+  // Only support infeed to local device.
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, GetLocalDeviceState());
+  return local_device->client()->TransferToInfeedLocal(
+      literal, local_device->device_ordinal());
+}
+
+StatusOr<Literal> PjRtDevice::TransferFromOutfeed(const Shape& shape) const {
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, GetLocalDeviceState());
+  return local_device->client()->TransferFromOutfeedLocal(
+      shape, local_device->device_ordinal());
+}
+
+StatusOr<PjRtDevice*> PjRtClient::LookupLocalDevice(int local_device_id) const {
+  for (auto* device : local_devices_) {
+    if (local_device_id == device->local_device_id()) {
+      return device;
+    }
+  }
+  return InvalidArgument("No matching device found for local_device_id %d",
+                         local_device_id);
 }
 
 PjRtBuffer::PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
@@ -1123,8 +1099,8 @@ PjRtBuffer::CopyToHostAsyncInternal(bool discard_cached_copy,
     host_shape = on_host_shape_;
   }
   host_value->value = std::make_shared<Literal>(host_shape);
-  ShapedBuffer shaped_buffer = device_buffer->AsShapedBuffer(
-      host_shape, on_device_shape_, client_->client()->platform());
+  ShapedBuffer shaped_buffer =
+      device_buffer->AsShapedBuffer(host_shape, on_device_shape_);
   client_->client()->backend().transfer_manager()->TransferLiteralFromDevice(
       stream, shaped_buffer, host_value->value.get(),
       [host_value](Status done_status) {
@@ -1159,7 +1135,7 @@ PjRtBuffer::CopyToHostAsyncInternal(bool discard_cached_copy,
 
 StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral(
     const bool discard_cached_copy, absl::optional<xla::Layout> layout) {
-  tensorflow::profiler::TraceMe traceme("PjRtBuffer::ToLiteral");
+  tensorflow::profiler::TraceMe traceme("PjRtClient::ToLiteral");
   TF_ASSIGN_OR_RETURN(std::shared_ptr<HostValue> host_value,
                       CopyToHostAsyncInternal(discard_cached_copy, layout));
   if (host_value == nullptr) {
@@ -1176,8 +1152,7 @@ StatusOr<ShapedBuffer> PjRtBuffer::AsShapedBuffer() const {
     return InvalidArgument(
         "Attempted to fetch value of invalid/deleted buffer.");
   }
-  return device_buffer_->AsShapedBuffer(on_host_shape_, on_device_shape_,
-                                        client_->client()->platform());
+  return device_buffer_->AsShapedBuffer(on_host_shape_, on_device_shape_);
 }
 
 PjRtBuffer::ScopedHold PjRtBuffer::GetBufferWithHold(ScopedHold::Type type) {
@@ -1212,8 +1187,8 @@ PjRtBuffer::CopyToDeviceHelper(
 
   ScopedHold dst_device_buffer(py_buffer->GetBufferWithUsageHold());
   CHECK(dst_device_buffer.ok());
-  ShapedBuffer dst_buffer = dst_device_buffer->AsShapedBuffer(
-      on_host_shape_, on_device_shape_, client_->client()->platform());
+  ShapedBuffer dst_buffer =
+      dst_device_buffer->AsShapedBuffer(on_host_shape_, on_device_shape_);
 
   // Copy the leaf buffers.
   StatusOr<std::shared_ptr<BufferSequencingEvent>> copy_event_or =
@@ -1267,9 +1242,9 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CopyToDevice(
   // Copying across PjRtClients involves a copy through the host.
   if (dst_device->client() != client_) {
     TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteral());
-    return FromHostBuffer(literal->untyped_data(), literal->shape(),
-                          HostBufferSemantics::kZeroCopy, nullptr,
-                          dst_device->client(), dst_device);
+    return dst_device->client()->BufferFromHostBuffer(
+        literal->untyped_data(), literal->shape(),
+        PjRtClient::HostBufferSemantics::kZeroCopy, nullptr, dst_device);
   }
 
   TF_ASSIGN_OR_RETURN(LocalDeviceState * dst_local_device,
@@ -1498,12 +1473,66 @@ PjRtExecutable::PjRtExecutable(
   }
 }
 
-Status PjRtExecutable::SetUpDonation(PjRtClient* client, bool tuple_inputs) {
+StatusOr<absl::flat_hash_set<int>> GetParametersThatMustBeDonated(
+    const HloModule& module, bool tuple_inputs) {
+  HloComputation* computation = module.entry_computation();
+  int number_of_parameters = [&]() -> int {
+    if (tuple_inputs) {
+      CHECK_EQ(computation->num_parameters(), 1);
+      const Shape& input_tuple_shape =
+          computation->parameter_instruction(0)->shape();
+      CHECK(input_tuple_shape.IsTuple());
+      return input_tuple_shape.tuple_shapes_size();
+    } else {
+      return computation->num_parameters();
+    }
+  }();
+  // If any buffer in a parameter is aliased we will donate the entire input
+  // parameter.
+  absl::flat_hash_set<int> parameters_to_donate;
+  const HloInputOutputAliasConfig& config = module.input_output_alias_config();
+  TF_RETURN_IF_ERROR(config.ForEachAliasWithStatus(
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) {
+        if (tuple_inputs) {
+          if (alias.parameter_number != 0) {
+            return InvalidArgument(
+                "Unexpected parameter number %d in alias config with tupled "
+                "inputs",
+                alias.parameter_number);
+          }
+          const ShapeIndex& index = alias.parameter_index;
+          if (!index.empty()) {
+            int this_parameter = index.data()[0];
+            if (this_parameter >= number_of_parameters) {
+              return InvalidArgument(
+                  "Unexpected parameter index %s in alias config with tupled "
+                  "inputs and %d parameters",
+                  index.ToString(), number_of_parameters);
+            }
+            parameters_to_donate.insert(this_parameter);
+          }
+        } else {
+          int this_parameter = alias.parameter_number;
+          if (this_parameter >= number_of_parameters) {
+            return InvalidArgument(
+                "Unexpected parameter number %d in alias config without tupled "
+                "inputs and %d parameters",
+                this_parameter, number_of_parameters);
+          }
+          parameters_to_donate.insert(this_parameter);
+        }
+        return Status::OK();
+      }));
+  return parameters_to_donate;
+}
+
+Status PjRtExecutable::SetUpDonation(bool tuple_inputs) {
   parameters_that_must_be_donated_.reserve(executables_.size());
   for (auto& executable : executables_) {
-    TF_ASSIGN_OR_RETURN(
-        absl::flat_hash_set<int> parameters_to_donate,
-        client->GetParametersThatMustBeDonated(*executable, tuple_inputs));
+    TF_ASSIGN_OR_RETURN(absl::flat_hash_set<int> parameters_to_donate,
+                        GetParametersThatMustBeDonated(
+                            executable->executable()->module(), tuple_inputs));
     parameters_that_must_be_donated_.emplace_back(
         std::move(parameters_to_donate));
   }
@@ -1964,7 +1993,7 @@ PjRtExecutable::ExecuteOnLocalDevices(
     if (!statusor.ok()) {
       return AppendStatus(
           statusor.status(),
-          absl::StrFormat("while running replica %d and partition %d of a"
+          absl::StrFormat("while running replica %d and partition %d of a "
                           "replicated computation (other "
                           "replicas may have failed as well).",
                           replica, partition));
@@ -1974,6 +2003,19 @@ PjRtExecutable::ExecuteOnLocalDevices(
   return wrapped_results;
 }
 
+StatusOr<std::vector<std::shared_ptr<HloModule>>>
+PjRtExecutable::GetHloModules() {
+  std::vector<std::shared_ptr<HloModule>> modules;
+  modules.reserve(executables().size());
+  for (const auto& local_exec : executables()) {
+    if (!local_exec->executable()->has_module()) {
+      return InvalidArgument("Executable does not have HLO modules.");
+    }
+    modules.push_back(local_exec->executable()->shared_module());
+  }
+  return std::move(modules);
+}
+
 namespace {
 
 StatusOr<Shape> GetShardedShape(const Shape& shape,
@@ -2061,14 +2103,13 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
 
 }  // namespace
 
-/*static*/ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtExecutable::Compile(
-    const XlaComputation& computation, PjRtClient* client,
-    CompileOptions options) {
-  tensorflow::profiler::TraceMe traceme("LocalExecutable::Compile");
+StatusOr<std::unique_ptr<PjRtExecutable>> PjRtClient::Compile(
+    const XlaComputation& computation, CompileOptions options) {
+  tensorflow::profiler::TraceMe traceme("PjRtClient::Compile");
 
   ExecutableBuildOptions& build_options = options.executable_build_options;
   if (!build_options.device_allocator()) {
-    build_options.set_device_allocator(client->allocator());
+    build_options.set_device_allocator(allocator());
   }
 
   int num_replicas;
@@ -2084,14 +2125,14 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
     num_partitions = 1;
   } else {
     if (!build_options.has_device_assignment()) {
-      VLOG(2) << "PjRtExecutable::Compile using default device_assignment.";
+      VLOG(2) << "PjRtClient::Compile using default device_assignment.";
       TF_ASSIGN_OR_RETURN(
           DeviceAssignment device_assignment,
-          client->GetDefaultDeviceAssignment(build_options.num_replicas(),
-                                             build_options.num_partitions()));
+          GetDefaultDeviceAssignment(build_options.num_replicas(),
+                                     build_options.num_partitions()));
       build_options.set_device_assignment(device_assignment);
     }
-    VLOG(2) << "PjRtExecutable::Compile device_assignment:\n"
+    VLOG(2) << "PjRtClient::Compile device_assignment:\n"
             << build_options.device_assignment().ToString();
     num_replicas = build_options.device_assignment().replica_count();
     num_partitions = build_options.device_assignment().computation_count();
@@ -2118,7 +2159,8 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
 
   // Assign a default layout based on `sharded_shape` to any array subshapes in
   // `dst_shape` that are missing layouts.
-  auto assign_layouts = [client](const Shape& sharded_shape, Shape* dst_shape) {
+  auto assign_layouts = [local_client = client()](const Shape& sharded_shape,
+                                                  Shape* dst_shape) {
     return ShapeUtil::ForEachMutableSubshapeWithStatus(
         dst_shape, [&](Shape* subshape, const ShapeIndex& idx) {
           if (subshape->IsArray() && !subshape->has_layout()) {
@@ -2126,8 +2168,7 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
             const Shape& sharded_subshape =
                 ShapeUtil::GetSubshape(sharded_shape, idx);
             LayoutUtil::SetToDefaultLayout(subshape);
-            TF_ASSIGN_OR_RETURN(Shape layout, client->client()
-                                                  ->backend()
+            TF_ASSIGN_OR_RETURN(Shape layout, local_client->backend()
                                                   .transfer_manager()
                                                   ->ChooseCompactLayoutForShape(
                                                       sharded_subshape));
@@ -2162,8 +2203,8 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
     for (int replica = 0; replica < num_replicas; ++replica) {
       for (int partition = 0; partition < num_partitions; ++partition) {
         int device_id = (*device_assignment)(replica, partition);
-        PjRtDevice* device = LookupDevice(*client, device_id);
-        if (device->host_id() != client->host_id()) {
+        PjRtDevice* device = LookupDevice(*this, device_id);
+        if (device->host_id() != host_id()) {
           VLOG(3) << "Non-local device: " << device_id;
           continue;
         }
@@ -2185,15 +2226,14 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<LocalExecutable>> local_executables,
-      client->client()->Compile(computation, argument_layout_pointers,
-                                build_options));
+      client()->Compile(computation, argument_layout_pointers, build_options));
 
   auto executable = absl::make_unique<PjRtExecutable>(
       std::move(local_executables), options.parameter_is_tupled_arguments,
       std::move(device_assignment), std::move(local_logical_device_ids),
-      std::move(local_devices), client);
+      std::move(local_devices), this);
   TF_RETURN_IF_ERROR(
-      executable->SetUpDonation(client, options.parameter_is_tupled_arguments));
+      executable->SetUpDonation(options.parameter_is_tupled_arguments));
   return executable;
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index cb4ef9da85b..38d2610ff93 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -36,40 +36,63 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 
 // API notes:
 // PjRt stands for "Pretty much Just another RunTime".
 
 namespace xla {
 
+using PjRtPlatformId = uint64;
+
+constexpr char kCpuName[] = "cpu";
+constexpr char kGpuName[] = "gpu";
+constexpr char kTpuName[] = "tpu";
+static const PjRtPlatformId kCpuId = tensorflow::Fingerprint64(kCpuName);
+static const PjRtPlatformId kGpuId = tensorflow::Fingerprint64(kGpuName);
+static const PjRtPlatformId kTpuId = tensorflow::Fingerprint64(kTpuName);
+
 class PjRtClient;
 
 class PjRtDevice {
  public:
   explicit PjRtDevice(int id,
                       std::unique_ptr<LocalDeviceState> local_device_state,
-                      std::string platform_name, std::string device_kind,
-                      int host_id = 0)
+                      std::string device_kind, int host_id = 0)
       : id_(id),
+        local_device_id_(
+            local_device_state ? local_device_state->device_ordinal() : -1),
         local_device_state_(std::move(local_device_state)),
         host_id_(host_id),
-        platform_name_(std::move(platform_name)),
         device_kind_(std::move(device_kind)) {}
   virtual ~PjRtDevice() {}
 
+  // Must set client exactly once.
+  void SetClient(PjRtClient* client) {
+    CHECK(client_ == nullptr);
+    client_ = client;
+  }
+
   // The ID of this device. IDs are unique among devices of this type
   // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
   // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
   int id() const { return id_; }
 
+  bool IsLocalDevice() const { return local_device_id_ != -1; }
+
+  int local_device_id() const { return local_device_id_; }
+
   // If this is a device local to this host, returns a LocalDeviceState object
   // that can be used to manipulate the device. Returns nullptr if the device is
   // not local to this host.
@@ -85,7 +108,11 @@ class PjRtDevice {
   // The ID of this device's host. This is always 0 on single-host platforms.
   int host_id() const { return host_id_; }
 
-  const std::string& platform_name() const { return platform_name_; }
+  // Return `platform_id` from client.
+  PjRtPlatformId platform_id() const;
+
+  // Return `platform_name` from client.
+  const std::string& platform_name() const;
 
   // A vendor-dependent string that uniquely identifies the kind of device.
   const std::string& device_kind() const { return device_kind_; }
@@ -94,13 +121,18 @@ class PjRtDevice {
 
   PjRtClient* client() const { return client_; }
 
- private:
-  friend class PjRtClient;
+  // Transfer the given literal to the infeed queue of the given localdevice.
+  virtual Status TransferToInfeed(const LiteralSlice& literal) const;
 
+  // Transfer and return a value of the given shape from the outfeed of the
+  // given device.
+  virtual StatusOr<Literal> TransferFromOutfeed(const Shape& shape) const;
+
+ private:
   const int id_;
+  const int local_device_id_;  // -1 means not local.
   const std::unique_ptr<LocalDeviceState> local_device_state_;
   const int host_id_;
-  const std::string platform_name_;
   const std::string device_kind_;
   PjRtClient* client_ = nullptr;
 };
@@ -120,6 +152,24 @@ struct PjRtCrossHostRecvBuffer {
 using PjRtCrossHostRecvNotifier =
     std::function<void(StatusOr<std::vector<PjRtCrossHostRecvBuffer>>&&)>;
 
+struct CompileOptions {
+  // The layouts of the arguments that the computation should expect.
+  absl::optional<std::vector<Shape>> argument_layouts;
+
+  // If true, the supplied computation expects its arguments to be wrapped in a
+  // tuple and passed as a single parameter.
+  bool parameter_is_tupled_arguments = false;
+
+  // XLA's compilation time options.
+  ExecutableBuildOptions executable_build_options;
+
+  // If true, the executable can be run on any device. May only be true if
+  // !executable_build_options.has_device_assignment(), so only applies to
+  // single-device executables. Beware: on GPUs, sometimes an executable
+  // compiled for one device doesn't run on another.
+  bool compile_portable_executable = false;
+};
+
 class PjRtExecutable;
 
 // Encapsulates the state of Python session with XLA.
@@ -153,12 +203,16 @@ class PjRtClient {
     return id_to_device_;
   }
   int host_id() const { return host_id_; }
+  PjRtPlatformId platform_id() const { return platform_id_; }
   const std::string& platform_name() const { return platform_name_; }
 
   LocalDeviceState& device_state(int device_ordinal) const {
     return *local_devices_.at(device_ordinal)->local_device_state();
   }
 
+  // Return a local PjRtDevice for a given `local_device_id`.
+  virtual StatusOr<PjRtDevice*> LookupLocalDevice(int local_device_id) const;
+
   LocalClient* client() const { return client_; }
   se::DeviceMemoryAllocator* allocator() const { return allocator_; }
   tensorflow::Allocator* host_memory_allocator() const {
@@ -181,13 +235,6 @@ class PjRtClient {
   // function specifies which one the platform expects.
   virtual bool EnqueueD2DTransfersOnSrcStream() const { return true; }
 
-  // Some platforms allow executables to donate buffers so that they can be
-  // aliased from inputs to outputs. This function returns the list of
-  // parameters that must be donated when executable is run. tuple_inputs
-  // reflects the option that executable was compiled with.
-  virtual StatusOr<absl::flat_hash_set<int>> GetParametersThatMustBeDonated(
-      const LocalExecutable& executable, bool tuple_inputs) const;
-
   // Generates a unique fingerprint for `executable`. See
   // PjRtExecutable::fingerprint_.
   virtual StatusOr<absl::optional<std::string>> ExecutableFingerprint(
@@ -198,6 +245,73 @@ class PjRtClient {
   // Returns a backend-specific HLO cost analysis visitor.
   virtual std::unique_ptr<HloCostAnalysis> GetHloCostAnalysis();
 
+  virtual StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options);
+
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device);
+
+  // Describes the semantics the caller to BufferFromHostBuffer expects from the
+  // runtime, in a total order from most restrictive to least restrictive.
+  enum class HostBufferSemantics {
+    // The runtime may not hold references to `data` after the call to
+    // `BufferFromHostBuffer` completes. The caller promises that `data` is
+    // immutable and will not be freed only for the duration of the
+    // BufferFromHostBuffer call. `buffer_reference` will be freed by the time
+    // `BufferFromHostBuffer` returns.
+    kImmutableOnlyDuringCall,
+
+    // The runtime may hold onto `data` after the call to `BufferFromHostBuffer`
+    // returns while the runtime completes a transfer to the device. The caller
+    // promises not to mutate or free `data` until the transfer completes, at
+    // which point the runtime will release `buffer_reference`. It is also
+    // correct to wait on the host (directly or indirectly) for the buffer's
+    // definition event to complete.
+    kImmutableUntilTransferCompletes,
+
+    // The PjRtBuffer may alias `data` internally and the runtime may use the
+    // `data` contents as long as the buffer is alive. The caller promises to
+    // keep `data` alive and not to mutate its contents as long as the buffer is
+    // alive; to notify the caller that the buffer may be freed, the runtime
+    // will release its `buffer_reference` when the PjRtBuffer is freed. On
+    // non-CPU platforms this acts identically to
+    // kImmutableUntilTransferCompletes.
+    kZeroCopy,
+  };
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, const Shape& shape,
+      HostBufferSemantics host_buffer_semantics,
+      std::shared_ptr<void> buffer_reference, PjRtDevice* device);
+
+  // Note that literal must remain in scope until the transfer has completed, so
+  // the caller should, for example, wait for BlockHostUntilReady() completes on
+  // the return value before letting literal go out of scope.
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device);
+
+  // Asynchronously makes a vector of PjRtBuffers that can be used to receive
+  // cross host transfers using `client` on `device'. `shapes` must be the exact
+  // shapes, with identical layouts, corresponding to the buffers that will be
+  // sent. When resources for the transfer are available, notifier will be
+  // called with a vector of PjRtCrossHostRecvBuffer structs, one for each
+  // shape in `shapes`. Each struct contains a buffer that will contain the
+  // received value, and an opaque string that should be transmitted to the
+  // sending host and used in a call to CopyToRemoteDevice. None of the recv
+  // buffers will become ready until *all* of the sends have completed.
+  virtual void MakeCrossHostReceiveBuffers(
+      absl::Span<const Shape> shapes, PjRtDevice* device,
+      PjRtCrossHostRecvNotifier&& notifier);
+
+  virtual StatusOr<ChannelHandle> CreateChannelHandle() {
+    return client()->CreateChannelHandle();
+  }
+  virtual StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() {
+    return client()->CreateDeviceToHostChannelHandle();
+  }
+  virtual StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() {
+    return client()->CreateHostToDeviceChannelHandle();
+  }
+
  protected:
   friend class PjRtBuffer;
   virtual void EnqueueCrossHostReceive(
@@ -211,7 +325,8 @@ class PjRtClient {
     return Unimplemented("Cross host sends not implemented.");
   }
 
-  std::string platform_name_;
+  const PjRtPlatformId platform_id_;
+  const std::string platform_name_;
   LocalClient* client_;
 
   // Allocator to be used for staging memory transfers to devices.
@@ -385,6 +500,7 @@ class PjRtBuffer {
 
    private:
     friend class PjRtBuffer;
+    friend class PjRtClient;
 
     // Helper struct that makes it possible to move a ScopedHold through a
     // closure.
@@ -423,66 +539,10 @@ class PjRtBuffer {
     StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or_;
   };
 
-  // Returns a buffer with uninitialized contents.
-  static StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitialized(
-      const Shape& shape, PjRtClient* client, PjRtDevice* device);
-
-  // Describes the semantics the caller to FromHostBuffer expects from the
-  // runtime, in a total order from most restrictive to least restrictive.
-  enum class HostBufferSemantics {
-    // The runtime may not hold references to `data` after the call to
-    // `FromHostBuffer` completes. The caller promises that `data` is immutable
-    // and will not be freed only for the duration of the FromHostBuffer call.
-    // `buffer_reference` will be freed by the time `FromHostBuffer` returns.
-    kImmutableOnlyDuringCall,
-
-    // The runtime may hold onto `data` after the call to `FromHostBuffer`
-    // returns while the runtime completes a transfer to the device. The caller
-    // promises not to mutate or free `data` until the transfer completes, at
-    // which point the runtime will release `buffer_reference`. It is also
-    // correct to wait on the host (directly or indirectly) for the buffer's
-    // definition event to complete.
-    kImmutableUntilTransferCompletes,
-
-    // The PjRtBuffer may alias `data` internally and the runtime may use the
-    // `data` contents as long as the buffer is alive.
-    // The caller promises to keep `data` alive and not to mutate its contents
-    // as long as the buffer is alive; to notify the caller that the buffer may
-    // be freed, the runtime will release its `buffer_reference` when the
-    // PjRtBuffer is freed. On non-CPU platforms this acts identically to
-    // kImmutableUntilTransferCompletes.
-    kZeroCopy,
-  };
-  static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostBuffer(
-      const void* data, const Shape& shape,
-      HostBufferSemantics host_buffer_semantics,
-      std::shared_ptr<void> buffer_reference, PjRtClient* client,
-      PjRtDevice* device);
-
-  // Note that literal must remain in scope until the transfer has completed, so
-  // the caller should, for example, wait for BlockHostUntilReady() completes on
-  // the return value before letting literal go out of scope.
-  static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostLiteral(
-      const LiteralSlice& literal, PjRtClient* client, PjRtDevice* device);
-
-  // Asynchronously makes a vector of PjRtBuffers that can be used to receive
-  // cross host transfers using `client` on `device'. `shapes` must be the exact
-  // shapes, with identical layouts, corresponding to the buffers that will be
-  // sent. When resources for the transfer are available, notifier will be
-  // called with a vector of PjRtCrossHostRecvBuffer structs, one for each
-  // shape in `shapes`. Each struct contains a buffer that will contain the
-  // received value, and an opaque string that should be transmitted to the
-  // sending host and used in a call to CopyToRemoteDevice. None of the recv
-  // buffers will become ready until *all* of the sends have completed.
-  static void MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                                          PjRtClient* client,
-                                          PjRtDevice* device,
-                                          PjRtCrossHostRecvNotifier&& notifier);
-
   PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
              std::shared_ptr<TrackedDeviceBuffer> device_buffer,
              PjRtClient* client, PjRtDevice* device);
-  ~PjRtBuffer();
+  virtual ~PjRtBuffer();
 
   PjRtBuffer(const PjRtBuffer&) = delete;
   PjRtBuffer(PjRtBuffer&&) = delete;
@@ -492,6 +552,7 @@ class PjRtBuffer {
   const Shape& on_host_shape() const { return on_host_shape_; }
   const Shape& on_device_shape() const { return on_device_shape_; }
   PjRtDevice* device() const { return device_; }
+  PjRtPlatformId platform_id() const { return client_->platform_id(); }
   const std::string& platform_name() const { return client_->platform_name(); }
   PjRtClient* client() const { return client_; }
   bool IsEmptyTuple() const {
@@ -584,6 +645,9 @@ class PjRtBuffer {
   // immediate use on the device. Useful in particular for timing benchmarks.
   Status BlockHostUntilReady();
 
+  // Whether this buffer is on CPU and thus allows for certain optimizations.
+  bool IsOnCpu() const;
+
  private:
   friend class PjRtClient;
   // The cached value of the buffer on the host, produced either from a call to
@@ -661,24 +725,6 @@ class PjRtBuffer {
   Semaphore donation_semaphore_;
 };
 
-struct CompileOptions {
-  // The layouts of the arguments that the computation should expect.
-  absl::optional<std::vector<Shape>> argument_layouts;
-
-  // If true, the supplied computation expects its arguments to be wrapped in a
-  // tuple and passed as a single parameter.
-  bool parameter_is_tupled_arguments = false;
-
-  // XLA's compilation time options.
-  ExecutableBuildOptions executable_build_options;
-
-  // If true, the executable can be run on any device. May only be true if
-  // !executable_build_options.has_device_assignment(), so only applies to
-  // single-device executables. Beware: on GPUs, sometimes an executable
-  // compiled for one device doesn't run on another.
-  bool compile_portable_executable = false;
-};
-
 class ExecuteContext {
  public:
   virtual ~ExecuteContext() = default;
@@ -710,10 +756,6 @@ struct ExecuteOptions {
 // buffer will be donated when passed to the execution.
 class PjRtExecutable {
  public:
-  static StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
-      const XlaComputation& computation, PjRtClient* client,
-      CompileOptions options);
-
   PjRtExecutable(std::vector<std::unique_ptr<LocalExecutable>> executables,
                  bool parameter_is_tupled_arguments,
                  std::shared_ptr<DeviceAssignment> device_assignment,
@@ -777,15 +819,19 @@ class PjRtExecutable {
 
   const string& name() const;
 
+  // Return an HloModule per partition.
+  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules();
+
  protected:
   bool parameter_is_tupled_arguments() const {
     return parameter_is_tupled_arguments_;
   }
 
  private:
+  friend class PjRtClient;
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
-  Status SetUpDonation(PjRtClient* client, bool tuple_inputs);
+  Status SetUpDonation(bool tuple_inputs);
 
   virtual bool MustDonateParameter(int executable_idx, int parameter) const;
 
@@ -844,6 +890,13 @@ class PjRtExecutable {
   std::vector<PjRtDevice*> local_devices_;
 };
 
+// Executables can donate buffers so that buffers can be aliased from inputs
+// to outputs. This function returns the list of parameters that must be
+// donated when executable is run. tuple_inputs reflects the option that
+// executable was compiled with.
+StatusOr<absl::flat_hash_set<int>> GetParametersThatMustBeDonated(
+    const HloModule& hlo_module, bool tuple_inputs);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_PJRT_CLIENT_H_
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.cc b/tensorflow/compiler/xla/pjrt/tpu_client.cc
index a8711631605..b0c8b7cb62f 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.cc
@@ -118,7 +118,7 @@ PjRtTpuClient::PjRtTpuClient(LocalClient* client,
                              std::vector<std::unique_ptr<PjRtDevice>> devices,
                              int host_id,
                              tf_tpu::TpuPlatformInterface* tpu_platform)
-    : PjRtClient("tpu", client, std::move(devices), host_id,
+    : PjRtClient(kTpuName, client, std::move(devices), host_id,
                  /*allocator=*/nullptr,
                  /*host_memory_allocator=*/nullptr,
                  /*should_stage_host_to_device_transfers=*/false,
@@ -199,7 +199,8 @@ StatusOr<std::vector<std::unique_ptr<PjRtDevice>>> GetTpuDevices(
 StatusOr<std::shared_ptr<PjRtClient>> GetTpuClient(
     bool asynchronous, absl::Duration init_retry_timeout) {
   tf_tpu::TpuPlatformInterface* platform =
-      tf_tpu::TpuPlatformInterface::GetRegisteredPlatform();
+      tf_tpu::TpuPlatformInterface::GetRegisteredPlatform(
+          /*initialize_platform=*/true, /*num_tries=*/1);
   if (platform == nullptr) {
     return InvalidArgument("TpuPlatform is not available.");
   }
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.h b/tensorflow/compiler/xla/pjrt/tpu_client.h
index 1a458c1480b..cdc68bc9606 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.h
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.h
@@ -33,7 +33,7 @@ class PjRtTpuDevice : public PjRtDevice {
                 int host_id, const std::array<int, 3>& coords,
                 std::string device_kind)
       : PjRtDevice(core.Id(), std::move(local_device_state),
-                   /*platform_name=*/"tpu", std::move(device_kind), host_id),
+                   std::move(device_kind), host_id),
         core_(core),
         coords_(coords) {}
 
diff --git a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
index 32ca4e4550c..2843c9b3af6 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
@@ -117,11 +117,9 @@ TrackedDeviceBuffer::FromScopedShapedBuffer(
       /*on_delete_callback=*/nullptr);
 }
 
-ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(const Shape& on_host_shape,
-                                                 const Shape& on_device_shape,
-                                                 se::Platform* platform) const {
-  ShapedBuffer shaped_buffer(on_host_shape, on_device_shape, platform,
-                             device_ordinal_);
+ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(
+    const Shape& on_host_shape, const Shape& on_device_shape) const {
+  ShapedBuffer shaped_buffer(on_host_shape, on_device_shape, device_ordinal_);
   ShapeTree<se::DeviceMemoryBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   for (const se::DeviceMemoryBase& buf : device_memory_) {
diff --git a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
index 562cb2f913e..1476dc2039e 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
@@ -141,8 +141,7 @@ class TrackedDeviceBuffer {
   // not verify that TransferManager::HostShapeToDeviceShape(on_host_shape) ==
   // on_device_shape().
   ShapedBuffer AsShapedBuffer(const Shape& on_host_shape,
-                              const Shape& on_device_shape,
-                              se::Platform* platform) const;
+                              const Shape& on_device_shape) const;
 
   // Adds the owned device buffers in order to 'iterator'. Used to add the
   // buffers to an ExecutionInput. We require but do not verify that 'iterator'
diff --git a/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc b/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
index 9373b57e7d1..ffeb7c002a0 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
@@ -66,16 +66,13 @@ TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
       c_buffer->device_memory()[0]};
   ShapedBuffer shaped_a = a_buffer->AsShapedBuffer(
       a_shape,
-      client->backend().transfer_manager()->HostShapeToDeviceShape(a_shape),
-      client->platform());
+      client->backend().transfer_manager()->HostShapeToDeviceShape(a_shape));
   ShapedBuffer shaped_b = b_buffer->AsShapedBuffer(
       b_shape,
-      client->backend().transfer_manager()->HostShapeToDeviceShape(b_shape),
-      client->platform());
+      client->backend().transfer_manager()->HostShapeToDeviceShape(b_shape));
   ShapedBuffer shaped_c = c_buffer->AsShapedBuffer(
       c_shape,
-      client->backend().transfer_manager()->HostShapeToDeviceShape(c_shape),
-      client->platform());
+      client->backend().transfer_manager()->HostShapeToDeviceShape(c_shape));
   auto expected_it = expected_buffer_sequence.begin();
   for (auto it = shaped_a.buffers().begin(); it != shaped_a.buffers().end();
        ++it) {
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index fb9c705fdf3..a3cba5dc44b 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -28,6 +28,18 @@ pyx_library(
     srcs = ["custom_call_for_test.pyx"],
 )
 
+py_test(
+    name = "xla_client_backend_independent_test",
+    srcs = ["xla_client_backend_independent_test.py"],
+    python_version = "PY3",
+    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
+    deps = [
+        ":xla_client",
+        ":xla_extension",
+        "@absl_py//absl/testing:absltest",
+    ] + xla_py_test_deps(),
+)
+
 py_library(
     name = "xla_client_test",
     testonly = 1,
@@ -227,6 +239,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":py_client",
+        ":python_ref_manager",
         ":traceback",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -434,6 +447,7 @@ pybind_extension(
         "//tensorflow/compiler/xla/pjrt:interpreter_device",
         "//tensorflow/compiler/xla/pjrt:nvidia_gpu_device",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tpu_client",
         "//tensorflow/compiler/xla/pjrt:tracked_device_buffer",
         "//tensorflow/compiler/xla/pjrt/distributed",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
diff --git a/tensorflow/compiler/xla/python/bfloat16.cc b/tensorflow/compiler/xla/python/bfloat16.cc
index b70244cc3ef..5dcfc3b0dcc 100644
--- a/tensorflow/compiler/xla/python/bfloat16.cc
+++ b/tensorflow/compiler/xla/python/bfloat16.cc
@@ -396,25 +396,30 @@ PyTypeObject PyBfloat16_Type = {
 PyArray_ArrFuncs NPyBfloat16_ArrFuncs;
 
 PyArray_Descr NPyBfloat16_Descr = {
-    PyObject_HEAD_INIT(nullptr) & PyBfloat16_Type,  // typeobj
+    PyObject_HEAD_INIT(nullptr)  //
+                                 /*typeobj=*/
+    (&PyBfloat16_Type),
     // We must register bfloat16 with a kind other than "f", because numpy
     // considers two types with the same kind and size to be equal, but
     // float16 != bfloat16.
     // The downside of this is that NumPy scalar promotion does not work with
     // bfloat16 values.
-    'V',  // kind
+    /*kind=*/'V',
     // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
     // character is unique.
-    'E',                                                  // type
-    '=',                                                  // byteorder
-    NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,  // hasobject
-    0,                                                    // type_num
-    sizeof(bfloat16),                                     // elsize
-    alignof(bfloat16),                                    // alignment
-    nullptr,                                              // subarray
-    nullptr,                                              // fields
-    nullptr,                                              // names
-    &NPyBfloat16_ArrFuncs,                                // f
+    /*type=*/'E',
+    /*byteorder=*/'=',
+    /*flags=*/NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,
+    /*type_num=*/0,
+    /*elsize=*/sizeof(bfloat16),
+    /*alignment=*/alignof(bfloat16),
+    /*subarray=*/nullptr,
+    /*fields=*/nullptr,
+    /*names=*/nullptr,
+    /*f=*/&NPyBfloat16_ArrFuncs,
+    /*metadata=*/nullptr,
+    /*c_metadata=*/nullptr,
+    /*hash=*/-1,  // -1 means "not computed yet".
 };
 
 // Implementations of NumPy array methods.
diff --git a/tensorflow/compiler/xla/python/bfloat16_test.py b/tensorflow/compiler/xla/python/bfloat16_test.py
index 60b56bf810d..9aaa955d546 100644
--- a/tensorflow/compiler/xla/python/bfloat16_test.py
+++ b/tensorflow/compiler/xla/python/bfloat16_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import itertools
 import math
 
@@ -254,6 +255,15 @@ class Bfloat16NumPyTest(parameterized.TestCase):
   def testDtype(self):
     self.assertEqual(bfloat16, np.dtype(bfloat16))
 
+  def testDeepCopyDoesNotAlterHash(self):
+    # For context, see https://github.com/google/jax/issues/4651. If the hash
+    # value of the type descriptor is not initialized correctly, a deep copy
+    # can change the type hash.
+    dtype = np.dtype(bfloat16)
+    h = hash(dtype)
+    _ = copy.deepcopy(dtype)
+    self.assertEqual(h, hash(dtype))
+
   def testArray(self):
     x = np.array([[1, 2, 3]], dtype=bfloat16)
     self.assertEqual(bfloat16, x.dtype)
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 67afa25d23e..85252256657 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -23,8 +23,10 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "include/dlpack/dlpack.h"  // from @dlpack
+#include "pybind11/pytypes.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
+#include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -41,12 +43,30 @@ namespace {
 const char* const kDlTensorCapsuleName = "dltensor";
 
 struct DLPackTensor {
+  ~DLPackTensor();
+
+  // At most one of buffer and buffer_reference/scoped_hold is populated.
+
+  // `buffer` is populated if we have exclusive (read-write) access.
   std::shared_ptr<TrackedDeviceBuffer> buffer;
+
+  // `buffer_reference` and `scoped_hold` are populated if we have
+  // shared (read-only) access.
+  py::object buffer_reference;
+  absl::optional<PjRtBuffer::ScopedHold> scoped_hold;
+
   std::vector<int64> shape;
   std::vector<int64> strides;
   DLManagedTensor tensor;
 };
 
+DLPackTensor::~DLPackTensor() {
+  if (buffer_reference) {
+    GlobalPyRefManager()->AddGarbage(
+        absl::MakeSpan(&buffer_reference, /*size=*/1));
+  }
+}
+
 void DLPackTensorDeleter(DLManagedTensor* t) {
   if (t) {
     delete static_cast<DLPackTensor*>(t->manager_ctx);
@@ -208,68 +228,76 @@ StatusOr<DLDeviceType> DLDeviceTypeForDevice(const PjRtDevice& device) {
 StatusOr<DLContext> DLContextForDevice(const PjRtDevice& device) {
   DLContext context;
   TF_ASSIGN_OR_RETURN(context.device_type, DLDeviceTypeForDevice(device));
-  context.device_id = device.local_device_state()->device_ordinal();
+  context.device_id = device.local_device_id();
   return context;
 }
 
 StatusOr<PjRtDevice*> DeviceForDLContext(const PjRtClient& client,
                                          const DLContext& context) {
-  se::Platform::Id platform_id;
   switch (context.device_type) {
     case kDLCPU:
-      platform_id = se::host::kHostPlatformId;
-      break;
+      if (client.platform_id() != kCpuId) {
+        return InvalidArgument(
+            "DLPack CPU device type mismatch with PjRtClient platform %s",
+            client.platform_name());
+      }
+      return client.LookupLocalDevice(context.device_id);
     case kDLGPU:
-      platform_id = se::cuda::kCudaPlatformId;
-      break;
+      if (client.platform_id() != kGpuId) {
+        return InvalidArgument(
+            "DLPack GPU device type mismatch with PjRtClient platform %s",
+            client.platform_name());
+      }
+      return client.LookupLocalDevice(context.device_id);
     default:
       return InvalidArgument("Unknown/unsupported DLPack device type %d",
                              context.device_type);
   }
-  auto it = absl::c_find_if(client.local_devices(), [&](PjRtDevice* device) {
-    return device->local_device_state()->executor()->platform()->id() ==
-               platform_id &&
-           device->local_device_state()->device_ordinal() == context.device_id;
-  });
-  if (it == client.local_devices().end()) {
-    return InvalidArgument(
-        "No matching device found for DLPack device_type %d device_id %d",
-        context.device_type, context.device_id);
-  }
-  return *it;
 }
 
 }  // namespace
 
-StatusOr<py::capsule> BufferToDLPackManagedTensor(PyBuffer* buffer) {
+StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
+                                                  bool take_ownership) {
+  PyBuffer* buffer = py::cast<PyBuffer*>(py_buffer);
   auto pack = std::make_unique<DLPackTensor>();
-  // Block on outstanding operations, so that it is safe to read or mutate the
-  // returned buffer.
-  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or =
-      buffer->buffer()->Release(/*wait_for_operations_to_complete=*/true);
-  if (!buffer_or.ok()) {
-    return InvalidArgument(
-        "Buffer synchronization failed converting to DLPack tensor: %s",
-        buffer_or.status().ToString());
-  }
-  pack->buffer = buffer_or.ConsumeValueOrDie();
-  if (!pack->buffer) {
-    return InvalidArgument(
-        "Cannot convert deleted/invalid buffer to DLPack tensor.");
-  }
-  pack->tensor.manager_ctx = pack.get();
-  pack->tensor.deleter = DLPackTensorDeleter;
-  DLTensor& dt = pack->tensor.dl_tensor;
   if (buffer->buffer()->on_device_shape().IsTuple()) {
     return Unimplemented(
         "unsafe_buffer_pointer is not implemented for tuple "
         "buffers.");
   }
-  TF_RET_CHECK(pack->buffer->device_memory().size() == 1);
-  dt.data = pack->buffer->device_memory().front().opaque();
+
+  DLTensor& dt = pack->tensor.dl_tensor;
+  if (take_ownership) {
+    // Block on outstanding operations, so that it is safe to read or mutate the
+    // returned buffer.
+    StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or =
+        buffer->buffer()->Release(/*wait_for_operations_to_complete=*/true);
+    if (!buffer_or.ok()) {
+      return InvalidArgument(
+          "Buffer synchronization failed converting to DLPack tensor: %s",
+          buffer_or.status().ToString());
+    }
+    pack->buffer = buffer_or.ConsumeValueOrDie();
+    if (!pack->buffer) {
+      return InvalidArgument(
+          "Cannot convert deleted/invalid buffer to DLPack tensor.");
+    }
+    TF_RET_CHECK(pack->buffer->device_memory().size() == 1);
+    dt.data = pack->buffer->device_memory().front().opaque();
+  } else {
+    // Block on outstanding operations, so that it is safe to read or mutate the
+    // returned buffer.
+    TF_RETURN_IF_ERROR(buffer->BlockHostUntilReady());
+    pack->buffer_reference = py::reinterpret_borrow<py::object>(py_buffer);
+    pack->scoped_hold.emplace(
+        buffer->buffer()->GetBufferWithExternalReference());
+    dt.data = pack->scoped_hold->buffer()->device_memory().front().opaque();
+  }
+  pack->tensor.manager_ctx = pack.get();
+  pack->tensor.deleter = DLPackTensorDeleter;
   TF_ASSIGN_OR_RETURN(dt.ctx, DLContextForDevice(*buffer->buffer()->device()));
-  dt.ctx.device_id =
-      buffer->buffer()->device()->local_device_state()->device_ordinal();
+  dt.ctx.device_id = buffer->buffer()->device()->local_device_id();
   dt.ndim = buffer->buffer()->on_host_shape().dimensions_size();
   TF_ASSIGN_OR_RETURN(dt.dtype,
                       PrimitiveTypeToDLDataType(
diff --git a/tensorflow/compiler/xla/python/dlpack.h b/tensorflow/compiler/xla/python/dlpack.h
index 7200997cf27..c39e6a7d932 100644
--- a/tensorflow/compiler/xla/python/dlpack.h
+++ b/tensorflow/compiler/xla/python/dlpack.h
@@ -22,7 +22,11 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(PyBuffer* buffer);
+// If take_ownership is true, ownership of the buffer is handed to DLPack, and
+// the receiver may mutate the buffer as they see fit. Otherwise PjRt retains
+// ownership of the buffer and it should be immutable.
+StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(pybind11::handle buffer,
+                                                        bool take_ownership);
 
 StatusOr<std::unique_ptr<PyBuffer>> DLPackManagedTensorToBuffer(
     const pybind11::capsule& tensor, std::shared_ptr<PyClient> client);
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
index e1528e77e0f..b7d833e5948 100644
--- a/tensorflow/compiler/xla/python/jax_jit.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -122,19 +122,36 @@ struct CallSignature {
   std::vector<py::object> static_args;
   // A PyTreeDef for each positional dynamic (i.e. not static) argument.
   std::vector<PyTreeDef> dynamic_positional_args_treedef;
-  // Keyword arguments. Sorted by the interned keyword pointers.
+  // Keyword arguments. Sorted by the keyword name.
   std::vector<KwargEntry> keyword_args;
   // Shape and dtype for both the dynamic positional arguments and the keyword
-  // arguments (sorted by interned keyword pointers).
+  // arguments (sorted by keyword name).
   std::vector<ArgSignature> dynamic_args_signatures;
   PjRtDevice* device;
 
   bool operator==(const CallSignature& other) const {
-    return std::tie(dynamic_positional_args_treedef, static_args, keyword_args,
+    return std::tie(dynamic_positional_args_treedef, keyword_args,
                     dynamic_args_signatures, device) ==
-           std::tie(other.dynamic_positional_args_treedef, other.static_args,
-                    other.keyword_args, other.dynamic_args_signatures,
-                    other.device);
+               std::tie(other.dynamic_positional_args_treedef,
+                        other.keyword_args, other.dynamic_args_signatures,
+                        other.device) &&
+           // `==` on py:objects is the Python `is`. We need equal.
+           std::equal(
+               static_args.begin(), static_args.end(),
+               other.static_args.begin(), other.static_args.end(),
+               [](const py::object& a, const py::object& b) {
+                 try {
+                   return a.equal(b);
+                 } catch (const py::error_already_set& e) {
+                   throw std::invalid_argument(absl::StrCat(
+                       "static arguments should be comparable using __eq__."
+                       "The following error was raised when comparing two "
+                       "objects of types ",
+                       py::cast<std::string>(py::str(py::type::of(a))), " and ",
+                       py::cast<std::string>(py::str(py::type::of(b))),
+                       ". The error was:\n", e.what()));
+                 }
+               });
   }
   bool operator!=(const CallSignature& other) const {
     return !(*this == other);
@@ -169,12 +186,6 @@ H AbslHashValue(H h, const CallSignature::KwargEntry& kw) {
 
 template <typename H>
 H AbslHashValue(H h, const CallSignature& s) {
-  // /!\ important: We cannot include static arguments to the hash, because
-  // the py::object must be hashable for absl. We can try delegating to the
-  // Python __hash__, but there are many non-hashable Python types such as
-  // np.ndarray.
-  // TODO(jblespiau): We should either ban non-hashable objects from jit or we
-  // should hash them by object identity.
   h = H::combine_contiguous(std::move(h),
                             s.dynamic_positional_args_treedef.data(),
                             s.dynamic_positional_args_treedef.size());
@@ -183,6 +194,20 @@ H AbslHashValue(H h, const CallSignature& s) {
   h = H::combine_contiguous(std::move(h), s.dynamic_args_signatures.data(),
                             s.dynamic_args_signatures.size());
   h = H::combine(std::move(h), s.device);
+  for (const auto& static_arg : s.static_args) {
+    ssize_t hash;
+    try {
+      hash = py::hash(static_arg);
+    } catch (const py::error_already_set& e) {
+      throw std::invalid_argument(absl::StrCat(
+          "Non-hashable static arguments are not supported. An error occured "
+          "while trying to hash an object of type ",
+          py::cast<std::string>(py::str(py::type::of(static_arg))), ", ",
+          py::cast<std::string>(py::str(static_arg)), ". The error was:\n",
+          e.what(), "\n"));
+    }
+    h = H::combine(std::move(h), hash);
+  }
   return h;
 }
 
@@ -190,7 +215,7 @@ std::string CallSignature::DebugString() const {
   std::vector<std::string> static_args_str;
   static_args_str.reserve(static_args.size());
   for (auto& static_arg : static_args) {
-    static_args_str.emplace_back(py::cast<std::string>(static_arg.str()));
+    static_args_str.emplace_back(py::cast<std::string>(py::str(static_arg)));
   }
 
   std::vector<std::string> signature_str;
@@ -269,11 +294,11 @@ class CompiledFunction {
     return inspect->attr("signature")(fun_);
   }
 
+  int cache_size() const { return executables_.size(); }
+
  private:
   // Returns nullptr if not present in the cache.
-  CacheEntry* GetCacheEntryIfPresent(const py::args& args,
-                                     const py::kwargs& kwargs,
-                                     const CallSignature& signature);
+  CacheEntry* GetCacheEntryIfPresent(const CallSignature& signature);
   // Should never return nullptr.
   CacheEntry* AddCacheEntry(const py::args& args, const py::kwargs& kwargs,
                             const CallSignature& signature,
@@ -424,8 +449,10 @@ void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs,
   // Keyword arguments.
   std::vector<std::pair<py::handle, py::handle>> kwargs(py_kwargs.begin(),
                                                         py_kwargs.end());
-  // We first intern the keys, then sort them (by pointer) and then create
-  // the signatures.
+  // We first intern the keys, then sort them (by name, as in the Python path)
+  // (see also PyTreeDef::Flatten) and then create the signatures.
+  // TODO(jblespiau): We should be able to sort the keys by interned-key
+  // pointers, but this requires the Python compilation to do the same.
   arguments.signature.keyword_args.resize(kwargs.size());
   for (size_t i = 0; i < kwargs.size(); ++i) {
     // Intern the key if not already interned.
@@ -442,7 +469,7 @@ void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs,
   std::sort(kwargs.begin(), kwargs.end(),
             [](const std::pair<py::handle, py::handle>& a,
                const std::pair<py::handle, py::handle>& b) {
-              return a.first.ptr() < b.first.ptr();
+              return a.first < b.first;
             });
   for (size_t i = 0; i < kwargs.size(); ++i) {
     arguments.signature.keyword_args[i].key = kwargs[i].first;
@@ -457,10 +484,10 @@ std::unique_ptr<xla::PjRtBuffer> ConvertToScalarBuffer(
     xla::PjRtDevice* device) {
   CppType data = py::cast<Pybind11Type>(scalar);
   xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<CppType>({});
-  return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+  return ValueOrThrow(client->BufferFromHostBuffer(
       &data, shape,
-      xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
-      client, device));
+      xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+      device));
 }
 
 // Convert a scalar to the associated PjRtBuffer or raises an error if it is
@@ -494,24 +521,24 @@ StatusOr<std::unique_ptr<xla::PjRtBuffer>> ScalarToBuffer(
     if (jax_enable_x64) {
       xla::complex128 data(result.real, result.imag);
       xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<xla::complex128>({});
-      return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+      return ValueOrThrow(client->BufferFromHostBuffer(
           &data, shape,
-          xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall,
-          nullptr, client, device));
+          xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+          nullptr, device));
     } else {
       xla::complex64 data(result.real, result.imag);
       xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<xla::complex64>({});
-      return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+      return ValueOrThrow(client->BufferFromHostBuffer(
           &data, shape,
-          xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall,
-          nullptr, client, device));
+          xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+          nullptr, device));
     }
   }
   return InvalidArgument(
       "%s", absl::StrCat(
                 "Not supported: The C++ jax jit execution path, only accepts "
                 "DeviceArray, Numpy arrays, or Python scalars. Got type ",
-                py::cast<std::string>(scalar.get_type().str())));
+                py::cast<std::string>(py::str(scalar.get_type()))));
 }
 
 const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) {
@@ -524,16 +551,16 @@ const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) {
   static const auto* complex64_dt = new py::dtype("complex64");
   static const auto* complex128_dt = new py::dtype("complex128");
 
-  if (dtype == *int64_dt) {
+  if (dtype.equal(*int64_dt)) {
     return int32_dt;
   }
-  if (dtype == *float64_dt) {
+  if (dtype.equal(*float64_dt)) {
     return float32_dt;
   }
-  if (dtype == *uint64_dt) {
+  if (dtype.equal(*uint64_dt)) {
     return uint32_dt;
   }
-  if (dtype == *complex128_dt) {
+  if (dtype.equal(*complex128_dt)) {
     return complex64_dt;
   }
 
@@ -568,7 +595,7 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
   const auto& device_array = xla_module->attr("DeviceArray");
 
   static const auto* numpy_module = new py::module(py::module::import("numpy"));
-  const auto& array = numpy_module->attr("array");
+  const auto& np_array = numpy_module->attr("array");
 
   // When the jitted function is not committed, we first check whether any
   // sticky `DeviceArray` is present and on which device they live. See also:
@@ -663,14 +690,14 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
       if (!jax_enable_x64) {
         const py::dtype* to_dtype = DtypeTo32BitDtype(numpy_array.dtype());
         if (to_dtype) {
-          numpy_array = array(numpy_array, to_dtype);
+          numpy_array = np_array(numpy_array, *to_dtype);
         }
       }
       std::unique_ptr<xla::PyBuffer> buffer =
           ValueOrThrow(pyclient.BufferFromPyval(
               numpy_array, data_device,
               /*force_copy=*/false, /*host_buffer_semantics=*/
-              xla::PjRtBuffer::HostBufferSemantics::kZeroCopy));
+              xla::PjRtClient::HostBufferSemantics::kZeroCopy));
       arg_buffers.push_back(buffer->buffer());
 
       ArgSignature sig;
@@ -702,7 +729,6 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
 }  // namespace
 
 CacheEntry* CompiledFunction::GetCacheEntryIfPresent(
-    const py::args& args, const py::kwargs& kwargs,
     const CallSignature& signature) {
   auto found_iterator = executables_.find(signature);
   if (found_iterator != executables_.end()) {  // Cache hit!
@@ -790,6 +816,10 @@ py::object CompiledFunction::Call(py::args args, py::kwargs kwargs) {
         }
         default_pyclient_ = default_pydevice_.client;
         default_device_ = default_pydevice_.contents;
+        if (!default_device_) {  // UPTC
+          always_fallback_to_python_ = true;
+          return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
+        }
         is_committed_ =
             py::cast<bool>(device_and_is_committed.attr("committed_to_device"));
       }
@@ -810,12 +840,11 @@ py::object CompiledFunction::Call(py::args args, py::kwargs kwargs) {
     return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
   }
 
-  CacheEntry* cache_entry =
-      GetCacheEntryIfPresent(args, kwargs, arguments.signature);
+  CacheEntry* cache_entry = GetCacheEntryIfPresent(arguments.signature);
 
   if (!cache_entry) {
     py::object out_and_fastpath_data = cache_miss_(*args, **kwargs);
-    cache_entry = GetCacheEntryIfPresent(args, kwargs, arguments.signature);
+    cache_entry = GetCacheEntryIfPresent(arguments.signature);
     if (!cache_entry) {
       cache_entry = AddCacheEntry(args, kwargs, arguments.signature,
                                   out_and_fastpath_data);
@@ -868,6 +897,16 @@ void BuildJaxjitSubmodule(pybind11::module& m) {
       });
 
   // Only for testing purposes
+  cfun.def("_cache_size", &CompiledFunction::cache_size);
+  jitlib.def("_DtypeTo32BitDtype", [](const py::object obj) -> py::object {
+    py::dtype dtype = py::dtype::from_args(obj);
+    const py::dtype* res = DtypeTo32BitDtype(dtype);
+    if (res) {
+      return *res;
+    } else {
+      return py::none();
+    }
+  });
   jitlib.def("_is_float0", &IsFloat0);
   jitlib.def("_is_trivial", &HasTrivialLazyExpr);
   jitlib.def("_ScalarToBuffer", [](py::handle scalar, bool jax_enable_x64,
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
index f6067e650c0..2535d62ee7e 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -409,10 +409,9 @@ Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
   compile_options.executable_build_options.set_device_assignment(
       device_assignment);
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtExecutable> executable,
-      PjRtExecutable::Compile(computation, devices_[device_idx]->client(),
-                              std::move(compile_options)));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                      devices_[device_idx]->client()->Compile(
+                          computation, std::move(compile_options)));
   ExecuteOptions execute_options;
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
                       executable->Execute({}, execute_options));
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
index 919dafe2e0b..5422a4b3056 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -40,9 +40,8 @@ Status CompileAndExecute(XlaBuilder* builder, XlaOp root, int device_id,
   compile_options.executable_build_options.set_device_assignment(
       device_assignment);
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtExecutable> executable,
-      PjRtExecutable::Compile(computation, client, std::move(compile_options)));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                      client->Compile(computation, std::move(compile_options)));
   ExecuteOptions execute_options;
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
                       executable->Execute({}, execute_options));
diff --git a/tensorflow/compiler/xla/python/py_buffer.cc b/tensorflow/compiler/xla/python/py_buffer.cc
index b32fe047530..cac14142b75 100644
--- a/tensorflow/compiler/xla/python/py_buffer.cc
+++ b/tensorflow/compiler/xla/python/py_buffer.cc
@@ -144,7 +144,7 @@ int PjRtBufferGetBuffer(PyObject* exporter, Py_buffer* view, int flags) {
     // Additionally we call BlockHostUntilReady() below, which may block.
     py::gil_scoped_release gil_release;
 
-    if (buffer.device()->platform_name() != "cpu") {
+    if (!buffer.IsOnCpu()) {
       return InvalidArgument(
           "Python buffer protocol is only defined for CPU buffers.");
     }
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 07b915c640c..d42bbdca154 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -89,7 +89,7 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
 
 StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
     const pybind11::object& argument, PjRtDevice* device, bool force_copy,
-    PjRtBuffer::HostBufferSemantics host_buffer_semantics) {
+    PjRtClient::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
     TF_RET_CHECK(!pjrt_client_->local_devices().empty());
     device = pjrt_client_->local_devices().front();
@@ -114,10 +114,9 @@ StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
   std::unique_ptr<PjRtBuffer> buffer;
   {
     py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(
-        buffer, PjRtBuffer::FromHostBuffer(
-                    c->buf_ptr, c->shape, host_buffer_semantics,
-                    std::move(py_buffer_ref), pjrt_client_.get(), device));
+    TF_ASSIGN_OR_RETURN(buffer, pjrt_client_->BufferFromHostBuffer(
+                                    c->buf_ptr, c->shape, host_buffer_semantics,
+                                    std::move(py_buffer_ref), device));
   }
   auto traceback = Traceback::Get();
   return std::make_unique<PyBuffer>(shared_from_this(), std::move(buffer),
@@ -131,8 +130,7 @@ StatusOr<std::shared_ptr<PyExecutable>> PyClient::Compile(
   {
     py::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(executable,
-                        PjRtExecutable::Compile(computation, pjrt_client_.get(),
-                                                std::move(options)));
+                        pjrt_client_->Compile(computation, std::move(options)));
     TF_ASSIGN_OR_RETURN(fingerprint,
                         pjrt_client_->ExecutableFingerprint(*executable));
   }
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index 08249722d6c..37f5333ea1c 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -112,18 +112,18 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
       int num_replicas);
 
   StatusOr<ChannelHandle> CreateChannelHandle() {
-    return pjrt_client_->client()->CreateChannelHandle();
+    return pjrt_client_->CreateChannelHandle();
   }
   StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() {
-    return pjrt_client_->client()->CreateDeviceToHostChannelHandle();
+    return pjrt_client_->CreateDeviceToHostChannelHandle();
   }
   StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() {
-    return pjrt_client_->client()->CreateHostToDeviceChannelHandle();
+    return pjrt_client_->CreateHostToDeviceChannelHandle();
   }
 
   StatusOr<std::unique_ptr<PyBuffer>> BufferFromPyval(
       const pybind11::object& argument, PjRtDevice* device, bool force_copy,
-      PjRtBuffer::HostBufferSemantics host_buffer_semantics);
+      PjRtClient::HostBufferSemantics host_buffer_semantics);
 
   StatusOr<std::shared_ptr<PyExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options);
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index 53891b96846..9d1b89a1cbc 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -135,15 +135,7 @@ PyExecutable::ExecuteOnLocalDevices(
 
 StatusOr<std::vector<std::shared_ptr<HloModule>>> PyExecutable::HloModules()
     const {
-  std::vector<std::shared_ptr<HloModule>> modules;
-  modules.reserve(executable_->executables().size());
-  for (const auto& local_exec : executable_->executables()) {
-    if (!local_exec->executable()->has_module()) {
-      return InvalidArgument("Executable does not have HLO modules.");
-    }
-    modules.push_back(local_exec->executable()->shared_module());
-  }
-  return std::move(modules);
+  return executable_->GetHloModules();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index 0602d096aaa..6cd55d0e631 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -37,7 +37,7 @@ namespace xla {
 
 TpuDevice::TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
                      int core_on_chip)
-    : xla::PjRtDevice(id, /*local_device_state=*/nullptr, kTpuPlatform,
+    : xla::PjRtDevice(id, /*local_device_state=*/nullptr,
                       /*device_kind=*/"Cloud TPU", host_id),
       coords_(coords),
       core_on_chip_(core_on_chip) {}
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 5d526b51899..7a2369d48f6 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -207,6 +207,13 @@ PYBIND11_MODULE(tpu_client_extension, m) {
   py::class_<TpuDevice, PjRtDevice, std::shared_ptr<TpuDevice>>(m, "TpuDevice")
       .def_property_readonly("coords", &TpuDevice::coords)
       .def_property_readonly("core_on_chip", &TpuDevice::core_on_chip)
+      // TODO(skye): this is a horrible hack because falling back to
+      // PjRtDevice::platform_name() segfaults, due to TpuDevice::client_ being
+      // uninitialized. This can be removed when PyTpuClient subclasses
+      // PjRtClient and can be used to set TpuDevice::client_.
+      .def_property_readonly(
+          "platform",
+          [](const TpuDevice& device) -> std::string { return kTpuPlatform; })
       .def("__repr__", [](const TpuDevice& device) {
         return absl::StrFormat(
             "TpuDevice(id=%i, host_id=%i, coords=(%i,%i,%i), core_on_chip=%i)",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
index bd13e464747..a5a6cbabb82 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
@@ -30,6 +30,12 @@
 namespace tpu_driver {
 namespace {
 
+#define CHECK_EXISTS_OR_RETURN(container, target_op_id, operation_id)  \
+  {                                                                    \
+    auto p = CheckHandleExists(container, target_op_id, operation_id); \
+    if (p != nullptr) return p;                                        \
+  }
+
 using xla::Status;
 using xla::WorkerThread;
 
@@ -107,7 +113,8 @@ class CombinedEvent : public PodEvent {
     return Status::OK();
   }
 
-  void AddCallback(std::function<void(Status)> callback) override {
+  void AddCallback(std::function<void(Status)> callback)
+      TF_LOCKS_EXCLUDED(mu_) override {
     bool all_events_completed = false;
     {
       absl::MutexLock l(&mu_);
@@ -122,7 +129,7 @@ class CombinedEvent : public PodEvent {
   }
 
  private:
-  void IncrementAndCheckComplete(Status s) {
+  void IncrementAndCheckComplete(Status s) TF_LOCKS_EXCLUDED(mu_) {
     std::vector<std::function<void(Status)>> callbacks;
     {
       absl::MutexLock l(&mu_);
@@ -220,6 +227,12 @@ class PodLoadedProgramHandle : public LoadedProgramHandle {
 };
 
 struct EventInFlight {
+  EventInFlight()
+      : underlying_event(nullptr),
+        create_fn(nullptr),
+        incomplete_deps(),
+        callbacks() {}
+
   std::shared_ptr<Event> underlying_event;
   std::function<std::shared_ptr<Event>(void)> create_fn;
 
@@ -343,8 +356,8 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, core_id, region, num_bytes, operation_id]() {
-          absl::MutexLock l(&mu_);
+        [this, core_id, region, num_bytes,
+         operation_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
           underlying_buffers_.insert(
               {operation_id,
                core_to_driver_[core_id]->Allocate(core_to_driver_core_[core_id],
@@ -365,8 +378,8 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, core_id, region, shape, operation_id]() {
-          absl::MutexLock l(&mu_);
+        [this, core_id, region, shape,
+         operation_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
           underlying_buffers_.insert(
               {operation_id,
                core_to_driver_[core_id]->Allocate(core_to_driver_core_[core_id],
@@ -396,12 +409,14 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, core_id, region, children_ids, operation_id]() {
-          absl::MutexLock l(&mu_);
-
+        [this, core_id, region, children_ids,
+         operation_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_)
+            -> std::shared_ptr<Event> {
           std::vector<BufferHandle*> child_buffers;
           child_buffers.reserve(children_ids.size());
           for (int i = 0; i < children_ids.size(); ++i) {
+            CHECK_EXISTS_OR_RETURN(underlying_buffers_, children_ids[i],
+                                   operation_id);
             child_buffers.push_back(underlying_buffers_[children_ids[i]].get());
           }
 
@@ -429,8 +444,10 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, op_id, core_id]() {
-          absl::MutexLock l(&mu_);
+        [this, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_buffers_, op_id, operation_id);
+
           auto buf_iter = underlying_buffers_.find(op_id);
           auto underlying_hn = std::move(buf_iter->second);
           underlying_buffers_.erase(buf_iter);
@@ -455,8 +472,10 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, src, op_id, core_id]() {
-          absl::MutexLock l(&mu_);
+        [this, src, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_buffers_, op_id, operation_id);
+
           auto buf_iter = underlying_buffers_.find(op_id);
           return core_to_driver_[core_id]->TransferToDevice(
               src, buf_iter->second.get(), {});
@@ -478,8 +497,9 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, dst, op_id, core_id]() {
-          absl::MutexLock l(&mu_);
+        [this, dst, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_buffers_, op_id, operation_id);
           auto buf_iter = underlying_buffers_.find(op_id);
           return core_to_driver_[core_id]->TransferFromDevice(
               buf_iter->second.get(), dst, {});
@@ -510,13 +530,18 @@ class PodTpuDriver : public TpuDriver {
 
       ScheduleRequest(
           operation_id,
-          [this, src_op_id, dst_op_id, dst_core_id]() {
-            absl::MutexLock l(&mu_);
-            auto src_iter = underlying_buffers_.find(src_op_id);
-            auto dst_iter = underlying_buffers_.find(dst_op_id);
-            return core_to_driver_[dst_core_id]->TransferFromDeviceToDevice(
-                src_iter->second.get(), dst_iter->second.get(), {});
-          },
+          [this, operation_id, src_op_id, dst_op_id, dst_core_id]()
+              TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+                CHECK_EXISTS_OR_RETURN(underlying_buffers_, src_op_id,
+                                       operation_id);
+                CHECK_EXISTS_OR_RETURN(underlying_buffers_, dst_op_id,
+                                       operation_id);
+
+                auto src_iter = underlying_buffers_.find(src_op_id);
+                auto dst_iter = underlying_buffers_.find(dst_op_id);
+                return core_to_driver_[dst_core_id]->TransferFromDeviceToDevice(
+                    src_iter->second.get(), dst_iter->second.get(), {});
+              },
           deps);
       return std::make_shared<PodEvent>(this, operation_id);
     } else {
@@ -540,8 +565,8 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, operation_id, source, num_replicas]() {
-          absl::MutexLock l(&mu_);
+        [this, operation_id, source,
+         num_replicas]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
           auto cph_iterator =
               underlying_cph_
                   .insert(
@@ -576,8 +601,9 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, operation_id, cph_op_id, core_id]() {
-          absl::MutexLock l(&mu_);
+        [this, operation_id, cph_op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_cph_, cph_op_id, operation_id);
           auto cph_iter = underlying_cph_.find(cph_op_id);
 
           underlying_lph_.insert(
@@ -608,9 +634,9 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, op_id, core_id]() {
-          absl::MutexLock l(&mu_);
-
+        [this, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_lph_, op_id, operation_id);
           auto lph_iter = underlying_lph_.find(op_id);
           auto event = core_to_driver_[core_id]->UnloadProgram(
               std::move(lph_iter->second), {});
@@ -654,23 +680,27 @@ class PodTpuDriver : public TpuDriver {
 
     ScheduleRequest(
         operation_id,
-        [this, core_id, op_id, input_op_ids, output_op_ids,
-         device_assignment]() {
-          absl::MutexLock l(&mu_);
-
+        [this, operation_id, core_id, op_id, input_op_ids, output_op_ids,
+         device_assignment]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_)
+            -> std::shared_ptr<Event> {
           std::vector<BufferHandle*> underlying_inputs;
           std::vector<BufferHandle*> underlying_outputs;
 
           underlying_inputs.reserve(input_op_ids.size());
           for (auto input_op_id : input_op_ids) {
+            CHECK_EXISTS_OR_RETURN(underlying_buffers_, input_op_id,
+                                   operation_id);
             underlying_inputs.push_back(underlying_buffers_[input_op_id].get());
           }
           underlying_outputs.reserve(output_op_ids.size());
           for (auto output_op_id : output_op_ids) {
+            CHECK_EXISTS_OR_RETURN(underlying_buffers_, output_op_id,
+                                   operation_id);
             underlying_outputs.push_back(
                 underlying_buffers_[output_op_id].get());
           }
 
+          CHECK_EXISTS_OR_RETURN(underlying_lph_, op_id, operation_id);
           LoadedProgramHandle* handle = underlying_lph_[op_id].get();
           return core_to_driver_[core_id]->ExecuteProgram(
               handle, underlying_inputs, underlying_outputs, device_assignment,
@@ -687,12 +717,12 @@ class PodTpuDriver : public TpuDriver {
 
   // Helper methods for Event scheduling
 
-  absl::optional<Status> WaitForEvent(int64_t event_id,
-                                      absl::Duration duration) {
+  absl::optional<Status> WaitForEvent(int64_t event_id, absl::Duration duration)
+      TF_LOCKS_EXCLUDED(mu_) {
     std::shared_ptr<Event> underlying_event;
 
     {
-      absl::MutexLock l(&event_mu_);
+      absl::MutexLock l(&mu_);
       auto event = events_.find(event_id);
 
       if (event == events_.end()) {
@@ -705,25 +735,44 @@ class PodTpuDriver : public TpuDriver {
       }
 
       auto done = [this, event_id]() {
-        event_mu_.AssertHeld();
-        return events_[event_id].underlying_event != nullptr;
+        mu_.AssertHeld();
+        // The event was either completed and erased from the map or we have
+        // an underlying event available to us.
+        return events_.count(event_id) == 0 ||
+               (events_[event_id]->underlying_event != nullptr &&
+                events_[event_id]->underlying_event.use_count() != 0);
       };
 
-      auto status =
-          event_mu_.AwaitWithTimeout(absl::Condition(&done), duration);
+      auto status = mu_.AwaitWithTimeout(absl::Condition(&done), duration);
       if (!status) {
         return absl::nullopt;
       }
-      underlying_event = events_[event_id].underlying_event;
+
+      if (events_.count(event_id) > 0) {
+        underlying_event = events_[event_id]->underlying_event;
+      } else {
+        underlying_event = nullptr;
+      }
     }
 
     // Wait for the underlying event without holding on to the event_lock_, or
     // else incoming events will not be processed.
-    return underlying_event->AwaitWithTimeout(duration);
+    if (underlying_event != nullptr) {
+      return underlying_event->AwaitWithTimeout(duration);
+    } else {
+      absl::MutexLock l(&mu_);
+      auto event_status = abnormal_event_status_.find(event_id);
+      if (event_status == abnormal_event_status_.end()) {
+        return Status::OK();
+      } else {
+        return event_status->second;
+      }
+    }
   }
 
-  void AddCallbackForEvent(int64_t event_id, std::function<void(Status)> fn) {
-    absl::MutexLock l(&event_mu_);
+  void AddCallbackForEvent(int64_t event_id, std::function<void(Status)> fn)
+      TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
     auto event = events_.find(event_id);
 
     if (event == events_.end()) {
@@ -733,17 +782,19 @@ class PodTpuDriver : public TpuDriver {
       } else {
         fn(event_status->second);
       }
-    }
-
-    if (event->second.underlying_event != nullptr) {
-      event->second.underlying_event->AddCallback(fn);
     } else {
-      event->second.callbacks.push_back(std::move(fn));
+      if (event->second->underlying_event != nullptr &&
+          event->second->underlying_event.use_count() != 0) {
+        event->second->underlying_event->AddCallback(fn);
+      } else {
+        event->second->callbacks.push_back(std::move(fn));
+      }
     }
   }
 
   xla::Status GetCompiledProgramShape(int64_t op_id,
-                                      xla::ProgramShapeProto* program_shape) {
+                                      xla::ProgramShapeProto* program_shape)
+      TF_LOCKS_EXCLUDED(mu_) {
     absl::MutexLock l(&mu_);
 
     auto done = [this, op_id]() {
@@ -766,7 +817,6 @@ class PodTpuDriver : public TpuDriver {
   SystemInfo pod_info_;
 
   absl::Mutex mu_;
-  absl::Mutex event_mu_;
 
   absl::flat_hash_map<int64_t, std::unique_ptr<BufferHandle>>
       underlying_buffers_ ABSL_GUARDED_BY(mu_);
@@ -776,9 +826,10 @@ class PodTpuDriver : public TpuDriver {
   absl::flat_hash_map<int64_t, std::unique_ptr<LoadedProgramHandle>>
       underlying_lph_ ABSL_GUARDED_BY(mu_);
 
-  absl::btree_map<int64_t, EventInFlight> events_ ABSL_GUARDED_BY(event_mu_);
+  absl::btree_map<int64_t, std::unique_ptr<EventInFlight>> events_
+      ABSL_GUARDED_BY(mu_);
   absl::flat_hash_map<int64_t, Status> abnormal_event_status_
-      ABSL_GUARDED_BY(event_mu_);
+      ABSL_GUARDED_BY(mu_);
 
   std::atomic<int64_t> operation_id_counter_{0};
 
@@ -798,18 +849,19 @@ class PodTpuDriver : public TpuDriver {
   // EventCompleted is executed on the event_thread_ worker thread. We want
   // to propagate the fact that the event is completed to any subsequent events
   // that might depend on this event.
-  void EventCompleted(int64_t event_id, Status status) {
-    absl::MutexLock l(&event_mu_);
+  void EventCompleted(int64_t event_id, Status status) TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
 
-    absl::btree_map<int64_t, EventInFlight>::iterator curr_event;
+    absl::btree_map<int64_t, std::unique_ptr<EventInFlight>>::iterator
+        curr_event;
     if (!status.ok()) abnormal_event_status_.insert({event_id, status});
     curr_event = events_.find(event_id);
 
-    DCHECK(curr_event->second.callbacks.empty());
-    DCHECK(curr_event->second.incomplete_deps.empty());
+    DCHECK(curr_event->second->callbacks.empty());
+    DCHECK(curr_event->second->incomplete_deps.empty());
 
     for (auto& event : events_) {
-      event.second.incomplete_deps.erase(event_id);
+      event.second->incomplete_deps.erase(event_id);
       // The if statement conditions on both
       //  - all previous events have completed (incomplete_deps.empty())
       //  - the op creating this event has not been called yet
@@ -817,16 +869,16 @@ class PodTpuDriver : public TpuDriver {
       // We call the create_fn that creates the event and adds any relevant
       // callbacks to the actual event, before setting create_fn to nullptr
       // to indicate that it has already been called
-      if (event.second.incomplete_deps.empty() &&
-          event.second.create_fn != nullptr) {
+      if (event.second->incomplete_deps.empty() &&
+          event.second->create_fn != nullptr) {
         // We were the last unfilled dependency, all other dependencies are
         // filled. We can now fire the create function.
-        event.second.underlying_event = event.second.create_fn();
-        for (auto& fn : event.second.callbacks) {
-          event.second.underlying_event->AddCallback(std::move(fn));
+        event.second->underlying_event = event.second->create_fn();
+        for (auto& fn : event.second->callbacks) {
+          event.second->underlying_event->AddCallback(std::move(fn));
         }
-        event.second.callbacks.clear();
-        event.second.create_fn = nullptr;
+        event.second->callbacks.clear();
+        event.second->create_fn = nullptr;
       }
     }
 
@@ -836,12 +888,14 @@ class PodTpuDriver : public TpuDriver {
 
   void ScheduleRequest(int64_t operation_id,
                        std::function<std::shared_ptr<Event>(void)> fn,
-                       const absl::flat_hash_set<int64_t>& deps) {
-    absl::MutexLock l(&event_mu_);
-    absl::btree_map<int64_t, EventInFlight>::iterator event;
+                       const absl::flat_hash_set<int64_t>& deps)
+      TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    absl::btree_map<int64_t, std::unique_ptr<EventInFlight>>::iterator event;
     absl::flat_hash_set<int64_t> incomplete_deps;
 
-    event = events_.insert({operation_id, {}}).first;
+    event = events_.insert({operation_id, absl::make_unique<EventInFlight>()})
+                .first;
     for (const auto& dep : deps) {
       if (events_.count(dep) > 0) incomplete_deps.insert(dep);
     }
@@ -850,9 +904,9 @@ class PodTpuDriver : public TpuDriver {
       // All dependencies have been fulfilled, we execute the request
       // immediately and add a callback to inform our event fulfilled thread
       // when it is done.
-      event->second.create_fn = nullptr;
-      event->second.underlying_event = fn();
-      event->second.underlying_event->AddCallback(
+      event->second->create_fn = nullptr;
+      event->second->underlying_event = fn();
+      event->second->underlying_event->AddCallback(
           [this, operation_id](Status status) {
             event_thread_.Schedule([this, operation_id, status]() {
               EventCompleted(operation_id, status);
@@ -862,15 +916,28 @@ class PodTpuDriver : public TpuDriver {
       // There are some dependencies that are not yet fulfilled. We attach
       // the request to the event, and will execute it in the EventFulfilled
       // worker thread when all its dependencies are fulfilled.
-      event->second.create_fn = std::move(fn);
-      event->second.incomplete_deps = std::move(incomplete_deps);
-      event->second.callbacks.push_back([this, operation_id](Status status) {
+      event->second->create_fn = std::move(fn);
+      event->second->incomplete_deps = std::move(incomplete_deps);
+      event->second->callbacks.push_back([this, operation_id](Status status) {
         event_thread_.Schedule([this, operation_id, status]() {
           EventCompleted(operation_id, status);
         });
       });
     }
   }
+
+  template <typename T>
+  std::shared_ptr<Event> CheckHandleExists(
+      absl::flat_hash_map<int64_t, T>& container, int64_t target_op_id,
+      int64_t operation_id) {
+    if (container.count(target_op_id) == 0) {
+      return std::make_shared<ErrorEvent>(
+          this, operation_id,
+          tensorflow::errors::InvalidArgument("Handle ", target_op_id,
+                                              " does not exist."));
+    }
+    return nullptr;
+  }
 };
 
 xla::Status PodEvent::Await() {
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 0e92c85f6f6..fb8c0ba0ba4 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/interpreter_device.h"
 #include "tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/tpu_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
 #include "tensorflow/compiler/xla/python/jax_jit.h"
@@ -205,6 +206,23 @@ bool IsOptimizedBuild() {
 #endif  // NDEBUG
 }
 
+// Safe version of ShapeUtil::MakeShapeWithLayout that fails gracefully on
+// invalid input.
+StatusOr<Shape> MakeShapeWithLayout(
+    PrimitiveType element_type, absl::Span<const int64> dims,
+    absl::optional<absl::Span<const int64>> minor_to_major) {
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      ShapeUtil::MakeValidatedShape(element_type, dims));
+  if (minor_to_major) {
+    *shape.mutable_layout() = LayoutUtil::MakeLayout(*minor_to_major);
+    TF_RETURN_IF_ERROR(
+        LayoutUtil::ValidateLayoutForShape(shape.layout(), shape));
+  } else {
+    shape.clear_layout();
+  }
+  return shape;
+}
+
 }  // namespace
 
 PYBIND11_MODULE(xla_extension, m) {
@@ -261,15 +279,13 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static(
           "array_shape",
           [](PrimitiveType type, py::object dims_seq,
-             absl::optional<py::object> layout_seq) -> Shape {
+             absl::optional<py::object> layout_seq) -> StatusOr<Shape> {
             std::vector<int64> dims = IntSequenceToVector(dims_seq);
             if (layout_seq) {
               std::vector<int64> layout = IntSequenceToVector(*layout_seq);
-              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
+              return MakeShapeWithLayout(type, dims, layout);
             } else {
-              Shape shape = ShapeUtil::MakeShape(type, dims);
-              shape.clear_layout();
-              return shape;
+              return MakeShapeWithLayout(type, dims, absl::nullopt);
             }
           },
           "Constructs an array shape.", py::arg("type"), py::arg("dims"),
@@ -277,16 +293,14 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static(
           "array_shape",
           [](py::dtype dtype, py::object dims_seq,
-             absl::optional<py::object> layout_seq) -> Shape {
+             absl::optional<py::object> layout_seq) -> StatusOr<Shape> {
             PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
             std::vector<int64> dims = IntSequenceToVector(dims_seq);
             if (layout_seq) {
               std::vector<int64> layout = IntSequenceToVector(*layout_seq);
-              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
+              return MakeShapeWithLayout(type, dims, layout);
             } else {
-              Shape shape = ShapeUtil::MakeShape(type, dims);
-              shape.clear_layout();
-              return shape;
+              return MakeShapeWithLayout(type, dims, absl::nullopt);
             }
           },
           "Constructs an array shape.", py::arg("type"), py::arg("dims"),
@@ -465,10 +479,7 @@ PYBIND11_MODULE(xla_extension, m) {
            [](const PjRtDevice& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
-             TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                 device.GetLocalDeviceState());
-             return local_device->client()->TransferToInfeedLocal(
-                 literal, local_device->device_ordinal());
+             return device.TransferToInfeed(literal);
            })
       .def("transfer_from_outfeed",
            [](const PjRtDevice& device,
@@ -477,8 +488,6 @@ PYBIND11_MODULE(xla_extension, m) {
              std::shared_ptr<Literal> literal_shared;
              {
                py::gil_scoped_release gil_release;
-               TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                   device.GetLocalDeviceState());
                Shape shape_with_layout = shape;
                ShapeUtil::ForEachMutableSubshape(
                    &shape_with_layout, [](Shape* subshape, const ShapeIndex&) {
@@ -486,10 +495,8 @@ PYBIND11_MODULE(xla_extension, m) {
                        LayoutUtil::SetToDefaultLayout(subshape);
                      }
                    });
-               TF_ASSIGN_OR_RETURN(
-                   Literal literal,
-                   local_device->client()->TransferFromOutfeedLocal(
-                       shape_with_layout, local_device->device_ordinal()));
+               TF_ASSIGN_OR_RETURN(Literal literal, device.TransferFromOutfeed(
+                                                        shape_with_layout));
 
                literal_shared = std::make_shared<Literal>(std::move(literal));
              }
@@ -521,12 +528,12 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
       .value("BFC", GpuAllocatorConfig::Kind::kBFC);
 
-  py::enum_<PjRtBuffer::HostBufferSemantics>(m, "HostBufferSemantics")
+  py::enum_<PjRtClient::HostBufferSemantics>(m, "HostBufferSemantics")
       .value("IMMUTABLE_ONLY_DURING_CALL",
-             PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall)
+             PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
       .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
-             PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes)
-      .value("ZERO_COPY", PjRtBuffer::HostBufferSemantics::kZeroCopy);
+             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
+      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kZeroCopy);
 
   py::class_<PyClient, std::shared_ptr<PyClient>> py_local_client(m, "Client");
   py_local_client.def_property_readonly("platform", &PyClient::platform_name)
@@ -548,7 +555,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("buffer_from_pyval", &PyClient::BufferFromPyval, py::arg("argument"),
            py::arg("device") = nullptr, py::arg("force_copy") = false,
            py::arg("host_buffer_semantics") =
-               PjRtBuffer::HostBufferSemantics::kZeroCopy)
+               PjRtClient::HostBufferSemantics::kZeroCopy)
       .def("compile", &PyClient::Compile, py::arg("computation"),
            py::arg("compile_options") = CompileOptions())
       .def("heap_profile", &PyClient::HeapProfile);
@@ -580,6 +587,14 @@ PYBIND11_MODULE(xla_extension, m) {
       py::arg("asynchronous") = true,
       py::arg("allocator_config") = GpuAllocatorConfig(),
       py::arg("distributed_client") = nullptr, py::arg("node_id") = 0);
+  m.def(
+      "get_tpu_client",
+      [](bool asynchronous) -> StatusOr<std::shared_ptr<PyClient>> {
+        TF_ASSIGN_OR_RETURN(std::shared_ptr<PjRtClient> client,
+                            GetTpuClient(asynchronous));
+        return std::make_shared<PyClient>(std::move(client));
+      },
+      py::arg("asynchronous") = true);
 
   py::class_<Traceback::Frame>(m, "Frame")
       .def_readonly("file_name", &Traceback::Frame::file_name)
@@ -626,9 +641,7 @@ PYBIND11_MODULE(xla_extension, m) {
           [](py::object buffer_obj) -> StatusOr<py::object> {
             GlobalPyRefManager()->CollectGarbage();
             PyBuffer* buffer = buffer_obj.cast<PyBuffer*>();
-            LocalDeviceState* state =
-                buffer->buffer()->device()->local_device_state();
-            if (state->executor()->platform_kind() == se::PlatformKind::kHost &&
+            if (buffer->buffer()->IsOnCpu() &&
                 buffer->buffer()->on_device_shape().IsArray() &&
                 buffer->buffer()->on_device_shape().element_type() != BF16) {
               py::object out = py::reinterpret_steal<py::object>(
@@ -877,7 +890,8 @@ PYBIND11_MODULE(xla_extension, m) {
                  ShapeIndex(param_index.begin(), param_index.end()));
            });
 
-  m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor);
+  m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor,
+        py::arg("buffer"), py::arg("take_ownership") = true);
   m.def("dlpack_managed_tensor_to_buffer", DLPackManagedTensorToBuffer);
 
   py::enum_<PrecisionConfig::Precision>(m, "PrecisionConfig_Precision")
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 133483d2bb9..3de0ffcc2f8 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -90,11 +90,16 @@ def _gpu_backend_factory(distributed_client=None, node_id=0):
       node_id=node_id)
 
 
+def _tpu_backend_factory():
+  return _xla.get_tpu_client(asynchronous=True)
+
+
 # Backend factories, keyed by user-visible name, in increasing priority order.
 _local_backend_factories = collections.OrderedDict([
     ('interpreter', _interpreter_backend_factory),
     ('cpu', _cpu_backend_factory),
     ('gpu', _gpu_backend_factory),
+    ('tpu', _tpu_backend_factory),
 ])
 
 
diff --git a/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py b/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
new file mode 100644
index 00000000000..180bb040cc4
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Backend-independent tests for the Python XLA client."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from absl.testing import absltest
+import numpy as np
+
+from tensorflow.compiler.xla.python import xla_client
+
+# pylint: disable=g-import-not-at-top
+try:
+  import portpicker
+except ImportError:
+  portpicker = None
+# pylint: enable=g-import-not-at-top
+
+ops = xla_client.ops
+
+
+class ShapeTest(absltest.TestCase):
+
+  def testInvalidShapes(self):
+    with self.assertRaisesRegex(RuntimeError,
+                                "shape's dimensions must not be < 0.*"):
+      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [-2, 4])
+
+    with self.assertRaisesRegex(
+        RuntimeError, "layout minor_to_major field contains 1 element.*"):
+      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [2, 4], [3])
+
+    with self.assertRaisesRegex(
+        RuntimeError, "layout minor_to_major field has out-of-bounds value.*"):
+      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [2, 4],
+                                   [1, -1])
+
+
+class ComputationPrinting(absltest.TestCase):
+
+  def ExampleComputation(self):
+    builder = xla_client.XlaBuilder("acomputation")
+    p0 = ops.Parameter(builder, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    x = ops.Mul(p0, p1)
+    ops.Add(x, x)
+    return builder.build()
+
+  def testComputationToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.as_hlo_text()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testComputationToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = computation.as_hlo_dot_graph()
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+  def testHloModuleToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.as_hlo_module().to_string()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testHloModuleToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
+        computation.as_hlo_module())
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+
+class ComputationHashTest(absltest.TestCase):
+
+  def testHash(self):
+    builder0 = xla_client.XlaBuilder("computation0")
+    p0 = ops.Parameter(builder0, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder0, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    ops.Mul(p0, p1)
+    computation0 = builder0.build()
+
+    builder1 = xla_client.XlaBuilder("computation1")
+    p0 = ops.Parameter(builder1, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder1, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    ops.Mul(p0, p1)
+    computation1 = builder1.build()
+
+    self.assertEqual(computation0.hash(), computation1.hash())
+
+
+class AliasTest(absltest.TestCase):
+
+  def testSetUpAlias(self):
+    c = xla_client.XlaBuilder(self.id())
+    p1 = ops.Parameter(
+        c, 0,
+        xla_client.shape_from_pyval(np.array(
+            1.0, np.float32)).with_major_to_minor_layout_if_absent())
+    p2 = ops.Parameter(
+        c, 1,
+        xla_client.shape_from_pyval(np.array(
+            1.0, np.float32)).with_major_to_minor_layout_if_absent())
+    out = ops.Add(p1, p2)
+    c.setup_alias([], 0, [])
+    c.build(out)
+
+
+class ProfilerTest(absltest.TestCase):
+
+  def testTraceMe(self):
+    # TODO(phawkins): These tests just check that the TraceMe context manager
+    # acts like a context manager and doesn't explode. Ideally we'd check that
+    # the profiler saw the traceme too.
+    with xla_client.profiler.TraceMe("test1"):
+      pass
+    with xla_client.profiler.TraceMe("test2", foo=123):
+      pass
+    with self.assertRaises(ValueError):
+      with xla_client.profiler.TraceMe("test3"):
+        raise ValueError("test")
+
+  @unittest.skipIf(portpicker is None, "Test requires portpicker")
+  def testStartServer(self):
+    port = portpicker.pick_unused_port()
+    server = xla_client.profiler.start_server(port)
+    del server
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 3863d8a1481..67052c8c784 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1,4 +1,3 @@
-# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the Python extension-based XLA client."""
+"""Backend-dependent tests for the Python XLA client."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -37,12 +36,6 @@ try:
 except ImportError:
   custom_call_for_test = None
 
-try:
-  import portpicker
-except ImportError:
-  portpicker = None
-# pylint: enable=g-import-not-at-top
-
 bfloat16 = xla_client.bfloat16
 ops = xla_client.ops
 
@@ -142,27 +135,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
       ops.Add(x, x)
       return builder.build()
 
-    def testComputationToHloText(self):
-      computation = self.ExampleComputation()
-      hlo_text = computation.as_hlo_text()
-      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-    def testComputationToHloGraph(self):
-      computation = self.ExampleComputation()
-      hlo_dot_graph = computation.as_hlo_dot_graph()
-      self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
-    def testHloModuleToHloText(self):
-      computation = self.ExampleComputation()
-      hlo_text = computation.as_hlo_module().to_string()
-      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-    def testHloModuleToHloGraph(self):
-      computation = self.ExampleComputation()
-      hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
-          computation.as_hlo_module())
-      self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
     @unittest.skipIf(cloud_tpu, "not implemented")
     def testCompiledHloModuleToHloText(self):
       computation = self.ExampleComputation()
@@ -182,29 +154,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
 
   tests.append(ComputationPrinting)
 
-  class ComputationHashTest(absltest.TestCase):
-
-    def testHash(self):
-      builder0 = xla_client.XlaBuilder("computation0")
-      p0 = ops.Parameter(builder0, 0,
-                         xla_client.shape_from_pyval(np.float32(0)))
-      p1 = ops.Parameter(
-          builder0, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-      ops.Mul(p0, p1)
-      computation0 = builder0.build()
-
-      builder1 = xla_client.XlaBuilder("computation1")
-      p0 = ops.Parameter(builder1, 0,
-                         xla_client.shape_from_pyval(np.float32(0)))
-      p1 = ops.Parameter(
-          builder1, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-      ops.Mul(p0, p1)
-      computation1 = builder1.build()
-
-      self.assertEqual(computation0.hash(), computation1.hash())
-
-  tests.append(ComputationHashTest)
-
   class ComputationsWithConstantsTest(ComputationTest):
     """Tests focusing on Constant ops."""
 
@@ -556,6 +505,7 @@ def TestFactory(xla_backend, cloud_tpu=False):
       self._ExecuteAndCompareExact(
           c, expected=[np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtype)])
 
+    # pyformat: disable
     @parameterized.named_parameters({
         "testcase_name": "_{}_{}".format(src_dtype.__name__,
                                          dst_dtype.__name__),
@@ -563,6 +513,7 @@ def TestFactory(xla_backend, cloud_tpu=False):
         "dst_dtype": dst_dtype,
     } for src_dtype, dst_dtype in itertools.permutations(
         [np.bool, np.int32, np.int64, np.float32, np.float64], 2))
+    # pyformat: enable
     def testConvertElementType(self, src_dtype, dst_dtype):
       if ((src_dtype in [np.int64, np.float64] or
            dst_dtype in [np.int64, np.float64]) and
@@ -582,6 +533,7 @@ def TestFactory(xla_backend, cloud_tpu=False):
       self.assertEqual(result[0].dtype, expected.dtype)
       np.testing.assert_equal(result[0], expected)
 
+    # pyformat: disable
     @parameterized.named_parameters(
         {
             "testcase_name": "_{}_{}".format(src_dtype.__name__,
@@ -591,6 +543,7 @@ def TestFactory(xla_backend, cloud_tpu=False):
         }
         for dtypes in [[np.int32, np.float32], [np.int64, np.float64]]
         for src_dtype, dst_dtype in itertools.permutations(dtypes, 2))
+    # pyformat: enable
     def testBitcastConvertType(self, src_dtype, dst_dtype):
       if (np.float64 in (src_dtype, dst_dtype) and
           self.backend.platform == "tpu"):
@@ -1820,6 +1773,14 @@ def TestFactory(xla_backend, cloud_tpu=False):
                           dtype=np.int32)
       self._ExecuteAndCompareClose(c, expected=[expected])
 
+  class DeviceTest(ComputationTest):
+
+    def testPlatform(self):
+      for device in self.backend.local_devices():
+        self.assertEqual(device.platform, self.backend.platform)
+
+  tests.append(DeviceTest)
+
   class ErrorTest(ComputationTest):
 
     def setUp(self):
@@ -1901,24 +1862,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
 
   tests.append(SetShardingTest)
 
-  class AliasTest(ComputationTest):
-
-    def testSetUpAlias(self):
-      c = self._NewComputation()
-      p1 = ops.Parameter(
-          c, 0,
-          xla_client.shape_from_pyval(
-              NumpyArrayF32(1.0)).with_major_to_minor_layout_if_absent())
-      p2 = ops.Parameter(
-          c, 1,
-          xla_client.shape_from_pyval(
-              NumpyArrayF32(1.0)).with_major_to_minor_layout_if_absent())
-      out = ops.Add(p1, p2)
-      c.setup_alias([], 0, [])
-      c = c.build(out)
-
-  tests.append(AliasTest)
-
   testcase_shapes = [
       (),
       (1,),
@@ -1944,35 +1887,67 @@ def TestFactory(xla_backend, cloud_tpu=False):
         self.skipTest("DLPack requires CPU or GPU")
 
     # pylint: disable=g-complex-comprehension
+    # pyformat: disable
     @parameterized.named_parameters({
-        "testcase_name": FormatShapeAndDtype(shape, dtype),
+        "testcase_name": "{}_own={}".format(FormatShapeAndDtype(shape, dtype),
+                                            take_ownership),
         "dtype": dtype,
-        "shape": shape
-    } for dtype in dlpack_dtypes for shape in testcase_shapes)
-    def testRoundTrip(self, dtype, shape):
+        "shape": shape,
+        "take_ownership": take_ownership
+    } for dtype in dlpack_dtypes for shape in testcase_shapes
+                                    for take_ownership in [False, True])
+    # pyformat: enable
+    def testRoundTrip(self, dtype, shape, take_ownership):
       x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
       buffer = self.backend.buffer_from_pyval(x)
-      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
+      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(
+          buffer, take_ownership=take_ownership)
       del buffer  # Free "buffer" to make sure dlt retains ownership.
       self.assertEqual(type(dlt).__name__, "PyCapsule")
-      y = xla_client._xla.dlpack_managed_tensor_to_buffer(
-          dlt, self.backend)
+      y = xla_client._xla.dlpack_managed_tensor_to_buffer(dlt, self.backend)
       np.testing.assert_array_equal(x, y.to_py())
 
     def testTensorsCanBeConsumedOnceOnly(self):
       x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
       buffer = self.backend.buffer_from_pyval(x)
-      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
+      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(
+          buffer, take_ownership=True)
 
       def ConsumeDLPackTensor():
-        _ = xla_client._xla.dlpack_managed_tensor_to_buffer(
-            dlt, self.backend)
+        _ = xla_client._xla.dlpack_managed_tensor_to_buffer(dlt, self.backend)
 
       ConsumeDLPackTensor()
       self.assertRaisesRegex(
           RuntimeError, ".*a DLPack tensor may be consumed at most once.*",
           ConsumeDLPackTensor)
 
+    def testTensorsCanBeOwnedOnceOnly(self):
+      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
+      buffer = self.backend.buffer_from_pyval(x)
+      _ = xla_client._xla.buffer_to_dlpack_managed_tensor(
+          buffer, take_ownership=True)
+      self.assertTrue(buffer.is_deleted())
+      with self.assertRaisesRegex(
+          RuntimeError,
+          "Cannot convert deleted/invalid buffer to DLPack tensor.*"):
+        _ = xla_client._xla.buffer_to_dlpack_managed_tensor(
+            buffer, take_ownership=True)
+
+    def testNonOwnedDlpackCanBeViewedTwice(self):
+      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
+      buffer = self.backend.buffer_from_pyval(x)
+      d1 = xla_client._xla.buffer_to_dlpack_managed_tensor(
+          buffer, take_ownership=False)
+      d2 = xla_client._xla.buffer_to_dlpack_managed_tensor(
+          buffer, take_ownership=False)
+
+      y = xla_client._xla.dlpack_managed_tensor_to_buffer(d1, self.backend)
+      z = xla_client._xla.dlpack_managed_tensor_to_buffer(d2, self.backend)
+      del d1, d2
+      np.testing.assert_array_equal(x, buffer.to_py())
+      np.testing.assert_array_equal(x, y.to_py())
+      np.testing.assert_array_equal(x, z.to_py())
+
   tests.append(DLPackTest)
 
   class BufferProtocolTest(parameterized.TestCase):
@@ -2022,28 +1997,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
 
   tests.append(BufferProtocolTest)
 
-  class ProfilerTest(absltest.TestCase):
-
-    def testTraceMe(self):
-      # TODO(phawkins): These tests just check that the TraceMe context manager
-      # acts like a context manager and doesn't explode. Ideally we'd check that
-      # the profiler saw the traceme too.
-      with xla_client.profiler.TraceMe("test1"):
-        pass
-      with xla_client.profiler.TraceMe("test2", foo=123):
-        pass
-      with self.assertRaises(ValueError):
-        with xla_client.profiler.TraceMe("test3"):
-          raise ValueError("test")
-
-    @unittest.skipIf(portpicker is None, "Test requires portpicker")
-    def testStartServer(self):
-      port = portpicker.pick_unused_port()
-      server = xla_client.profiler.start_server(port)
-      del server
-
-  tests.append(ProfilerTest)
-
   class TracebackTest(absltest.TestCase):
 
     def setUp(self):
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 491d1d67877..5fe7fbdb1d8 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -241,7 +241,10 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
@@ -2718,17 +2721,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_window_utils",
+    srcs = ["dynamic_window_utils.cc"],
+    hdrs = ["dynamic_window_utils.h"],
+    deps = [
+        ":hlo",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core/platform:macros",
+    ],
+)
+
 cc_library(
     name = "dynamic_dimension_inference",
     srcs = ["dynamic_dimension_inference.cc"],
     hdrs = ["dynamic_dimension_inference.h"],
     deps = [
+        ":dynamic_window_utils",
         ":hlo",
         ":hlo_casting_utils",
         ":tuple_util",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -2748,10 +2769,13 @@ cc_library(
     hdrs = ["dynamic_padder.h"],
     deps = [
         ":dynamic_dimension_inference",
+        ":dynamic_window_utils",
         ":hlo",
         ":hlo_casting_utils",
+        ":hlo_creation_utils",
         ":hlo_dce",
         ":hlo_pass",
+        ":hlo_verifier",
         ":shape_inference",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
@@ -2759,6 +2783,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
@@ -3457,6 +3482,33 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/graphcycles",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "loop_schedule_linearizer",
+    srcs = ["loop_schedule_linearizer.cc"],
+    hdrs = ["loop_schedule_linearizer.h"],
+    deps = [
+        ":dump",
+        ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_dce",
+        ":hlo_graph_dumper",
+        ":hlo_ordering",
+        ":hlo_pass",
+        ":logical_buffer",
+        ":tuple_simplifier",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3485,6 +3537,28 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "loop_schedule_linearizer_test",
+    srcs = ["loop_schedule_linearizer_test.cc"],
+    deps = [
+        ":copy_insertion",
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_matchers",
+        ":hlo_runner",
+        ":loop_schedule_linearizer",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "memory_space_assignment_utils",
     srcs = ["memory_space_assignment_utils.cc"],
@@ -3603,6 +3677,8 @@ cc_library(
         ":hlo_dce",
         ":hlo_liveness_analysis",
         ":hlo_pass",
+        ":tuple_simplifier",
+        ":while_loop_simplifier",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 76b0236fcdd..046701c564f 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -3170,14 +3170,15 @@ Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
   CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
   {
     // compare(broadcast(a) + x, broadcast(b)) ==>
-    //   compare(x, broadcast(b-a))
+    //   compare(x, broadcast(b-a)), only enabled for integral types.
     HloInstruction *x, *a, *b;
     if (Match(compare,
               m::Compare(
                   m::AddAnyOrder(m::Op(&x), m::Broadcast(m::Op(&a).WithShape(
                                                 m::Shape().IsScalar()))),
                   m::Broadcast(m::Op(&b).WithShape(m::Shape().IsScalar()))))) {
-      if (ShapeUtil::ElementIsSigned(x->shape())) {
+      if (ShapeUtil::ElementIsSigned(x->shape()) &&
+          ShapeUtil::ElementIsIntegral(x->shape())) {
         HloInstruction* sub =
             computation_->AddInstruction(HloInstruction::CreateBinary(
                 b->shape(), HloOpcode::kSubtract, b, a));
@@ -5219,13 +5220,31 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SwapConvOperands(
   for (int64 spatial_dim = 0;
        spatial_dim < dnums.input_spatial_dimensions_size(); ++spatial_dim) {
     const int64 kernel_size = window_dims[spatial_dim].size();
-    const int64 dilated_kernel_size =
-        1 + (kernel_size - 1) * window_dims[spatial_dim].window_dilation();
-
+    const bool can_be_group_or_contraction =
+        !window_dims[spatial_dim].window_reversal() &&
+        window_dims[spatial_dim].padding_low() == 0 &&
+        window_dims[spatial_dim].padding_high() == 0 &&
+        window_dims[spatial_dim].window_dilation() == 1;
+    const bool is_group_dim =
+        can_be_group_or_contraction &&
+        window_dims[spatial_dim].base_dilation() == kernel_size &&
+        window_dims[spatial_dim].stride() == kernel_size - 1;
     const int64 input_size =
         input->shape().dimensions(dnums.input_spatial_dimensions(spatial_dim));
+    const bool is_pure_contraction_dim =
+        kernel_size == input_size && can_be_group_or_contraction &&
+        window_dims[spatial_dim].base_dilation() == 1 &&
+        window_dims[spatial_dim].stride() == 1;
+    if (is_group_dim || is_pure_contraction_dim) {
+      *(swapped_window.add_dimensions()) = window_dims[spatial_dim];
+      continue;
+    }
+
+    const int64 dilated_kernel_size =
+        1 + (kernel_size - 1) * window_dims[spatial_dim].window_dilation();
     const int64 dilated_input_size =
         1 + (input_size - 1) * window_dims[spatial_dim].base_dilation();
+
     // Don't decide to swap if the input size is one, since many convolution
     // implementations can easily hand that special case efficiently.
     kernel_product *= kernel_size;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index c4f3ea4087b..91c6c29ee80 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -6654,6 +6654,32 @@ TEST_F(AlgebraicSimplifierTest, AbsEliminationMultiply) {
               GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
+TEST_F(AlgebraicSimplifierTest, BroadcastCompareSimplification) {
+  std::string module_string = R"(
+    HloModule m
+    test {
+      a = s32[] parameter(0)
+      b = s32[] parameter(1)
+      x = s32[10]{0} parameter(2)
+      broadcast_a = s32[10]{0} broadcast(a), dimensions={}
+      broadcast_b = s32[10]{0} broadcast(b), dimensions={}
+      add = s32[10]{0} add(broadcast_a, x)
+      ROOT cmp = pred[10]{0} compare(add, broadcast_b), direction=EQ
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_string));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Compare(m::Parameter(2),
+                                    m::Broadcast(m::Subtract(
+                                        m::Parameter(1), m::Parameter(0))))));
+
+  // Numerically unstable transformation shouldn't be applied to floating types.
+  std::string module_string_f32 =
+      absl::StrReplaceAll(module_string, {{"s32", "f32"}});
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, AbsEliminationPower2) {
   const char* kModuleStr = R"(
     HloModule m
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 19927ae1576..a25492bd71b 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -64,15 +64,6 @@ StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
   VLOG(2) << "RegisterInternal("
           << "tag: \"" << tag << "\" with " << replicated_buffers.size()
           << " shaped_buffers.";
-  for (const auto& shaped_buffer : replicated_buffers) {
-    VLOG(2) << "shaped_buffer:" << shaped_buffer;
-    if (shaped_buffer.platform() != backend_->platform()) {
-      return InvalidArgument(
-          "AllocationTracker for platform %s cannot register buffer from "
-          "platform %s",
-          backend_->platform()->Name(), shaped_buffer.platform()->Name());
-    }
-  }
 
   int64 handle = next_handle_++;
   for (auto& shaped_buffer : replicated_buffers) {
@@ -158,7 +149,7 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
        ++i) {
     auto element_buffer = ShapedBuffer(
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
-        shaped_buffer->platform(), shaped_buffer->device_ordinal());
+        shaped_buffer->device_ordinal());
     element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}),
                               /*index=*/{});
     std::vector<ShapedBuffer> replicated_buffers;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index db34f054f35..a1b86be13c1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -607,7 +607,10 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
 // BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
   VLOG(1) << "CombineTempAllocations()";
-  flat_hash_map<BufferValue::Color, BufferAllocation> combined_allocation_map;
+  // Stores the combined allocations.
+  std::deque<BufferAllocation> combined_allocations;
+  // Holds the pointer to a combined allocation of each color, if any.
+  flat_hash_map<BufferValue::Color, BufferAllocation*> combined_allocation_map;
 
   // Move all temp allocations into a single run at the end of the allocations
   // vector.
@@ -621,19 +624,31 @@ void BufferAssignment::CombineTempAllocations() {
   // to the same color.
   if (first_temp_it != allocations_.end()) {
     for (auto it = first_temp_it; it != allocations_.end(); ++it) {
-      const BufferAllocation& temp_allocation = *it;
+      BufferAllocation& temp_allocation = *it;
       BufferValue::Color color = temp_allocation.color();
       auto combined_it = combined_allocation_map.find(color);
       if (combined_it == combined_allocation_map.end()) {
         // We have found the first temp allocation of this color. Collect
-        // the other temp allocations of the same color into it.
+        // the other temp allocations of the same color into it subject to the
+        // size constraint.
         VLOG(1) << "Combined temp allocation for color " << color
                 << " is: " << temp_allocation;
-        combined_allocation_map.emplace(color, temp_allocation);
+        combined_allocations.emplace_back(temp_allocation);
+        combined_allocation_map.emplace(color, &combined_allocations.back());
+        continue;
+      }
+      if (combined_it->second->size() + it->size() >=
+          multiheap_size_constraint_per_heap_) {
+        // We cannot put more into the current combined_it. So, appoint a new
+        // combined_it.
+        VLOG(1) << "Due to size constraint, reset temp allocation for color "
+                << color << " to: " << temp_allocation;
+        combined_allocations.emplace_back(temp_allocation);
+        combined_allocation_map.emplace(color, &combined_allocations.back());
         continue;
       }
 
-      auto* combined_allocation = &combined_it->second;
+      BufferAllocation* combined_allocation = combined_it->second;
       VLOG(1) << "Combined allocation absorbing temp allocation: "
               << temp_allocation;
 
@@ -663,9 +678,9 @@ void BufferAssignment::CombineTempAllocations() {
     // Replace all existing temporary allocations with the new combined
     // allocations.
     allocations_.erase(first_temp_it, allocations_.end());
-    for (auto& combined : combined_allocation_map) {
-      allocations_.push_back(combined.second);
-      temp_allocation_total_size_ += combined.second.size();
+    for (BufferAllocation& combined : combined_allocations) {
+      temp_allocation_total_size_ += combined.size();
+      allocations_.push_back(std::move(combined));
     }
   }
 
@@ -1331,11 +1346,13 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     auto algorithms = absl::make_unique<
         std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
     algorithms->push_back(
-        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
-            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
+        absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
+            assignment->multiheap_size_constraint_per_heap(), alignment,
+            GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
     algorithms->push_back(
-        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
-            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
+        absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
+            assignment->multiheap_size_constraint_per_heap(), alignment,
+            GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
     return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
         std::move(algorithms));
   };
@@ -1444,6 +1461,12 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
   int64 max_live_size = 0;
   int64 live_size = 0;
   for (const auto& event : heap_trace.events()) {
+    if (!id_to_value.contains(event.buffer_id())) {
+      // Skip as the buffer associated with this trace event is not placed into
+      // this allocation. This can happen when size constraints are given to the
+      // heap simulator.
+      continue;
+    }
     live_size += memory_delta(event);
     if (max_live_size < live_size) {
       max_live_size = live_size;
@@ -1455,6 +1478,12 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
   absl::flat_hash_set<const HloValue*> live_values;
   live_size = 0;
   for (const auto& event : heap_trace.events()) {
+    if (!id_to_value.contains(event.buffer_id())) {
+      // Skip as the buffer associated with this trace event is not placed into
+      // this allocation. This can happen when size constraints are given to the
+      // heap simulator.
+      continue;
+    }
     const HloValue* value = id_to_value.at(event.buffer_id());
     if (event.kind() == HeapSimulatorTrace::Event::ALLOC ||
         event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
@@ -1500,20 +1529,24 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
   }
   VLOG(1) << "Result size from heap simulator: " << result.heap_size;
 
-  BufferAllocation* allocation =
-      assignment->NewEmptyAllocation(result.heap_size, color);
-  for (const auto& buffer_chunk : result.chunk_map) {
-    const HloValue& value = *buffer_chunk.first;
-    const HeapSimulator::Chunk& chunk = buffer_chunk.second;
-    assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
+  // Iterate through heap_results. For each heap_result, create a new allocation
+  // in `assignment`.
+  for (const HeapSimulator::HeapResult<HloValue>& heap_result :
+       result.heap_results) {
+    BufferAllocation* allocation =
+        assignment->NewEmptyAllocation(heap_result.heap_size, color);
+    for (const auto& buffer_chunk : heap_result.chunk_map) {
+      const HloValue& value = *buffer_chunk.first;
+      const HeapSimulator::Chunk& chunk = buffer_chunk.second;
+      assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
+    }
+    allocation->peak_buffers_ =
+        ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
+
+    XLA_VLOG_LINES(2, allocation->ToString());
+
+    allocation->AddHeapTrace(result.debug_trace);
   }
-  allocation->peak_buffers_ =
-      ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
-
-  VLOG(1) << "Ran heap simulation for allocation: ";
-  XLA_VLOG_LINES(2, allocation->ToString());
-
-  allocation->AddHeapTrace(result.debug_trace);
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
@@ -1580,6 +1613,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       buffers_to_assign_sequentially.size() == global_computations.size();
   VLOG(2) << "Running whole module heap simulation: "
           << run_whole_module_heap_simulation;
+  const int32 multiheap_size_constraint_per_heap =
+      module->config().debug_options().xla_multiheap_size_constraint_per_heap();
+  VLOG(2) << "Multiheap per heap size limit: "
+          << multiheap_size_constraint_per_heap;
   TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
       buffers_to_assign_sequentially, run_whole_module_heap_simulation,
       assignment.get()));
@@ -1614,10 +1651,11 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
     }
   }
 
-  // Combines allocations of temporary buffers into one big BufferAllocation.
-  // This can only be performed after all buffers have been assigned, and
-  // after maybe_live_out is marked, since it is used to determine whether an
-  // allocation contains temporary buffers or not.
+  // Combines allocations of temporary buffers into big BufferAllocations
+  // subject to the buffer allocation size constraint. This can only be
+  // performed after all buffers have been assigned, and after maybe_live_out
+  // is marked, since it is used to determine whether an allocation contains
+  // temporary buffers or not.
   assignment->CombineTempAllocations();
 
   XLA_VLOG_LINES(2, assignment->ToString());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index dfde46ca4b1..19a0e6ef6da 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -363,6 +363,10 @@ class BufferAssignment {
     return temp_allocation_total_size_;
   }
 
+  uint64 multiheap_size_constraint_per_heap() const {
+    return multiheap_size_constraint_per_heap_;
+  }
+
   // Returns whether the given buffer has been assigned an allocation.
   bool HasAllocation(const HloValue& value) const;
 
@@ -491,7 +495,14 @@ class BufferAssignment {
         buffer_size_(std::move(buffer_size)),
         color_alignment_(std::move(color_alignment)),
         alias_analysis_(std::move(alias_analysis)),
-        hlo_live_range_(std::move(hlo_live_range)) {}
+        hlo_live_range_(std::move(hlo_live_range)) {
+    int32 raw_value = module->config()
+                          .debug_options()
+                          .xla_multiheap_size_constraint_per_heap();
+    // -1 means no constraint.
+    multiheap_size_constraint_per_heap_ =
+        (raw_value == -1) ? UINT64_MAX : raw_value;
+  }
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
@@ -535,6 +546,8 @@ class BufferAssignment {
   // The total size of all temporary buffers.
   int64 temp_allocation_total_size_ = 0;
 
+  uint64 multiheap_size_constraint_per_heap_;
+
   // Maps Buffers to the index of the BufferAllocation which holds the buffer.
   absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
       allocation_index_for_value_;
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index f03b27cdcc7..653f4555a77 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -28,14 +28,6 @@ namespace xla {
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
-StatusOr<
-    std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
-Compiler::RunHloPassesAndBufferAssignement(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("This compiler does not support this method");
-}
-
 std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
 Compiler::ComputeBackendConfigs(const HloInstruction& hlo,
                                 se::StreamExecutor* executor) const {
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 312a068ba65..253caac195c 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -188,7 +188,10 @@ class Compiler {
       std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
   RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> module,
                                    se::StreamExecutor* executor,
-                                   se::DeviceMemoryAllocator* device_allocator);
+                                   se::DeviceMemoryAllocator* device_allocator,
+                                   bool optimize) {
+    return Unimplemented("This compiler does not support this method");
+  }
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 78730cbdcb8..74f2e38b8ab 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -2100,10 +2100,14 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody() {
   return builder.Build();
 }
 
-void BM_SequentialWhiles(int num_iters, int num_whiles) {
+void BM_SequentialWhiles(::testing::benchmark::State& state) {
+  const int num_whiles = state.range(0);
+
   // This benchmark constructs a chain of sequential while instructions.
-  tensorflow::testing::StopTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  // Timer starts automatically at the first iteration of this loop
+  // and ends after the last one.
+  for (auto s : state) {
+    state.PauseTiming();
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
@@ -2131,19 +2135,22 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
 
     CopyInsertion copy_insertion;
 
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
     ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
+    state.PauseTiming();
 
     // The entry computation should have three copies, and each body has one.
     ASSERT_EQ(CountCopies(module), 3 + num_whiles);
+    state.ResumeTiming();
   }
 }
 
-void BM_ParallelWhiles(int num_iters, int num_whiles) {
+void BM_ParallelWhiles(::testing::benchmark::State& state) {
+  const int num_whiles = state.range(0);
+
   // This benchmark constructs a fan-out of parallel while instructions.
-  tensorflow::testing::StopTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
+    state.PauseTiming();
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
@@ -2182,9 +2189,9 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
 
     CopyInsertion copy_insertion;
 
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
     ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
+    state.PauseTiming();
 
     // Each body receives of copy of two of the parameters (the corresponding
     // elements in the body are modified), and there is one copy in each body.
@@ -2209,14 +2216,15 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
   return builder.Build();
 }
 
-void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
-  tensorflow::testing::StopTiming();
+void BM_ManyElementTuple(::testing::benchmark::State& state) {
+  const int num_tuple_inputs = state.range(0);
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsFromFlags());
   CopyInsertion copy_insertion;
   const Shape element_shape = ShapeUtil::MakeShape(F32, {});
   std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
+    state.PauseTiming();
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
     HloModule module("BM_ManyElementTuple", config);
     for (int j = 0; j < num_tuple_inputs; ++j) {
@@ -2234,9 +2242,8 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
     builder.AddInstruction(HloInstruction::CreateGetTupleElement(
         ShapeUtil::MakeShape(F32, {}), xla_while, 0));
     module.AddEntryComputation(builder.Build());
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
     ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 80c5513bb9d..e5c59fc0c7a 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -45,8 +45,9 @@ cc_library(
 )
 
 filegroup(
-    name = "single_threaded_runtime_srcs",
+    name = "runtime_srcs",
     srcs = [
+        # Single-threaded support.
         "runtime_fp16.cc",
         "runtime_key_value_sort.cc",
         "runtime_pow.cc",
@@ -54,13 +55,20 @@ filegroup(
         "runtime_single_threaded_fft.cc",
         "runtime_single_threaded_matmul.cc",
         "runtime_topk.cc",
+    ] + [
+        # Multi-threaded support.
+        "runtime_conv2d.cc",
+        "runtime_fft.cc",
+        "runtime_matmul.cc",
+        "runtime_fork_join.cc",
     ],
     visibility = [":friends"],
 )
 
 filegroup(
-    name = "single_threaded_runtime_hdrs",
+    name = "runtime_hdrs",
     srcs = [
+        # Single-threaded support.
         "runtime_conv2d_impl.h",
         "runtime_fft_impl.h",
         "runtime_fp16.h",
@@ -70,6 +78,13 @@ filegroup(
         "runtime_single_threaded_fft.h",
         "runtime_single_threaded_matmul.h",
         "runtime_topk.h",
+    ] + [
+        # Multi-threaded support.
+        "runtime_conv2d.h",
+        "runtime_fft.h",
+        "runtime_fork_join.h",
+        "runtime_lightweight_check.h",
+        "runtime_matmul.h",
     ],
     visibility = [":friends"],
 )
@@ -487,7 +502,6 @@ cc_library(
         ":cpu_runtime",
         ":ir_emission_utils",
         ":mlir_emitter",
-        ":mlir_matmul_codegen_strategy",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -509,6 +523,7 @@ cc_library(
         "@llvm-project//mlir:EDSC",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:StandardOps",
     ],
 )
@@ -552,6 +567,7 @@ cc_library(
         "@llvm-project//llvm:IPO",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Object",
+        "@llvm-project//llvm:OrcJIT",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
     ],
@@ -1143,24 +1159,3 @@ cc_library(
         "@llvm-project//mlir:VectorToLLVM",
     ],
 )
-
-cc_library(
-    name = "mlir_matmul_codegen_strategy",
-    srcs = ["mlir_matmul_codegen_strategy.cc"],
-    hdrs = ["mlir_matmul_codegen_strategy.h"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Affine",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorOps",
-        "@llvm-project//mlir:VectorToSCF",
-    ],
-)
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index a21ace0d8b2..643de6c4e58 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -80,8 +80,8 @@ class FilteredPassManager : public llvm::legacy::PassManager {
 };
 }  // anonymous namespace
 
-std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
-    llvm::Module& module) const {
+llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
+    llvm::Module& module) {
   FilteredPassManager module_passes(disable_expensive_passes_);
   llvm::legacy::FunctionPassManager function_passes(&module);
 
@@ -155,7 +155,7 @@ std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
     }
   }
 
-  return memory_buffer;
+  return std::move(memory_buffer);
 }
 
 static std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl() {
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index 647f0d18ef5..6211588861b 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COMPILER_FUNCTOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COMPILER_FUNCTOR_H_
 
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -29,7 +30,7 @@ namespace cpu {
 
 // Functor class for compiling an LLVM module down to an object file. For use by
 // Orc JIT compile layer.
-class CompilerFunctor {
+class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
  public:
   explicit CompilerFunctor(
       llvm::TargetMachine* target_machine, int opt_level,
@@ -39,7 +40,8 @@ class CompilerFunctor {
       LLVMCompiler::ModuleHook post_optimization_hook = nullptr,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook =
           nullptr)
-      : target_machine_(target_machine),
+      : IRCompiler(llvm::orc::IRSymbolMapper::ManglingOptions()),
+        target_machine_(target_machine),
         opt_level_(opt_level),
         optimize_for_size_(optimize_for_size),
         disable_expensive_passes_(disable_expensive_passes),
@@ -49,8 +51,8 @@ class CompilerFunctor {
         post_codegen_hook_(std::move(post_codegen_hook)) {}
 
   // Compile a Module to an ObjectFile.
-  std::unique_ptr<llvm::MemoryBuffer> operator()(
-      llvm::Module& module) const;  // NOLINT
+  llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
+      llvm::Module& module) override;
 
  private:
   // Populates the given pass manager with TargetLibraryInfo and
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 0260b3926c7..e92f890ba67 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string.h>
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "llvm/IR/Verifier.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
@@ -562,9 +564,11 @@ StatusOr<
     std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
 CpuCompiler::RunHloPassesAndBufferAssignement(
     std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-    se::DeviceMemoryAllocator* device_allocator) {
-  TF_ASSIGN_OR_RETURN(
-      module, RunHloPasses(std::move(module), executor, device_allocator));
+    se::DeviceMemoryAllocator* device_allocator, bool optimize) {
+  if (optimize) {
+    TF_ASSIGN_OR_RETURN(
+        module, RunHloPasses(std::move(module), executor, device_allocator));
+  }
 
   // Select an order for emitting the HLO instructions for each computation.
   // Using this sequence enables tighter buffer liveness analysis and reduced
@@ -643,11 +647,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
   LoadMLIRDialects(mlir_context);
-  llvm::LLVMContext llvm_context;
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
   auto llvm_module =
-      absl::make_unique<llvm::Module>("__compute_module", llvm_context);
+      absl::make_unique<llvm::Module>("__compute_module", *llvm_context);
 
-  auto jit = absl::make_unique<SimpleOrcJIT>(
+  auto jit = SimpleOrcJIT::Create(
       CompilerTargetOptions(module->config()),
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
@@ -655,8 +659,12 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       llvm_ir::GetCpuFastMathFlags(module->config()), pre_optimization_ir_hook,
       post_optimization_ir_hook,
       OrcJITPostCompilationHook::Create(module.get()));
-  llvm_module->setDataLayout(jit->data_layout());
-  llvm_module->setTargetTriple(jit->target_triple().getTriple());
+  if (!jit) {
+    return InternalError("Creating JIT failed: %s",
+                         llvm::toString(jit.takeError()));
+  }
+  llvm_module->setDataLayout((*jit)->data_layout());
+  llvm_module->setTargetTriple((*jit)->target_triple().getTriple());
 
   HloComputation* entry_computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
@@ -698,7 +706,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // GetEmbeddedComputations guarantees that a called computation occurs
   // before a caller computation.
 
-  LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
+  LLVMTargetMachineFeatures target_machine_features((*jit)->target_machine());
   IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
@@ -737,7 +745,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   string function_name = [&]() {
     llvm::SmallVector<char, 40> function_name_vector;
     llvm::Mangler::getNameWithPrefix(
-        function_name_vector, entry_function->getName(), jit->data_layout());
+        function_name_vector, entry_function->getName(), (*jit)->data_layout());
     return string(function_name_vector.begin(), function_name_vector.end());
   }();
 
@@ -749,9 +757,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   // JIT compile the LLVM IR module to in-memory machine code.
-  jit->AddModule(std::move(llvm_module));
+  llvm::orc::ThreadSafeModule thread_safe_module(std::move(llvm_module),
+                                                 std::move(llvm_context));
+  cantFail((*jit)->AddModule(std::move(thread_safe_module)));
   cpu_executable.reset(new CpuExecutable(
-      std::move(jit), std::move(assignment), std::move(module), function_name,
+      std::move(*jit), std::move(assignment), std::move(module), function_name,
       std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
   if (embed_ir_in_executable) {
@@ -969,7 +979,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         llvm_ir::GetCpuFastMathFlags(module->config()),
         pre_optimization_ir_hook, post_optimization_ir_hook, post_codegen_hook);
     std::unique_ptr<llvm::MemoryBuffer> object_file =
-        compiler_functor(llvm_module);
+        cantFail(compiler_functor(llvm_module));
     ObjectFileData object_file_data(object_file->getBufferStart(),
                                     object_file->getBufferEnd());
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index d28ccd985a3..5c056fcacaa 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -138,9 +138,10 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<
       std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
-  RunHloPassesAndBufferAssignement(
-      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      se::DeviceMemoryAllocator* device_allocator) override;
+  RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> module,
+                                   se::StreamExecutor* executor,
+                                   se::DeviceMemoryAllocator* device_allocator,
+                                   bool optimize) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 02bc445ce9a..456bb8c5a32 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -63,14 +63,14 @@ CpuExecutable::CpuExecutable(
       assignment_(std::move(assignment)) {
   // Resolve symbols in the constructor rather than at execution time to avoid
   // races because FindSymbol is not thread safe.
-  llvm::JITSymbol sym = jit_->FindCompiledSymbol(entry_function_name);
+  llvm::Expected<llvm::JITEvaluatedSymbol> sym =
+      jit_->FindCompiledSymbol(entry_function_name);
   // We expect to find the symbol provided with entry_function_name; otherwise
   // this is an internal error.
-  CHECK(sym) << "Symbol " << entry_function_name << " not found.";
+  CHECK(*sym) << "Symbol " << entry_function_name << " not found.";
   // getAddress can do work under the hood in the jit, so it needs to be
   // guarded by the mutex.
-  compute_function_ =
-      reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress()));
+  compute_function_ = reinterpret_cast<ComputeFunctionType>(sym->getAddress());
   VLOG(1) << "compute_function_ at address "
           << reinterpret_cast<void*>(compute_function_);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 63d44af4a9e..ba8b74a64a5 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"  // from @llvm-project
 #include "mlir/EDSC/Builders.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
@@ -322,7 +322,7 @@ Status DotOpEmitter::EmitLinalgMatmul() {
         int64 alignment =
             target_machine_features_.minimum_alignment_for_allocation(
                 ShapeUtil::ByteSizeOf(dot_info_.result_shape));
-        mlir_strategy::MatmulCodegenStrategy strategy;
+        mlir::linalg::CodegenStrategy strategy;
         strategy.tile<mlir::linalg::GenericOp>(tilingOptions)
             .promote<mlir::linalg::GenericOp>(
                 mlir::linalg::LinalgPromotionOptions()
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index 05364a4492b..b15aa3689b7 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -41,8 +41,8 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
   switch (prim_type) {
     case F16:
       cast_result_to_fp16 = true;
-      lhs = FPCast(lhs, b_->getFloatTy());
-      rhs = FPCast(rhs, b_->getFloatTy());
+      lhs = FPCast(lhs, b()->getFloatTy());
+      rhs = FPCast(rhs, b()->getFloatTy());
       TF_FALLTHROUGH_INTENDED;
     case F32:
       function_name = "atan2f";
@@ -55,7 +55,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
   }
   // Create a function declaration.
   llvm::Function* function = llvm::dyn_cast<llvm::Function>(
-      module_
+      module()
           ->getOrInsertFunction(function_name, lhs->getType(), lhs->getType(),
                                 rhs->getType())
           .getCallee());
@@ -65,7 +65,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
   // Create an instruction to call the function.
   llvm::Value* result = Call(function, {lhs, rhs});
   if (cast_result_to_fp16) {
-    result = FPCast(result, b_->getHalfTy());
+    result = FPCast(result, b()->getHalfTy());
   }
   return result;
 }
@@ -77,7 +77,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   switch (prim_type) {
     case F16:
       cast_result_to_fp16 = true;
-      value = FPCast(value, b_->getFloatTy());
+      value = FPCast(value, b()->getFloatTy());
       TF_FALLTHROUGH_INTENDED;
     case F32:
       function_name = "tanhf";
@@ -90,7 +90,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   }
   // Create a function declaration.
   llvm::Function* function = llvm::dyn_cast<llvm::Function>(
-      module_
+      module()
           ->getOrInsertFunction(function_name, value->getType(),
                                 value->getType())
           .getCallee());
@@ -100,26 +100,20 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   // Create an instruction to call the function.
   llvm::Value* result = Call(function, value);
   if (cast_result_to_fp16) {
-    result = FPCast(result, b_->getHalfTy());
+    result = FPCast(result, b()->getHalfTy());
   }
   return result;
 }
 
-llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
+StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitConvolution(
     const HloInstruction* hlo,
-    const HloToElementGeneratorMap& operand_to_generator) {
-  switch (hlo->opcode()) {
-    case HloOpcode::kConvolution:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        return ir_emitter_->EmitElementalConvolution(
-            Cast<HloConvolutionInstruction>(hlo),
-            operand_to_generator.at(hlo->operand(0)),
-            operand_to_generator.at(hlo->operand(1)), index);
-      };
-    default:
-      return ElementalIrEmitter::MakeElementGenerator(hlo,
-                                                      operand_to_generator);
-  }
+    const HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) {
+  return ir_emitter_->EmitElementalConvolution(
+      Cast<HloConvolutionInstruction>(hlo),
+      operand_to_generator.at(hlo->operand(0)),
+      operand_to_generator.at(hlo->operand(1)), index);
 }
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 4c3167e16d9..fbf582d3a8b 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -31,18 +31,19 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
  public:
   CpuElementalIrEmitter(const HloModuleConfig& module_config,
                         IrEmitter* ir_emitter, llvm::Module* module)
-      : ElementalIrEmitter(module_config, module, ir_emitter->b()),
+      : ElementalIrEmitter(module, ir_emitter->b()),
+        hlo_module_config_(module_config),
         ir_emitter_(ir_emitter) {}
 
-  llvm_ir::ElementGenerator MakeElementGenerator(
-      const HloInstruction* hlo,
-      const HloToElementGeneratorMap& operand_to_generator) override;
-
  protected:
   StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
                                    llvm::Value* rhs) override;
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
+  StatusOr<llvm::Value*> EmitConvolution(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) override;
 
   StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
@@ -54,6 +55,7 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
     return hlo_module_config_.debug_options().xla_cpu_enable_fast_min_max();
   }
 
+  const HloModuleConfig& hlo_module_config_;
   IrEmitter* ir_emitter_;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 54822323137..4425a3681c1 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -63,7 +63,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -2200,20 +2199,21 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
     VLOG(3) << "HandleFusion FusedDynamicUpdateSliceInPlace";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
+    FusedIrEmitter fused_emitter(&elemental_emitter);
+    BindFusionArguments(fusion, &fused_emitter);
+
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
     // Delegate to common implementation of fused in-place dynamic-update-slice.
     return llvm_ir::EmitFusedDynamicUpdateSliceInPlace(
-        fusion, GetGeneratorForOperandIrArrays(fusion), GetIrArrayFor(fusion),
-        &elemental_emitter, &b_);
+        fusion, GetIrArrayFor(fusion), &fused_emitter, &b_);
   } else if (fusion->IsLoopFusion()) {
     VLOG(3) << "HandleFusion kLoop";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
-    auto operands = GetIrArraysForOperandsOf(fusion);
-    FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
-                                 &elemental_emitter);
-    TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
-
-    return EmitTargetElementLoop(fusion, fused_emitter.GetRootGenerator());
+    FusedIrEmitter fused_emitter(&elemental_emitter);
+    BindFusionArguments(fusion, &fused_emitter);
+    TF_ASSIGN_OR_RETURN(auto generator, fused_emitter.GetGenerator(
+                                            fusion->fused_expression_root()));
+    return EmitTargetElementLoop(fusion, generator);
   } else if (fusion->IsOutputFusion()) {
     VLOG(3) << "HandleFusion kOutput";
     int64 dot_op_index = root->operand(0)->opcode() == HloOpcode::kDot ? 0 : 1;
@@ -3451,5 +3451,17 @@ llvm::Value* IrEmitter::GetBufferForGlobalCallReturnValue(
   return EmitBufferPointer(root_buffer, root_inst->shape());
 }
 
+void IrEmitter::BindFusionArguments(const HloInstruction* fusion,
+                                    FusedIrEmitter* fused_emitter) {
+  for (int i = 0; i < fusion->operand_count(); i++) {
+    const HloInstruction* operand = fusion->operand(i);
+    fused_emitter->BindGenerator(
+        fusion->fused_parameter(i),
+        [this, operand](llvm_ir::IrArray::Index index) {
+          return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_);
+        });
+  }
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index f136e3470e5..891d53c889d 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -234,10 +235,9 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<llvm_ir::IrArray> GetIrArraysForOperandsOf(
       const HloInstruction* hlo);
 
-  GeneratorForOperandIrArrays GetGeneratorForOperandIrArrays(
-      HloInstruction* unnested_hlo) {
-    return [=]() { return GetIrArraysForOperandsOf(unnested_hlo); };
-  }
+  // Bind all argument IrArrays of `fusion` to `fused_emitter`.
+  void BindFusionArguments(const HloInstruction* fusion,
+                           FusedIrEmitter* fused_emitter);
 
   // Augments IrArray with aliasing information.
   void AddAliasingInformationToIrArray(const HloInstruction& hlo,
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.cc b/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.cc
deleted file mode 100644
index ea89071a967..00000000000
--- a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Analysis/SliceAnalysis.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Utils/Utils.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/Utils.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/EDSC/Intrinsics.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorOps.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorTransforms.h"  // from @llvm-project
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/Dominance.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-
-// TODO(kramerb): Remove this once strategy is in mlir core.
-
-using namespace mlir;          // NOLINT
-using namespace mlir::linalg;  // NOLINT
-
-#define DEBUG_TYPE "matmul-codegen-strategy"
-
-namespace xla {
-namespace cpu {
-namespace mlir_strategy {
-
-//===----------------------------------------------------------------------===//
-// TODO: Cleanup and upstream these to go into core. Please ignore for now !
-//===----------------------------------------------------------------------===//
-static void hoistRedundantCopies(FuncOp func) {
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    func.walk([&](linalg::FillOp op) {
-      auto loop = op.getParentOfType<scf::ForOp>();
-      if (!loop) return;
-
-      for (auto operand : op.getOperands())
-        if (!loop.isDefinedOutsideOfLoop(operand)) return;
-
-      // Hoist fill before.
-      op.getOperation()->moveBefore(loop);
-      changed = true;
-    });
-
-    func.walk([&](linalg::CopyOp op) {
-      auto loop = op.getParentOfType<scf::ForOp>();
-      if (!loop) return;
-
-      for (auto operand : op.getOperands())
-        if (!loop.isDefinedOutsideOfLoop(operand)) return;
-
-      Value sourceView = op.getInput(0);
-      while (auto subViewOp = sourceView.getDefiningOp<SubViewOp>())
-        sourceView = subViewOp.getViewSource();
-
-      // Source traces back to a block argument.
-      if (sourceView.isa<BlockArgument>()) {
-        op.getOperation()->moveBefore(loop);
-      } else {
-        assert(sourceView.getDefiningOp<ViewOp>() ||
-               sourceView.getDefiningOp<AllocOp>() ||
-               sourceView.getDefiningOp<AllocaOp>());
-        op.getOperation()->moveAfter(loop);
-      }
-      changed = true;
-    });
-  }
-}
-
-/// Substitute scf.for = %lb to %ub step %step by an AffineExpr expressing:
-///   `%lb + %step * new_dim` where
-/// 1. the AffineExpr for %lb is either an AffineConstantExpr or an
-/// AffineDimExpr depending on whether the value is constant or not.
-/// 2. the AffineExpr for %step is either an AffineConstantExpr or an
-/// AffineSymbolExpr depending on whether the value is constant or not.
-///
-static void substitute(scf::ForOp forOp, SmallVectorImpl<AffineExpr> &exprs,
-                       SmallVectorImpl<Value> &dims,
-                       SmallVectorImpl<Value> &symbols) {
-  MLIRContext *ctx = forOp.getContext();
-  auto lbConstant = forOp.lowerBound().getDefiningOp<ConstantIndexOp>();
-  AffineExpr lb = lbConstant ? getAffineConstantExpr(lbConstant.getValue(), ctx)
-                             : getAffineDimExpr(dims.size(), ctx);
-
-  auto stepConstant = forOp.step().getDefiningOp<ConstantIndexOp>();
-  AffineExpr step = stepConstant
-                        ? getAffineConstantExpr(stepConstant.getValue(), ctx)
-                        : getAffineSymbolExpr(symbols.size(), ctx);
-
-  if (!lbConstant) dims.push_back(forOp.lowerBound());
-  if (!stepConstant) symbols.push_back(forOp.step());
-  exprs.push_back(lb + step * getAffineDimExpr(dims.size(), ctx));
-
-  auto ubConstant = forOp.upperBound().getDefiningOp<ConstantIndexOp>();
-  AffineExpr ub = ubConstant ? getAffineConstantExpr(ubConstant.getValue(), ctx)
-                             : getAffineDimExpr(dims.size(), ctx);
-  if (!ubConstant) dims.push_back(forOp.upperBound());
-  exprs.push_back(ub);
-
-  dims.push_back(forOp.getInductionVar());
-}
-
-/// Traverse the .
-static void substitute(AffineMinOp minOp, SmallVectorImpl<AffineExpr> &exprs,
-                       SmallVectorImpl<Value> &dims,
-                       SmallVectorImpl<Value> &symbols) {
-  MLIRContext *ctx = minOp.getContext();
-  for (Value v : minOp.getDimOperands()) {
-    if (auto forOp = scf::getForInductionVarOwner(v)) {
-      substitute(forOp, exprs, dims, symbols);
-      continue;
-    }
-    if (auto parentMinOp = v.getDefiningOp<AffineMinOp>()) {
-      substitute(parentMinOp, exprs, dims, symbols);
-      continue;
-    }
-    exprs.push_back(getAffineDimExpr(dims.size(), ctx));
-    dims.push_back(v);
-  }
-}
-
-/// Perform folding of chains of AffineMinOp.
-struct AffineMinCanonicalizationPattern : public OpRewritePattern<AffineMinOp> {
-  using OpRewritePattern<AffineMinOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AffineMinOp minOp,
-                                PatternRewriter &rewriter) const override;
-};
-
-LogicalResult AffineMinCanonicalizationPattern::matchAndRewrite(
-    AffineMinOp minOp, PatternRewriter &rewriter) const {
-  LLVM_DEBUG(llvm::dbgs() << "\nCanonicalize AffineMin: "
-                          << *minOp.getOperation() << "\n");
-
-  int64_t min = std::numeric_limits<int64_t>::max();
-  for (auto e : minOp.map().getResults())
-    if (auto cstExpr = e.dyn_cast<AffineConstantExpr>())
-      min = std::min(min, cstExpr.getValue());
-  if (min == std::numeric_limits<int64_t>::max()) return failure();
-
-  SmallVector<AffineExpr, 4> exprs;
-  SmallVector<Value, 4> dims, symbols;
-  substitute(minOp, exprs, dims, symbols);
-
-  SmallVector<Value, 4> operands = dims;
-  operands.append(symbols.begin(), symbols.end());
-
-  MLIRContext *ctx = minOp.getContext();
-  auto map = AffineMap::get(dims.size(), symbols.size(), exprs, ctx);
-  LLVM_DEBUG(llvm::dbgs() << "Substitution map: " << map << "\n");
-
-  SmallVector<AffineExpr, 4> modExprs;
-  for (unsigned idx = 0, e = map.getNumResults(); idx < e; ++idx)
-    modExprs.push_back(getAffineDimExpr(idx, ctx) % min);
-  map = AffineMap::get(map.getNumResults(), 0, modExprs, ctx).compose(map);
-  canonicalizeMapAndOperands(&map, &operands);
-  map = simplifyAffineMap(map);
-
-  LLVM_DEBUG(llvm::dbgs() << "Post mod: " << map << "\n";
-             llvm::interleaveComma(operands, llvm::dbgs()));
-
-  if (!llvm::all_of(map.getResults(), [](AffineExpr e) {
-        if (auto cst = e.dyn_cast<AffineConstantExpr>())
-          return cst.getValue() == 0;
-        return false;
-      }))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<ConstantIndexOp>(minOp, min);
-  return success();
-}
-//===----------------------------------------------------------------------===//
-// END TODO
-//===----------------------------------------------------------------------===//
-
-void MatmulCodegenStrategy::transform(FuncOp func) const {
-  MLIRContext *context = func.getContext();
-  // Emplace patterns one at a time while also maintaining a simple chained
-  // state transition.
-  unsigned stepCount = 0;
-  SmallVector<OwningRewritePatternList, 4> stage1Patterns;
-  auto zeroState = Identifier::get(std::to_string(stepCount), context);
-  auto currentState = zeroState;
-  for (auto &t : transformation_sequence) {
-    auto nextState = Identifier::get(std::to_string(++stepCount), context);
-    auto marker = (currentState == zeroState)
-                      ? linalg::LinalgMarker({}, nextState)
-                      : linalg::LinalgMarker(currentState, nextState);
-    stage1Patterns.emplace_back(t->buildRewritePatterns(context, marker));
-    currentState = nextState;
-  }
-
-  OwningRewritePatternList stage2Patterns =
-      linalg::getLinalgTilingCanonicalizationPatterns(context);
-  stage2Patterns.insert<AffineMinCanonicalizationPattern>(context);
-
-  auto stage3Transforms = [](Operation *op) {
-    // Some of these may be too aggressive as a stage 3 that is applied on each
-    // stage 1 application and may have to be split out to post staged patterns
-    // application (in which case they could just be passes, TBD).
-    PassManager pm(op->getContext());
-    pm.addPass(createLoopInvariantCodeMotionPass());
-    if (failed(pm.run(op->getParentOfType<ModuleOp>())))
-      llvm_unreachable("Unexpected failure in cleanup pass pipeline.");
-    promoteSingleIterationLoops(cast<FuncOp>(op));
-    hoistViewAllocOps(cast<FuncOp>(op));
-    hoistRedundantVectorTransfers(cast<FuncOp>(op));
-    hoistRedundantCopies(cast<FuncOp>(op));
-    return success();
-  };
-  linalg::applyStagedPatterns(func, stage1Patterns, stage2Patterns,
-                              stage3Transforms);
-
-  //===--------------------------------------------------------------------===//
-  // Post staged patterns transforms
-  //===--------------------------------------------------------------------===//
-  // Programmatic controlled lowering of vector.contract only.
-  OwningRewritePatternList vectorContractLoweringPatterns;
-  vectorContractLoweringPatterns
-      .insert<ContractionOpToOuterProductOpLowering,
-              ContractionOpToMatmulOpLowering, ContractionOpLowering>(
-          vector_transforms_options, context);
-  applyPatternsAndFoldGreedily(func, vectorContractLoweringPatterns);
-
-  // Programmatic controlled lowering of vector.transfer only.
-  OwningRewritePatternList vectorToLoopsPatterns;
-  populateVectorToSCFConversionPatterns(vectorToLoopsPatterns, context,
-                                        vector_to_scf_options);
-  applyPatternsAndFoldGreedily(func, vectorToLoopsPatterns);
-}
-
-}  // namespace mlir_strategy
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h b/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h
deleted file mode 100644
index 3b11b750c47..00000000000
--- a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_EDGE_BENCHMARKS_STRATEGIES_MATMULCODEGENSTRATEGIES_H_
-#define MLIR_EDGE_BENCHMARKS_STRATEGIES_MATMULCODEGENSTRATEGIES_H_
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorOps.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorTransforms.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-
-// TODO(kramerb): Remove this once strategy is in mlir core.
-
-namespace xla {
-namespace cpu {
-namespace mlir_strategy {
-
-/// Abstract Transformation class applied in a sequence that also handles state
-/// through markers.
-struct Transformation {
-  virtual ~Transformation() = default;
-  virtual mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) = 0;
-  mlir::linalg::LinalgMarker marker;
-};
-
-/// Promotion transformation enqueues a particular stage-1 pattern for
-/// `Tile<LinalgOpType>`with the appropriate `options`.
-// TODO: variadic LinalgOpTypes.
-template <typename LinalgOpType>
-struct Tile : public Transformation {
-  explicit Tile(mlir::linalg::LinalgTilingOptions options) : options(options) {}
-
-  mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) override {
-    mlir::OwningRewritePatternList tiling_patterns;
-    tiling_patterns.insert<mlir::linalg::LinalgTilingPattern<LinalgOpType>>(
-        context, options, m);
-    return tiling_patterns;
-  }
-
- private:
-  mlir::linalg::LinalgTilingOptions options;
-};
-
-/// Promotion transformation enqueues a particular stage-1 pattern for
-/// `Promote<LinalgOpType>`with the appropriate `options`.
-// TODO: variadic LinalgOpTypes.
-template <typename LinalgOpType>
-struct Promote : public Transformation {
-  explicit Promote(mlir::linalg::LinalgPromotionOptions options)
-      : options(options) {}
-
-  mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) override {
-    mlir::OwningRewritePatternList promotion_patterns;
-    promotion_patterns
-        .insert<mlir::linalg::LinalgPromotionPattern<LinalgOpType>>(context,
-                                                                    options, m);
-    return promotion_patterns;
-  }
-
- private:
-  mlir::linalg::LinalgPromotionOptions options;
-};
-
-/// Vectorization transformation enqueues a particular stage-1 pattern for
-/// `LinalgVectorizationPattern<LinalgOpType>` as well as copy to vector
-/// transfer rewrite forwarding patterns.
-// TODO: variadic LinalgOpTypes.
-template <typename LinalgOpType>
-struct Vectorize : public Transformation {
-  mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) override {
-    mlir::OwningRewritePatternList vectorization_patterns;
-    // FillOp may interfere with forwarding patterns atm, so we bump up the
-    // priority of LinalgCopyVTRForwardingPattern /
-    // LinalgCopyVTWForwardingPattern.
-    vectorization_patterns
-        .insert<mlir::linalg::LinalgVectorizationPattern<LinalgOpType>>(context,
-                                                                        m);
-    vectorization_patterns.insert<mlir::linalg::LinalgCopyVTRForwardingPattern,
-                                  mlir::linalg::LinalgCopyVTWForwardingPattern>(
-        context,
-        /*benefit=*/2);
-    return vectorization_patterns;
-  }
-};
-
-/// Matmul-specific strategy object controls how a linalg.matmul is
-/// progressively lowered.
-/// The strategy uses a 3-level staged patterns strategy which allows ordering
-/// transformations by using the Linalg `applyStagedPatterns` function, where:
-///   1. The first stage consists of the successive `tile`, `promote` and
-///   `vectorize` patterns, applied sequentially.
-///   2. The second stage consists of common local canonicalization patterns
-///   that are applied eagerly after each stage-1 pattern.
-///   3. the third stage consists of more global transformation, also applied
-///   eagerly, after all stage-2 patterns. Such more global transformations
-struct MatmulCodegenStrategy {
-  /// Append a pattern to add a level of tiling for `LinalgOpType` with tiling
-  /// `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &tile(mlir::linalg::LinalgTilingOptions options) {
-    transformation_sequence.emplace_back(new Tile<LinalgOpType>(options));
-    return *this;
-  }
-  /// Conditionally append a pattern to add a level of tiling for `LinalgOpType`
-  /// with tiling `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &tileIf(bool b,
-                                mlir::linalg::LinalgTilingOptions options) {
-    return b ? tile<LinalgOpType>(options) : *this;
-  }
-  /// Append a pattern to add a level of promotion for `LinalgOpType` with
-  /// promotion `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &promote(mlir::linalg::LinalgPromotionOptions options) {
-    transformation_sequence.emplace_back(new Promote<LinalgOpType>(options));
-    return *this;
-  }
-  /// Conditionally append a pattern to add a level of promotion for
-  /// `LinalgOpType` with promotion `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &promoteIf(
-      bool b, mlir::linalg::LinalgPromotionOptions options) {
-    return b ? promote<LinalgOpType>(options) : *this;
-    return *this;
-  }
-  /// Append a pattern to rewrite `LinalgOpType` as a vector operation.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &vectorize() {
-    transformation_sequence.emplace_back(new Vectorize<LinalgOpType>());
-    return *this;
-  }
-  /// Conditionally append a pattern to rewrite `LinalgOpType` as a vector
-  /// operation.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &vectorizeIf(bool b) {
-    return b ? vectorize<LinalgOpType>() : *this;
-    return *this;
-  }
-  /// Configure the post staged-patterns late vector transformations.
-  MatmulCodegenStrategy &setVectorTransformsOptions(
-      mlir::vector::VectorTransformsOptions options) {
-    vector_transforms_options = options;
-    return *this;
-  }
-  /// Configure the post staged-patterns late vector.transfer to scf conversion.
-  MatmulCodegenStrategy &setVectorTransferToSCFOptions(
-      mlir::VectorTransferToSCFOptions options) {
-    vector_to_scf_options = options;
-    return *this;
-  }
-
-  /// Apply the transformation patterns in sequence with cleanup transformations
-  /// interleaved.
-  void transform(mlir::FuncOp func) const;
-
- private:
-  mlir::LogicalResult postPatternTransforms(mlir::Operation *func) const;
-
-  mlir::vector::VectorTransformsOptions vector_transforms_options;
-  mlir::VectorTransferToSCFOptions vector_to_scf_options;
-  llvm::SmallVector<std::unique_ptr<Transformation>, 4> transformation_sequence;
-};
-
-}  // namespace mlir_strategy
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // MLIR_EDGE_BENCHMARKS_STRATEGIES_MATMULCODEGENSTRATEGIES_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 28508bde4cd..41f24af6652 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -85,6 +85,8 @@ SimpleOrcJIT::InferTargetMachineForJIT(
 }
 
 SimpleOrcJIT::SimpleOrcJIT(
+    std::unique_ptr<llvm::orc::TargetProcessControl> target_process_control,
+    std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
     const llvm::TargetOptions& target_options,
     llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
     bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
@@ -93,48 +95,89 @@ SimpleOrcJIT::SimpleOrcJIT(
     std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook)
     : target_machine_(InferTargetMachineForJIT(target_options, opt_level)),
       data_layout_(target_machine_->createDataLayout()),
-      symbol_resolver_(llvm::orc::createLegacyLookupResolver(
-          execution_session_,
-          [this](llvm::StringRef name) -> llvm::JITSymbol {
-            return this->ResolveRuntimeSymbol(std::string(name));
-          },
-          [](llvm::Error Err) {
-            cantFail(std::move(Err), "lookupFlags failed");
-          })),
-      object_layer_(
-          execution_session_,
-          [this](llvm::orc::VModuleKey) {
-            llvm::orc::LegacyRTDyldObjectLinkingLayer::Resources result;
-            result.MemMgr = std::make_shared<llvm::SectionMemoryManager>(
-                orc_jit_memory_mapper::GetInstance());
-            result.Resolver = symbol_resolver_;
-            return result;
-          },
-          /*NotifyLoaded=*/
-          llvm::orc::LegacyRTDyldObjectLinkingLayer::NotifyLoadedFtor(),
-          /*NotifyFinalized=*/
-          [this](VModuleKeyT, const llvm::object::ObjectFile& object,
-                 const llvm::RuntimeDyld::LoadedObjectInfo& object_info) {
-            this->NotifyObjectFinalized(object, object_info);
-          },
-          /*NotifyFreed=*/
-          [this](VModuleKeyT, const llvm::object::ObjectFile& object) {
-            this->NotifyObjectFreed(object);
-          }),
+      target_process_control_(std::move(target_process_control)),
+      execution_session_(std::move(execution_session)),
+      object_layer_(*execution_session_,
+                    []() {
+                      return std::make_unique<llvm::SectionMemoryManager>(
+                          orc_jit_memory_mapper::GetInstance());
+                    }),
       compile_layer_(
-          object_layer_,
-          CompilerFunctor(target_machine_.get(), opt_level, optimize_for_size,
-                          disable_expensive_passes, fast_math_flags,
-                          std::move(pre_optimization_hook),
-                          std::move(post_optimization_hook),
-                          std::move(post_codegen_hook))),
+          *execution_session_, object_layer_,
+          std::make_unique<CompilerFunctor>(
+              target_machine_.get(), opt_level, optimize_for_size,
+              disable_expensive_passes, fast_math_flags,
+              std::move(pre_optimization_hook),
+              std::move(post_optimization_hook), std::move(post_codegen_hook))),
+      main_jit_dylib_(&execution_session_->createBareJITDylib("<main>")),
       gdb_jit_event_listener_(
           llvm::JITEventListener::createGDBRegistrationListener()) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
           << " features: " << target_machine_->getTargetFeatureString().str();
+
+  // Materialize unknown symbols from the runtime symbol table.
+  class RuntimeSymbolGenerator : public llvm::orc::DefinitionGenerator {
+    SimpleOrcJIT& jit_;
+
+   public:
+    explicit RuntimeSymbolGenerator(SimpleOrcJIT& jit) : jit_(jit) {}
+    llvm::Error tryToGenerate(
+        llvm::orc::LookupState&, llvm::orc::LookupKind,
+        llvm::orc::JITDylib& jit_dylib, llvm::orc::JITDylibLookupFlags,
+        const llvm::orc::SymbolLookupSet& names) override {
+      llvm::orc::SymbolMap new_defs;
+
+      for (const auto& kv : names) {
+        const auto& name = kv.first;
+        if (llvm::JITEvaluatedSymbol symbol =
+                jit_.ResolveRuntimeSymbol(*name)) {
+          new_defs[name] = symbol;
+        }
+      }
+
+      cantFail(jit_dylib.define(absoluteSymbols(std::move(new_defs))));
+      return llvm::Error::success();
+    }
+  };
+  main_jit_dylib_->addGenerator(
+      std::make_unique<RuntimeSymbolGenerator>(*this));
+  object_layer_.registerJITEventListener(*this);
+
+  // Copied from LLJIT, required to find symbols on Windows.
+  if (target_machine_->getTargetTriple().isOSBinFormatCOFF()) {
+    object_layer_.setOverrideObjectFlagsWithResponsibilityFlags(true);
+    object_layer_.setAutoClaimResponsibilityForObjectSymbols(true);
+  }
 }
 
-llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
+SimpleOrcJIT::~SimpleOrcJIT() {
+  if (auto err = execution_session_->endSession()) {
+    execution_session_->reportError(std::move(err));
+  }
+}
+
+llvm::Expected<std::unique_ptr<SimpleOrcJIT>> SimpleOrcJIT::Create(
+    const llvm::TargetOptions& target_options,
+    llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
+    bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
+    LLVMCompiler::ModuleHook pre_optimization_hook,
+    LLVMCompiler::ModuleHook post_optimization_hook,
+    std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook) {
+  auto target_process_control = llvm::orc::SelfTargetProcessControl::Create();
+  if (!target_process_control) {
+    return target_process_control.takeError();
+  }
+
+  auto execution_session = std::make_unique<llvm::orc::ExecutionSession>();
+  return std::make_unique<SimpleOrcJIT>(
+      std::move(*target_process_control), std::move(execution_session),
+      target_options, opt_level, optimize_for_size, disable_expensive_passes,
+      fast_math_flags, std::move(pre_optimization_hook),
+      std::move(post_optimization_hook), std::move(post_codegen_hook));
+}
+
+llvm::JITEvaluatedSymbol SimpleOrcJIT::ResolveRuntimeSymbol(
+    llvm::StringRef name) {
   void* func_addr = nullptr;
   if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
     // On Mac OS X, 'name' may have a leading underscore prefix, even though the
@@ -143,12 +186,13 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
     func_addr =
         xla::CustomCallTargetRegistry::Global()->Lookup(stripped_name, "Host");
   } else {
-    func_addr = xla::CustomCallTargetRegistry::Global()->Lookup(name, "Host");
+    func_addr =
+        xla::CustomCallTargetRegistry::Global()->Lookup(name.str(), "Host");
   }
 
   if (func_addr == nullptr) {
     LOG(ERROR)
-        << "Unable to resolve runtime symbol: `" << name
+        << "Unable to resolve runtime symbol: `" << name.str()
         << "'.  Hint: if the symbol a custom call target, make sure you've "
            "registered it with the JIT using "
            "XLA_CPU_REGISTER_CUSTOM_CALL_TARGET.";
@@ -159,60 +203,25 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
   return symbol_info;
 }
 
-void SimpleOrcJIT::NotifyObjectFinalized(
+void SimpleOrcJIT::notifyObjectLoaded(
+    llvm::JITEventListener::ObjectKey key,
     const llvm::object::ObjectFile& object,
     const llvm::RuntimeDyld::LoadedObjectInfo& object_info) {
-  uint64_t key = static_cast<uint64_t>(
-      reinterpret_cast<uintptr_t>(object.getData().data()));
   gdb_jit_event_listener_->notifyObjectLoaded(key, object, object_info);
   size_of_generated_code_in_bytes_ += object.getData().size();
 }
 
-void SimpleOrcJIT::NotifyObjectFreed(const llvm::object::ObjectFile& object) {
-  uint64_t key = static_cast<uint64_t>(
-      reinterpret_cast<uintptr_t>(object.getData().data()));
+void SimpleOrcJIT::notifyFreeingObject(llvm::JITEventListener::ObjectKey key) {
   gdb_jit_event_listener_->notifyFreeingObject(key);
 }
 
-SimpleOrcJIT::VModuleKeyT SimpleOrcJIT::AddModule(
-    std::unique_ptr<llvm::Module> module) {
-  auto key = execution_session_.allocateVModule();
-  cantFail(compile_layer_.addModule(key, std::move(module)));
-  module_keys_.push_back(key);
-  return key;
+llvm::Error SimpleOrcJIT::AddModule(llvm::orc::ThreadSafeModule module) {
+  return compile_layer_.add(*main_jit_dylib_, std::move(module));
 }
 
-void SimpleOrcJIT::RemoveModule(SimpleOrcJIT::VModuleKeyT key) {
-  module_keys_.erase(std::remove(module_keys_.begin(), module_keys_.end(), key),
-                     module_keys_.end());
-  cantFail(compile_layer_.removeModule(key));
-}
-
-llvm::JITSymbol SimpleOrcJIT::FindCompiledSymbol(const std::string& name) {
-#ifdef _WIN32
-  // The symbol lookup of ObjectLinkingLayer uses the SymbolRef::SF_Exported
-  // flag to decide whether a symbol will be visible or not, when we call
-  // IRCompileLayer::findSymbolIn with ExportedSymbolsOnly set to true.
-  //
-  // But for Windows COFF objects, this flag is currently never set.
-  // For a potential solution see: https://reviews.llvm.org/rL258665
-  // For now, we allow non-exported symbols on Windows as a workaround.
-  const bool exported_symbols_only = false;
-#else
-  const bool exported_symbols_only = true;
-#endif
-
-  // Resolve symbol from last module to first, allowing later redefinitions of
-  // symbols shadow earlier ones.
-  for (auto& key :
-       llvm::make_range(module_keys_.rbegin(), module_keys_.rend())) {
-    if (auto symbol =
-            compile_layer_.findSymbolIn(key, name, exported_symbols_only)) {
-      return symbol;
-    }
-  }
-
-  return nullptr;
+llvm::Expected<llvm::JITEvaluatedSymbol> SimpleOrcJIT::FindCompiledSymbol(
+    const std::string& name) {
+  return execution_session_->lookup({main_jit_dylib_}, name);
 }
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 9c470edbac2..36c32ed23e6 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
@@ -42,13 +43,10 @@ namespace cpu {
 // Supports JIT-ing multiple modules but without cross-module linking.
 // Implements eager compilation - the module is lowered to binary as soon as
 // it's added to the JIT.
-class SimpleOrcJIT {
+class SimpleOrcJIT : public llvm::JITEventListener {
  public:
-  using ObjLayerT = llvm::orc::LegacyRTDyldObjectLinkingLayer;
-  using CompileFtor =
-      std::function<llvm::Expected<ObjLayerT::ObjectPtr>(llvm::Module&)>;
-  using CompileLayerT = llvm::orc::LegacyIRCompileLayer<ObjLayerT, CompileFtor>;
-  using VModuleKeyT = llvm::orc::VModuleKey;
+  using ObjLayerT = llvm::orc::RTDyldObjectLinkingLayer;
+  using CompileLayerT = llvm::orc::IRCompileLayer;
 
   // Create a new JIT, targeting the host architecture.
   //
@@ -56,6 +54,8 @@ class SimpleOrcJIT {
   // LLVM IR-level optimizations.  post_codegen_hook is invoked after
   // compiling to machine code.
   SimpleOrcJIT(
+      std::unique_ptr<llvm::orc::TargetProcessControl> target_process_control,
+      std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
       const llvm::TargetOptions& target_options,
       llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
       bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
@@ -63,22 +63,28 @@ class SimpleOrcJIT {
       LLVMCompiler::ModuleHook post_optimization_hook,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook);
 
+  static llvm::Expected<std::unique_ptr<SimpleOrcJIT>> Create(
+      const llvm::TargetOptions& target_options,
+      llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
+      bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
+      LLVMCompiler::ModuleHook pre_optimization_hook,
+      LLVMCompiler::ModuleHook post_optimization_hook,
+      std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook);
+
+  ~SimpleOrcJIT() override;
+
   const llvm::DataLayout& data_layout() const { return data_layout_; }
 
   const llvm::Triple& target_triple() const {
     return target_machine_->getTargetTriple();
   }
 
-  // Add a module to the JIT. Returns an opaque key that can be used to later
-  // remove this module.
-  VModuleKeyT AddModule(std::unique_ptr<llvm::Module> module);
-
-  // Remove a module from the JIT and free the memory associated with it.
-  void RemoveModule(VModuleKeyT key);
+  llvm::Error AddModule(llvm::orc::ThreadSafeModule module);
 
   // Get the runtime address of the compiled symbol whose name is given. Returns
   // nullptr if the symbol cannot be found.
-  llvm::JITSymbol FindCompiledSymbol(const std::string& name);
+  llvm::Expected<llvm::JITEvaluatedSymbol> FindCompiledSymbol(
+      const std::string& name);
 
   llvm::TargetMachine* target_machine() const { return target_machine_.get(); }
 
@@ -93,20 +99,21 @@ class SimpleOrcJIT {
   }
 
  private:
-  llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name);
+  llvm::JITEvaluatedSymbol ResolveRuntimeSymbol(llvm::StringRef name);
 
-  void NotifyObjectFinalized(
+  void notifyObjectLoaded(
+      llvm::JITEventListener::ObjectKey key,
       const llvm::object::ObjectFile& object,
-      const llvm::RuntimeDyld::LoadedObjectInfo& object_info);
-  void NotifyObjectFreed(const llvm::object::ObjectFile& object);
+      const llvm::RuntimeDyld::LoadedObjectInfo& object_info) override;
+  void notifyFreeingObject(llvm::JITEventListener::ObjectKey key) override;
 
-  std::vector<VModuleKeyT> module_keys_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const llvm::DataLayout data_layout_;
-  llvm::orc::ExecutionSession execution_session_;
-  std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
+  std::unique_ptr<llvm::orc::TargetProcessControl> target_process_control_;
+  std::unique_ptr<llvm::orc::ExecutionSession> execution_session_;
   ObjLayerT object_layer_;
   CompileLayerT compile_layer_;
+  llvm::orc::JITDylib* main_jit_dylib_;
   int64 size_of_generated_code_in_bytes_ = 0;
 
   // Non owning pointer to a JIT event listener that registers the JIT events
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 80f98775c01..739ca5b149b 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dynamic_window_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -29,10 +30,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
-
 namespace xla {
 
 namespace {
@@ -157,6 +158,18 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
   using DynamicDimensionFn = std::function<Status(
       ShapeIndex index, int64 dimension, HloInstruction* dynamic_size)>;
 
+  Status HandleDynamicConvolutionForward(HloInstruction* hlo,
+                                         int64 operand_index, int64 dimension,
+                                         HloInstruction* dynamic_size);
+
+  Status HandleDynamicConvolutionKernelGrad(HloInstruction* hlo,
+                                            int64 operand_index,
+                                            int64 dimension);
+
+  Status HandleDynamicConvolutionInputGrad(HloInstruction* hlo,
+                                           int64 operand_index,
+                                           int64 dimension);
+
   Status ForEachOperandDynamicDimension(HloInstruction* inst,
                                         const OperandDynamicDimensionFn&);
   Status ForEachDynamicDimensionInOperand(HloInstruction* inst,
@@ -256,6 +269,20 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
           parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
           return Status::OK();
         }
+        if (hlo->custom_call_target() == "DynamicConvolutionInputGrad") {
+          return HandleDynamicConvolutionInputGrad(hlo, operand_index,
+                                                   dimension);
+        }
+
+        if (hlo->custom_call_target() == "DynamicConvolutionKernelGrad") {
+          return HandleDynamicConvolutionKernelGrad(hlo, operand_index,
+                                                    dimension);
+        }
+
+        if (hlo->custom_call_target() == "DynamicConvolutionForward") {
+          return HandleDynamicConvolutionForward(hlo, operand_index, dimension,
+                                                 dynamic_size);
+        }
         return Unimplemented(
             "CustomCall \"%s\" is not supported to have a dynamic dimension",
             hlo->custom_call_target());
@@ -591,6 +618,70 @@ Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
   return Status::OK();
 }
 
+Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionForward(
+    HloInstruction* hlo, int64 operand_index, int64 dimension,
+    HloInstruction* dynamic_size) {
+  TF_RET_CHECK(operand_index == 0);
+  const ConvolutionDimensionNumbers& dimension_numbers =
+      hlo->convolution_dimension_numbers();
+
+  if (dimension == dimension_numbers.input_batch_dimension()) {
+    // Batch dimension is propagated without any changes.
+    parent_->SetDynamicSize(hlo, {}, dimension_numbers.output_batch_dimension(),
+                            dynamic_size);
+    return Status::OK();
+  }
+
+  for (int64 spatial_dim_index = 0;
+       spatial_dim_index < dimension_numbers.input_spatial_dimensions_size();
+       ++spatial_dim_index) {
+    int64 input_spatial_dim =
+        dimension_numbers.input_spatial_dimensions(spatial_dim_index);
+    int64 output_spatial_dim =
+        dimension_numbers.output_spatial_dimensions(spatial_dim_index);
+    if (dimension == input_spatial_dim) {
+      // This is a dynamic spatial dimension. Calculate the output size.
+      WindowDimension window_dim = hlo->window().dimensions(spatial_dim_index);
+      DynamicWindowDims dynamic_window_dims = GetWindowedOutputSize(
+          dynamic_size, window_dim.size(), window_dim.window_dilation(),
+          window_dim.stride(), hlo->padding_type());
+      TF_RET_CHECK(window_dim.base_dilation() == 1);
+      parent_->SetDynamicSize(hlo, {}, output_spatial_dim,
+                              dynamic_window_dims.output_size);
+      return Status::OK();
+    }
+  }
+  return Unimplemented(
+      "XLA doesn't support dynamic input feature dimension on convolution: %s",
+      hlo->ToString());
+}
+
+Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionInputGrad(
+    HloInstruction* hlo, int64 operand_index, int64 dimension) {
+  // The output size of convolution input grad is corresponding input size.
+  HloInstruction* input_sizes = hlo->mutable_operand(0);
+  HloComputation* comp = hlo->parent();
+  TF_RET_CHECK(input_sizes->shape().rank() == 1) << hlo->ToString();
+  TF_RET_CHECK(input_sizes->shape().element_type() == S32) << hlo->ToString();
+  TF_RET_CHECK(input_sizes->shape().dimensions(0) ==
+               hlo->shape().dimensions_size())
+      << hlo->ToString();
+  // Slice to get corresponding input size.
+  HloInstruction* slice = comp->AddInstruction(
+      HloInstruction::CreateSlice(ShapeUtil::MakeShape(S32, {1}), input_sizes,
+                                  {dimension}, {dimension + 1}, {1}));
+  HloInstruction* reshape = comp->AddInstruction(
+      HloInstruction::CreateReshape(ShapeUtil::MakeScalarShape(S32), slice));
+  parent_->SetDynamicSize(hlo, {}, dimension, reshape);
+  return Status::OK();
+}
+
+Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionKernelGrad(
+    HloInstruction* hlo, int64 operand_index, int64 dimension) {
+  // Dynamic convolution kernel grad produces static shape outputs.
+  return Status::OK();
+}
+
 Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
@@ -1540,4 +1631,15 @@ HloInstruction* DynamicDimensionInference::GetDynamicSize(
   return nullptr;
 }
 
+std::vector<HloInstruction*> DynamicDimensionInference::GetDynamicSizes(
+    HloInstruction* inst, const ShapeIndex& index) const {
+  CHECK(ShapeUtil::IndexIsValid(inst->shape(), index));
+  const int64 rank = ShapeUtil::GetSubshape(inst->shape(), index).rank();
+  std::vector<HloInstruction*> result(rank, nullptr);
+  for (int64 i = 0; i < rank; ++i) {
+    result[i] = GetDynamicSize(inst, {}, i);
+  }
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 1597538e9ac..b982f280cde 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -51,8 +51,13 @@ class DynamicDimensionInference {
   HloInstruction* GetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
                                  int64 dim) const;
 
-  // Returns if current instruction contains any dynamic dimension. Recursively
-  // go into tuples.
+  // Returns dynamic sizes of all dimensions of `inst`'s leaf node at `index`.
+  // Static sizes are represented by nullptr.
+  std::vector<HloInstruction*> GetDynamicSizes(HloInstruction* inst,
+                                               const ShapeIndex& index) const;
+
+  // Returns if current instruction contains any dynamic dimension.
+  // Recursively go into tuples.
   bool HasDynamicDimension(HloInstruction* inst) const;
 
   // Forward dynamic dimension size at `dim` from `inst` to `new_inst`.
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index b4c56113239..b0631442c01 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -27,8 +27,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/dynamic_window_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
@@ -37,14 +39,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace xla {
 
 namespace {
 
+auto* dynamic_padding_gauge = tensorflow::monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/core/use_dynamic_padding_gauge",
+    "Tracks if dynamic padder is used.");
+
 // ChooseIdentityValue looks at the instruction's operand, returns a
 // identity value which, when padded, doesn't change the result of the
 // instruction.
@@ -117,9 +125,9 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst,
     case HloOpcode::kSlice:
     case HloOpcode::kDomain:
       return nullptr;
-    // Assume that custom calls created by the client are valid with padded
-    // dynamic dimensions.
     case HloOpcode::kCustomCall:
+      // Assume that custom calls created by the client are valid with padded
+      // dynamic dimensions.
       return nullptr;
     default:
       return UnimplementedStrCat("Unimplemented padding for instruction: ",
@@ -716,6 +724,262 @@ Status RewriteDynamicReshapeSingleGroup(
   return Status::OK();
 }
 
+HloInstruction* RewriteInputWithDynamicPadding(
+    HloInstruction* conv, HloInstruction* input,
+    absl::Span<HloInstruction*> padding_before, Window* input_window) {
+  HloComputation* comp = conv->parent();
+  auto dnums = conv->convolution_dimension_numbers();
+  HloInstruction* zero_s32 = comp->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+  Shape padded_grad_shape = input->shape();
+  PaddingConfig padding_configs;
+  for (int64 i = 0; i < input->shape().rank(); ++i) {
+    PaddingConfig::PaddingConfigDimension padding_dim;
+    *padding_configs.add_dimensions() = padding_dim;
+  }
+  std::vector<HloInstruction*> start_indices(input->shape().rank(), zero_s32);
+  for (int64 spatial_dim_index = 0;
+       spatial_dim_index < dnums.input_spatial_dimensions_size();
+       ++spatial_dim_index) {
+    int64 input_spatial_dim = dnums.input_spatial_dimensions(spatial_dim_index);
+    if (padding_before[spatial_dim_index] == nullptr) {
+      continue;
+    }
+    WindowDimension* window_dim =
+        input_window->mutable_dimensions(spatial_dim_index);
+    auto* padding_dim = padding_configs.mutable_dimensions(input_spatial_dim);
+    const int64 dilated_window_size = window_util::DilatedBound(
+        window_dim->size(), window_dim->window_dilation());
+    // Chosoe dilated window size as low padding and static padding_high +
+    // padding_low as high padding to make sure the following dynamic slice is
+    // valid.
+    //
+    // See go/xla-dynamic-spatial-dim for more details.
+    padding_dim->set_edge_padding_low(dilated_window_size);
+    padding_dim->set_edge_padding_high(window_dim->padding_high() +
+                                       window_dim->padding_low());
+    padding_dim->set_interior_padding(window_dim->base_dilation() - 1);
+    HloInstruction* slicing_start =
+        comp->AddInstruction(HloInstruction::CreateBinary(
+            ShapeUtil::MakeScalarShape(S32), HloOpcode::kSubtract,
+            comp->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::CreateR0<int32>(padding_dim->edge_padding_low()))),
+            padding_before[spatial_dim_index]));
+    start_indices[input_spatial_dim] = slicing_start;
+
+    padded_grad_shape.mutable_dimensions()[input_spatial_dim] =
+        window_dim->padding_low() +
+        window_util::DilatedBound(
+            padded_grad_shape.dimensions(input_spatial_dim),
+            window_dim->base_dilation()) +
+        window_dim->padding_high();
+    window_dim->clear_padding_high();
+    window_dim->clear_padding_low();
+    window_dim->set_base_dilation(1);
+    input->mutable_shape()->set_dynamic_dimension(input_spatial_dim, false);
+  }
+  // Reconstruct dynamic padding using pad and dynamic slice.
+  HloInstruction* pad =
+      MakePadHlo(input,
+                 comp->AddInstruction(HloInstruction::CreateConstant(
+                     LiteralUtil::Zero(conv->shape().element_type()))),
+                 padding_configs)
+          .ValueOrDie();
+  input = comp->AddInstruction(HloInstruction::CreateDynamicSlice(
+      padded_grad_shape, pad, start_indices, padded_grad_shape.dimensions()));
+  return input;
+}
+
+StatusOr<bool> RewriteDynamicConvolutionInputGrad(
+    HloInstruction* custom_call_conv,
+    DynamicDimensionInference* dynamic_dimension_inference) {
+  HloInstruction* grad = custom_call_conv->mutable_operand(1);
+  HloInstruction* kernel = custom_call_conv->mutable_operand(2);
+  TF_RET_CHECK(kernel->shape().is_static());
+  auto dnums = custom_call_conv->convolution_dimension_numbers();
+  HloComputation* comp = custom_call_conv->parent();
+  Window window = custom_call_conv->window();
+  HloInstruction* zero = comp->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(custom_call_conv->shape().element_type())));
+  std::vector<HloInstruction*> padding_before(
+      dnums.input_spatial_dimensions_size(), nullptr);
+  for (int64 spatial_dim_index = 0;
+       spatial_dim_index < dnums.input_spatial_dimensions_size();
+       ++spatial_dim_index) {
+    int64 input_spatial_dim = dnums.input_spatial_dimensions(spatial_dim_index);
+    HloInstruction* operand_dynamic_size =
+        dynamic_dimension_inference->GetDynamicSize(
+            custom_call_conv->mutable_operand(1), {}, input_spatial_dim);
+    if (operand_dynamic_size == nullptr) {
+      continue;
+    }
+    grad = PadWithScalar(grad, input_spatial_dim, operand_dynamic_size, zero);
+    HloInstruction* slice = comp->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(S32, {1}), custom_call_conv->mutable_operand(0),
+        {input_spatial_dim}, {input_spatial_dim + 1}, {1}));
+    HloInstruction* dynamic_input_size = comp->AddInstruction(
+        HloInstruction::CreateReshape(ShapeUtil::MakeScalarShape(S32), slice));
+    const WindowDimension& window_dim = window.dimensions(spatial_dim_index);
+    // Window stride of forward prop is same as base dilation of backward prop.
+    DynamicWindowDims dynamic_window_dims = GetWindowedInputGradSize(
+        dynamic_input_size, /*window_size=*/window_dim.size(),
+        /*window_dilation=*/window_dim.window_dilation(),
+        /*window_stride=*/window_dim.base_dilation(),
+        custom_call_conv->padding_type());
+    padding_before[spatial_dim_index] = dynamic_window_dims.padding_before;
+  }
+
+  if (custom_call_conv->padding_type() == PaddingType::PADDING_SAME) {
+    grad = RewriteInputWithDynamicPadding(
+        custom_call_conv, grad, absl::MakeSpan(padding_before), &window);
+  }
+
+  PrecisionConfig precision_config;
+  if (custom_call_conv->precision_config().operand_precision_size() == 3) {
+    // We are not interested in the precision config of the first operand, which
+    // is the input_sizes.
+    *precision_config.mutable_operand_precision() = {
+        custom_call_conv->precision_config().operand_precision().begin() + 1,
+        custom_call_conv->precision_config().operand_precision().end()};
+  }
+  HloInstruction* static_conv = comp->AddInstruction(
+      HloInstruction::CreateConvolve(
+          custom_call_conv->shape(), grad, kernel,
+          custom_call_conv->feature_group_count(),
+          custom_call_conv->batch_group_count(), window,
+          custom_call_conv->convolution_dimension_numbers(),
+          custom_call_conv->precision_config()),
+      "ConvBackwardInput");
+  TF_RETURN_IF_ERROR(custom_call_conv->ReplaceAllUsesWith(static_conv));
+  TF_RETURN_IF_ERROR(dynamic_dimension_inference->ForwardDynamicSize(
+      custom_call_conv, static_conv, {}));
+  return true;
+}
+
+StatusOr<bool> RewriteDynamicConvolutionForward(
+    HloInstruction* custom_call_conv,
+    DynamicDimensionInference* dynamic_dimension_inference) {
+  HloInstruction* input = custom_call_conv->mutable_operand(0);
+  HloInstruction* kernel = custom_call_conv->mutable_operand(1);
+  TF_RET_CHECK(kernel->shape().is_static());
+  TF_RET_CHECK(input->shape().is_dynamic());
+  HloComputation* comp = custom_call_conv->parent();
+  Window window = custom_call_conv->window();
+  auto dnums = custom_call_conv->convolution_dimension_numbers();
+  HloInstruction* zero = comp->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(custom_call_conv->shape().element_type())));
+  std::vector<HloInstruction*> padding_before(
+      dnums.input_spatial_dimensions_size(), nullptr);
+  for (int64 spatial_dim_index = 0;
+       spatial_dim_index < dnums.input_spatial_dimensions_size();
+       ++spatial_dim_index) {
+    int64 input_spatial_dim = dnums.input_spatial_dimensions(spatial_dim_index);
+    HloInstruction* operand_dynamic_size =
+        dynamic_dimension_inference->GetDynamicSize(
+            custom_call_conv->mutable_operand(0), {}, input_spatial_dim);
+    if (operand_dynamic_size == nullptr) {
+      continue;
+    }
+
+    input = PadWithScalar(input, input_spatial_dim, operand_dynamic_size, zero);
+    const WindowDimension& window_dim = window.dimensions(spatial_dim_index);
+    DynamicWindowDims dynamic_window_dims = GetWindowedOutputSize(
+        operand_dynamic_size, window_dim.size(), window_dim.window_dilation(),
+        window_dim.stride(), custom_call_conv->padding_type());
+    padding_before[spatial_dim_index] = dynamic_window_dims.padding_before;
+  }
+
+  if (custom_call_conv->padding_type() == PaddingType::PADDING_SAME) {
+    input = RewriteInputWithDynamicPadding(
+        custom_call_conv, input, absl::MakeSpan(padding_before), &window);
+  }
+
+  HloInstruction* static_conv = comp->AddInstruction(
+      HloInstruction::CreateConvolve(
+          custom_call_conv->shape(), input, kernel,
+          custom_call_conv->feature_group_count(),
+          custom_call_conv->batch_group_count(), window,
+          custom_call_conv->convolution_dimension_numbers(),
+          custom_call_conv->precision_config()),
+      "ConvForward");
+  TF_RETURN_IF_ERROR(custom_call_conv->ReplaceAllUsesWith(static_conv));
+  TF_RETURN_IF_ERROR(dynamic_dimension_inference->ForwardDynamicSize(
+      custom_call_conv, static_conv, {}));
+  return true;
+}
+
+StatusOr<bool> RewriteDynamicConvolutionKernelGrad(
+    HloInstruction* custom_call_conv,
+    DynamicDimensionInference* dynamic_dimension_inference) {
+  HloInstruction* activations = custom_call_conv->mutable_operand(0);
+  HloInstruction* gradients = custom_call_conv->mutable_operand(1);
+  TF_RET_CHECK(activations->shape().is_dynamic());
+  TF_RET_CHECK(gradients->shape().is_dynamic());
+  HloComputation* comp = custom_call_conv->parent();
+  Window window = custom_call_conv->window();
+  auto dnums = custom_call_conv->convolution_dimension_numbers();
+  HloInstruction* zero = comp->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(custom_call_conv->shape().element_type())));
+  std::vector<HloInstruction*> padding_before(
+      dnums.input_spatial_dimensions_size(), nullptr);
+  for (int64 spatial_dim_index = 0;
+       spatial_dim_index < dnums.input_spatial_dimensions_size();
+       ++spatial_dim_index) {
+    int64 input_spatial_dim = dnums.input_spatial_dimensions(spatial_dim_index);
+    int64 kernel_spatial_dim =
+        dnums.kernel_spatial_dimensions(spatial_dim_index);
+    HloInstruction* activations_dynamic_size =
+        dynamic_dimension_inference->GetDynamicSize(
+            custom_call_conv->mutable_operand(0), {}, input_spatial_dim);
+    if (activations_dynamic_size != nullptr) {
+      activations = PadWithScalar(activations, input_spatial_dim,
+                                  activations_dynamic_size, zero);
+    }
+
+    HloInstruction* gradients_dynamic_size =
+        dynamic_dimension_inference->GetDynamicSize(
+            custom_call_conv->mutable_operand(1), {}, kernel_spatial_dim);
+    if (gradients_dynamic_size != nullptr) {
+      gradients = PadWithScalar(gradients, kernel_spatial_dim,
+                                gradients_dynamic_size, zero);
+    }
+    if (activations_dynamic_size == nullptr ||
+        gradients_dynamic_size == nullptr) {
+      TF_RET_CHECK(activations_dynamic_size == nullptr &&
+                   gradients_dynamic_size == nullptr);
+      continue;
+    }
+    int64 output_spatial_dim =
+        dnums.output_spatial_dimensions(spatial_dim_index);
+    const WindowDimension& window_dim = window.dimensions(spatial_dim_index);
+    DynamicWindowDims dynamic_window_dims = GetWindowedOutputSize(
+        activations_dynamic_size, /*window_size=*/
+        custom_call_conv->shape().dimensions(output_spatial_dim),
+        /*window_dilation=*/window_dim.stride(),
+        /*window_stride=*/window_dim.window_dilation(),
+        custom_call_conv->padding_type());
+    padding_before[spatial_dim_index] = dynamic_window_dims.padding_before;
+  }
+
+  if (custom_call_conv->padding_type() == PaddingType::PADDING_SAME) {
+    activations = RewriteInputWithDynamicPadding(
+        custom_call_conv, activations, absl::MakeSpan(padding_before), &window);
+  }
+
+  HloInstruction* static_conv = comp->AddInstruction(
+      HloInstruction::CreateConvolve(
+          custom_call_conv->shape(), activations, gradients,
+          custom_call_conv->feature_group_count(),
+          custom_call_conv->batch_group_count(), window,
+          custom_call_conv->convolution_dimension_numbers(),
+          custom_call_conv->precision_config()),
+      "ConvBackwardGrad");
+  TF_RETURN_IF_ERROR(custom_call_conv->ReplaceAllUsesWith(static_conv));
+  TF_RETURN_IF_ERROR(dynamic_dimension_inference->ForwardDynamicSize(
+      custom_call_conv, static_conv, {}));
+  return true;
+}
+
 StatusOr<bool> RewriteDynamicConcat(
     HloInstruction* concat,
     DynamicDimensionInference* dynamic_dimension_inference) {
@@ -1318,6 +1582,23 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
             inst, static_reshape, {}));
         continue;
       }
+      if (inst->IsCustomCall("DynamicConvolutionInputGrad")) {
+        TF_ASSIGN_OR_RETURN(changed, RewriteDynamicConvolutionInputGrad(
+                                         inst, &dynamic_dimension_inference));
+        continue;
+      }
+
+      if (inst->IsCustomCall("DynamicConvolutionForward")) {
+        TF_ASSIGN_OR_RETURN(changed, RewriteDynamicConvolutionForward(
+                                         inst, &dynamic_dimension_inference));
+        continue;
+      }
+
+      if (inst->IsCustomCall("DynamicConvolutionKernelGrad")) {
+        TF_ASSIGN_OR_RETURN(changed, RewriteDynamicConvolutionKernelGrad(
+                                         inst, &dynamic_dimension_inference));
+        continue;
+      }
       for (int64 operand_num = 0; operand_num < inst->operand_count();
            ++operand_num) {
         HloInstruction* original_operand = inst->mutable_operand(operand_num);
@@ -1351,6 +1632,7 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
               operand, input_dim, operand_dynamic_size, identity_value);
           TF_RETURN_IF_ERROR(inst->ReplaceOperandWith(operand_num, padded));
           operand = inst->mutable_operand(operand_num);
+          dynamic_padding_gauge->GetCell()->Set(true);
           changed = true;
         }
       }
@@ -1392,11 +1674,12 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
       changed = changed || replaced_set_bound;
     }
   }
-
   HloDCE dce;
   TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
+
   VLOG(2) << "Post DynamicPadder HLO:";
   XLA_VLOG_LINES(2, module->ToString());
+  dynamic_padding_gauge->GetCell()->Set(changed);
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/dynamic_window_utils.cc b/tensorflow/compiler/xla/service/dynamic_window_utils.cc
new file mode 100644
index 00000000000..05badeeccf4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_window_utils.cc
@@ -0,0 +1,150 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_window_utils.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
+namespace xla {
+namespace {
+// HloOp wraps an instuction pointer to do arithmetic based on operator
+// overloading.
+//
+// TODO(yunxing): This is only used internally to this file to provide a
+// convenient way to do operator overloadding.  Find out an idiom and merge this
+// with hlo_creation_utils.
+class HloOp {
+ public:
+  HloOp() = default;
+  explicit HloOp(HloInstruction* inst) : inst_(inst) {}
+  void SetName(const std::string& name) {
+    inst_->SetAndSanitizeName(name);
+    if (inst_->GetModule() != nullptr) {
+      inst_->UniquifyName(&inst_->GetModule()->instruction_name_uniquer());
+    }
+  }
+  HloInstruction* get() { return inst_; }
+
+ private:
+  HloInstruction* inst_ = nullptr;
+};
+HloOp BinaryOp(HloOp x, HloOp y, HloOpcode opcode,
+               const std::string& name = "") {
+  CHECK_EQ(x.get()->parent(), y.get()->parent());
+  Shape binary_op_shape =
+      ShapeInference::InferBinaryOpShape(opcode, x.get(), y.get()).ValueOrDie();
+  return HloOp(x.get()->parent()->AddInstruction(
+      HloInstruction::CreateBinary(binary_op_shape, opcode, x.get(), y.get()),
+      name));
+}
+HloOp operator+(HloOp x, HloOp y) { return BinaryOp(x, y, HloOpcode::kAdd); }
+
+HloOp operator-(HloOp x, HloOp y) {
+  return BinaryOp(x, y, HloOpcode::kSubtract);
+}
+
+HloOp operator*(HloOp x, HloOp y) {
+  return BinaryOp(x, y, HloOpcode::kMultiply);
+}
+
+HloOp operator/(HloOp x, HloOp y) { return BinaryOp(x, y, HloOpcode::kDivide); }
+
+HloOp Maximum(HloOp x, HloOp y, const std::string& name = "") {
+  return BinaryOp(x, y, HloOpcode::kMaximum, name);
+}
+
+template <typename NativeT>
+HloOp ConstantR0(HloComputation* comp, NativeT value,
+                 const std::string& name = "") {
+  return HloOp(comp->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<NativeT>(value)),
+      name));
+}
+
+template <typename NativeT>
+HloOp One(HloComputation* comp) {
+  return ConstantR0<NativeT>(comp, 1, "one");
+}
+
+template <typename NativeT>
+HloOp Zero(HloComputation* comp) {
+  return ConstantR0<NativeT>(comp, 0, "zero");
+}
+
+HloOp EffectiveFilterSize(HloComputation* comp, int64 window_size,
+                          int64 window_dilation) {
+  return ConstantR0<int32>(comp, (window_size - 1) * window_dilation + 1,
+                           "effective_filter_size");
+}
+}  // namespace
+
+DynamicWindowDims GetWindowedOutputSize(HloInstruction* input_size,
+                                        int64 window_size,
+                                        int64 window_dilation,
+                                        int64 window_stride,
+                                        PaddingType padding_type) {
+  HloComputation* comp = input_size->parent();
+  DynamicWindowDims result;
+
+  HloOp stride = ConstantR0<int32>(comp, window_stride, "stride");
+  HloOp effective_filter_size =
+      EffectiveFilterSize(comp, window_size, window_dilation);
+  if (padding_type == PaddingType::PADDING_VALID) {
+    HloOp output =
+        (HloOp(input_size) + stride - effective_filter_size) / stride;
+    result.output_size = output.get();
+    result.padding_before = Zero<int32>(comp).get();
+  } else if (padding_type == PaddingType::PADDING_SAME) {
+    HloOp output = (HloOp(input_size) + stride - One<int32>(comp)) / stride;
+    HloOp padding_needed = Maximum(
+        Zero<int32>(comp), (output - One<int32>(comp)) * stride +
+                               effective_filter_size - HloOp(input_size));
+    HloOp padding_before = padding_needed / ConstantR0<int32>(comp, 2);
+    result.padding_before = padding_before.get();
+    result.output_size = output.get();
+  }
+
+  return result;
+}
+
+DynamicWindowDims GetWindowedInputGradSize(HloInstruction* input_size,
+                                           int64 window_size,
+                                           int64 window_dilation,
+                                           int64 window_stride,
+                                           PaddingType padding_type) {
+  HloComputation* comp = input_size->parent();
+  DynamicWindowDims result;
+  HloOp effective_filter_size =
+      ConstantR0<int32>(comp, (window_size - 1) * window_dilation + 1);
+  HloOp stride = ConstantR0<int32>(comp, window_stride);
+  DynamicWindowDims forward_dims = GetWindowedOutputSize(
+      input_size, window_size, window_dilation, window_stride, padding_type);
+  HloOp output_size =
+      (HloOp(forward_dims.output_size) - One<int32>(comp)) * stride +
+      One<int32>(comp);
+  HloOp padding_before = effective_filter_size - One<int32>(comp) -
+                         HloOp(forward_dims.padding_before);
+  result.output_size = output_size.get();
+  result.padding_before = padding_before.get();
+  return result;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_window_utils.h b/tensorflow/compiler/xla/service/dynamic_window_utils.h
new file mode 100644
index 00000000000..aed5f6bed75
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_window_utils.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_WINDOW_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_WINDOW_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+namespace xla {
+struct DynamicWindowDims {
+  HloInstruction* padding_before;
+  HloInstruction* output_size;
+};
+
+// This mirrors the logic in GetWindowedOutputSizeVerboseV2 but with HLOs as
+// inputs and outputs.
+DynamicWindowDims GetWindowedOutputSize(HloInstruction* input_size,
+                                        int64 window_size,
+                                        int64 window_dilation,
+                                        int64 window_stride,
+                                        PaddingType padding_type);
+
+DynamicWindowDims GetWindowedInputGradSize(HloInstruction* input_size,
+                                           int64 window_size,
+                                           int64 window_dilation,
+                                           int64 window_stride,
+                                           PaddingType padding_type);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_WINDOW_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 98d523487b4..8da04b51093 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1578,6 +1578,33 @@ llvm::Value* ElementalIrEmitter::EmitIntegerRemainder(llvm::Value* lhs,
       Select(has_int_min_overflow, GetZero(lhs->getType()), safe_rem));
 }
 
+llvm::Value* ElementalIrEmitter::EmitIntegerPow(llvm::Value* base,
+                                                llvm::Value* exponent,
+                                                bool is_signed) {
+  // Exponentiation by squaring:
+  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
+  int bits = 6;  // Everything else would overflow for any exponent > 1, as 2^64
+                 // is the larget possible exponent for a 64-bit integer, and
+                 // that's 1 << 6.
+  llvm::Value* accumulator = llvm::ConstantInt::get(base->getType(), 1);
+  llvm::Value* one = llvm::ConstantInt::get(exponent->getType(), 1);
+  llvm::Value* zero = llvm::ConstantInt::get(exponent->getType(), 0);
+  llvm::Value* original_base = base;
+  llvm::Value* original_exponent = exponent;
+
+  // Unroll the loop at compile time.
+  for (int i = 0; i < bits; i++) {
+    accumulator =
+        b_->CreateSelect(b_->CreateICmpEQ(b_->CreateAnd(exponent, one), one),
+                         b_->CreateMul(accumulator, base), accumulator);
+    base = b_->CreateMul(base, base);
+    exponent = b_->CreateLShr(exponent, 1);
+  }
+  return b_->CreateSelect(
+      b_->CreateICmpSGE(original_exponent, zero), accumulator,
+      b_->CreateSelect(b_->CreateICmpEQ(original_base, one), one, zero));
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value,
     bool is_signed) {
@@ -1627,6 +1654,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
       return And(lhs_value, rhs_value);
     case HloOpcode::kOr:
       return Or(lhs_value, rhs_value);
+    case HloOpcode::kPower:
+      return EmitIntegerPow(lhs_value, rhs_value, is_signed);
     case HloOpcode::kXor:
       return Xor(lhs_value, rhs_value);
 
@@ -2486,6 +2515,10 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return EmitElementalReduce(reduce_instr, std::move(input_generators),
                                    std::move(initial_value_generators), index);
       };
+    case HloOpcode::kConvolution:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return EmitConvolution(hlo, operand_to_generator, index);
+      };
     default:
       return [hlo](const IrArray::Index& index) {
         return Unimplemented("Unhandled opcode for elemental IR emission: %s",
@@ -2540,6 +2573,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduceWindow(
   //     if I in bounds of input
   //       value = function(value, input[I])
   //     output[O] = value
+  if (reduce_window->shape().IsTuple()) {
+    return Status(tensorflow::error::UNIMPLEMENTED,
+                  "Variadic reduce window op is not yet fully supported.");
+  }
   const HloInstruction* operand = reduce_window->operand(0);
   const Window& window = reduce_window->window();
 
@@ -2726,6 +2763,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduce(
   }
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitConvolution(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) {
+  return Unimplemented("Elemental convolution is not implemented");
+}
+
 // Evaluate polynomial using Horner's method.
 StatusOr<llvm::Value*> ElementalIrEmitter::EvaluatePolynomial(
     llvm::Type* type, llvm::Value* x, absl::Span<const double> coefficients) {
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 365e3f56b85..60e25c7d8bf 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
@@ -39,22 +38,14 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   using HloToElementGeneratorMap =
       std::unordered_map<const HloInstruction*, llvm_ir::ElementGenerator>;
 
-  ElementalIrEmitter(const HloModuleConfig& hlo_module_config,
-                     llvm::Module* module, llvm::IRBuilder<>* b)
-      : b_(b), module_(module), hlo_module_config_(hlo_module_config) {}
+  ElementalIrEmitter(llvm::Module* module, llvm::IRBuilder<>* b)
+      : b_(b), module_(module) {}
 
   virtual ~ElementalIrEmitter() = default;
 
-  virtual StatusOr<llvm::Value*> EmitUnaryOp(const HloInstruction* op,
-                                             llvm::Value* operand_value);
-
-  virtual StatusOr<llvm::Value*> EmitBinaryOp(const HloInstruction* op,
-                                              llvm::Value* lhs_value,
-                                              llvm::Value* rhs_value);
-
   // Returns a function to generate an element of the output of `hlo`, given a
   // map of functions to generate elements of its operands.
-  virtual llvm_ir::ElementGenerator MakeElementGenerator(
+  llvm_ir::ElementGenerator MakeElementGenerator(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator);
 
@@ -66,6 +57,21 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   llvm::Module* module() { return module_; }
 
  protected:
+  virtual StatusOr<llvm::Value*> EmitFloatBinaryOp(const HloInstruction* op,
+                                                   llvm::Value* lhs_value,
+                                                   llvm::Value* rhs_value);
+
+  virtual llvm::Value* EmitExtractReal(llvm::Value* value);
+  virtual llvm::Value* EmitExtractImag(llvm::Value* value);
+
+ private:
+  virtual StatusOr<llvm::Value*> EmitUnaryOp(const HloInstruction* op,
+                                             llvm::Value* operand_value);
+
+  virtual StatusOr<llvm::Value*> EmitBinaryOp(const HloInstruction* op,
+                                              llvm::Value* lhs_value,
+                                              llvm::Value* rhs_value);
+
   virtual StatusOr<llvm::Value*> EmitIntegerUnaryOp(const HloInstruction* op,
                                                     llvm::Value* operand_value);
 
@@ -86,16 +92,14 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
                                  bool is_signed);
   llvm::Value* EmitIntegerRemainder(llvm::Value* lhs, llvm::Value* rhs,
                                     bool is_signed);
+  llvm::Value* EmitIntegerPow(llvm::Value* lhs, llvm::Value* rhs,
+                              bool is_signed);
 
   virtual StatusOr<llvm::Value*> EmitIntegerBinaryOp(const HloInstruction* op,
                                                      llvm::Value* lhs_value,
                                                      llvm::Value* rhs_value,
                                                      bool is_signed);
 
-  virtual StatusOr<llvm::Value*> EmitFloatBinaryOp(const HloInstruction* op,
-                                                   llvm::Value* lhs_value,
-                                                   llvm::Value* rhs_value);
-
   virtual StatusOr<llvm::Value*> EmitComplexBinaryOp(const HloInstruction* op,
                                                      llvm::Value* lhs_value,
                                                      llvm::Value* rhs_value);
@@ -175,9 +179,6 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
                                                   PrimitiveType prim_type,
                                                   llvm::Value* operand_value);
 
-  virtual llvm::Value* EmitExtractReal(llvm::Value* value);
-  virtual llvm::Value* EmitExtractImag(llvm::Value* value);
-
   // Composes a complex struct. imag may be nullptr for simple cast operations.
   llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
                                   llvm::Value* imag);
@@ -245,17 +246,11 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       std::vector<llvm_ir::ElementGenerator> initial_value_generators,
       const llvm_ir::IrArray::Index& index);
 
-  virtual bool fast_min_max() = 0;
+  virtual StatusOr<llvm::Value*> EmitConvolution(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
 
-  llvm::IRBuilder<>* const b_;
-
-  llvm::Module* module_;
-
-  // The HloModuleConfig which gathers all settings and values which affect the
-  // compiled executable outside of the HLO code itself.
-  const HloModuleConfig& hlo_module_config_;
-
- private:
   // Computes the complex power function, returns (a + i*b)^(c + i*d).
   StatusOr<llvm::Value*> EmitComplexPower(const HloInstruction* op,
                                           llvm::Value* a, llvm::Value* b,
@@ -264,6 +259,12 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   // Evaluates a polynomial using Horner's method.
   StatusOr<llvm::Value*> EvaluatePolynomial(
       llvm::Type* type, llvm::Value* x, absl::Span<const double> coefficients);
+
+  virtual bool fast_min_max() = 0;
+
+  llvm::IRBuilder<>* const b_;
+
+  llvm::Module* module_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index e9a3c6b3018..42794fc995b 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -62,8 +62,7 @@ void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
 StatusOr<ShapedBuffer> ExecutionInput::ToShapedBuffer(
     se::DeviceMemoryAllocator* allocator, int device_ordinal) const {
   const Shape& input_shape = shape();
-  ShapedBuffer shaped_buffer(input_shape, allocator->platform(),
-                             device_ordinal);
+  ShapedBuffer shaped_buffer(input_shape, device_ordinal);
   for (const auto& index_buffer : Buffers()) {
     const tensorflow::se::OwningDeviceMemory* mem =
         index_buffer.second.AsOwningDeviceMemory();
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 1e1b3436a3c..51763b79959 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -232,6 +232,10 @@ class ExecutionOutput {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
+  explicit Executable(std::shared_ptr<HloModule> hlo_module)
+      : hlo_module_(std::move(hlo_module)) {}
+
+  // TODO(b/172012028): Remove this constructor.
   explicit Executable(
       std::shared_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
index 17d3fb2b3d6..25b9658ba98 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
@@ -38,14 +38,60 @@ FusionNodeIndexingEvaluation::FusionNodeIndexingEvaluation(
 // a tradeoff between compilation time and runtime here.
 const int64 FusionNodeIndexingEvaluation::kAllowedCodeDuplication = 15;
 
+namespace {
+
+// Returns which ops invalidate the cache of emitted instructions by creating a
+// new BasicBlock and setting the insertion point to the newly created
+// BasicBlock. We can only reuse cached values if they were emitted in the same
+// BasicBlock as the current BasicBlock.
+bool OpInvalidatesCache(const HloInstruction* hlo) {
+  switch (hlo->opcode()) {
+    // This list of ops was created by inspecting the code. There is no
+    // guarantee that it is complete.
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kDot:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kPad:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Counts the number of "real" users of 'hlo'. When 'hlo' has a fusion node as
+// user, we consider the users of the fusion parameter corresponding to 'hlo' as
+// the real users.
+int64 UserCount(const HloInstruction* hlo) {
+  int64 cnt = 0;
+  for (HloInstruction* user : hlo->users()) {
+    if (user->opcode() == HloOpcode::kFusion) {
+      // Count the number of users of the parameter corresponding to the fusion
+      // operand.
+      int64 operand_index = user->operand_index(hlo);
+      cnt += user->fused_parameter(operand_index)->user_count();
+    } else {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
+}  // namespace
+
 bool FusionNodeIndexingEvaluation::CodeDuplicationTooHigh(
     const HloInstruction* producer) const {
-  return EvaluateEmittedInstructions(producer) > kAllowedCodeDuplication;
+  int64 emitted_instructions = EvaluateEmittedInstructions(producer);
+  return emitted_instructions > kAllowedCodeDuplication ||
+         (OpInvalidatesCache(producer) &&
+          (emitted_instructions > 1 || UserCount(producer) > 1));
 }
 
 bool FusionNodeIndexingEvaluation::MaxCodeDuplicationTooHigh() const {
   for (const auto& entry : index_usage_count_) {
-    if (entry.second > kAllowedCodeDuplication) {
+    if (entry.second > kAllowedCodeDuplication ||
+        (OpInvalidatesCache(entry.first) &&
+         (entry.second > 1 || UserCount(entry.first) > 1))) {
       return true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index b2ec656a2ba..05535c0dbe9 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -288,6 +288,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service:while_loop_analysis",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
@@ -913,6 +914,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_reachability",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1186,6 +1188,7 @@ cc_library(
         ":gpu_layout_assignment",
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
+        ":horizontal_input_fusion",
         ":horizontal_loop_fusion",
         ":instruction_fusion",
         ":ir_emission_utils",
@@ -1236,6 +1239,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:logistic_expander",
+        "//tensorflow/compiler/xla/service:loop_schedule_linearizer",
         "//tensorflow/compiler/xla/service:qr_expander",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
@@ -1769,6 +1773,7 @@ cc_library(
     srcs = ["horizontal_loop_fusion.cc"],
     hdrs = ["horizontal_loop_fusion.h"],
     deps = [
+        ":gpu_fusible",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
@@ -1805,6 +1810,45 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "horizontal_input_fusion",
+    srcs = ["horizontal_input_fusion.cc"],
+    hdrs = ["horizontal_input_fusion.h"],
+    deps = [
+        ":gpu_fusible",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_creation_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "horizontal_input_fusion_test",
+    srcs = ["horizontal_input_fusion_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":horizontal_input_fusion",
+        ":multi_output_fusion",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "reduction_degenerate_dim_remover",
     srcs = ["reduction_degenerate_dim_remover.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index 88982d3c034..11afd627c42 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -247,9 +247,11 @@ Status CollectivePermuteThunk::ExecuteOnStream(const ExecuteParams& params) {
   std::shared_ptr<Rendezvous> rendezvous =
       GlobalRendezvousMap().GetOrCreateIfAbsent(key, rendezvous_factory);
 
-  TF_ASSIGN_OR_RETURN(int64 replica_id,
-                      params.device_assn->ReplicaIdForDeviceOrdinal(
-                          params.stream->parent()->device_ordinal()));
+  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
+                      params.GetGlobalDeviceId());
+  TF_ASSIGN_OR_RETURN(
+      int64 replica_id,
+      params.device_assn->ReplicaIdForDeviceOrdinal(global_device_id.value()));
 
   // Figure out which replicas our data is copied to.
   std::vector<int64> dest_replicas;
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 3f000a2491d..e72c12813b7 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -72,7 +72,8 @@ bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
     llvm::IRBuilder<>* b, NestedComputer compute_nested)
-    : ElementalIrEmitter(hlo_module_config, module, b),
+    : ElementalIrEmitter(module, b),
+      hlo_module_config_(hlo_module_config),
       compute_nested_(std::move(compute_nested)) {}
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitDeviceMathCall(
@@ -91,7 +92,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitDeviceMathCall(
       for (int64 i = 0; i < operands.size(); ++i) {
         if (input_types[i] == F16) {
           converted_operands[i] =
-              FPCast(converted_operands[i], b_->getFloatTy());
+              FPCast(converted_operands[i], b()->getFloatTy());
           converted_input_types[i] = F32;
         }
       }
@@ -106,12 +107,12 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitDeviceMathCall(
                            PrimitiveType_Name(output_type));
   }
   const string& munged_callee =
-      ObtainDeviceFunctionName(funcid, output_type, b_);
+      ObtainDeviceFunctionName(funcid, output_type, b());
   llvm::Value* result = EmitMathCall(munged_callee, converted_operands,
                                      converted_input_types, output_type)
                             .ValueOrDie();
   if (cast_result_to_fp16) {
-    result = FPCast(result, b_->getHalfTy());
+    result = FPCast(result, b()->getHalfTy());
   }
   return result;
 }
@@ -153,7 +154,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
 
   return EmitDeviceFunctionCall(
       callee_name, operands, input_types, output_type,
-      {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind}, b_);
+      {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind}, b());
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
@@ -168,7 +169,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
     return llvm_ir::EmitCallToIntrinsic(
         opcode == HloOpcode::kMaximum ? llvm::Intrinsic::maxnum
                                       : llvm::Intrinsic::minnum,
-        {lhs_value, rhs_value}, {lhs_value->getType()}, b_);
+        {lhs_value, rhs_value}, {lhs_value->getType()}, b());
   }
 
   switch (op->opcode()) {
@@ -275,19 +276,19 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   // This routine isn't numerically precise, but it's good enough for ML.
 
   // Upcast F16 to F32 if necessary.
-  llvm::Type* type = prim_type == F16 ? b_->getFloatTy() : value->getType();
+  llvm::Type* type = prim_type == F16 ? b()->getFloatTy() : value->getType();
   llvm::Value* input = FPCast(value, type);
 
   // If |value| >= kMaxValue, tanh() is set to -1.0 or 1.0.
   constexpr double kMaxValue = 20.0;
   auto max_value = llvm::ConstantFP::get(type, kMaxValue);
   llvm::Value* abs_value =
-      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {input}, {type}, b_);
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {input}, {type}, b());
 
-  llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input);
+  llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b(), input);
   auto one = llvm::ConstantFP::get(type, 1.0);
   auto one_with_sign = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::copysign,
-                                                    {one, input}, {type}, b_);
+                                                    {one, input}, {type}, b());
   return FPCast(Select(FCmpULT(abs_value, max_value), fast_tanh, one_with_sign),
                 value->getType(), "tanh");
 }
@@ -301,14 +302,14 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexAbs(
 
 llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   llvm::Value* block_id = IntCast(
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_),
-      b_->getIntNTy(128), /*isSigned=*/true, "block.id");
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b()),
+      b()->getIntNTy(128), /*isSigned=*/true, "block.id");
   llvm::Value* thread_id_in_block = IntCast(
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_),
-      b_->getIntNTy(128), /*isSigned=*/true, "thread.id");
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b()),
+      b()->getIntNTy(128), /*isSigned=*/true, "thread.id");
   llvm::Value* threads_per_block = IntCast(
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockDimx, {}, {}, b_),
-      b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockDimx, {}, {}, b()),
+      b()->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 766a4c84df5..0303ea47e8d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -126,6 +126,8 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       const string& callee_name, absl::Span<llvm::Value* const> operands,
       absl::Span<const PrimitiveType> input_types, PrimitiveType output_type);
 
+  const HloModuleConfig& hlo_module_config_;
+
   NestedComputer compute_nested_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index e9435f4fa92..6e81dc0d5e2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -91,6 +92,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/logistic_expander.h"
+#include "tensorflow/compiler/xla/service/loop_schedule_linearizer.h"
 #include "tensorflow/compiler/xla/service/qr_expander.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/rng_bit_generator_expander.h"
@@ -306,6 +308,10 @@ Status GpuCompiler::OptimizeHloModule(
 
     HloPassPipeline horizontal_fusion("horizontal_fusion");
     horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
+    // The code generated for fusions created by GpuHorizontalInputFusion has
+    // been observed to fail with CUDA_ERROR_ILLEGAL_ADDRESS errors.
+    // TODO(b/171227713): Re-enable once the emitters are fixed.
+    // horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                       /*only_fusion_computations=*/true);
     horizontal_fusion.AddPass<HloDCE>();
@@ -357,6 +363,7 @@ Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   if (hlo_module->config().alias_passthrough_params()) {
     pipeline.AddPass<AliasPassthroughParams>();
   }
+  pipeline.AddPass<LoopScheduleLinearizer>(GetCanShareBuffer());
   pipeline.AddPass<GpuCopyInsertion>(GetCanShareBuffer());
   pipeline.AddPass<GpuSanitizeConstantNames>();
   return pipeline.Run(hlo_module).status();
@@ -477,6 +484,47 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   return std::move(module);
 }
 
+static absl::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
+                                                        const HloInstruction*,
+                                                        const ShapeIndex&) {
+  return absl::nullopt;
+}
+
+StatusOr<
+    std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+GpuCompiler::RunHloPassesAndBufferAssignement(
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* executor,
+    se::DeviceMemoryAllocator* device_allocator, bool optimize) {
+  if (optimize) {
+    TF_ASSIGN_OR_RETURN(hlo_module, RunHloPasses(std::move(hlo_module),
+                                                 executor, device_allocator));
+  }
+
+  std::unique_ptr<StreamAssignment> stream_assignment =
+      AssignStreams(*hlo_module);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GpuHloSchedule> hlo_schedule,
+      GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size_));
+
+  auto buffer_size_bytes_function =
+      [this](const BufferValue& buffer_value) -> int64 {
+    return GpuCompiler::GetSizeOfShape(buffer_value.shape(), pointer_size_);
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(
+          hlo_module.get(), hlo_schedule->ConsumeHloOrdering(),
+          buffer_size_bytes_function,
+          /*color_alignment=*/
+          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+          /*allocate_buffers_for_constants=*/true,
+          /*colorer=*/BufferAssigner::DefaultColorer(),
+          /*must_not_live_out=*/{}, DummyCanShareBufferFunction));
+
+  return std::make_tuple(std::move(hlo_module), std::move(assignment));
+}
+
 // The order of `thunk_sequence` corresponds to
 // `hlo_schedule->ThunkLaunchOrder()`.
 static Status CompileModuleToLlvmIrImpl(
@@ -722,12 +770,6 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
 }
 
-static absl::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
-                                                        const HloInstruction*,
-                                                        const ShapeIndex&) {
-  return absl::nullopt;
-}
-
 StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 7b6e4c78832..824d7404ebe 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -55,6 +55,13 @@ class GpuCompiler : public LLVMCompiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
 
+  StatusOr<
+      std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+  RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> hlo_module,
+                                   se::StreamExecutor* executor,
+                                   se::DeviceMemoryAllocator* device_allocator,
+                                   bool optimize) override;
+
   Status OptimizeHloModule(HloModule* hlo_module,
                            se::StreamExecutor* stream_exec,
                            se::DeviceMemoryAllocator* device_allocator);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
index fb8c05798d8..a6113273d8f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
@@ -536,11 +536,12 @@ MatchBackwardInput(HloInstruction* conv) {
   // 'kernel_output_feature_dimension' by 'feature_group_count'.
   int64 input_feature_dimension = dnums.kernel_input_feature_dimension();
   int64 output_feature_dimension = dnums.kernel_output_feature_dimension();
+  // The following code assumes that input_feature_dimension and
+  // output_feature_dimension are adjacent.
+  if (std::abs(input_feature_dimension - output_feature_dimension) != 1) {
+    return no_match_result;
+  }
 
-  // In the backward convolution case, the spatial dimensions become the
-  // feature dimensions, and we are guaranteed that the spatial dimensions are
-  // adjacent.
-  CHECK_EQ(std::abs(input_feature_dimension - output_feature_dimension), 1LL);
   int64 input_features = rhs->shape().dimensions(input_feature_dimension);
   int64 output_features = rhs->shape().dimensions(output_feature_dimension);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index ce319b4c59d..b9481aa5091 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -143,29 +143,27 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
          IsReductionFromOrToContiguousDimensions(instr);
 }
 
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return &instr;
+  }
+  auto fused_expression_root = instr.fused_expression_root();
+  if (!instr.IsMultiOutputFusion()) {
+    return fused_expression_root;
+  }
+  // If possible, we want to pick a reduction-from-or-to-contiguous-dims
+  // operand of the fusion root, because it has the most constraints.
+  for (const auto* inst : fused_expression_root->operands()) {
+    if (IsReductionFromOrToContiguousDimensions(*inst)) {
+      return inst;
+    }
+  }
+  return fused_expression_root->operands()[0];
+}
+
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                           const HloInstruction& instr2) {
-  // Returns the instructions that determines the emitter used for lowering,
-  // sometimes referred to as "the real hero".
-  auto get_real_hero =
-      [&](const HloInstruction* instr) -> const HloInstruction* {
-    if (instr->opcode() != HloOpcode::kFusion) {
-      return instr;
-    }
-    auto fused_expression_root = instr->fused_expression_root();
-    if (!instr->IsMultiOutputFusion()) {
-      return fused_expression_root;
-    }
-    // If possible, we want to pick a reduction-to-vector operand of the
-    // fusion root, because it has the most constraints.
-    for (const auto* inst : fused_expression_root->operands()) {
-      if (IsReductionFromOrToContiguousDimensions(*inst)) {
-        return inst;
-      }
-    }
-    return fused_expression_root->operands()[0];
-  };
-
   // Multi-output fusion kernels share a common parallel loop. The loop
   // dimensions are determined by instruction shapes.
   auto get_loop_shape = [&](const HloInstruction* element_instr) {
@@ -181,8 +179,8 @@ bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
   // root ops should have equal output shapes. An exception are
   // reduction-to-vector ops. Here the input shapes of the reduction (first
   // operand shape) and the reduction dimensions need to match.
-  auto* instr_1 = get_real_hero(&instr1);
-  auto* instr_2 = get_real_hero(&instr2);
+  auto* instr_1 = GetRealHeroForMultiOutputFusion(instr1);
+  auto* instr_2 = GetRealHeroForMultiOutputFusion(instr2);
   if (IsReductionFromOrToContiguousDimensions(*instr_1) &&
       IsReductionFromOrToContiguousDimensions(*instr_2) &&
       !AreFusedReductionOutputsConsistent({instr_1, instr_2}, instr_1)) {
@@ -336,9 +334,9 @@ static int64 SharedMemoryUsage(const HloInstruction& instr) {
     }
   } else if (instr.opcode() == HloOpcode::kFusion) {
     int64 sum = 0;
-    for (const HloInstruction* operand :
-         instr.fused_expression_root()->operands()) {
-      sum += SharedMemoryUsage(*operand);
+    for (const HloInstruction* hlo :
+         instr.fused_instructions_computation()->MakeInstructionPostOrder()) {
+      sum += SharedMemoryUsage(*hlo);
     }
     return sum;
   }
@@ -524,5 +522,24 @@ HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& /*producer*/,
                                   : HloInstruction::FusionKind::kLoop;
 }
 
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer) {
+  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
+    if (user->opcode() == HloOpcode::kGetTupleElement) {
+      // Skip GTE.
+      return IsConsumerTheOnlyNonRootUser(*user, consumer);
+    }
+    if (user == &consumer) {
+      // `user` is `consumer`.
+      return true;
+    }
+    if (user == user->parent()->root_instruction()) {
+      // Consumed by ROOT.
+      return true;
+    }
+    return false;
+  });
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index e7cac6e55c8..9fa098a3394 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -71,6 +71,11 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
 bool CreatesNestedLoop(const HloInstruction& producer,
                        const HloInstruction& consumer);
 
+// Returns the instruction that determines the emitter used for lowering,
+// sometimes referred to as "the real hero".
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
@@ -100,6 +105,10 @@ bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
 HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
                                             const HloInstruction& consumer);
 
+// Returns whether `consumer` is the only non-root user of `instr`.
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
new file mode 100644
index 00000000000..9287f9a92b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Gets the representative input shape of the multi-output fusion.
+Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
+  // Get the HLO that determines the emitter used for lowering.
+  const HloInstruction* real_hero = GetRealHeroForMultiOutputFusion(instr);
+  if (real_hero->operands().empty()) {
+    // Simply return an empty shape if the representative node has no input
+    // operands.
+    return Shape();
+  } else {
+    return real_hero->operand(0)->shape();
+  }
+}
+
+class HorizontalInputFusionImpl {
+ public:
+  explicit HorizontalInputFusionImpl(HloComputation* computation)
+      : computation_(computation) {}
+
+  ~HorizontalInputFusionImpl() {}
+
+  StatusOr<bool> Run();
+
+ private:
+  HloComputation* computation_;
+};  // HorizontalInputFusionImpl
+
+// Compares one-by-one the dimensions of `shape_a` and `shape_b` from left to
+// right.
+bool CompareShapeDimsFromLeftToRight(const Shape& shape_a,
+                                     const Shape& shape_b) {
+  if (shape_a.rank() != shape_b.rank()) {
+    return shape_a.rank() < shape_b.rank();
+  }
+  auto dims_a = shape_a.dimensions();
+  auto dims_b = shape_b.dimensions();
+  for (size_t i = 0; i < dims_a.size(); ++i) {
+    if (dims_a[i] != dims_b[i]) {
+      return dims_a[i] < dims_b[i];
+    }
+  }
+  return true;
+}
+
+std::vector<HloInstruction*> FindAndSortFusionCandidates(
+    HloInstruction* consumer) {
+  absl::flat_hash_set<HloInstruction*> fusion_instr_set;
+  for (auto opnd : consumer->operands()) {
+    HloInstruction* predecessor = opnd->LatestNonGteAncestor();
+    // Find out the input fusion instructions whose only consumer is `consumer`.
+    // This guarantees that fusing these candidates will never create cycles, as
+    // there is no back edge.
+    if (IsReduceInputFusion(*predecessor) &&
+        IsConsumerTheOnlyNonRootUser(*predecessor, *consumer)) {
+      fusion_instr_set.insert(predecessor);
+    }
+  }
+
+  std::vector<HloInstruction*> fusion_instrs;
+  fusion_instrs.insert(fusion_instrs.end(), fusion_instr_set.begin(),
+                       fusion_instr_set.end());
+
+  std::sort(fusion_instrs.begin(), fusion_instrs.end(),
+            [&](const HloInstruction* a, const HloInstruction* b) {
+              Shape shape_a = GetInputShapeForMultiOutputFusion(*a);
+              Shape shape_b = GetInputShapeForMultiOutputFusion(*b);
+              if (!ShapeUtil::EqualIgnoringElementType(shape_a, shape_b)) {
+                // Sort shapes according to dimensions, so that the same input
+                // shapes will be placed adjacent each other.
+                return CompareShapeDimsFromLeftToRight(shape_a, shape_b);
+              }
+              // Sort `fusion_instrs` according to instruction counts, because
+              // we'd like to fuse together computations of similar sizes.
+              return a->fused_instruction_count() <
+                     b->fused_instruction_count();
+            });
+
+  return fusion_instrs;
+}
+
+StatusOr<bool> HorizontalInputFusionImpl::Run() {
+  bool changed = false;
+  XLA_VLOG_LINES(3, computation_->ToString());
+
+  // Using def-to-use order is sound since we do not modify users.
+  std::vector<HloInstruction*> def_to_use_order =
+      computation_->MakeInstructionPostOrder();
+  for (auto consumer : def_to_use_order) {
+    auto candidates = FindAndSortFusionCandidates(consumer);
+    if (candidates.empty()) {
+      continue;
+    }
+
+    size_t fusion_anchor_id = 0;
+    for (size_t j = 1; j < candidates.size(); ++j) {
+      HloInstruction* fusion_anchor = candidates[fusion_anchor_id];
+      HloInstruction* fused = candidates[j];
+      if (ShapesCompatibleForMultiOutputFusion(*fusion_anchor, *fused) &&
+          !FusionWouldBeTooLarge(*fusion_anchor, *fused)) {
+        VLOG(3) << "Fuse " << fused->ToString() << " into "
+                << fusion_anchor->ToString();
+        fusion_anchor->MergeFusionInstructionIntoMultiOutput(fused);
+        changed = true;
+      } else {
+        // Update the `fusion_anchor_id` since `fused` is either not
+        // compatible or not beneficial to be fused with current fusion anchor.
+        VLOG(3) << j - fusion_anchor_id - 1 << " instructions are fused.";
+        fusion_anchor_id = j;
+      }
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> GpuHorizontalInputFusion::RunOnComputation(
+    HloComputation* computation) {
+  HorizontalInputFusionImpl horizontal_fusion_impl(computation);
+  return horizontal_fusion_impl.Run();
+}
+
+StatusOr<bool> GpuHorizontalInputFusion::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "Run horizontal input fusion.";
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(changed, RunOnComputation(comp));
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
new file mode 100644
index 00000000000..85313d03412
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace gpu {
+
+// This optimization pass horizontally fuses kInput fusions to both reduce the
+// kernel launch overhead and increase parallelism degree. See
+// GpuHorizontalFusion for general description and motivation about horizontal
+// fusion. GpuHorizontalFusion deals with kLoop fusions while this pass deals
+// with kInput fusions.
+//
+// Following GpuHorizontalFusion, a simple yet effective heuristic is used
+// to search the fusion candidates while avoiding creating cycles. That is,
+// we simply search for fusion candidates by looking for instructions whose
+// outputs are all consumed by the same instruction. This catches the typical
+// target cases; often, the candidate instructions are just consumed by the
+// ROOT tuple of the entry computation.
+class GpuHorizontalInputFusion : public HloModulePass {
+ public:
+  GpuHorizontalInputFusion() {}
+
+  absl::string_view name() const override {
+    return "gpu_horizontal_input_fusion";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation*);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
new file mode 100644
index 00000000000..96e46fe723c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HorizontalInputFusionTest : public GpuCodegenTest {};
+
+TEST_F(HorizontalInputFusionTest, BasicTest) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule BasicTest
+
+  %add_f16 {
+    %x = f16[] parameter(0)
+    %y = f16[] parameter(1)
+    ROOT %add = f16[] add(%x, %y)
+  }
+
+ fused_computation.1 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+ }
+
+ fused_computation.2 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+ }
+
+ ENTRY entry_computation {
+   arg.1 = f16[1024]{0} parameter(0)
+   arg.2 = f16[1024]{0} parameter(1)
+   fusion.1 = f16[] fusion(arg.1), kind=kInput, calls=fused_computation.1
+   fusion.2 = f16[] fusion(arg.2), kind=kInput, calls=fused_computation.2
+   ROOT tuple.1 = (f16[], f16[]) tuple(fusion.1, fusion.2)
+ }
+)")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(entry_root, op::Tuple((op::GetTupleElement(op::Fusion())),
+                                    (op::GetTupleElement(op::Fusion()))));
+
+  const HloInstruction* fusion = entry_root->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+// TODO(b/171227713): Re-enable once fixed.
+TEST_F(HorizontalInputFusionTest, DISABLED_ManyInputFusions) {
+  auto module = CreateNewVerifiedModule();
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  HloComputation::Builder builder(TestName());
+  std::vector<HloInstruction*> var_outs;
+  auto input_shape = ShapeUtil::MakeShape(F32, {1024, 1024});
+  auto output_shape = ShapeUtil::MakeShape(F32, {1024});
+  for (int64 i = 0; i < 130; ++i) {
+    // %fused_computation.3 (param_0: f32[1024,1024], param_1: f32[]) ->
+    // f32[1024] {
+    //  %param_0 = f32[1024,1024]{1,0} parameter(0)
+    //  %param_1 = f32[] parameter(1)
+    //  %broadcast = f32[1024,1024]{1,0} broadcast(f32[] %param_1),
+    //  dimensions={}
+    //  %multiply = f32[1024,1024]{1,0}
+    //      multiply(f32[1024,1024]{1,0} %param_0, f32[1024,1024]{1,0}
+    //      %broadcast)
+    //  %constant0 = f32[] constant(0)
+    //  ROOT %reduce = f32[1024]{0}
+    //      reduce(f32[1024,1024]{1,0} %multiply, f32[] %constant0),
+    //          dimensions={1}, to_apply=%add
+    // }
+    HloInstruction* param_var_in = builder.AddInstruction(
+        HloInstruction::CreateParameter(i * 2 + 0, input_shape, "var.in"));
+    HloInstruction* param_alpha =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i * 2 + 1, ShapeUtil::MakeShape(F32, {}), "alpha"));
+    auto alpha_broadcasted = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(input_shape, param_alpha, {}));
+    auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+        input_shape, HloOpcode::kMultiply, param_var_in, alpha_broadcasted));
+    HloInstruction* const0 = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+    auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+        output_shape, mul, const0, {1}, reduce_computation));
+    var_outs.push_back(reduce);
+  }
+  builder.AddInstruction(HloInstruction::CreateTuple(var_outs));
+  module->AddEntryComputation(builder.Build());
+
+  // Verify that horizontal fusion is kicked in. Check that there are multiple
+  // `reduce` instructions fused into the same fusion. 6 is just a randomly
+  // picked number as we don't exactly know how large the fusion will be
+  // created due to the `FusionWouldBeTooLarge` constraint.
+  CompileAndVerifyIr(module->Clone(), R"(CHECK: reduce-group-6)",
+                     /*match_optimized_ir=*/false);
+
+  // Testing with the entire gpu optimization pipeline.
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(HorizontalInputFusionTest, MultiOutputFusionTest) {
+  // This tests the below pattern. One known issue is that gtes (to fusions) can
+  // be removed after their producer fusions are merged. In the below case, gte2
+  // and gte6 will be gone if Fusion2 is fused into Fusion1.
+  //
+  // Fusion1   Fusion2
+  //  |   |    |     |
+  //  |  gte1 gte2   |
+  //  |   |    |     |
+  //  |   Fusion3    |
+  //  |    |   |     |
+  // gte3 gte4 gte5 gte6
+  //  \  |     |    /
+  //  =====ROOT=====
+  //
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule MultiOutputFusionTest
+
+  %add_f16 {
+    %x = f16[] parameter(0)
+    %y = f16[] parameter(1)
+    ROOT %add = f16[] add(%x, %y)
+  }
+
+ fused_computation.1 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+   add.0 = f16[1024] add(arg.1, arg.1)
+   ROOT tuple.1 = (f16[], f16[1024]) tuple(reduce.1, add.0)
+ }
+
+ fused_computation.2 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+   add.0 = f16[1024] add(arg.1, arg.1)
+   ROOT tuple.1 = (f16[], f16[1024]) tuple(reduce.1, add.0)
+ }
+
+ fused_computation.3 {
+   arg.0 = f16[1024]{0} parameter(0)
+   arg.1 = f16[1024]{0} parameter(1)
+   add.0 = f16[1024] add(arg.0, arg.1)
+   mul.0 = f16[1024] multiply(arg.0, arg.1)
+   ROOT tuple.1 = (f16[1024], f16[1024]) tuple(add.0, mul.0)
+ }
+
+ ENTRY entry_computation {
+   arg.1 = f16[1024]{0} parameter(0)
+   arg.2 = f16[1024]{0} parameter(1)
+   fusion.1 = (f16[],f16[1024]) fusion(arg.1), kind=kInput, calls=fused_computation.1
+   fusion.2 = (f16[],f16[1024]) fusion(arg.2), kind=kInput, calls=fused_computation.2
+   gte.3 = f16[] get-tuple-element(fusion.1), index=0
+   gte.1 = f16[1024]{0} get-tuple-element(fusion.1), index=1
+   gte.2 = f16[1024]{0} get-tuple-element(fusion.2), index=1
+   gte.6 = f16[] get-tuple-element(fusion.2), index=0
+   fusion.3 = (f16[1024],f16[1024]) fusion(gte.1, gte.2),
+       kind=kLoop, calls=fused_computation.3
+   gte.4 = f16[1024] get-tuple-element(fusion.3), index=0
+   gte.5 = f16[1024]{0} get-tuple-element(fusion.3), index=1
+   ROOT tuple.1 = (f16[], f16[1024]{0}, f16[], f16[1024]{0})
+       tuple(gte.3, gte.4, gte.5, gte.6)
+ }
+)")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
index 577c7eed6c4..9d1e0533a91 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/env_var.h"
@@ -137,25 +138,6 @@ bool IsFusionSupported(const HloInstruction& instr) {
   return true;
 }
 
-bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
-                                  const HloInstruction& consumer) {
-  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
-    if (user->opcode() == HloOpcode::kGetTupleElement) {
-      // Skip GTE.
-      return IsConsumerTheOnlyNonRootUser(*user, consumer);
-    } else if (user == &consumer) {
-      // `user` is `consumer`.
-      return true;
-    } else if (user == user->parent()->root_instruction()) {
-      // Consumed by ROOT is always fine, since it is impossible to create
-      // cycles through ROOT.
-      return true;
-    } else {
-      return false;
-    }
-  });
-}
-
 // Returns whether `instr` is a profitable candidate to be horizontally fused.
 // Since the primary benefit of horizontal fusion comes from reducing the
 // kernel launch overhead, we want to exclude the instructions with
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 2215881271c..5682fcedf1d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -773,11 +773,11 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   CHECK_EQ(HloInstruction::FusionKind::kLoop, fusion->fusion_kind());
   GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_,
                                           GetNestedComputer());
-  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
-                               &elemental_emitter);
-  TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
-
-  return EmitTargetElementLoop(*fusion, fused_emitter.GetRootGenerator());
+  FusedIrEmitter fused_emitter(&elemental_emitter);
+  BindFusionArguments(fusion, &fused_emitter);
+  TF_ASSIGN_OR_RETURN(auto generator, fused_emitter.GetGenerator(
+                                          fusion->fused_expression_root()));
+  return EmitTargetElementLoop(*fusion, generator);
 }
 
 Status IrEmitter::HandleCall(HloInstruction* call) {
@@ -876,5 +876,17 @@ std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
   return output_arrays;
 }
 
+void IrEmitter::BindFusionArguments(const HloInstruction* fusion,
+                                    FusedIrEmitter* fused_emitter) {
+  for (int i = 0; i < fusion->operand_count(); i++) {
+    const HloInstruction* operand = fusion->operand(i);
+    fused_emitter->BindGenerator(
+        fusion->fused_parameter(i),
+        [this, operand, fusion](llvm_ir::IrArray::Index index) {
+          return GetIrArray(*operand, *fusion).EmitReadArrayElement(index, &b_);
+        });
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 1a387528220..894f1401e0d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@@ -182,18 +183,9 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   const HloModuleConfig& hlo_module_config_;
 
  protected:
-  GeneratorForOperandIrArrays GetGeneratorForOperandIrArrays(
-      const HloInstruction* fusion) {
-    return [=]() {
-      std::vector<llvm_ir::IrArray> ir_arrays;
-      ir_arrays.reserve(fusion->operand_count());
-      absl::c_transform(fusion->operands(), std::back_inserter(ir_arrays),
-                        [&](const HloInstruction* operand) {
-                          return GetIrArray(*operand, *fusion);
-                        });
-      return ir_arrays;
-    };
-  }
+  // Bind all argument IrArrays of `fusion` to `fused_emitter`.
+  void BindFusionArguments(const HloInstruction* fusion,
+                           FusedIrEmitter* fused_emitter);
 
  private:
   // A helper method for EmitAtomicOperationForNestedComputation. Certain
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d33474f83c2..3896d8e870c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -38,8 +38,10 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
@@ -87,6 +89,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -203,7 +206,7 @@ StatusOr<BufferAllocation::Slice> GetAllocationSliceForMlir(
 }
 
 StatusOr<std::vector<MlirBufferSlice>> GetMlirBufferSlices(
-    mlir::Operation* op, mlir::OperandRange operands,
+    mlir::Operation* op, mlir::ValueRange operands,
     absl::Span<const BufferAllocation> allocations) {
   const auto buffer_is_written = [op](mlir::Value operand) {
     llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 2> effects;
@@ -227,6 +230,55 @@ StatusOr<std::vector<MlirBufferSlice>> GetMlirBufferSlices(
   return slices;
 }
 
+bool BinarySearchDenseElementsAttr(::mlir::DenseIntElementsAttr elements,
+                                   int64 v) {
+  ::mlir::APInt value(sizeof(int64) * 8, v, /*isSigned=*/true);
+  return std::binary_search(
+      elements.begin(), elements.end(), value,
+      [](const ::mlir::APInt& x, const ::mlir::APInt& y) { return x.slt(y); });
+}
+
+// Returns true if the fusion contains any instruction that is likely
+// translated to complex LLVM IR, such as loops, and prevent vectorization.
+bool MayPreventVectorization(const HloInstruction& hlo) {
+  if (hlo.opcode() == HloOpcode::kFusion) {
+    return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
+                          [](const HloInstruction* instr) {
+                            switch (instr->opcode()) {
+                              case HloOpcode::kReduceWindow:
+                              case HloOpcode::kSort:
+                              case HloOpcode::kDot:
+                              case HloOpcode::kSin:
+                              case HloOpcode::kCos:
+                              case HloOpcode::kPower:
+                              case HloOpcode::kAtan2:
+                                return true;
+                              default:
+                                return false;
+                            }
+                          });
+  } else if (hlo.IsElementwise()) {
+    // Unfused elementwise operations are usually memory bound, unroll them.
+    switch (hlo.opcode()) {
+        // The following elementwise operation implementations contain branches.
+        // LLVM vectorizer doesn't work in that case.
+        // The unrolled code is faster when it isn't vectorized.
+      case HloOpcode::kSin:
+      case HloOpcode::kCos:
+      case HloOpcode::kPower:
+      case HloOpcode::kAtan2:
+        return true;
+      default:
+        return false;
+    }
+  } else if (hlo.opcode() == HloOpcode::kReduce && hlo.shape().IsArray()) {
+    // TODO: check if the to_apply() attribute contains instruction
+    // that break LLVM vectorization.
+    return false;
+  }
+  return true;
+}
+
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
@@ -405,6 +457,62 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
   return b->getInt32Ty();
 }
 
+// The same as GetIndexTypeForKernel, but works with MLIR ops.
+llvm::Type* GetIndexTypeForKernelFromMlir(mlir::Operation* op,
+                                          int64 launch_size,
+                                          llvm::IRBuilder<>* b) {
+  auto shape_in_range = [&](const Shape& s) {
+    bool in_range = true;
+    ShapeUtil::ForEachSubshape(s, [&](const Shape& sub_shape,
+                                      const ShapeIndex& /*index*/) {
+      if (sub_shape.IsArray() && !IsInt32(ShapeUtil::ElementsIn(sub_shape))) {
+        in_range = false;
+      }
+    });
+
+    return in_range;
+  };
+
+  llvm::Type* i64_ty = b->getInt64Ty();
+  // Check launch dimension
+  if (!IsInt32(launch_size)) {
+    return i64_ty;
+  }
+
+  // Check the size of result tensors
+  for (auto result : op->getResults()) {
+    if (!shape_in_range(TypeToShape(result.getType()))) {
+      return i64_ty;
+    }
+  }
+
+  auto hlo_shape_in_range = [&](mlir::Value operand) -> bool {
+    return shape_in_range(TypeToShape(operand.getType()));
+  };
+
+  // Check the size of input tensors
+  if (!absl::c_all_of(op->getOperands(), hlo_shape_in_range)) {
+    return i64_ty;
+  }
+
+  // Check the size of the internal result tensors
+  if (auto fusion = mlir::dyn_cast<mlir::lmhlo::FusionOp>(op)) {
+    auto result = fusion.region().walk([&](mlir::Operation* op) {
+      for (mlir::Value result : op->getResults()) {
+        if (!hlo_shape_in_range(result)) {
+          return mlir::WalkResult::interrupt();
+        }
+      }
+      return mlir::WalkResult::advance();
+    });
+    if (result.wasInterrupted()) {
+      return i64_ty;
+    }
+  }
+
+  return b->getInt32Ty();
+}
+
 // Gets the input shape of the ROOT slices, which will be used as the kernel
 // launch dims. The slice input fusion requires the input shapes of the ROOT
 // slices to be the same although the (slice) output shapes can be different.
@@ -703,6 +811,207 @@ Status IrEmitterUnnested::HandleTriangularSolve(HloInstruction* hlo) {
   return ThunkEmitter(this).HandleTriangularSolve(hlo);
 }
 
+// Convert the following form of fusion region:
+//   fusion() {
+//     %0 = tensor_load %external_memref0
+//     %1 = tensor_load %external_memref1
+//     ...
+//     tensor_store %ret, %external_memref2
+//   }
+// to
+//   fusion(%external_memref0, %external_memref1) (^bb(%0, %1) {
+//     ...
+//     mhlo.return %ret
+//   })
+//
+// So that it's suitable for MHLO -> XLA HLO conversion.
+// This function won't be needed once ElementalIrEmitter migrates to take MHLO
+// instead.
+static Status ProcessFusionForConversion(mlir::Region* region,
+                                         std::vector<mlir::Value>* operands,
+                                         std::vector<mlir::Value>* outputs,
+                                         std::vector<Shape>* operand_shapes) {
+  std::vector<mlir::TensorLoadOp> loads;
+  std::vector<mlir::TensorStoreOp> stores;
+
+  region->walk([&](mlir::TensorLoadOp load) {
+    if (load.memref().getParentRegion() != region) {
+      loads.push_back(load);
+    }
+  });
+
+  region->walk([&](mlir::TensorStoreOp store) {
+    if (store.memref().getParentRegion() != region) {
+      stores.push_back(store);
+    }
+  });
+
+  for (auto load : loads) {
+    auto arg = region->addArgument(load.getType());
+    load.replaceAllUsesWith(arg);
+    operands->push_back(load.memref());
+    Shape shape = TypeToShape(load.getType());
+    auto attr = mlir::GetLayoutFromMlirHlo(load);
+    if (attr) {
+      std::vector<int64> minor_to_major;
+      absl::c_transform(
+          attr, std::back_inserter(minor_to_major),
+          std::function<int64(const llvm::APInt&)>(&llvm::APInt::getZExtValue));
+      *shape.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
+    } else {
+      *shape.mutable_layout() =
+          LayoutUtil::MakeDescendingLayout(load.getType().getShape().size());
+    }
+    operand_shapes->push_back(shape);
+    load.erase();
+  }
+
+  std::vector<mlir::Value> returned_values;
+  for (auto store : stores) {
+    returned_values.push_back(store.tensor());
+    outputs->push_back(store.memref());
+    store.erase();
+  }
+
+  region->back().back().erase();
+  auto b = mlir::OpBuilder::atBlockEnd(&region->back());
+  auto loc = returned_values[0].getLoc();
+  b.create<mlir::mhlo::ReturnOp>(loc, returned_values);
+  return Status::OK();
+}
+
+// Similar to the general GetMlirBufferSlices, but it's specific to fusion,
+// since fusion doesn't have any ODS operands and memory side-effect
+// annotations.
+static StatusOr<std::vector<MlirBufferSlice>> CreateFusionSlices(
+    absl::Span<const mlir::Value> fusion_operands,
+    absl::Span<const mlir::Value> fusion_outputs,
+    absl::Span<const Shape> operand_shapes, const Shape& output_shape,
+    const BufferAssignment& buffer_assignment) {
+  absl::Span<const BufferAllocation> allocations(
+      buffer_assignment.Allocations());
+
+  std::vector<MlirBufferSlice> slices;
+  for (int i = 0; i < fusion_operands.size(); i++) {
+    mlir::Value operand = fusion_operands[i];
+    MlirBufferSlice slice;
+    TF_ASSIGN_OR_RETURN(slice.buffer_slice,
+                        GetAllocationSliceForMlir(operand, allocations));
+    slice.shape = operand_shapes.at(i);
+    slices.push_back(slice);
+  }
+  for (int i = 0; i < fusion_outputs.size(); i++) {
+    mlir::Value output = fusion_outputs[i];
+    MlirBufferSlice slice;
+    TF_ASSIGN_OR_RETURN(slice.buffer_slice,
+                        GetAllocationSliceForMlir(output, allocations));
+    slice.written = true;
+    if (output_shape.IsTuple()) {
+      slice.shape = output_shape.tuple_shapes(i);
+    } else {
+      slice.shape = output_shape;
+    }
+    slices.push_back(slice);
+  }
+
+  return slices;
+}
+
+// TODO(timshen): update the comment once the HandleFusion code path deleted.
+//
+// This is migrated from IrEmitter::HandleFusion() with IrEmitterUnnested as the
+// subclass. The logic is de-virtualized and less scattered.
+Status IrEmitterUnnested::EmitLoopFusionFromMlir(MlirEmitterInput input,
+                                                 const Shape& output_shape,
+                                                 int unroll_factor) {
+  auto fusion = mlir::cast<mlir::lmhlo::FusionOp>(input.op);
+  std::string name = mlir::GetNameFromLoc(fusion.getLoc());
+
+  std::vector<mlir::Value> fusion_operands;
+  std::vector<mlir::Value> fusion_outputs;
+  std::vector<Shape> operand_shapes;
+  TF_RETURN_IF_ERROR(ProcessFusionForConversion(
+      &fusion.region(), &fusion_operands, &fusion_outputs, &operand_shapes));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<MlirBufferSlice> slices,
+      CreateFusionSlices(fusion_operands, fusion_outputs, operand_shapes,
+                         output_shape,
+                         ir_emitter_context_->buffer_assignment()));
+  slices.push_back(input.extra_slice);
+
+  std::vector<llvm_ir::IrArray> ir_arrays;
+  Thunk* kernel_thunk;
+  {
+    std::unique_ptr<KernelThunk> kernel_thunk_ptr =
+        BuildKernelThunkForMlir(name, input.thunk_info, slices, &ir_arrays);
+    kernel_thunk = kernel_thunk_ptr.get();
+    thunk_sequence_.emplace_back(std::move(kernel_thunk_ptr));
+  }
+
+  TF_ASSIGN_OR_RETURN(const HloComputation* fused_computation,
+                      GetOrCreateSubComputationFromRegion(&fusion.region()));
+
+  CHECK_EQ(fusion_operands.size(), fused_computation->num_parameters());
+  for (int i = 0; i < fused_computation->num_parameters(); i++) {
+    *fused_computation->parameter_instruction(i)
+         ->mutable_shape()
+         ->mutable_layout() = slices[i].shape.layout();
+  }
+
+  GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_,
+                                          GetNestedComputer());
+  FusedIrEmitter fused_emitter(&elemental_emitter);
+
+  for (int i = 0; i < fusion_operands.size(); i++) {
+    auto operand_ir_arrays =
+        absl::MakeSpan(ir_arrays).subspan(0, fusion_operands.size());
+
+    auto* builder = &b_;
+    auto ir_array = operand_ir_arrays[i];
+    fused_emitter.BindGenerator(
+        fused_computation->parameter_instruction(i),
+        [builder, ir_array](llvm_ir::IrArray::Index index) {
+          return ir_array.EmitReadArrayElement(index, builder);
+        });
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto element_generator,
+      fused_emitter.GetGenerator(fused_computation->root_instruction()));
+
+  Shape element_shape = TypeToShape(fusion_outputs[0].getType());
+  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+      element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
+                         ir_emitter_context_->llvm_module());
+  auto output_arrays = absl::MakeSpan(ir_arrays).subspan(fusion_operands.size(),
+                                                         fusion_outputs.size());
+  llvm::Type* index_type = GetIndexTypeForKernelFromMlir(
+      fusion, launch_dimensions.launch_bound(), &b_);
+
+  if (fusion_outputs.size() > 1) {
+    // Emit the tuple pointers in one thread.  We could do this at any point in
+    // the kernel, but we do it at the beginning in the hopes of reducing
+    // register pressure, since we touch threadIdx.x and blockIdx.x at the
+    // beginning of the kernel *anyway*.
+    KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+      llvm_ir::EmitTuple(ir_arrays.back(), output_arrays, &b_);
+    });
+    // For multioutput fusion, we need to emit each operand and the root.
+    TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays,
+                                           launch_dimensions, &b_,
+                                           unroll_factor)
+                           .EmitLoop(name, index_type));
+  } else {
+    TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays[0],
+                                           launch_dimensions, &b_,
+                                           unroll_factor)
+                           .EmitLoop(name, index_type));
+  }
+
+  b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
   if (fusion->IsInputFusion()) {
@@ -718,14 +1027,14 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           GpuElementalIrEmitter operand_elemental_emitter(
               hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
               GetNestedComputer());
-          FusedIrEmitter operand_fused_emitter(
-              GetGeneratorForOperandIrArrays(fusion),
-              &operand_elemental_emitter);
-          TF_RETURN_IF_ERROR(
-              root->mutable_operand(0)->Accept(&operand_fused_emitter));
+          FusedIrEmitter operand_fused_emitter(&operand_elemental_emitter);
+          BindFusionArguments(fusion, &operand_fused_emitter);
+          TF_ASSIGN_OR_RETURN(
+              auto generator,
+              operand_fused_emitter.GetGenerator(root->operand(0)));
 
           TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
-              *fusion, operand_fused_emitter.GetGenerator(root->operand(0)),
+              *fusion, generator,
               static_cast<KernelThunk*>(thunks.back().get()),
               ComputeMaxUnrollFactor(fusion)));
         }
@@ -740,16 +1049,34 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           GpuElementalIrEmitter scatter_elemental_emitter(
               hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
               GetNestedComputer());
-          FusedIrEmitter scatter_fused_emitter(
-              GetGeneratorForOperandIrArrays(fusion),
-              &scatter_elemental_emitter);
-          TF_RETURN_IF_ERROR(root->Accept(&scatter_fused_emitter));
-          TF_RETURN_IF_ERROR(EmitScatter(
-              thunks.back().get(), root,
-              /*scatter_indices_gen=*/
-              scatter_fused_emitter.GetGenerator(root->operand(1)),
-              /*updates_gen=*/
-              scatter_fused_emitter.GetGenerator(root->operand(2))));
+          FusedIrEmitter scatter_fused_emitter(&scatter_elemental_emitter);
+          BindFusionArguments(fusion, &scatter_fused_emitter);
+          CHECK_EQ(root->parent()->FusionInstruction(), fusion);
+
+          TF_ASSIGN_OR_RETURN(
+              const auto dim_numbers,
+              lhlo_scratch_emitter_.GetScatterDimensionNumbers(root));
+
+          ScatterDescriptor desc;
+          desc.name = IrName(root);
+          desc.operand_shape = root->operand(0)->shape();
+          desc.scatter_indices_shape = root->operand(1)->shape();
+          desc.updates_shape = root->operand(2)->shape();
+          desc.dim_numbers = dim_numbers;
+          desc.unique_indices = root->unique_indices();
+          desc.update_computation = root->called_computations()[0];
+          desc.output = GetIrArray(*fusion, *fusion);
+          TF_ASSIGN_OR_RETURN(
+              desc.scatter_indices_gen,
+              scatter_fused_emitter.GetGenerator(root->operand(1)));
+          TF_ASSIGN_OR_RETURN(
+              desc.updates_gen,
+              scatter_fused_emitter.GetGenerator(root->operand(2)));
+          desc.get_index_type = [&](int64 launch_size) {
+            return GetIndexTypeForKernel(root, launch_size, &b_);
+          };
+
+          TF_RETURN_IF_ERROR(EmitScatter(desc, thunks.back().get()));
         }
         AddThunkToThunkSequence(absl::make_unique<SequentialThunk>(
             GetThunkInfo(fusion), std::move(thunks)));
@@ -811,9 +1138,11 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
                            ir_emitter_context_->llvm_module());
     AddThunkToThunkSequence(std::move(fusion_thunk));
 
+    FusedIrEmitter fused_emitter(&elemental_emitter);
+    BindFusionArguments(fusion, &fused_emitter);
+
     return llvm_ir::EmitParallelFusedDynamicUpdateSliceInPlace(
-        fusion, GetGeneratorForOperandIrArrays(fusion), output_array,
-        &elemental_emitter, launch_dimensions, &b_);
+        fusion, output_array, &fused_emitter, launch_dimensions, &b_);
   }
 
   CHECK_EQ(fusion->fusion_kind(), HloInstruction::FusionKind::kLoop)
@@ -823,7 +1152,22 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  return IrEmitter::HandleFusion(fusion);
+  int unroll_factor = 1;
+  if (!MayPreventVectorization(*fusion)) {
+    unroll_factor = ComputeMaxUnrollFactor(fusion);
+  }
+
+  MlirEmitterInput input;
+  TF_ASSIGN_OR_RETURN(input.op, lhlo_scratch_emitter_.EmitFusionOp(fusion));
+  const auto& buffer_assignment = ir_emitter_context_->buffer_assignment();
+  auto& slice = input.extra_slice;
+  TF_ASSIGN_OR_RETURN(slice.buffer_slice,
+                      buffer_assignment.GetUniqueSlice(fusion, {}));
+  slice.written = true;
+  slice.shape = fusion->shape();
+  input.thunk_info = GetThunkInfo(fusion);
+
+  return EmitLoopFusionFromMlir(input, fusion->shape(), unroll_factor);
 }
 
 Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
@@ -933,35 +1277,72 @@ Status IrEmitterUnnested::HandleGetTupleElement(HloInstruction*) {
 
 Status IrEmitterUnnested::HandleSelectAndScatter(
     HloInstruction* select_and_scatter) {
-  CHECK_EQ(select_and_scatter->operand_count(), 3);
+  const Window& window = select_and_scatter->window();
   const auto* operand = select_and_scatter->operand(0);
   const auto* source = select_and_scatter->operand(1);
-  const Window& window = select_and_scatter->window();
-  PrimitiveType operand_element_type = operand->shape().element_type();
   const int64 rank = operand->shape().rank();
   CHECK_EQ(rank, source->shape().rank());
   CHECK_EQ(rank, window.dimensions_size());
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
-                      BuildInitializerThunk(select_and_scatter));
-  std::vector<std::unique_ptr<Thunk>> thunks;
-  thunks.push_back(std::move(initializer_thunk));
-  thunks.push_back(BuildKernelThunk(select_and_scatter,
-                                    /*implements_whole_instruction=*/false));
-  std::unique_ptr<SequentialThunk> select_and_scatter_thunk =
-      absl::make_unique<SequentialThunk>(GetThunkInfo(select_and_scatter),
-                                         std::move(thunks));
-
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
         "Dilation for SelectAndScatter not implemented on GPU.");
   }
 
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                      BuildInitializerThunk(select_and_scatter));
+
+  MlirEmitterInput input;
+  TF_ASSIGN_OR_RETURN(
+      auto select_and_scatter_op,
+      lhlo_scratch_emitter_.EmitSelectAndScatterOp(select_and_scatter));
+  input.op = select_and_scatter_op;
+  input.thunk_info = GetThunkInfo(select_and_scatter);
+  return EmitSelectAndScatterFromMlir(input, std::move(initializer_thunk));
+}
+
+Status IrEmitterUnnested::EmitSelectAndScatterFromMlir(
+    MlirEmitterInput mlir_input, std::unique_ptr<Thunk>&& initializer_thunk) {
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(initializer_thunk));
+
+  absl::Span<const BufferAllocation> allocations(
+      ir_emitter_context_->buffer_assignment().Allocations());
+
+  auto select_and_scatter_op =
+      ::mlir::cast<::mlir::lmhlo::SelectAndScatterOp>(mlir_input.op);
+
+  // Init value is not needed in IR emission.
+  llvm::SmallVector<::mlir::Value, 4> operands{select_and_scatter_op.operand(),
+                                               select_and_scatter_op.source(),
+                                               select_and_scatter_op.out()};
+  TF_ASSIGN_OR_RETURN(
+      std::vector<MlirBufferSlice> operand_slices,
+      GetMlirBufferSlices(select_and_scatter_op, operands, allocations));
+
+  std::string name = mlir::GetNameFromLoc(select_and_scatter_op.getLoc());
+  std::vector<llvm_ir::IrArray> ir_arrays;
+  thunks.push_back(BuildKernelThunkForMlir(name, Thunk::ThunkInfo(),
+                                           operand_slices, &ir_arrays));
+  CHECK_EQ(ir_arrays.size(), 3);
+  const IrArray& operand_array = ir_arrays[0];
+  const IrArray& source_array = ir_arrays[1];
+  const IrArray& out_array = ir_arrays[2];
+
+  auto select_and_scatter_thunk = absl::make_unique<SequentialThunk>(
+      mlir_input.thunk_info, std::move(thunks));
+
+  const Shape source_shape =
+      TypeToShape(select_and_scatter_op.source().getType());
+  const Shape operand_shape =
+      TypeToShape(select_and_scatter_op.operand().getType());
+  const int64 rank = operand_shape.rank();
+
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      source->shape(), ir_emitter_context_->gpu_device_info());
-  llvm::Type* index_type = GetIndexTypeForKernel(
-      select_and_scatter, launch_dimensions.launch_bound(), &b_);
+      source_shape, ir_emitter_context_->gpu_device_info());
+  llvm::Type* index_type = GetIndexTypeForKernelFromMlir(
+      select_and_scatter_op, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_type, c);
   };
@@ -985,32 +1366,39 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   //         selected_index = I
   //         initialized_flag = true
   //   output(selected_index) = scatter(output(selected_index), source(S))
-  auto loop_body_emitter = [=](const IrArray::Index& source_index) -> Status {
+  auto loop_body_emitter = [&](const IrArray::Index& source_index) -> Status {
     // Allocate space to keep the currently selected value, its index, and a
     // boolean flag if the value is initialized. The initialized_flag is set
     // false.
     llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry(
-        llvm_ir::PrimitiveTypeToIrType(operand_element_type,
+        llvm_ir::PrimitiveTypeToIrType(operand_shape.element_type(),
                                        ir_emitter_context_->llvm_module()),
         "selected_value_address", &b_);
+
     llvm::Value* selected_index_address =
         llvm_ir::EmitAllocaAtFunctionEntryWithCount(
             index_type, index_typed_constant(rank), "selected_index_address",
             &b_);
+
     llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry(
         b_.getInt1Ty(), "initialized_flag_address", &b_);
     Store(b_.getInt1(false), initialized_flag_address);
 
     // Create the inner loop to iterate over the window.
-    llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_,
+    llvm_ir::ForLoopNest window_loops(absl::StrCat(name, "inner"), &b_,
                                       index_type);
+
     DimensionVector window_size;
-    for (const auto& dim : window.dimensions()) {
-      window_size.push_back(dim.size());
-      CHECK_GT(dim.size(), 0);
+    ::mlir::DenseIntElementsAttr window_dimensions =
+        select_and_scatter_op.window_dimensions().getValue();
+    for (const auto& dim : window_dimensions) {
+      window_size.push_back(dim.getSExtValue());
+      CHECK_GT(dim.getSExtValue(), 0);
     }
+
     const IrArray::Index window_index = window_loops.AddLoopsForShape(
-        ShapeUtil::MakeShape(operand_element_type, window_size), "window");
+        ShapeUtil::MakeShape(operand_shape.element_type(), window_size),
+        "window");
     llvm_ir::SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(),
                                    &b_);
 
@@ -1019,18 +1407,25 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // checking whether the operand index >= 0.
     std::vector<llvm::Value*> operand_multi_index(source_index.size());
     llvm::Value* in_bounds_condition = b_.getInt1(true);
-    for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* strided_index = NSWMul(
-          source_index[i], index_typed_constant(window.dimensions(i).stride()));
-      operand_multi_index[i] =
-          NSWSub(NSWAdd(strided_index, window_index[i]),
-                 index_typed_constant(window.dimensions(i).padding_low()));
+
+    auto strides = *select_and_scatter_op.window_strides();
+    auto paddings = *select_and_scatter_op.padding();
+
+    for (auto stride_and_padding :
+         llvm::enumerate(llvm::zip(strides, paddings))) {
+      const int i = stride_and_padding.index();
+      int64 stride = std::get<0>(stride_and_padding.value()).getSExtValue();
+      int64 padding = std::get<1>(stride_and_padding.value()).getSExtValue();
+
+      llvm::Value* strided_index =
+          NSWMul(source_index[i], index_typed_constant(stride));
+      operand_multi_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]),
+                                      index_typed_constant(padding));
       llvm::Value* index_condition = ICmpULT(
           operand_multi_index[i],
-          index_typed_constant(ShapeUtil::GetDimension(operand->shape(), i)));
+          index_typed_constant(ShapeUtil::GetDimension(operand_shape, i)));
       in_bounds_condition = And(in_bounds_condition, index_condition);
     }
-    CHECK(in_bounds_condition != nullptr);
 
     // Only need to do something if the operand index is within the bounds.
     // First check if the initialized_flag is set.
@@ -1050,8 +1445,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
         Store(operand_index[i], selected_index_address_slot);
       }
     };
-    IrArray operand_array = GetIrArray(*operand, *select_and_scatter);
-    IrArray::Index operand_index(operand_multi_index, operand->shape(),
+    IrArray::Index operand_index(operand_multi_index, operand_shape,
                                  index_type);
     llvm::Value* operand_data =
         operand_array.EmitReadArrayElement(operand_index, &b_);
@@ -1069,9 +1463,14 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
         llvm_ir::PrimitiveTypeToIrType(PRED,
                                        ir_emitter_context_->llvm_module()),
         "select_return_buffer", &b_);
+
+    TF_ASSIGN_OR_RETURN(
+        const HloComputation* select_computation,
+        GetOrCreateSubComputationFromRegion(&select_and_scatter_op.select()));
+
     TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-        *select_and_scatter->select(),
-        {selected_value_address, operand_address}, select_return_buffer));
+        *select_computation, {selected_value_address, operand_address},
+        select_return_buffer));
     llvm::Value* result = Load(select_return_buffer);
 
     // If the 'select' function returns false, update the selected value and the
@@ -1100,18 +1499,21 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
           InBoundsGEP(selected_index_address, {b_.getInt32(i)});
       selected_multi_index.push_back(Load(selected_index_address_slot));
     }
+    const Shape output_shape =
+        TypeToShape(select_and_scatter_op.out().getType());
     llvm::Value* source_value_address =
-        GetIrArray(*source, *select_and_scatter)
-            .EmitArrayElementAddress(source_index, &b_);
-    IrArray::Index selected_index(selected_multi_index,
-                                  select_and_scatter->shape(),
+        source_array.EmitArrayElementAddress(source_index, &b_);
+    IrArray::Index selected_index(selected_multi_index, output_shape,
                                   operand_index.GetType());
     llvm::Value* output_value_address =
-        GetIrArray(*select_and_scatter, *select_and_scatter)
-            .EmitArrayElementAddress(selected_index, &b_);
+        out_array.EmitArrayElementAddress(selected_index, &b_);
+
+    TF_ASSIGN_OR_RETURN(
+        const HloComputation* scatter_computation,
+        GetOrCreateSubComputationFromRegion(&select_and_scatter_op.scatter()));
+
     return EmitAtomicOperationForNestedComputation(
-        *select_and_scatter->scatter(), output_value_address,
-        source_value_address);
+        *scatter_computation, output_value_address, source_value_address);
   };
 
   UpdateLaunchDimensions(
@@ -1123,9 +1525,9 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       select_and_scatter_thunk->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   AddThunkToThunkSequence(std::move(select_and_scatter_thunk));
-  return ParallelLoopEmitter(loop_body_emitter, source->shape(),
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(select_and_scatter), index_type);
+  return ParallelLoopEmitter(loop_body_emitter, source_shape, launch_dimensions,
+                             &b_)
+      .EmitLoop(name, index_type);
 }
 
 Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
@@ -1177,61 +1579,117 @@ Status IrEmitterUnnested::HandleRngGetAndUpdateState(
 }
 
 Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
-  const HloInstruction* operand = scatter->operand(0);
-  const HloInstruction* scatter_indices = scatter->operand(1);
-  const HloInstruction* updates = scatter->operand(2);
+  MlirEmitterInput result;
+
+  TF_ASSIGN_OR_RETURN(auto scatter_op,
+                      lhlo_scratch_emitter_.EmitScatterOp(scatter));
+  result.op = scatter_op;
+  result.thunk_info = GetThunkInfo(scatter);
+  return EmitScatterFromMlir(result);
+}
+
+Status IrEmitterUnnested::EmitScatterFromMlir(MlirEmitterInput mlir_input) {
   std::vector<std::unique_ptr<Thunk>> thunks;
 
+  absl::Span<const BufferAllocation> allocations(
+      ir_emitter_context_->buffer_assignment().Allocations());
+
+  auto scatter_op = ::mlir::cast<::mlir::lmhlo::ScatterOp>(mlir_input.op);
+
+  TF_ASSIGN_OR_RETURN(
+      auto operand_buffer,
+      GetAllocationSliceForMlir(scatter_op.operand(), allocations));
+  TF_ASSIGN_OR_RETURN(
+      auto output_buffer,
+      GetAllocationSliceForMlir(scatter_op.output(), allocations));
+
   // Copy the operand into the output if it's not the same buffer already.
-  auto operand_buffer = GetAllocationSlice(*operand);
-  auto destination_buffer = GetAllocationSlice(*scatter);
-  if (operand_buffer != destination_buffer) {
+  if (operand_buffer != output_buffer) {
     thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
         Thunk::ThunkInfo(),
         /*source_address=*/operand_buffer,
-        /*destination_buffer=*/destination_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(operand->shape())));
+        /*destination_buffer=*/output_buffer,
+        /*mem_size=*/
+        ShapeUtil::ByteSizeOf(TypeToShape(scatter_op.output().getType()))));
   }
 
-  thunks.push_back(
-      BuildKernelThunk(scatter,
-                       /*implements_whole_instruction=*/thunks.empty()));
+  // Create MLIR buffer slice info for all operands except the first one
+  // (`operand`). The code generated for scatter below assumes that the input
+  // operand is already copied into the output, so does not use it in codegen.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<MlirBufferSlice> operand_slices,
+      GetMlirBufferSlices(scatter_op, scatter_op.getOperands().drop_front(),
+                          allocations));
+
+  std::string name = mlir::GetNameFromLoc(scatter_op.getLoc());
+  std::vector<llvm_ir::IrArray> ir_arrays;
+  thunks.push_back(BuildKernelThunkForMlir(name, mlir_input.thunk_info,
+                                           operand_slices, &ir_arrays));
+  CHECK_EQ(ir_arrays.size(), 3);
+  const IrArray& scatter_indices = ir_arrays[0];
+  const IrArray& updates = ir_arrays[1];
+  const IrArray& output = ir_arrays[2];
+
+  auto get_index_type = [&](int64 launch_size) {
+    return GetIndexTypeForKernelFromMlir(scatter_op, launch_size, &b_);
+  };
 
   TF_RETURN_IF_ERROR(EmitScatter(
-      thunks.back().get(), scatter,
+      thunks.back().get(), scatter_op, output,
       /*scatter_indices_gen=*/
-      [=](const IrArray::Index& index) {
-        return GetIrArray(*scatter_indices, *scatter)
-            .EmitReadArrayElement(index, &b_, "scatter_index");
+      [&](const IrArray::Index& index) {
+        return scatter_indices.EmitReadArrayElement(index, &b_,
+                                                    "scatter_index");
       },
       /*updates_gen=*/
-      [=](const IrArray::Index& index) {
-        return GetIrArray(*updates, *scatter)
-            .EmitReadArrayElement(index, &b_, "update");
-      }));
+      [&](const IrArray::Index& index) {
+        return updates.EmitReadArrayElement(index, &b_, "update");
+      },
+      /* get_index_type=*/
+      get_index_type));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
     AddThunkToThunkSequence(std::move(thunks[0]));
   } else {
     AddThunkToThunkSequence(absl::make_unique<SequentialThunk>(
-        GetThunkInfo(scatter), std::move(thunks)));
+        mlir_input.thunk_info, std::move(thunks)));
   }
 
   return Status::OK();
 }
 
 Status IrEmitterUnnested::EmitScatter(
-    Thunk* thunk, HloInstruction* scatter,
+    Thunk* thunk, mlir::lmhlo::ScatterOp scatter,
+    const llvm_ir::IrArray& output,
     const llvm_ir::ElementGenerator& scatter_indices_gen,
-    const llvm_ir::ElementGenerator& updates_gen) {
-  const HloInstruction* operand = scatter->operand(0);
-  const HloInstruction* scatter_indices = scatter->operand(1);
-  const HloInstruction* updates = scatter->operand(2);
-  const ScatterDimensionNumbers& dim_numbers =
-      scatter->scatter_dimension_numbers();
-  CHECK(ShapeUtil::Equal(scatter->shape(), operand->shape()));
+    const llvm_ir::ElementGenerator& updates_gen,
+    std::function<llvm::Type*(int64)> get_index_type) {
+  const Shape operand_shape = TypeToShape(scatter.operand().getType());
+  CHECK(
+      ShapeUtil::Equal(TypeToShape(scatter.output().getType()), operand_shape));
 
+  TF_ASSIGN_OR_RETURN(
+      const HloComputation* update_computation,
+      GetOrCreateSubComputationFromRegion(&scatter.update_computation()));
+
+  ScatterDescriptor desc;
+  desc.name = mlir::GetNameFromLoc(scatter.getLoc());
+  desc.operand_shape = operand_shape;
+  desc.scatter_indices_shape = TypeToShape(scatter.scatter_indices().getType());
+  desc.updates_shape = TypeToShape(scatter.updates().getType());
+  desc.dim_numbers = scatter.scatter_dimension_numbers();
+  desc.unique_indices = scatter.unique_indices();
+  desc.update_computation = update_computation;
+  desc.output = output;
+  desc.scatter_indices_gen = scatter_indices_gen;
+  desc.updates_gen = updates_gen;
+  desc.get_index_type = get_index_type;
+  return EmitScatter(desc, thunk);
+}
+
+Status IrEmitterUnnested::EmitScatter(const ScatterDescriptor& desc,
+                                      Thunk* thunk) {
   auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
     std::vector<llvm::Value*> raw_window_multidim;
     std::vector<llvm::Value*> input_scatter_multidim;
@@ -1241,22 +1699,25 @@ Status IrEmitterUnnested::EmitScatter(
     for (int64 i = 0, e = index.size(); i != e; ++i) {
       // For window indices also remember the window size, this comes in handy
       // later.
-      if (absl::c_binary_search(dim_numbers.update_window_dims(), i)) {
+      if (BinarySearchDenseElementsAttr(desc.dim_numbers.update_window_dims(),
+                                        i)) {
         raw_window_multidim.push_back(index[i]);
-        raw_window_bounds.push_back(updates->shape().dimensions(i));
+        raw_window_bounds.push_back(desc.updates_shape.dimensions(i));
       } else {
         input_scatter_multidim.push_back(index[i]);
       }
     }
     DCHECK_EQ(raw_window_multidim.size(),
-              dim_numbers.update_window_dims_size());
+              desc.dim_numbers.update_window_dims().size());
 
     // Apply inserted_window_dims to the window dimensions.
     int64 raw_window_multidim_idx = 0;
     std::vector<llvm::Value*> input_window_multidim;
     std::vector<int64> input_window_bounds;
-    for (int64 i = 0, e = operand->shape().rank(); i != e; ++i) {
-      if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
+
+    for (int64 i = 0, e = desc.operand_shape.rank(); i != e; ++i) {
+      if (BinarySearchDenseElementsAttr(desc.dim_numbers.inserted_window_dims(),
+                                        i)) {
         input_window_bounds.push_back(1);  // Trivial dimension.
         input_window_multidim.push_back(index.GetConstantWithIndexType(0));
       } else {
@@ -1267,14 +1728,15 @@ Status IrEmitterUnnested::EmitScatter(
         ++raw_window_multidim_idx;
       }
     }
-    DCHECK_EQ(input_window_multidim.size(), operand->shape().rank());
+    DCHECK_EQ(input_window_multidim.size(), desc.operand_shape.rank());
 
     // Insert a 1 dimension at the end if index_vector_dim requests one.
-    Shape scatter_indices_shape = scatter_indices->shape();
-    if (dim_numbers.index_vector_dim() == scatter_indices_shape.rank()) {
-      scatter_indices_shape.add_dimensions(1);
-      scatter_indices_shape.mutable_layout()->add_minor_to_major(
-          dim_numbers.index_vector_dim());
+    Shape scatter_indices_shape_fixed = desc.scatter_indices_shape;
+    if (desc.dim_numbers.index_vector_dim().getInt() ==
+        desc.scatter_indices_shape.rank()) {
+      scatter_indices_shape_fixed.add_dimensions(1);
+      scatter_indices_shape_fixed.mutable_layout()->add_minor_to_major(
+          desc.dim_numbers.index_vector_dim().getInt());
     }
 
     // Now load the indices corresponding to the current window from
@@ -1282,23 +1744,27 @@ Status IrEmitterUnnested::EmitScatter(
     std::vector<llvm::Value*> raw_scatter_index_multidim =
         input_scatter_multidim;
     raw_scatter_index_multidim.insert(
-        raw_scatter_index_multidim.begin() + dim_numbers.index_vector_dim(),
+        raw_scatter_index_multidim.begin() +
+            desc.dim_numbers.index_vector_dim().getInt(),
         nullptr);
     llvm::Value* is_in_bounds = b_.getTrue();
-    for (int64 i = 0, e = dim_numbers.scatter_dims_to_operand_dims_size();
+    for (int64 i = 0,
+               e = desc.dim_numbers.scatter_dims_to_operand_dims().size();
          i != e; ++i) {
       // Our index is stored along index_vector_dim, insert that into the lookup
       // index into scatter_indices.
-      raw_scatter_index_multidim[dim_numbers.index_vector_dim()] =
+      raw_scatter_index_multidim[desc.dim_numbers.index_vector_dim().getInt()] =
           index.GetConstantWithIndexType(i);
       llvm_ir::IrArray::Index raw_scatter_index_index(
-          raw_scatter_index_multidim, scatter_indices_shape, index.GetType());
+          raw_scatter_index_multidim, scatter_indices_shape_fixed,
+          index.GetType());
 
-      int64 operand_dim = dim_numbers.scatter_dims_to_operand_dims(i);
+      int64 operand_dim =
+          desc.dim_numbers.scatter_dims_to_operand_dims().getValue<int64>(i);
       TF_ASSIGN_OR_RETURN(
           llvm::Value* const loaded_scatter_index,
-          scatter_indices_gen(raw_scatter_index_index.SourceIndexOfReshape(
-              scatter_indices_shape, scatter_indices->shape(), &b_)));
+          desc.scatter_indices_gen(raw_scatter_index_index.SourceIndexOfReshape(
+              scatter_indices_shape_fixed, desc.scatter_indices_shape, &b_)));
       // And add the index to our window index. This yields the output index.
       llvm::Value* casted_scatter_index =
           IntCast(loaded_scatter_index, index.GetType(),
@@ -1308,7 +1774,7 @@ Status IrEmitterUnnested::EmitScatter(
       input_window_multidim[operand_dim] = dim_offset;
 
       // Also do the bounds check now.
-      int64 max_index = operand->shape().dimensions(operand_dim) -
+      int64 max_index = desc.operand_shape.dimensions(operand_dim) -
                         input_window_bounds[operand_dim] + 1;
       // is_in_bounds = index >= 0 && index < dim_size-window_size+1
       //   --> index u< dim_size-window_size+1
@@ -1322,25 +1788,23 @@ Status IrEmitterUnnested::EmitScatter(
     llvm_ir::SetToFirstInsertPoint(if_window_in_bounds_data.true_block, &b_);
     // All done, now just read from the calculated input from the window, and do
     // an atomic store to the calculated location in the output.
-    HloInstruction* output_hlo =
-        scatter->IsFused() ? scatter->parent()->FusionInstruction() : scatter;
     llvm_ir::IrArray::Index input_window_index(
-        input_window_multidim, output_hlo->shape(), index.GetType());
+        input_window_multidim, desc.output.GetShape(), index.GetType());
     llvm::Value* output_address =
-        GetIrArray(*output_hlo, *output_hlo)
-            .EmitArrayElementAddress(input_window_index, &b_);
+        desc.output.EmitArrayElementAddress(input_window_index, &b_);
     llvm::Value* input_address = llvm_ir::EmitAllocaAtFunctionEntry(
-        llvm_ir::PrimitiveTypeToIrType(updates->shape().element_type(),
+        llvm_ir::PrimitiveTypeToIrType(desc.updates_shape.element_type(),
                                        module_),
         "input_address", &b_);
-    TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index));
+    TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
+                        desc.updates_gen(index));
     Store(input_ir_value, input_address);
 
-    if (!scatter->unique_indices()) {
+    if (!desc.unique_indices) {
       return EmitAtomicOperationForNestedComputation(
-          *scatter->to_apply(), output_address, input_address);
+          *desc.update_computation, output_address, input_address);
     } else {
-      return EmitCallToNestedComputation(*scatter->to_apply(),
+      return EmitCallToNestedComputation(*desc.update_computation,
                                          {output_address, input_address},
                                          output_address);
     }
@@ -1350,31 +1814,52 @@ Status IrEmitterUnnested::EmitScatter(
   // also do one kernel per window instead if bounds checks turn out to be a
   // bottleneck.
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      updates->shape(), ir_emitter_context_->gpu_device_info());
+      desc.updates_shape, ir_emitter_context_->gpu_device_info());
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
 
-  return ParallelLoopEmitter(loop_body_emitter, updates->shape(),
+  return ParallelLoopEmitter(loop_body_emitter, desc.updates_shape,
                              launch_dimensions, &b_)
-      .EmitLoop(IrName(scatter),
-                GetIndexTypeForKernel(scatter, launch_dimensions.launch_bound(),
-                                      &b_));
+      .EmitLoop(desc.name,
+                desc.get_index_type(launch_dimensions.launch_bound()));
 }
 
 Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
   return IrEmitter::HandleSelect(select);
 }
 
+// This transformation should be migrated off. See b/171334474.
 StatusOr<const HloComputation*>
 IrEmitterUnnested::GetOrCreateSubComputationFromRegion(mlir::Region* region) {
   std::unique_ptr<HloModule>& module = scratch_nested_computations_[region];
   if (module == nullptr) {
     xla::XlaComputation xla_computation;
-    TF_RETURN_IF_ERROR(ConvertRegionToComputation(region, &xla_computation));
+    mlir::MlirToHloConversionOptions options;
+    options.propagate_layouts = true;
+    TF_RETURN_IF_ERROR(
+        ConvertRegionToComputation(region, &xla_computation, options));
+
     TF_ASSIGN_OR_RETURN(auto program_shape, xla_computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
         module, HloModule::CreateFromProto(xla_computation.proto(),
                                            HloModuleConfig(program_shape)));
+
+    // Post-process the generated computation:
+    // * Sanitize constant names, so that they can be used as LLVM global
+    // symbols.
+    // * Propagate layouts for tuple types.
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instr : computation->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConstant) {
+          instr->SetAndSanitizeName(llvm_ir::SanitizeConstantName(*instr));
+        }
+        if (instr->shape().IsTuple()) {
+          TF_ASSIGN_OR_RETURN(*instr->mutable_shape(),
+                              ShapeInference::InferVariadicOpShape(
+                                  instr->opcode(), instr->operands()));
+        }
+      }
+    }
   }
   return module->entry_computation();
 }
@@ -1868,9 +2353,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
   for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
     if (alloc.IsPreallocatedTempBuffer()) {
       if (!temp_buffer.has_value()) {
+        // Retrieve the first seen temp buffer.
         temp_buffer = &alloc;
-      } else {
-        LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!";
       }
     }
   }
@@ -2119,14 +2603,14 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
                                             ir_emitter_context_->llvm_module(),
                                             &b_, GetNestedComputer());
 
-    FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
-                                 &elemental_emitter);
-    TF_RETURN_IF_ERROR(init_value_operand->Accept(&fused_emitter));
-    TF_RETURN_IF_ERROR(
-        ParallelLoopEmitter(fused_emitter.GetGenerator(init_value_operand),
-                            GetIrArray(*hlo, *hlo, index), launch_dimensions,
-                            &b_)
-            .EmitLoop(IrName(hlo)));
+    FusedIrEmitter fused_emitter(&elemental_emitter);
+    BindFusionArguments(hlo, &fused_emitter);
+    TF_ASSIGN_OR_RETURN(auto generator,
+                        fused_emitter.GetGenerator(init_value_operand));
+    TF_RETURN_IF_ERROR(ParallelLoopEmitter(generator,
+                                           GetIrArray(*hlo, *hlo, index),
+                                           launch_dimensions, &b_)
+                           .EmitLoop(IrName(hlo)));
   } else {
     // In the unfused case the element is already there, just read from it.
     TF_RETURN_IF_ERROR(ParallelLoopEmitter(
@@ -2374,51 +2858,6 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   return Status::OK();
 }
 
-namespace {
-
-// Returns true if the fusion contains any instruction that is likely
-// translated to complex LLVM IR, such as loops, and prevent vectorization.
-bool MayPreventVectorization(const HloInstruction& hlo) {
-  if (hlo.opcode() == HloOpcode::kFusion) {
-    return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
-                          [](const HloInstruction* instr) {
-                            switch (instr->opcode()) {
-                              case HloOpcode::kReduceWindow:
-                              case HloOpcode::kSort:
-                              case HloOpcode::kDot:
-                              case HloOpcode::kSin:
-                              case HloOpcode::kCos:
-                              case HloOpcode::kPower:
-                              case HloOpcode::kAtan2:
-                                return true;
-                              default:
-                                return false;
-                            }
-                          });
-  } else if (hlo.IsElementwise()) {
-    // Unfused elementwise operations are usually memory bound, unroll them.
-    switch (hlo.opcode()) {
-        // The following elementwise operation implementations contain branches.
-        // LLVM vectorizer doesn't work in that case.
-        // The unrolled code is faster when it isn't vectorized.
-      case HloOpcode::kSin:
-      case HloOpcode::kCos:
-      case HloOpcode::kPower:
-      case HloOpcode::kAtan2:
-        return true;
-      default:
-        return false;
-    }
-  } else if (hlo.opcode() == HloOpcode::kReduce && hlo.shape().IsArray()) {
-    // TODO: check if the to_apply() attribute contains instruction
-    // that break LLVM vectorization.
-    return false;
-  }
-  return true;
-}
-
-}  // namespace
-
 Status IrEmitterUnnested::EmitTargetElementLoop(
     const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter) {
   int unroll_factor = 1;
@@ -2665,15 +3104,35 @@ void IrEmitterUnnested::EmitTileElementForFusion(
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
   GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
                                      GetNestedComputer());
-  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
-                               &elem_emitter, x_loc, y_loc,
-                               param_shmem_buffers);
-
-  TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
+  FusedIrEmitter fused_emitter(&elem_emitter);
+  for (int i = 0; i < hlo->operand_count(); i++) {
+    llvm_ir::ElementGenerator gen;
+    if (llvm::Value* param_tile_buffer = param_shmem_buffers[i]) {
+      gen = [this, param_tile_buffer, x_loc,
+             y_loc](llvm_ir::IrArray::Index index) {
+        // TODO(jlebar): Add AA metadata to this load.  Tile buffers are
+        // global variables, so LLVM's points-to analysis doesn't help us
+        // much.  And we want the AA info to be present before address
+        // spaces are inferred (which is pretty late in the pipeline), so
+        // even if we had address-space-based AA in LLVM, it wouldn't help
+        // us much here.
+        return b_.CreateLoad(
+            b_.CreateGEP(param_tile_buffer,
+                         {index.GetConstantWithIndexType(0), x_loc, y_loc}),
+            "tiled_buffer");
+      };
+    } else {
+      const HloInstruction* operand = hlo->operand(i);
+      gen = [this, operand, hlo](llvm_ir::IrArray::Index index) {
+        return GetIrArray(*operand, *hlo).EmitReadArrayElement(index, &b_);
+      };
+    }
+    fused_emitter.BindGenerator(hlo->fused_parameter(i), std::move(gen));
+  }
   IrArray::Index untiled_index = GetUnnormalizedIndex(
       index, output_arrays[0].GetShape(), &b_, mapping_scheme);
-  const llvm_ir::ElementGenerator& output_generator =
-      fused_emitter.GetRootGenerator();
+  llvm_ir::ElementGenerator output_generator =
+      *fused_emitter.GetGenerator(hlo->fused_expression_root());
   llvm::Value* output_value = output_generator(untiled_index).ValueOrDie();
   if (hlo->IsMultiOutputFusion()) {
     DCHECK(output_value->getType()->isStructTy());
@@ -2729,14 +3188,12 @@ void IrEmitterUnnested::EmitPrologueForReduction(
     llvm::Value* init_ir_value;
     const HloInstruction* init_value = reduce_inst->operand(1);
     if (unnested_hlo->opcode() == HloOpcode::kFusion) {
-      FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
-                                   &elemental_emitter);
+      FusedIrEmitter fused_emitter(&elemental_emitter);
+      BindFusionArguments(unnested_hlo, &fused_emitter);
 
-      TF_CHECK_OK(init_value->Accept(&fused_emitter));
-      init_ir_value =
-          fused_emitter
-              .GetGenerator(init_value)(IrArray::Index(b_.getInt32Ty()))
-              .ValueOrDie();
+      init_ir_value = (*fused_emitter.GetGenerator(init_value))(
+                          IrArray::Index(b_.getInt32Ty()))
+                          .ValueOrDie();
     } else {
       init_ir_value =
           GetIrArray(*init_value, *unnested_hlo)
@@ -3075,21 +3532,21 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       extra_output_gens;
   GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
                                      GetNestedComputer());
-  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
-                               &elem_emitter);
+  FusedIrEmitter fused_emitter(&elem_emitter);
+
   // Construct the ElementGenerator for each reduction and extra output in the
   // the group of output instructions.
   if (unnested_hlo->opcode() == HloOpcode::kFusion) {
-    TF_CHECK_OK(unnested_hlo->fused_expression_root()->Accept(&fused_emitter));
+    BindFusionArguments(unnested_hlo, &fused_emitter);
 
     for (int i = 0, e = output_instructions.size(); i != e; ++i) {
       const HloInstruction* inst = output_instructions[i];
       ShapeIndex idx =
           CreateShapeIndexForOutputInstruction(*unnested_hlo, *inst);
       if (IsReductionFromOrToContiguousDimensions(*inst)) {
-        input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
+        input_gens.push_back(*fused_emitter.GetGenerator(inst->operand(0)));
       } else {
-        extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst),
+        extra_output_gens.emplace_back(*fused_emitter.GetGenerator(inst),
                                        std::move(idx));
       }
     }
@@ -4074,11 +4531,10 @@ void IrEmitterUnnested::EmitElementForInputFusibleSlices(
   std::vector<llvm::Value*> input_ir_values;
   GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
                                      GetNestedComputer());
-  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
-                               &elem_emitter);
-  TF_CHECK_OK(unnested_hlo->fused_expression_root()->Accept(&fused_emitter));
+  FusedIrEmitter fused_emitter(&elem_emitter);
+  BindFusionArguments(unnested_hlo, &fused_emitter);
   for (const HloInstruction* slice : slice_instructions) {
-    auto input_generator = fused_emitter.GetGenerator(slice->operand(0));
+    auto input_generator = *fused_emitter.GetGenerator(slice->operand(0));
     input_ir_values.push_back(input_generator(index).ValueOrDie());
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 5cc5e206167..275cc5fad50 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 
 namespace xla {
@@ -51,7 +52,7 @@ struct HloBufferSlice : public BufferSlice {
 
 struct MlirBufferSlice : public BufferSlice {
   // The buffer is modified by the kernel.
-  bool written;
+  bool written = false;
 
   Shape shape;
 };
@@ -59,6 +60,15 @@ struct MlirBufferSlice : public BufferSlice {
 struct MlirEmitterInput {
   mlir::Operation* op;
   Thunk::ThunkInfo thunk_info;
+
+  // A field to allow plumbing extra information that BufferAssignment has, but
+  // LMHLO/MLIR representation does not have. Specifically, this is for passing
+  // the allocated buffer for tuple outputs (an array of pointers to tuple
+  // elements).
+  //
+  // TODO(timshen): We need a corresponding construct in LMHLO to represent
+  // this, aka an array of pointers to different memrefs. Once we have that, we
+  // can merge this information back to LMHLO graph and remove this field.
   MlirBufferSlice extra_slice;
 };
 
@@ -148,9 +158,13 @@ class IrEmitterUnnested : public IrEmitter,
   Status HandleDot(HloInstruction* dot) override;
   Status HandleFft(HloInstruction* fft) override;
   Status HandleFusion(HloInstruction* fusion) override;
+  Status EmitLoopFusionFromMlir(MlirEmitterInput input,
+                                const Shape& output_shape, int unroll_factor);
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleSelectAndScatter(HloInstruction* instruction) override;
+  Status EmitSelectAndScatterFromMlir(
+      MlirEmitterInput mlir_input, std::unique_ptr<Thunk>&& initializer_thunk);
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleWhile(HloInstruction* xla_while) override;
   Status HandleInfeed(HloInstruction* xla_infeed) override;
@@ -158,6 +172,7 @@ class IrEmitterUnnested : public IrEmitter,
   Status HandleRng(HloInstruction* random) override;
   Status HandleRngGetAndUpdateState(HloInstruction* rng_state) override;
   Status HandleScatter(HloInstruction* scatter) override;
+  Status EmitScatterFromMlir(MlirEmitterInput mlir_input);
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
   Status EmitSortFromMlir(MlirEmitterInput mlir_input);
@@ -407,16 +422,38 @@ class IrEmitterUnnested : public IrEmitter,
       const llvm_ir::IrArray::Index& slice_input_index);
 
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
-  // the process. `scatter` may be fused, scatter indices are taken from
-  // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
-  // expected to have the operand values in it already. If unique_indices
-  // is false, we will use an atomic update. Using true for unique_indices
-  // behaves properly only when it is guaranteed that the indices to be
-  // updated do not overlap. The caller is responsible for ensuring this is
-  // the case.
-  Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
+  // the process. Scatter indices are taken from `scatter_indices_gen`, updates
+  // from `updates_gen`. The output buffer is expected to have the operand
+  // values in it already. If unique_indices is false, we will use an atomic
+  // update. Using true for unique_indices behaves properly only when it is
+  // guaranteed that the indices to be updated do not overlap. The caller is
+  // responsible for ensuring this is the case.
+  Status EmitScatter(Thunk* thunk, mlir::lmhlo::ScatterOp scatter,
+                     const llvm_ir::IrArray& output,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
-                     const llvm_ir::ElementGenerator& updates_gen);
+                     const llvm_ir::ElementGenerator& updates_gen,
+                     std::function<llvm::Type*(int64)> get_index_type);
+
+  // Structure describing a scatter operation for IR emission.
+  // TODO(jurahul): Migrate element generators to use MLIR.
+  //                Migrate update_computation to be an MLIR Region.
+  struct ScatterDescriptor {
+    std::string name;
+    Shape operand_shape;
+    Shape scatter_indices_shape;
+    Shape updates_shape;
+    mlir::mhlo::ScatterDimensionNumbers dim_numbers;
+    bool unique_indices;
+    const HloComputation* update_computation;
+    llvm_ir::IrArray output;
+    llvm_ir::ElementGenerator scatter_indices_gen;
+    llvm_ir::ElementGenerator updates_gen;
+    std::function<llvm::Type*(int64)> get_index_type;
+  };
+
+  // Emits code for an in-place scatter using the provided scatter operation
+  // description.
+  Status EmitScatter(const ScatterDescriptor& desc, Thunk* thunk);
 
   // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel
   // for the hlo instruction.
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 04af67a70b9..51583117706 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -149,7 +149,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   llvm::TargetOptions target_options =
-      llvm::codegen::InitTargetOptionsFromCodeGenFlags();
+      llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
 
   // Set the verbose assembly options.
   target_options.MCOptions.AsmVerbose = false;
@@ -225,7 +225,7 @@ void EmitBitcodeToFile(const llvm::Module& module, absl::string_view filename) {
 // for the NVPTX target.
 string EmitModuleToPTX(llvm::Module* module,
                        llvm::TargetMachine* target_machine) {
-  std::string ptx;  // need a std::string instead of a ::string.
+  std::string ptx;
   {
     llvm::raw_string_ostream stream(ptx);
     llvm::buffer_ostream pstream(stream);
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 4ee78b0874c..fa73ac261f8 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -148,6 +149,16 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
               << " would be too large of a fusion.";
       continue;
     }
+    // Make sure the emitter can codegen the fusion op efficiently. We currently
+    // can have exponential time/memory requirements for emitting certain fusion
+    // ops, in which case we don't want to fuse.
+    // TODO(b/119692968): Remove this once fixed in the emitter.
+    if (FusedIrEmitter::IsFusedIrEmitterInefficient(consumer, producer)) {
+      VLOG(3) << "Fusion of " << producer->name() << " into "
+              << consumer->name()
+              << " would result in overly large code duplication.";
+      continue;
+    }
     fusion_candidates.push_back(consumer);
   }
   return fusion_candidates;
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 283eb30d15f..c9b0ee2da59 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -906,7 +906,223 @@ TEST_F(MultiOutputFusionTest, SharedMemoryBudget) {
                     .ValueOrDie();
   ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ConsumeValueOrDie());
 
-  EXPECT_EQ(2, CountMultiOutputFusions(module.get()));
+  EXPECT_EQ(3, CountMultiOutputFusions(module.get()));
+}
+
+TEST_F(MultiOutputFusionTest, NoFusionToAvoidUsingTooMuchSharedMemory) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule xla_computation_update_step.10931
+
+%scalar_add_computation.1 (scalar_lhs.1: f64[], scalar_rhs.1: f64[]) -> f64[] {
+  %scalar_lhs.1 = f64[] parameter(0)
+  %scalar_rhs.1 = f64[] parameter(1)
+  ROOT %add.1257 = f64[] add(f64[] %scalar_lhs.1, f64[] %scalar_rhs.1)
+}
+
+%fused_computation.1 (param_0.8: f64[64,64], param_1.11: f64[64,64], param_2.9: f64[64,64]) -> (f64[64], f64[64]) {
+  %param_0.8 = f64[64,64]{1,0} parameter(0)
+  %param_1.11 = f64[64,64]{1,0} parameter(1)
+  %multiply.2 = f64[64,64]{1,0} multiply(f64[64,64]{1,0} %param_0.8, f64[64,64]{1,0} %param_1.11)
+  %constant_5217.3 = f64[] constant(0)
+  %broadcast.1 = f64[64,64]{1,0} broadcast(f64[] %constant_5217.3), dimensions={}
+  %multiply.0 = f64[64,64]{1,0} multiply(f64[64,64]{1,0} %multiply.2, f64[64,64]{1,0} %broadcast.1)
+  %reduce.0 = f64[64]{0} reduce(f64[64,64]{1,0} %multiply.0, f64[] %constant_5217.3), dimensions={0}, to_apply=%scalar_add_computation.1
+  %param_2.9 = f64[64,64]{1,0} parameter(2)
+  %multiply.1514.clone.0.clone.1 = f64[64,64]{1,0} multiply(f64[64,64]{1,0} %param_2.9, f64[64,64]{1,0} %param_1.11)
+  %constant_5217.1.clone.1 = f64[] constant(0)
+  %broadcast.0.clone.1 = f64[64,64]{1,0} broadcast(f64[] %constant_5217.1.clone.1), dimensions={}
+  %multiply.1341.clone.0.clone.1 = f64[64,64]{1,0} multiply(f64[64,64]{1,0} %multiply.1514.clone.0.clone.1, f64[64,64]{1,0} %broadcast.0.clone.1)
+  %reduce.630.clone.0.clone.1 = f64[64]{0} reduce(f64[64,64]{1,0} %multiply.1341.clone.0.clone.1, f64[] %constant_5217.1.clone.1), dimensions={0}, to_apply=%scalar_add_computation.1
+  ROOT %tuple = (f64[64]{0}, f64[64]{0}) tuple(f64[64]{0} %reduce.0, f64[64]{0} %reduce.630.clone.0.clone.1)
+}
+
+%primitive_computation_add__1.6426 (parameter.6427: f64[], parameter.6428: f64[]) -> f64[] {
+  %parameter.6427 = f64[] parameter(0)
+  %parameter.6428 = f64[] parameter(1)
+  ROOT %add.6429 = f64[] add(f64[] %parameter.6427, f64[] %parameter.6428)
+}
+
+%fused_computation.2 (param_0.7: f64[64,64], param_1.9: f64[64,64]) -> f64[64] {
+  %param_0.7 = f64[64,64]{1,0} parameter(0)
+  %param_1.9 = f64[64,64]{1,0} parameter(1)
+  %multiply.1 = f64[64,64]{1,0} multiply(f64[64,64]{1,0} %param_0.7, f64[64,64]{1,0} %param_1.9)
+  %constant_5217.2 = f64[] constant(0)
+  ROOT %reduce.740.clone.0 = f64[64]{0} reduce(f64[64,64]{1,0} %multiply.1, f64[] %constant_5217.2), dimensions={0}, to_apply=%primitive_computation_add__1.6426
+}
+
+ENTRY %reproducer (param_0.1090: f64[64,64], param_1.1377: f64[64,64], param_2.1948: f64[64,64]) -> (f64[64], f64[64], f64[64]) {
+  %param_0.1090 = f64[64,64]{1,0} parameter(0)
+  %param_1.1377 = f64[64,64]{1,0} parameter(1)
+  %param_2.1948 = f64[64,64]{1,0} parameter(2)
+  %fusion.1 = (f64[64]{0}, f64[64]{0}) fusion(f64[64,64]{1,0} %param_0.1090, f64[64,64]{1,0} %param_1.1377, f64[64,64]{1,0} %param_2.1948), kind=kInput, calls=%fused_computation.1
+  %get-tuple-element = f64[64]{0} get-tuple-element((f64[64]{0}, f64[64]{0}) %fusion.1), index=0
+  %fusion.2 = f64[64]{0} fusion(f64[64,64]{1,0} %param_0.1090, f64[64,64]{1,0} %param_1.1377), kind=kInput, calls=%fused_computation.2
+  %get-tuple-element.1 = f64[64]{0} get-tuple-element((f64[64]{0}, f64[64]{0}) %fusion.1), index=1
+  ROOT %tuple.428 = (f64[64]{0}, f64[64]{0}, f64[64]{0}) tuple(f64[64]{0} %get-tuple-element, f64[64]{0} %fusion.2, f64[64]{0} %get-tuple-element.1)
+}
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(GpuMultiOutputFusion().Run(module.get()).ConsumeValueOrDie());
+}
+
+TEST_F(MultiOutputFusionTest, NoFusionToAvoidCodeDuplication) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+and.reduce_sub_computation {
+  x = pred[] parameter(0)
+  y = pred[] parameter(1)
+  ROOT and = pred[] and(x, y)
+}
+
+fused_computation.1 {
+  param_4.658 = f32[2,20,256]{2,0,1} parameter(4)
+  slice.1385 = f32[2,1,256]{2,0,1} slice(param_4.658), slice={[0:2], [11:12], [0:256]}
+  constant.6847 = s32[] constant(0)
+  broadcast.4823 = s32[3]{0} broadcast(constant.6847), dimensions={}
+  param_9.415 = s32[3]{0} parameter(9)
+  compare.700 = pred[3]{0} compare(broadcast.4823, param_9.415), direction=LE
+  constant.6846 = pred[] constant(true)
+  reduce.221 = pred[] reduce(compare.700, constant.6846), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2933 = pred[2,1,256]{2,0,1} broadcast(reduce.221), dimensions={}
+  param_5.528 = f32[2,512]{1,0} parameter(5)
+  slice.1384 = f32[2,256]{1,0} slice(param_5.528), slice={[0:2], [0:256]}
+  bitcast.341 = f32[2,1,256]{2,0,1} bitcast(slice.1384)
+  constant.5418 = f32[] constant(0)
+  broadcast.3227 = f32[2,1,256]{2,0,1} broadcast(constant.5418), dimensions={}
+  select.173 = f32[2,1,256]{2,0,1} select(broadcast.2933, bitcast.341, broadcast.3227)
+  add.573 = f32[2,1,256]{2,0,1} add(slice.1385, select.173)
+  param_0.299 = s32[] parameter(0)
+  constant.5157 = s32[] constant(11)
+  dynamic-update-slice.189 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.658, add.573, param_0.299, constant.5157, param_0.299)
+  slice.1383 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.189), slice={[0:2], [10:11], [0:256]}
+  constant.6800 = s32[] constant(0)
+  broadcast.4803 = s32[3]{0} broadcast(constant.6800), dimensions={}
+  param_8.484 = s32[3]{0} parameter(8)
+  compare.681 = pred[3]{0} compare(broadcast.4803, param_8.484), direction=LE
+  constant.6798 = pred[] constant(true)
+  reduce.203 = pred[] reduce(compare.681, constant.6798), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2932 = pred[2,1,256]{2,0,1} broadcast(reduce.203), dimensions={}
+  param_3.1169 = f32[2,512]{1,0} parameter(3)
+  slice.1382 = f32[2,256]{1,0} slice(param_3.1169), slice={[0:2], [0:256]}
+  bitcast.340 = f32[2,1,256]{2,0,1} bitcast(slice.1382)
+  select.172 = f32[2,1,256]{2,0,1} select(broadcast.2932, bitcast.340, broadcast.3227)
+  add.572 = f32[2,1,256]{2,0,1} add(slice.1383, select.172)
+  constant.5154 = s32[] constant(10)
+  dynamic-update-slice.188 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.189, add.572, param_0.299, constant.5154, param_0.299)
+  slice.1381 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.188), slice={[0:2], [9:10], [0:256]}
+  constant.6794 = s32[] constant(0)
+  broadcast.4801 = s32[3]{0} broadcast(constant.6794), dimensions={}
+  param_7.478 = s32[3]{0} parameter(7)
+  compare.679 = pred[3]{0} compare(broadcast.4801, param_7.478), direction=LE
+  constant.6793 = pred[] constant(true)
+  reduce.201 = pred[] reduce(compare.679, constant.6793), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2930 = pred[2,1,256]{2,0,1} broadcast(reduce.201), dimensions={}
+  param_2.1685 = f32[2,512]{1,0} parameter(2)
+  slice.1380 = f32[2,256]{1,0} slice(param_2.1685), slice={[0:2], [0:256]}
+  bitcast.339 = f32[2,1,256]{2,0,1} bitcast(slice.1380)
+  select.171 = f32[2,1,256]{2,0,1} select(broadcast.2930, bitcast.339, broadcast.3227)
+  add.571 = f32[2,1,256]{2,0,1} add(slice.1381, select.171)
+  constant.5153 = s32[] constant(9)
+  dynamic-update-slice.187 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.188, add.571, param_0.299, constant.5153, param_0.299)
+  slice.1379 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.187), slice={[0:2], [8:9], [0:256]}
+  constant.6788 = s32[] constant(0)
+  broadcast.4799 = s32[3]{0} broadcast(constant.6788), dimensions={}
+  param_6.495 = s32[3]{0} parameter(6)
+  compare.677 = pred[3]{0} compare(broadcast.4799, param_6.495), direction=LE
+  constant.6786 = pred[] constant(true)
+  reduce.199 = pred[] reduce(compare.677, constant.6786), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2929 = pred[2,1,256]{2,0,1} broadcast(reduce.199), dimensions={}
+  param_1.1408 = f32[2,512]{1,0} parameter(1)
+  slice.1378 = f32[2,256]{1,0} slice(param_1.1408), slice={[0:2], [0:256]}
+  bitcast.338 = f32[2,1,256]{2,0,1} bitcast(slice.1378)
+  select.170 = f32[2,1,256]{2,0,1} select(broadcast.2929, bitcast.338, broadcast.3227)
+  add.570 = f32[2,1,256]{2,0,1} add(slice.1379, select.170)
+  constant.5152 = s32[] constant(8)
+  ROOT dynamic-update-slice.186 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.187, add.570, param_0.299, constant.5152, param_0.299)
+}
+
+fused_computation.2 {
+  param_4.655 = f32[2,20,256]{2,0,1} parameter(4)
+  slice.1369 = f32[2,1,256]{2,0,1} slice(param_4.655), slice={[0:2], [7:8], [0:256]}
+  param_6.483 = pred[] parameter(6)
+  broadcast.2927 = pred[2,1,256]{2,0,1} broadcast(param_6.483), dimensions={}
+  param_5.525 = f32[2,512]{1,0} parameter(5)
+  slice.1368 = f32[2,256]{1,0} slice(param_5.525), slice={[0:2], [0:256]}
+  bitcast.333 = f32[2,1,256]{2,0,1} bitcast(slice.1368)
+  constant.5415 = f32[] constant(0)
+  broadcast.3225 = f32[2,1,256]{2,0,1} broadcast(constant.5415), dimensions={}
+  select.161 = f32[2,1,256]{2,0,1} select(broadcast.2927, bitcast.333, broadcast.3225)
+  add.549 = f32[2,1,256]{2,0,1} add(slice.1369, select.161)
+  param_0.265 = s32[] parameter(0)
+  constant.5151 = s32[] constant(7)
+  dynamic-update-slice.185 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.655, add.549, param_0.265, constant.5151, param_0.265)
+  slice.1367 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.185), slice={[0:2], [6:7], [0:256]}
+  constant.6782 = s32[] constant(0)
+  broadcast.4797 = s32[3]{0} broadcast(constant.6782), dimensions={}
+  param_9.391 = s32[3]{0} parameter(9)
+  compare.675 = pred[3]{0} compare(broadcast.4797, param_9.391), direction=LE
+  constant.6781 = pred[] constant(true)
+  reduce.197 = pred[] reduce(compare.675, constant.6781), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2926 = pred[2,1,256]{2,0,1} broadcast(reduce.197), dimensions={}
+  param_3.1167 = f32[2,512]{1,0} parameter(3)
+  slice.1366 = f32[2,256]{1,0} slice(param_3.1167), slice={[0:2], [0:256]}
+  bitcast.332 = f32[2,1,256]{2,0,1} bitcast(slice.1366)
+  select.160 = f32[2,1,256]{2,0,1} select(broadcast.2926, bitcast.332, broadcast.3225)
+  add.548 = f32[2,1,256]{2,0,1} add(slice.1367, select.160)
+  constant.5150 = s32[] constant(6)
+  dynamic-update-slice.184 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.185, add.548, param_0.265, constant.5150, param_0.265)
+  slice.1365 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.184), slice={[0:2], [5:6], [0:256]}
+  constant.6776 = s32[] constant(0)
+  broadcast.4794 = s32[3]{0} broadcast(constant.6776), dimensions={}
+  param_8.464 = s32[3]{0} parameter(8)
+  compare.673 = pred[3]{0} compare(broadcast.4794, param_8.464), direction=LE
+  constant.6775 = pred[] constant(true)
+  reduce.195 = pred[] reduce(compare.673, constant.6775), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2925 = pred[2,1,256]{2,0,1} broadcast(reduce.195), dimensions={}
+  param_2.1684 = f32[2,512]{1,0} parameter(2)
+  slice.1364 = f32[2,256]{1,0} slice(param_2.1684), slice={[0:2], [0:256]}
+  bitcast.331 = f32[2,1,256]{2,0,1} bitcast(slice.1364)
+  select.159 = f32[2,1,256]{2,0,1} select(broadcast.2925, bitcast.331, broadcast.3225)
+  add.547 = f32[2,1,256]{2,0,1} add(slice.1365, select.159)
+  constant.5149 = s32[] constant(5)
+  dynamic-update-slice.183 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.184, add.547, param_0.265, constant.5149, param_0.265)
+  slice.1363 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.183), slice={[0:2], [4:5], [0:256]}
+  constant.6770 = s32[] constant(0)
+  broadcast.4792 = s32[3]{0} broadcast(constant.6770), dimensions={}
+  param_7.458 = s32[3]{0} parameter(7)
+  compare.671 = pred[3]{0} compare(broadcast.4792, param_7.458), direction=LE
+  constant.6769 = pred[] constant(true)
+  reduce.193 = pred[] reduce(compare.671, constant.6769), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2924 = pred[2,1,256]{2,0,1} broadcast(reduce.193), dimensions={}
+  param_1.1405 = f32[2,512]{1,0} parameter(1)
+  slice.1362 = f32[2,256]{1,0} slice(param_1.1405), slice={[0:2], [0:256]}
+  bitcast.330 = f32[2,1,256]{2,0,1} bitcast(slice.1362)
+  select.158 = f32[2,1,256]{2,0,1} select(broadcast.2924, bitcast.330, broadcast.3225)
+  add.546 = f32[2,1,256]{2,0,1} add(slice.1363, select.158)
+  constant.5148 = s32[] constant(4)
+  ROOT dynamic-update-slice.182 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.183, add.546, param_0.265, constant.5148, param_0.265)
+}
+
+ENTRY main {
+  param_0.0 = s32[] parameter(0)
+  param_1.0 = f32[2,512]{1,0} parameter(1)
+  param_2.0 = f32[2,512]{1,0} parameter(2)
+  param_3.0 = f32[2,512]{1,0} parameter(3)
+  param_4.0 = f32[2,20,256]{2,1,0} parameter(4)
+  param_5.0 = f32[2,512]{1,0} parameter(5)
+  param_6.0 = s32[3]{0} parameter(6)
+  param_7.0 = s32[3]{0} parameter(7)
+  param_8.0 = s32[3]{0} parameter(8)
+  param_9.0 = s32[3]{0} parameter(9)
+  fusion.1 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, param_4.0, param_5.0, param_6.0, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.1
+  param_10 = pred[] parameter(10)
+  fusion.2 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, fusion.1, param_5.0, param_10, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.2
+  ROOT root = (f32[2,20,256]{2,0,1}, f32[2,20,256]{2,0,1}) tuple(fusion.1, fusion.2)
+}
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index b13f71c5a13..c142214677e 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -586,16 +586,8 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
       params.profiler->MakeScopedInstructionProfiler(profile_index());
 
   int64 local_device_ordinal = params.stream->parent()->device_ordinal();
-  GlobalDeviceId global_device_id;
-  if (params.gpu_global_device_ids) {
-    TF_RET_CHECK(0 <= local_device_ordinal &&
-                 local_device_ordinal < params.gpu_global_device_ids->size());
-    global_device_id = (*params.gpu_global_device_ids)[local_device_ordinal];
-  } else {
-    // No local -> global mapping was provided; assume the identity mapping.
-    global_device_id = GlobalDeviceId(local_device_ordinal);
-  }
-
+  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
+                      params.GetGlobalDeviceId());
   // Determines the set of global and local devices that are participating in
   // the same collective group as the caller.
   TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index eefa4661d37..77c54e48a70 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -198,6 +198,42 @@ absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
   return absl::nullopt;
 }
 
+// Try to load ptx from files defined in the FLAGS. If successful, return true.
+bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) {
+  // If the xla_gpu_ptx_file options is set, be explicit when a file is used
+  // and warn when a file is not used to ease catching typo in filename.
+  std::string prefix = xla::FilenameFor(*module, "", *ptx);
+  std::string matched_filename;
+  for (const string& full_filename :
+       module->config().debug_options().xla_gpu_ptx_file()) {
+    // To ease comparing many PTX versions, accept different suffixes then
+    // the original filename.
+    auto filename = tensorflow::io::Basename(full_filename);
+    if (absl::StartsWith(filename, prefix)) {
+      matched_filename = full_filename;
+      VLOG(0) << "RunBackend() - Will load PTX from file: " << full_filename;
+      break;
+    }
+  }
+  if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 &&
+      matched_filename.empty()) {
+    VLOG(0) << "RunBackend() - For module with prefix '" << prefix
+            << "', we did not found a PTX file to load.";
+  }
+
+  if (!matched_filename.empty()) {
+    std::ifstream ifs(matched_filename, std::ifstream::in);
+    *ptx = std::string(std::istreambuf_iterator<char>(ifs),
+                       std::istreambuf_iterator<char>());
+    CHECK(!ptx->empty()) << "Empty or non existing PTX file: "
+                         << matched_filename;
+    return true;
+  }
+  return false;
+}
+
+}  // namespace
+
 // Prints a warning if the ptx->sass JIT in the driver has known bugs.
 //
 // Using such a driver only a problem if we fail to use ptxas to compile our ptx
@@ -238,42 +274,6 @@ void WarnIfBadDriverJITVersion() {
   });
 }
 
-// Try to load ptx from files defined in the FLAGS. If successful, return true.
-bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) {
-  // If the xla_gpu_ptx_file options is set, be explicit when a file is used
-  // and warn when a file is not used to ease catching typo in filename.
-  std::string prefix = xla::FilenameFor(*module, "", *ptx);
-  std::string matched_filename;
-  for (const string& full_filename :
-       module->config().debug_options().xla_gpu_ptx_file()) {
-    // To ease comparing many PTX versions, accept different suffixes then
-    // the original filename.
-    auto filename = tensorflow::io::Basename(full_filename);
-    if (absl::StartsWith(filename, prefix)) {
-      matched_filename = full_filename;
-      VLOG(0) << "RunBackend() - Will load PTX from file: " << full_filename;
-      break;
-    }
-  }
-  if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 &&
-      matched_filename.empty()) {
-    VLOG(0) << "RunBackend() - For module with prefix '" << prefix
-            << "', we did not found a PTX file to load.";
-  }
-
-  if (!matched_filename.empty()) {
-    std::ifstream ifs(matched_filename, std::ifstream::in);
-    *ptx = std::string(std::istreambuf_iterator<char>(ifs),
-                       std::istreambuf_iterator<char>());
-    CHECK(!ptx->empty()) << "Empty or non existing PTX file: "
-                         << matched_filename;
-    return true;
-  }
-  return false;
-}
-
-}  // namespace
-
 NVPTXCompiler::NVPTXCompiler()
     : GpuCompiler(stream_executor::cuda::kCudaPlatformId, nvptx::kTargetTriple,
                   nvptx::kDataLayout) {}
@@ -415,7 +415,9 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                   "using $PATH.",
                   hlo_module_config);
             }
-          } else {
+          } else if (maybe_cubin.status().code() !=
+                     tensorflow::error::Code::UNIMPLEMENTED) {
+            // If unimplemented is returned, we fallback to the driver.
             LOG(FATAL) << "ptxas returned an error during compilation of ptx "
                           "to sass: '"
                        << maybe_cubin.status() << "'  "
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index e69be947522..3e19b35af19 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -30,6 +30,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+void WarnIfBadDriverJITVersion();
+
 // NVPTXCompiler generates efficient GPU executables for NVPTX target.
 class NVPTXCompiler : public GpuCompiler {
  public:
diff --git a/tensorflow/compiler/xla/service/gpu/replica_id_thunk.cc b/tensorflow/compiler/xla/service/gpu/replica_id_thunk.cc
index b6792bb7a26..61fe42d10d6 100644
--- a/tensorflow/compiler/xla/service/gpu/replica_id_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/replica_id_thunk.cc
@@ -27,9 +27,12 @@ Status ReplicaIdThunk::ExecuteOnStream(const ExecuteParams& params) {
       params.profiler->MakeScopedInstructionProfiler(profile_index());
 
   auto dest_addr = params.buffer_allocations->GetDeviceAddress(dest_);
-  TF_ASSIGN_OR_RETURN(int replica_id,
-                      params.device_assn->ReplicaIdForDeviceOrdinal(
-                          params.stream->parent()->device_ordinal()));
+
+  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
+                      params.GetGlobalDeviceId());
+  TF_ASSIGN_OR_RETURN(
+      int replica_id,
+      params.device_assn->ReplicaIdForDeviceOrdinal(global_device_id.value()));
   params.stream->ThenMemset32(&dest_addr, replica_id, /*size=*/4);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/fusion.hlo b/tensorflow/compiler/xla/service/gpu/tests/fusion.hlo
new file mode 100644
index 00000000000..73a56cb15ba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/fusion.hlo
@@ -0,0 +1,353 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+HloModule TestModule
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
+// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to [64 x float]*
+// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_4:.*]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_3]] to [64 x float]*
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_7:.*]], i64 0
+// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_6]] to [64 x float]*
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds i8, i8* %[[VAL_10:.*]], i64 0
+// CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_9]] to [64 x float]*
+// CHECK:         %[[VAL_12:.*]] = getelementptr inbounds i8, i8* %[[VAL_13:.*]], i64 0
+// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_12]] to [64 x float]*
+// CHECK:         %[[VAL_15:.*]] = getelementptr inbounds i8, i8* %[[VAL_16:.*]], i64 0
+// CHECK:         %[[VAL_17:.*]] = bitcast i8* %[[VAL_15]] to [64 x float]*
+// CHECK:         %[[VAL_18:.*]] = getelementptr inbounds i8, i8* %[[VAL_19:.*]], i64 0
+// CHECK:         %[[VAL_20:.*]] = bitcast i8* %[[VAL_18]] to [128 x [112 x [112 x [64 x half]]]]*
+// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds i8, i8* %[[VAL_22:.*]], i64 0
+// CHECK:         %[[VAL_23:.*]] = bitcast i8* %[[VAL_21]] to [128 x [112 x [112 x [64 x half]]]]*
+// CHECK:         %[[VAL_24:.*]] = getelementptr inbounds i8, i8* %[[VAL_25:.*]], i64 0
+// CHECK:         %[[VAL_26:.*]] = bitcast i8* %[[VAL_24]] to [128 x [112 x [112 x [64 x half]]]]*
+// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds i8, i8* %[[VAL_28:.*]], i64 0
+// CHECK:         %[[VAL_29:.*]] = bitcast i8* %[[VAL_27]] to [128 x [112 x [112 x [64 x half]]]]*
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds i8, i8* %[[VAL_28]], i64 0
+// CHECK:         %[[VAL_31:.*]] = bitcast i8* %[[VAL_30]] to [128 x [112 x [112 x [64 x half]]]]*
+// CHECK:         %[[VAL_32:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_33:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_34:.*]] = mul nuw nsw i32 %[[VAL_32]], 256
+// CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_34]], %[[VAL_33]]
+// CHECK:         %[[VAL_36:.*]] = icmp ult i32 %[[VAL_35]], 25690112
+// CHECK:         call void @llvm.assume(i1 %[[VAL_36]])
+// CHECK:         %[[VAL_37:.*]] = mul nuw nsw i32 %[[VAL_35]], 4
+// CHECK:         %[[VAL_38:.*]] = udiv i32 %[[VAL_37]], 1
+// CHECK:         %[[VAL_39:.*]] = urem i32 %[[VAL_38]], 64
+// CHECK:         %[[VAL_40:.*]] = udiv i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_41:.*]] = urem i32 %[[VAL_40]], 112
+// CHECK:         %[[VAL_42:.*]] = udiv i32 %[[VAL_37]], 7168
+// CHECK:         %[[VAL_43:.*]] = urem i32 %[[VAL_42]], 112
+// CHECK:         %[[VAL_44:.*]] = udiv i32 %[[VAL_37]], 802816
+// CHECK:         %[[VAL_45:.*]] = add nuw nsw i32 %[[VAL_37]], 1
+// CHECK:         %[[VAL_46:.*]] = udiv i32 %[[VAL_45]], 1
+// CHECK:         %[[VAL_47:.*]] = urem i32 %[[VAL_46]], 64
+// CHECK:         %[[VAL_48:.*]] = udiv i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_49:.*]] = urem i32 %[[VAL_48]], 112
+// CHECK:         %[[VAL_50:.*]] = udiv i32 %[[VAL_45]], 7168
+// CHECK:         %[[VAL_51:.*]] = urem i32 %[[VAL_50]], 112
+// CHECK:         %[[VAL_52:.*]] = udiv i32 %[[VAL_45]], 802816
+// CHECK:         %[[VAL_53:.*]] = add nuw nsw i32 %[[VAL_37]], 2
+// CHECK:         %[[VAL_54:.*]] = udiv i32 %[[VAL_53]], 1
+// CHECK:         %[[VAL_55:.*]] = urem i32 %[[VAL_54]], 64
+// CHECK:         %[[VAL_56:.*]] = udiv i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_57:.*]] = urem i32 %[[VAL_56]], 112
+// CHECK:         %[[VAL_58:.*]] = udiv i32 %[[VAL_53]], 7168
+// CHECK:         %[[VAL_59:.*]] = urem i32 %[[VAL_58]], 112
+// CHECK:         %[[VAL_60:.*]] = udiv i32 %[[VAL_53]], 802816
+// CHECK:         %[[VAL_61:.*]] = add nuw nsw i32 %[[VAL_37]], 3
+// CHECK:         %[[VAL_62:.*]] = udiv i32 %[[VAL_61]], 1
+// CHECK:         %[[VAL_63:.*]] = urem i32 %[[VAL_62]], 64
+// CHECK:         %[[VAL_64:.*]] = udiv i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_65:.*]] = urem i32 %[[VAL_64]], 112
+// CHECK:         %[[VAL_66:.*]] = udiv i32 %[[VAL_61]], 7168
+// CHECK:         %[[VAL_67:.*]] = urem i32 %[[VAL_66]], 112
+// CHECK:         %[[VAL_68:.*]] = udiv i32 %[[VAL_61]], 802816
+// CHECK:         %[[VAL_69:.*]] = icmp ult i32 %[[VAL_37]], 102760448
+// CHECK:         br i1 %[[VAL_69]], label %[[VAL_70:.*]], label %[[VAL_71:.*]]
+// CHECK:       fusion.1.in_bounds-after:                         ; preds = %[[VAL_70]], %[[VAL_72:.*]]
+// CHECK:         ret void
+// CHECK:       fusion.1.in_bounds-true:                          ; preds = %[[VAL_72]]
+// CHECK:         %[[VAL_73:.*]] = urem i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_74:.*]] = bitcast [64 x float]* %[[VAL_14]] to float*
+// CHECK:         %[[VAL_75:.*]] = getelementptr inbounds float, float* %[[VAL_74]], i32 %[[VAL_73]]
+// CHECK:         %[[VAL_76:.*]] = load float, float* %[[VAL_75]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_77:.*]] = urem i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_78:.*]] = bitcast [64 x float]* %[[VAL_11]] to float*
+// CHECK:         %[[VAL_79:.*]] = getelementptr inbounds float, float* %[[VAL_78]], i32 %[[VAL_77]]
+// CHECK:         %[[VAL_80:.*]] = load float, float* %[[VAL_79]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_81:.*]] = fmul float %[[VAL_76]], %[[VAL_80]]
+// CHECK:         %[[VAL_82:.*]] = load float, float* bitcast ([4 x i8]* @0 to float*), align 4
+// CHECK:         %[[VAL_83:.*]] = fmul float %[[VAL_81]], %[[VAL_82]]
+// CHECK:         %[[VAL_84:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_26]] to half*
+// CHECK:         %[[VAL_85:.*]] = getelementptr inbounds half, half* %[[VAL_84]], i32 %[[VAL_37]]
+// CHECK:         %[[VAL_86:.*]] = load half, half* %[[VAL_85]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_87:.*]] = load half, half* bitcast ([2 x i8]* @1 to half*), align 2
+// CHECK:         %[[VAL_88:.*]] = fcmp ogt half %[[VAL_86]], %[[VAL_87]]
+// CHECK:         %[[VAL_89:.*]] = zext i1 %[[VAL_88]] to i8
+// CHECK:         %[[VAL_90:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_23]] to half*
+// CHECK:         %[[VAL_91:.*]] = getelementptr inbounds half, half* %[[VAL_90]], i32 %[[VAL_37]]
+// CHECK:         %[[VAL_92:.*]] = load half, half* %[[VAL_91]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_93:.*]] = trunc i8 %[[VAL_89]] to i1
+// CHECK:         %[[VAL_94:.*]] = select i1 %[[VAL_93]], half %[[VAL_92]], half %[[VAL_87]]
+// CHECK:         %[[VAL_95:.*]] = fpext half %[[VAL_94]] to float
+// CHECK:         %[[VAL_96:.*]] = load float, float* bitcast ([4 x i8]* @2 to float*), align 4
+// CHECK:         %[[VAL_97:.*]] = fmul float %[[VAL_95]], %[[VAL_96]]
+// CHECK:         %[[VAL_98:.*]] = urem i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_99:.*]] = bitcast [64 x float]* %[[VAL_8]] to float*
+// CHECK:         %[[VAL_100:.*]] = getelementptr inbounds float, float* %[[VAL_99]], i32 %[[VAL_98]]
+// CHECK:         %[[VAL_101:.*]] = load float, float* %[[VAL_100]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_102:.*]] = fsub float %[[VAL_97]], %[[VAL_101]]
+// CHECK:         %[[VAL_103:.*]] = urem i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_104:.*]] = bitcast [64 x float]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_105:.*]] = getelementptr inbounds float, float* %[[VAL_104]], i32 %[[VAL_103]]
+// CHECK:         %[[VAL_106:.*]] = load float, float* %[[VAL_105]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_107:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_20]] to half*
+// CHECK:         %[[VAL_108:.*]] = getelementptr inbounds half, half* %[[VAL_107]], i32 %[[VAL_37]]
+// CHECK:         %[[VAL_109:.*]] = load half, half* %[[VAL_108]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_110:.*]] = fpext half %[[VAL_109]] to float
+// CHECK:         %[[VAL_111:.*]] = urem i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_112:.*]] = bitcast [64 x float]* %[[VAL_17]] to float*
+// CHECK:         %[[VAL_113:.*]] = getelementptr inbounds float, float* %[[VAL_112]], i32 %[[VAL_111]]
+// CHECK:         %[[VAL_114:.*]] = load float, float* %[[VAL_113]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_115:.*]] = load float, float* bitcast ([4 x i8]* @3 to float*), align 4
+// CHECK:         %[[VAL_116:.*]] = fmul float %[[VAL_114]], %[[VAL_115]]
+// CHECK:         %[[VAL_117:.*]] = fsub float %[[VAL_110]], %[[VAL_116]]
+// CHECK:         %[[VAL_118:.*]] = fmul float %[[VAL_106]], %[[VAL_117]]
+// CHECK:         %[[VAL_119:.*]] = urem i32 %[[VAL_37]], 64
+// CHECK:         %[[VAL_120:.*]] = bitcast [64 x float]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_121:.*]] = getelementptr inbounds float, float* %[[VAL_120]], i32 %[[VAL_119]]
+// CHECK:         %[[VAL_122:.*]] = load float, float* %[[VAL_121]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_123:.*]] = fdiv float %[[VAL_118]], %[[VAL_122]]
+// CHECK:         %[[VAL_124:.*]] = fsub float %[[VAL_102]], %[[VAL_123]]
+// CHECK:         %[[VAL_125:.*]] = fmul float %[[VAL_83]], %[[VAL_124]]
+// CHECK:         %[[VAL_126:.*]] = fptrunc float %[[VAL_125]] to half
+// CHECK:         %[[VAL_127:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_29]] to half*
+// CHECK:         %[[VAL_128:.*]] = getelementptr inbounds half, half* %[[VAL_127]], i32 %[[VAL_37]]
+// CHECK:         store half %[[VAL_126]], half* %[[VAL_128]], align 2
+// CHECK:         %[[VAL_129:.*]] = urem i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_130:.*]] = bitcast [64 x float]* %[[VAL_14]] to float*
+// CHECK:         %[[VAL_131:.*]] = getelementptr inbounds float, float* %[[VAL_130]], i32 %[[VAL_129]]
+// CHECK:         %[[VAL_132:.*]] = load float, float* %[[VAL_131]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_133:.*]] = urem i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_134:.*]] = bitcast [64 x float]* %[[VAL_11]] to float*
+// CHECK:         %[[VAL_135:.*]] = getelementptr inbounds float, float* %[[VAL_134]], i32 %[[VAL_133]]
+// CHECK:         %[[VAL_136:.*]] = load float, float* %[[VAL_135]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_137:.*]] = fmul float %[[VAL_132]], %[[VAL_136]]
+// CHECK:         %[[VAL_138:.*]] = load float, float* bitcast ([4 x i8]* @4 to float*), align 4
+// CHECK:         %[[VAL_139:.*]] = fmul float %[[VAL_137]], %[[VAL_138]]
+// CHECK:         %[[VAL_140:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_26]] to half*
+// CHECK:         %[[VAL_141:.*]] = getelementptr inbounds half, half* %[[VAL_140]], i32 %[[VAL_45]]
+// CHECK:         %[[VAL_142:.*]] = load half, half* %[[VAL_141]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_143:.*]] = load half, half* bitcast ([2 x i8]* @5 to half*), align 2
+// CHECK:         %[[VAL_144:.*]] = fcmp ogt half %[[VAL_142]], %[[VAL_143]]
+// CHECK:         %[[VAL_145:.*]] = zext i1 %[[VAL_144]] to i8
+// CHECK:         %[[VAL_146:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_23]] to half*
+// CHECK:         %[[VAL_147:.*]] = getelementptr inbounds half, half* %[[VAL_146]], i32 %[[VAL_45]]
+// CHECK:         %[[VAL_148:.*]] = load half, half* %[[VAL_147]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_149:.*]] = trunc i8 %[[VAL_145]] to i1
+// CHECK:         %[[VAL_150:.*]] = select i1 %[[VAL_149]], half %[[VAL_148]], half %[[VAL_143]]
+// CHECK:         %[[VAL_151:.*]] = fpext half %[[VAL_150]] to float
+// CHECK:         %[[VAL_152:.*]] = load float, float* bitcast ([4 x i8]* @6 to float*), align 4
+// CHECK:         %[[VAL_153:.*]] = fmul float %[[VAL_151]], %[[VAL_152]]
+// CHECK:         %[[VAL_154:.*]] = urem i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_155:.*]] = bitcast [64 x float]* %[[VAL_8]] to float*
+// CHECK:         %[[VAL_156:.*]] = getelementptr inbounds float, float* %[[VAL_155]], i32 %[[VAL_154]]
+// CHECK:         %[[VAL_157:.*]] = load float, float* %[[VAL_156]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_158:.*]] = fsub float %[[VAL_153]], %[[VAL_157]]
+// CHECK:         %[[VAL_159:.*]] = urem i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_160:.*]] = bitcast [64 x float]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_161:.*]] = getelementptr inbounds float, float* %[[VAL_160]], i32 %[[VAL_159]]
+// CHECK:         %[[VAL_162:.*]] = load float, float* %[[VAL_161]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_163:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_20]] to half*
+// CHECK:         %[[VAL_164:.*]] = getelementptr inbounds half, half* %[[VAL_163]], i32 %[[VAL_45]]
+// CHECK:         %[[VAL_165:.*]] = load half, half* %[[VAL_164]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_166:.*]] = fpext half %[[VAL_165]] to float
+// CHECK:         %[[VAL_167:.*]] = urem i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_168:.*]] = bitcast [64 x float]* %[[VAL_17]] to float*
+// CHECK:         %[[VAL_169:.*]] = getelementptr inbounds float, float* %[[VAL_168]], i32 %[[VAL_167]]
+// CHECK:         %[[VAL_170:.*]] = load float, float* %[[VAL_169]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_171:.*]] = load float, float* bitcast ([4 x i8]* @7 to float*), align 4
+// CHECK:         %[[VAL_172:.*]] = fmul float %[[VAL_170]], %[[VAL_171]]
+// CHECK:         %[[VAL_173:.*]] = fsub float %[[VAL_166]], %[[VAL_172]]
+// CHECK:         %[[VAL_174:.*]] = fmul float %[[VAL_162]], %[[VAL_173]]
+// CHECK:         %[[VAL_175:.*]] = urem i32 %[[VAL_45]], 64
+// CHECK:         %[[VAL_176:.*]] = bitcast [64 x float]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_177:.*]] = getelementptr inbounds float, float* %[[VAL_176]], i32 %[[VAL_175]]
+// CHECK:         %[[VAL_178:.*]] = load float, float* %[[VAL_177]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_179:.*]] = fdiv float %[[VAL_174]], %[[VAL_178]]
+// CHECK:         %[[VAL_180:.*]] = fsub float %[[VAL_158]], %[[VAL_179]]
+// CHECK:         %[[VAL_181:.*]] = fmul float %[[VAL_139]], %[[VAL_180]]
+// CHECK:         %[[VAL_182:.*]] = fptrunc float %[[VAL_181]] to half
+// CHECK:         %[[VAL_183:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_29]] to half*
+// CHECK:         %[[VAL_184:.*]] = getelementptr inbounds half, half* %[[VAL_183]], i32 %[[VAL_45]]
+// CHECK:         store half %[[VAL_182]], half* %[[VAL_184]], align 2
+// CHECK:         %[[VAL_185:.*]] = urem i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_186:.*]] = bitcast [64 x float]* %[[VAL_14]] to float*
+// CHECK:         %[[VAL_187:.*]] = getelementptr inbounds float, float* %[[VAL_186]], i32 %[[VAL_185]]
+// CHECK:         %[[VAL_188:.*]] = load float, float* %[[VAL_187]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_189:.*]] = urem i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_190:.*]] = bitcast [64 x float]* %[[VAL_11]] to float*
+// CHECK:         %[[VAL_191:.*]] = getelementptr inbounds float, float* %[[VAL_190]], i32 %[[VAL_189]]
+// CHECK:         %[[VAL_192:.*]] = load float, float* %[[VAL_191]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_193:.*]] = fmul float %[[VAL_188]], %[[VAL_192]]
+// CHECK:         %[[VAL_194:.*]] = load float, float* bitcast ([4 x i8]* @8 to float*), align 4
+// CHECK:         %[[VAL_195:.*]] = fmul float %[[VAL_193]], %[[VAL_194]]
+// CHECK:         %[[VAL_196:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_26]] to half*
+// CHECK:         %[[VAL_197:.*]] = getelementptr inbounds half, half* %[[VAL_196]], i32 %[[VAL_53]]
+// CHECK:         %[[VAL_198:.*]] = load half, half* %[[VAL_197]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_199:.*]] = load half, half* bitcast ([2 x i8]* @9 to half*), align 2
+// CHECK:         %[[VAL_200:.*]] = fcmp ogt half %[[VAL_198]], %[[VAL_199]]
+// CHECK:         %[[VAL_201:.*]] = zext i1 %[[VAL_200]] to i8
+// CHECK:         %[[VAL_202:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_23]] to half*
+// CHECK:         %[[VAL_203:.*]] = getelementptr inbounds half, half* %[[VAL_202]], i32 %[[VAL_53]]
+// CHECK:         %[[VAL_204:.*]] = load half, half* %[[VAL_203]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_205:.*]] = trunc i8 %[[VAL_201]] to i1
+// CHECK:         %[[VAL_206:.*]] = select i1 %[[VAL_205]], half %[[VAL_204]], half %[[VAL_199]]
+// CHECK:         %[[VAL_207:.*]] = fpext half %[[VAL_206]] to float
+// CHECK:         %[[VAL_208:.*]] = load float, float* bitcast ([4 x i8]* @10 to float*), align 4
+// CHECK:         %[[VAL_209:.*]] = fmul float %[[VAL_207]], %[[VAL_208]]
+// CHECK:         %[[VAL_210:.*]] = urem i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_211:.*]] = bitcast [64 x float]* %[[VAL_8]] to float*
+// CHECK:         %[[VAL_212:.*]] = getelementptr inbounds float, float* %[[VAL_211]], i32 %[[VAL_210]]
+// CHECK:         %[[VAL_213:.*]] = load float, float* %[[VAL_212]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_214:.*]] = fsub float %[[VAL_209]], %[[VAL_213]]
+// CHECK:         %[[VAL_215:.*]] = urem i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_216:.*]] = bitcast [64 x float]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_217:.*]] = getelementptr inbounds float, float* %[[VAL_216]], i32 %[[VAL_215]]
+// CHECK:         %[[VAL_218:.*]] = load float, float* %[[VAL_217]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_219:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_20]] to half*
+// CHECK:         %[[VAL_220:.*]] = getelementptr inbounds half, half* %[[VAL_219]], i32 %[[VAL_53]]
+// CHECK:         %[[VAL_221:.*]] = load half, half* %[[VAL_220]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_222:.*]] = fpext half %[[VAL_221]] to float
+// CHECK:         %[[VAL_223:.*]] = urem i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_224:.*]] = bitcast [64 x float]* %[[VAL_17]] to float*
+// CHECK:         %[[VAL_225:.*]] = getelementptr inbounds float, float* %[[VAL_224]], i32 %[[VAL_223]]
+// CHECK:         %[[VAL_226:.*]] = load float, float* %[[VAL_225]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_227:.*]] = load float, float* bitcast ([4 x i8]* @11 to float*), align 4
+// CHECK:         %[[VAL_228:.*]] = fmul float %[[VAL_226]], %[[VAL_227]]
+// CHECK:         %[[VAL_229:.*]] = fsub float %[[VAL_222]], %[[VAL_228]]
+// CHECK:         %[[VAL_230:.*]] = fmul float %[[VAL_218]], %[[VAL_229]]
+// CHECK:         %[[VAL_231:.*]] = urem i32 %[[VAL_53]], 64
+// CHECK:         %[[VAL_232:.*]] = bitcast [64 x float]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_233:.*]] = getelementptr inbounds float, float* %[[VAL_232]], i32 %[[VAL_231]]
+// CHECK:         %[[VAL_234:.*]] = load float, float* %[[VAL_233]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_235:.*]] = fdiv float %[[VAL_230]], %[[VAL_234]]
+// CHECK:         %[[VAL_236:.*]] = fsub float %[[VAL_214]], %[[VAL_235]]
+// CHECK:         %[[VAL_237:.*]] = fmul float %[[VAL_195]], %[[VAL_236]]
+// CHECK:         %[[VAL_238:.*]] = fptrunc float %[[VAL_237]] to half
+// CHECK:         %[[VAL_239:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_29]] to half*
+// CHECK:         %[[VAL_240:.*]] = getelementptr inbounds half, half* %[[VAL_239]], i32 %[[VAL_53]]
+// CHECK:         store half %[[VAL_238]], half* %[[VAL_240]], align 2
+// CHECK:         %[[VAL_241:.*]] = urem i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_242:.*]] = bitcast [64 x float]* %[[VAL_14]] to float*
+// CHECK:         %[[VAL_243:.*]] = getelementptr inbounds float, float* %[[VAL_242]], i32 %[[VAL_241]]
+// CHECK:         %[[VAL_244:.*]] = load float, float* %[[VAL_243]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_245:.*]] = urem i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_246:.*]] = bitcast [64 x float]* %[[VAL_11]] to float*
+// CHECK:         %[[VAL_247:.*]] = getelementptr inbounds float, float* %[[VAL_246]], i32 %[[VAL_245]]
+// CHECK:         %[[VAL_248:.*]] = load float, float* %[[VAL_247]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_249:.*]] = fmul float %[[VAL_244]], %[[VAL_248]]
+// CHECK:         %[[VAL_250:.*]] = load float, float* bitcast ([4 x i8]* @12 to float*), align 4
+// CHECK:         %[[VAL_251:.*]] = fmul float %[[VAL_249]], %[[VAL_250]]
+// CHECK:         %[[VAL_252:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_26]] to half*
+// CHECK:         %[[VAL_253:.*]] = getelementptr inbounds half, half* %[[VAL_252]], i32 %[[VAL_61]]
+// CHECK:         %[[VAL_254:.*]] = load half, half* %[[VAL_253]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_255:.*]] = load half, half* bitcast ([2 x i8]* @13 to half*), align 2
+// CHECK:         %[[VAL_256:.*]] = fcmp ogt half %[[VAL_254]], %[[VAL_255]]
+// CHECK:         %[[VAL_257:.*]] = zext i1 %[[VAL_256]] to i8
+// CHECK:         %[[VAL_258:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_23]] to half*
+// CHECK:         %[[VAL_259:.*]] = getelementptr inbounds half, half* %[[VAL_258]], i32 %[[VAL_61]]
+// CHECK:         %[[VAL_260:.*]] = load half, half* %[[VAL_259]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_261:.*]] = trunc i8 %[[VAL_257]] to i1
+// CHECK:         %[[VAL_262:.*]] = select i1 %[[VAL_261]], half %[[VAL_260]], half %[[VAL_255]]
+// CHECK:         %[[VAL_263:.*]] = fpext half %[[VAL_262]] to float
+// CHECK:         %[[VAL_264:.*]] = load float, float* bitcast ([4 x i8]* @14 to float*), align 4
+// CHECK:         %[[VAL_265:.*]] = fmul float %[[VAL_263]], %[[VAL_264]]
+// CHECK:         %[[VAL_266:.*]] = urem i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_267:.*]] = bitcast [64 x float]* %[[VAL_8]] to float*
+// CHECK:         %[[VAL_268:.*]] = getelementptr inbounds float, float* %[[VAL_267]], i32 %[[VAL_266]]
+// CHECK:         %[[VAL_269:.*]] = load float, float* %[[VAL_268]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_270:.*]] = fsub float %[[VAL_265]], %[[VAL_269]]
+// CHECK:         %[[VAL_271:.*]] = urem i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_272:.*]] = bitcast [64 x float]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_273:.*]] = getelementptr inbounds float, float* %[[VAL_272]], i32 %[[VAL_271]]
+// CHECK:         %[[VAL_274:.*]] = load float, float* %[[VAL_273]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_275:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_20]] to half*
+// CHECK:         %[[VAL_276:.*]] = getelementptr inbounds half, half* %[[VAL_275]], i32 %[[VAL_61]]
+// CHECK:         %[[VAL_277:.*]] = load half, half* %[[VAL_276]], align 2, !invariant.load !4
+// CHECK:         %[[VAL_278:.*]] = fpext half %[[VAL_277]] to float
+// CHECK:         %[[VAL_279:.*]] = urem i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_280:.*]] = bitcast [64 x float]* %[[VAL_17]] to float*
+// CHECK:         %[[VAL_281:.*]] = getelementptr inbounds float, float* %[[VAL_280]], i32 %[[VAL_279]]
+// CHECK:         %[[VAL_282:.*]] = load float, float* %[[VAL_281]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_283:.*]] = load float, float* bitcast ([4 x i8]* @15 to float*), align 4
+// CHECK:         %[[VAL_284:.*]] = fmul float %[[VAL_282]], %[[VAL_283]]
+// CHECK:         %[[VAL_285:.*]] = fsub float %[[VAL_278]], %[[VAL_284]]
+// CHECK:         %[[VAL_286:.*]] = fmul float %[[VAL_274]], %[[VAL_285]]
+// CHECK:         %[[VAL_287:.*]] = urem i32 %[[VAL_61]], 64
+// CHECK:         %[[VAL_288:.*]] = bitcast [64 x float]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_289:.*]] = getelementptr inbounds float, float* %[[VAL_288]], i32 %[[VAL_287]]
+// CHECK:         %[[VAL_290:.*]] = load float, float* %[[VAL_289]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_291:.*]] = fdiv float %[[VAL_286]], %[[VAL_290]]
+// CHECK:         %[[VAL_292:.*]] = fsub float %[[VAL_270]], %[[VAL_291]]
+// CHECK:         %[[VAL_293:.*]] = fmul float %[[VAL_251]], %[[VAL_292]]
+// CHECK:         %[[VAL_294:.*]] = fptrunc float %[[VAL_293]] to half
+// CHECK:         %[[VAL_295:.*]] = bitcast [128 x [112 x [112 x [64 x half]]]]* %[[VAL_29]] to half*
+// CHECK:         %[[VAL_296:.*]] = getelementptr inbounds half, half* %[[VAL_295]], i32 %[[VAL_61]]
+// CHECK:         store half %[[VAL_294]], half* %[[VAL_296]], align 2
+// CHECK:         br label %[[VAL_71]]
+
+%fused_computation.1 (param_0.5: f32[64], param_1.3088: f32[64], param_2.2116: f32[64], param_3.974: f32[64], param_4.1162: f32[64], param_5.893: f32[64], param_6.809: f16[128,64,112,112], param_7.770: f16[128,64,112,112], param_8.637: f16[128,64,112,112]) -> f16[128,64,112,112] {
+  %param_4.1162 = f32[64]{0} parameter(4)
+  %broadcast.2313 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_4.1162), dimensions={1}
+  %param_3.974 = f32[64]{0} parameter(3)
+  %broadcast.1844 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_3.974), dimensions={1}
+  %multiply.1049 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %broadcast.2313, f32[128,64,112,112]{1,3,2,0} %broadcast.1844)
+  %constant_1404 = f32[] constant(6.22807704e-07)
+  %broadcast.1843 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[] %constant_1404), dimensions={}
+  %multiply.1048 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %multiply.1049, f32[128,64,112,112]{1,3,2,0} %broadcast.1843)
+  %param_8.637 = f16[128,64,112,112]{1,3,2,0} parameter(8)
+  %constant_3626 = f16[] constant(0)
+  %broadcast.4770 = f16[128,64,112,112]{1,3,2,0} broadcast(f16[] %constant_3626), dimensions={}
+  %compare.259 = pred[128,64,112,112]{1,3,2,0} compare(f16[128,64,112,112]{1,3,2,0} %param_8.637, f16[128,64,112,112]{1,3,2,0} %broadcast.4770), direction=GT
+  %param_7.770 = f16[128,64,112,112]{1,3,2,0} parameter(7)
+  %select.254 = f16[128,64,112,112]{1,3,2,0} select(pred[128,64,112,112]{1,3,2,0} %compare.259, f16[128,64,112,112]{1,3,2,0} %param_7.770, f16[128,64,112,112]{1,3,2,0} %broadcast.4770)
+  %convert.108 = f32[128,64,112,112]{1,3,2,0} convert(f16[128,64,112,112]{1,3,2,0} %select.254)
+  %constant_1390 = f32[] constant(1605632)
+  %broadcast.1841 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[] %constant_1390), dimensions={}
+  %multiply.1046 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %convert.108, f32[128,64,112,112]{1,3,2,0} %broadcast.1841)
+  %param_2.2116 = f32[64]{0} parameter(2)
+  %broadcast.1840 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_2.2116), dimensions={1}
+  %subtract.266 = f32[128,64,112,112]{1,3,2,0} subtract(f32[128,64,112,112]{1,3,2,0} %multiply.1046, f32[128,64,112,112]{1,3,2,0} %broadcast.1840)
+  %param_1.3088 = f32[64]{0} parameter(1)
+  %broadcast.1839 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_1.3088), dimensions={1}
+  %param_6.809 = f16[128,64,112,112]{1,3,2,0} parameter(6)
+  %convert.644 = f32[128,64,112,112]{1,3,2,0} convert(f16[128,64,112,112]{1,3,2,0} %param_6.809)
+  %param_5.893 = f32[64]{0} parameter(5)
+  %broadcast.3388 = f32[64]{0} broadcast(f32[] %constant_1404), dimensions={}
+  %multiply.2336 = f32[64]{0} multiply(f32[64]{0} %param_5.893, f32[64]{0} %broadcast.3388)
+  %broadcast.3387 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %multiply.2336), dimensions={1}
+  %subtract.591 = f32[128,64,112,112]{1,3,2,0} subtract(f32[128,64,112,112]{1,3,2,0} %convert.644, f32[128,64,112,112]{1,3,2,0} %broadcast.3387)
+  %multiply.1045 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %broadcast.1839, f32[128,64,112,112]{1,3,2,0} %subtract.591)
+  %param_0.5 = f32[64]{0} parameter(0)
+  %broadcast.1838 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_0.5), dimensions={1}
+  %divide.212 = f32[128,64,112,112]{1,3,2,0} divide(f32[128,64,112,112]{1,3,2,0} %multiply.1045, f32[128,64,112,112]{1,3,2,0} %broadcast.1838)
+  %subtract.265 = f32[128,64,112,112]{1,3,2,0} subtract(f32[128,64,112,112]{1,3,2,0} %subtract.266, f32[128,64,112,112]{1,3,2,0} %divide.212)
+  %multiply.1044 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %multiply.1048, f32[128,64,112,112]{1,3,2,0} %subtract.265)
+  ROOT %convert.107 = f16[128,64,112,112]{1,3,2,0} convert(f32[128,64,112,112]{1,3,2,0} %multiply.1044)
+}
+
+ENTRY main {
+  %get-tuple-element.1532 = f32[64]{0} parameter(0)
+  %get-tuple-element.876 = f32[64]{0} parameter(1)
+  %get-tuple-element.877 = f32[64]{0} parameter(2)
+  %get-tuple-element.1530 = f32[64]{0} parameter(3)
+  %arg112.113 = f32[64]{0} parameter(4)
+  %get-tuple-element.881 = f32[64]{0} parameter(5)
+  %get-tuple-element.872 = f16[128,64,112,112]{1,3,2,0} parameter(6)
+  %select-and-scatter.3626 = f16[128,64,112,112]{1,3,2,0} parameter(7)
+  %fusion.845 = f16[128,64,112,112]{1,3,2,0} parameter(8)
+
+  ROOT %fusion.1 = f16[128,64,112,112]{1,3,2,0} fusion(f32[64]{0} %get-tuple-element.1532, f32[64]{0} %get-tuple-element.876, f32[64]{0} %get-tuple-element.877, f32[64]{0} %get-tuple-element.1530, f32[64]{0} %arg112.113, f32[64]{0} %get-tuple-element.881, f16[128,64,112,112]{1,3,2,0} %get-tuple-element.872, f16[128,64,112,112]{1,3,2,0} %select-and-scatter.3626, f16[128,64,112,112]{1,3,2,0} %fusion.845), kind=kLoop, calls=%fused_computation.1
+}
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index d1bece038e0..6ed378adfeb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -833,7 +833,7 @@ TEST_F(GpuKernelTilingTest, RowReductionCorrectShmemUsage) {
   )";
   auto hlo_module = ParseAndReturnVerifiedModule(kHloString).ValueOrDie();
   auto expected_ir = R"(
-; CHECK: shared_cache_{{[0-9]*}} = private addrspace({{[0-9]*}}) global [1 x [32 x float]]
+; CHECK: shared_cache_{{[0-9]*}} = private unnamed_addr addrspace({{[0-9]*}}) global [1 x [32 x float]]
   )";
   CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 8ec00d73711..95cb01dd17e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -50,11 +50,10 @@ TEST_F(GpuNoAliasTest, Concat) {
   auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(CHECK-LABEL: define{{.*}}void @fusion
-                        CHECK-SAME: i8* noalias align {{[0-9]*}} dereferenceable({{[0-9]*}}) %[[OUTPUT_ALLOC:[a-z0-9]*]]
-                        CHECK: %fusion.raw = {{.*}} %[[OUTPUT_ALLOC]])",
-                     /*match_optimized_ir=*/false);
+  CompileAndVerifyIr(
+      std::move(hlo_module),
+      R"(CHECK: define{{.*}}void @fusion(i8* noalias align {{[0-9]*}} dereferenceable({{[0-9]*}}) %{{.*}}, i8* noalias align {{[0-9]*}} dereferenceable({{[0-9]*}}) %{{.*}}, i8* noalias align {{[0-9]*}} dereferenceable({{[0-9]*}}) %{{.*}}))",
+      /*match_optimized_ir=*/false);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
index f625abe6612..e929efb6d54 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
@@ -3,14 +3,12 @@
 // CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* noalias align 16 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(24) %alloc1, i8* noalias align 16 dereferenceable(8) %alloc2) {
 // CHECK: entry:
 // CHECK:         %[[VAL_32:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to [3 x [3 x i32]]*
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_4:.*]], i64 0
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_3]] to [3 x [3 x i32]]*
 // CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_7:.*]], i64 0
 // CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_6]] to [2 x i32]*
 // CHECK:         %[[VAL_9:.*]] = getelementptr inbounds i8, i8* %[[VAL_10:.*]], i64 0
 // CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_9]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
+// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to [3 x [3 x i32]]*
 // CHECK:         %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
 // CHECK:         %[[VAL_13:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
 // CHECK:         %[[VAL_14:.*]] = mul nuw nsw i32 %[[VAL_12]], 6
@@ -75,14 +73,12 @@ ENTRY main {
 // CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* noalias align 16 dereferenceable(4) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 %alloc2) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_60:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i8, i8* %[[VAL_38:.*]], i64 0
-// CHECK:         %[[VAL_39:.*]] = bitcast i8* %[[VAL_37]] to i32*
-// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds i8, i8* %[[VAL_41:.*]], i64 0
-// CHECK:         %[[VAL_42:.*]] = bitcast i8* %[[VAL_40]] to i32*
 // CHECK:         %[[VAL_43:.*]] = getelementptr inbounds i8, i8* %[[VAL_44:.*]], i64 0
 // CHECK:         %[[VAL_45:.*]] = bitcast i8* %[[VAL_43]] to [0 x i32]*
 // CHECK:         %[[VAL_46:.*]] = getelementptr inbounds i8, i8* %[[VAL_47:.*]], i64 0
 // CHECK:         %[[VAL_48:.*]] = bitcast i8* %[[VAL_46]] to i32*
+// CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i8, i8* %[[VAL_38:.*]], i64 0
+// CHECK:         %[[VAL_39:.*]] = bitcast i8* %[[VAL_37]] to i32*
 // CHECK:         %[[VAL_49:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
 // CHECK:         %[[VAL_50:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
 // CHECK:         %[[VAL_51:.*]] = mul nuw nsw i32 %[[VAL_49]], 1
@@ -135,14 +131,12 @@ ENTRY main {
 // CHECK:         %[[VAL_63:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_64:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_98:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_65:.*]] = getelementptr inbounds i8, i8* %[[VAL_66:.*]], i64 0
-// CHECK:         %[[VAL_67:.*]] = bitcast i8* %[[VAL_65]] to [3 x [3 x i32]]*
-// CHECK:         %[[VAL_68:.*]] = getelementptr inbounds i8, i8* %[[VAL_69:.*]], i64 0
-// CHECK:         %[[VAL_70:.*]] = bitcast i8* %[[VAL_68]] to [3 x [3 x i32]]*
 // CHECK:         %[[VAL_71:.*]] = getelementptr inbounds i8, i8* %[[VAL_72:.*]], i64 0
 // CHECK:         %[[VAL_73:.*]] = bitcast i8* %[[VAL_71]] to [2 x i32]*
 // CHECK:         %[[VAL_74:.*]] = getelementptr inbounds i8, i8* %[[VAL_75:.*]], i64 0
 // CHECK:         %[[VAL_76:.*]] = bitcast i8* %[[VAL_74]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_65:.*]] = getelementptr inbounds i8, i8* %[[VAL_66:.*]], i64 0
+// CHECK:         %[[VAL_67:.*]] = bitcast i8* %[[VAL_65]] to [3 x [3 x i32]]*
 // CHECK:         %[[VAL_77:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
 // CHECK:         %[[VAL_78:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
 // CHECK:         %[[VAL_79:.*]] = mul nuw nsw i32 %[[VAL_77]], 6
@@ -180,7 +174,7 @@ ENTRY main {
 // CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_104]], %[[VAL_95]]
 // CHECK:         %[[VAL_105:.*]] = load i32, i32* %[[VAL_64]], align 4
 // CHECK:         store i32 %[[VAL_105]], i32* %[[VAL_63]], align 4
-// CHECK:         call void @mul_s32(i32* %[[VAL_63]], i32* %[[VAL_98]], i32* %[[VAL_63]])
+// CHECK:         call void @{{.+}}(i32* %[[VAL_63]], i32* %[[VAL_98]], i32* %[[VAL_63]])
 // CHECK:         %[[VAL_106:.*]] = load i32, i32* %[[VAL_63]], align 4
 // CHECK:         %[[VAL_107:.*]] = cmpxchg i32* %[[VAL_97]], i32 %[[VAL_105]], i32 %[[VAL_106]] seq_cst seq_cst
 // CHECK:         %[[VAL_108:.*]] = extractvalue { i32, i1 } %[[VAL_107]], 0
@@ -219,14 +213,12 @@ ENTRY main {
 // CHECK-LABEL: define void @scatter_ScalarUpdate(i8* noalias align 16 dereferenceable(16) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_146:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_118:.*]] = getelementptr inbounds i8, i8* %[[VAL_119:.*]], i64 0
-// CHECK:         %[[VAL_120:.*]] = bitcast i8* %[[VAL_118]] to [4 x i32]*
-// CHECK:         %[[VAL_121:.*]] = getelementptr inbounds i8, i8* %[[VAL_122:.*]], i64 0
-// CHECK:         %[[VAL_123:.*]] = bitcast i8* %[[VAL_121]] to [4 x i32]*
 // CHECK:         %[[VAL_124:.*]] = getelementptr inbounds i8, i8* %[[VAL_125:.*]], i64 0
 // CHECK:         %[[VAL_126:.*]] = bitcast i8* %[[VAL_124]] to i32*
 // CHECK:         %[[VAL_127:.*]] = getelementptr inbounds i8, i8* %[[VAL_128:.*]], i64 0
 // CHECK:         %[[VAL_129:.*]] = bitcast i8* %[[VAL_127]] to i32*
+// CHECK:         %[[VAL_118:.*]] = getelementptr inbounds i8, i8* %[[VAL_119:.*]], i64 0
+// CHECK:         %[[VAL_120:.*]] = bitcast i8* %[[VAL_118]] to [4 x i32]*
 // CHECK:         %[[VAL_130:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
 // CHECK:         %[[VAL_131:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
 // CHECK:         %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_130]], 1
diff --git a/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo
new file mode 100644
index 00000000000..553a41ac8bf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo
@@ -0,0 +1,117 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca i1, align 1
+// CHECK:         %[[VAL_3:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_4:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds i8, i8* %[[VAL_6:.*]], i64 0
+// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_5]] to [6 x float]*
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_9:.*]], i64 0
+// CHECK:         %[[VAL_10:.*]] = bitcast i8* %[[VAL_8]] to [2 x float]*
+// CHECK:         %[[VAL_11:.*]] = getelementptr inbounds i8, i8* %[[VAL_12:.*]], i64 0
+// CHECK:         %[[VAL_13:.*]] = bitcast i8* %[[VAL_11]] to [6 x float]*
+// CHECK:         %[[VAL_14:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_15:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_16:.*]] = mul nuw nsw i32 %[[VAL_14]], 2
+// CHECK:         %[[VAL_17:.*]] = add nuw nsw i32 %[[VAL_16]], %[[VAL_15]]
+// CHECK:         %[[VAL_18:.*]] = icmp ult i32 %[[VAL_17]], 2
+// CHECK:         call void @llvm.assume(i1 %[[VAL_18]])
+// CHECK:         %[[VAL_19:.*]] = udiv i32 %[[VAL_17]], 1
+// CHECK:         %[[VAL_20:.*]] = icmp ult i32 %[[VAL_17]], 2
+// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:       select-and-scatter.12.in_bounds-after:            ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
+// CHECK:         ret void
+// CHECK:       select-and-scatter.12.in_bounds-true:             ; preds = %[[VAL_24]]
+// CHECK:         store i1 false, i1* %[[VAL_2]], align 1
+// CHECK:         store i32 0, i32* %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       select-and-scatter.12inner.loop_header.window.0:  ; preds = %[[VAL_26:.*]], %[[VAL_21]]
+// CHECK:         %[[VAL_27:.*]] = load i32, i32* %[[VAL_1]], align 4
+// CHECK:         %[[VAL_28:.*]] = icmp uge i32 %[[VAL_27]], 3
+// CHECK:         br i1 %[[VAL_28]], label %[[VAL_23]], label %[[VAL_29:.*]]
+// CHECK:       select-and-scatter.12inner.loop_body.window.0:    ; preds = %[[VAL_25]]
+// CHECK:         %[[VAL_30:.*]] = mul nsw i32 %[[VAL_19]], 3
+// CHECK:         %[[VAL_31:.*]] = add nsw i32 %[[VAL_30]], %[[VAL_27]]
+// CHECK:         %[[VAL_32:.*]] = sub nsw i32 %[[VAL_31]], 0
+// CHECK:         %[[VAL_33:.*]] = icmp ult i32 %[[VAL_32]], 6
+// CHECK:         %[[VAL_34:.*]] = and i1 true, %[[VAL_33]]
+// CHECK:         br i1 %[[VAL_34]], label %[[VAL_35:.*]], label %[[VAL_36:.*]]
+// CHECK:       in-bounds-after:                                  ; preds = %[[VAL_36]], %[[VAL_37:.*]]
+// CHECK:         %[[VAL_38:.*]] = add nuw nsw i32 %[[VAL_27]], 1
+// CHECK:         store i32 %[[VAL_38]], i32* %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_25]]
+// CHECK:       select-and-scatter.12inner.loop_exit.window.0:    ; preds = %[[VAL_25]]
+// CHECK:         %[[VAL_39:.*]] = getelementptr inbounds i32, i32* %[[VAL_3]], i32 0
+// CHECK:         %[[VAL_40:.*]] = load i32, i32* %[[VAL_39]], align 4
+// CHECK:         %[[VAL_41:.*]] = bitcast [2 x float]* %[[VAL_10]] to float*
+// CHECK:         %[[VAL_42:.*]] = getelementptr inbounds float, float* %[[VAL_41]], i32 %[[VAL_17]]
+// CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [6 x float], [6 x float]* %[[VAL_13]], i32 0, i32 %[[VAL_40]]
+// CHECK:         %[[VAL_44:.*]] = load float, float* %[[VAL_42]], align 4
+// CHECK:         %[[VAL_45:.*]] = atomicrmw fadd float* %[[VAL_43]], float %[[VAL_44]] seq_cst
+// CHECK:         br label %[[VAL_22]]
+// CHECK:       in-bounds-true:                                   ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_46:.*]] = load i1, i1* %[[VAL_2]], align 1
+// CHECK:         br i1 %[[VAL_46]], label %[[VAL_47:.*]], label %[[VAL_48:.*]]
+// CHECK:       initialized-after:                                ; preds = %[[VAL_48]], %[[VAL_49:.*]]
+// CHECK:         br label %[[VAL_26]]
+// CHECK:       in-bounds-false:                                  ; preds = %[[VAL_29]]
+// CHECK:         br label %[[VAL_26]]
+// CHECK:       initialized-true:                                 ; preds = %[[VAL_35]]
+// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds [6 x float], [6 x float]* %[[VAL_7]], i32 0, i32 %[[VAL_32]]
+// CHECK:         call void @region_0_4(float* %[[VAL_4]], float* %[[VAL_50]], i8* %[[VAL_0]])
+// CHECK:         %[[VAL_51:.*]] = load i8, i8* %[[VAL_0]], align 1
+// CHECK:         %[[VAL_52:.*]] = icmp ne i8 %[[VAL_51]], 0
+// CHECK:         br i1 %[[VAL_52]], label %[[VAL_53:.*]], label %[[VAL_54:.*]]
+// CHECK:       if-select-lhs-after:                              ; preds = %[[VAL_54]], %[[VAL_53]]
+// CHECK:         br label %[[VAL_37]]
+// CHECK:       initialized-false:                                ; preds = %[[VAL_35]]
+// CHECK:         %[[VAL_55:.*]] = getelementptr inbounds [6 x float], [6 x float]* %[[VAL_7]], i32 0, i32 %[[VAL_32]]
+// CHECK:         %[[VAL_56:.*]] = load float, float* %[[VAL_55]], align 4, !invariant.load !4
+// CHECK:         store float %[[VAL_56]], float* %[[VAL_4]], align 4
+// CHECK:         %[[VAL_57:.*]] = getelementptr inbounds i32, i32* %[[VAL_3]], i32 0
+// CHECK:         store i32 %[[VAL_32]], i32* %[[VAL_57]], align 4
+// CHECK:         store i1 true, i1* %[[VAL_2]], align 1
+// CHECK:         br label %[[VAL_37]]
+// CHECK:       if-select-lhs-true:                               ; preds = %[[VAL_47]]
+// CHECK:         br label %[[VAL_49]]
+// CHECK:       if-select-lhs-false:                              ; preds = %[[VAL_47]]
+// CHECK:         %[[VAL_58:.*]] = load float, float* %[[VAL_50]], align 4
+// CHECK:         store float %[[VAL_58]], float* %[[VAL_4]], align 4
+// CHECK:         %[[VAL_59:.*]] = getelementptr inbounds i32, i32* %[[VAL_3]], i32 0
+// CHECK:         store i32 %[[VAL_32]], i32* %[[VAL_59]], align 4
+// CHECK:         br label %[[VAL_49]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_60:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_61:.*]] = load float, float* %[[VAL_62:.*]], align 4
+// CHECK:         %[[VAL_63:.*]] = load float, float* %[[VAL_64:.*]], align 4
+// CHECK:         %[[VAL_65:.*]] = fcmp oge float %[[VAL_61]], %[[VAL_63]]
+// CHECK:         %[[VAL_66:.*]] = zext i1 %[[VAL_65]] to i8
+// CHECK:         store i8 %[[VAL_66]], i8* %[[VAL_60]], align 1
+// CHECK:         %[[VAL_67:.*]] = load i8, i8* %[[VAL_60]], align 1
+// CHECK:         store i8 %[[VAL_67]], i8* %[[VAL_68:.*]], align 1
+// CHECK:         ret void
+
+HloModule SelectAndScatter
+
+%ge_F32 (lhs.5: f32[], rhs.6: f32[]) -> pred[] {
+  %lhs.5 = f32[] parameter(0)
+  %rhs.6 = f32[] parameter(1)
+  ROOT %compare.7 = pred[] compare(f32[] %lhs.5, f32[] %rhs.6), direction=GE
+}
+
+%add_F32 (lhs.9: f32[], rhs.10: f32[]) -> f32[] {
+  %lhs.9 = f32[] parameter(0)
+  %rhs.10 = f32[] parameter(1)
+  ROOT %add.11 = f32[] add(f32[] %lhs.9, f32[] %rhs.10)
+}
+
+ENTRY main () -> f32[6] {
+  %operand = f32[6]{0} parameter(0)
+  %source = f32[2]{0} parameter(1)
+  %init = f32[] constant(0)
+  ROOT %select-and-scatter.12 = f32[6]{0} select-and-scatter(f32[6]{0} %operand, f32[2]{0} %source, f32[] %init), window={size=3 stride=3}, select=%ge_F32, scatter=%add_F32
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index b6c193c2153..92d90fe8889 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+StatusOr<GlobalDeviceId> Thunk::ExecuteParams::GetGlobalDeviceId() const {
+  int64 local_device_ordinal = stream->parent()->device_ordinal();
+  if (gpu_global_device_ids) {
+    TF_RET_CHECK(0 <= local_device_ordinal &&
+                 local_device_ordinal < gpu_global_device_ids->size());
+    return (*gpu_global_device_ids)[local_device_ordinal];
+  } else {
+    // No local -> global mapping was provided; assume the identity mapping.
+    return GlobalDeviceId(local_device_ordinal);
+  }
+}
+
 absl::string_view ThunkKindToString(Thunk::Kind kind) {
   switch (kind) {
     case Thunk::kCholesky:
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 64b685db379..225391b6ea1 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -108,6 +108,8 @@ class Thunk {
     std::vector<std::function<void()>>* deferred_host_callbacks;  // never null
     const std::vector<GlobalDeviceId>* gpu_global_device_ids;     // may be null
     const NcclUniqueIdCallback* nccl_unique_id_callback;          // may be null
+
+    StatusOr<GlobalDeviceId> GetGlobalDeviceId() const;
   };
 
   // Execute the kernel for the thunk on the given stream. This method must be
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/xla/service/graphcycles/BUILD
similarity index 96%
rename from tensorflow/compiler/jit/graphcycles/BUILD
rename to tensorflow/compiler/xla/service/graphcycles/BUILD
index 23d994c27c5..0c1ba803ccf 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/xla/service/graphcycles/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 package(
     default_visibility = [
         "//tensorflow/compiler/tf2xla:internal",
+        "//tensorflow/compiler/xla:internal",
     ],
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/xla/service/graphcycles/graphcycles.cc
similarity index 99%
rename from tensorflow/compiler/jit/graphcycles/graphcycles.cc
rename to tensorflow/compiler/xla/service/graphcycles/graphcycles.cc
index 416e101a025..69e10871815 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/xla/service/graphcycles/graphcycles.cc
@@ -29,7 +29,7 @@ limitations under the License.
 // (2) When a new edge (x->y) is inserted, do nothing if rank[x] < rank[y].
 // (3) Otherwise: adjust ranks in the neighborhood of x and y.
 
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 
 #include <algorithm>
 #include <unordered_set>
@@ -38,7 +38,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/jit/graphcycles/ordered_set.h"
+#include "tensorflow/compiler/xla/service/graphcycles/ordered_set.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/xla/service/graphcycles/graphcycles.h
similarity index 96%
rename from tensorflow/compiler/jit/graphcycles/graphcycles.h
rename to tensorflow/compiler/xla/service/graphcycles/graphcycles.h
index 3e20c4e641c..5028091c928 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/xla/service/graphcycles/graphcycles.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_GRAPHCYCLES_H_
-#define TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_GRAPHCYCLES_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GRAPHCYCLES_GRAPHCYCLES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GRAPHCYCLES_GRAPHCYCLES_H_
 
 #include <vector>
 
@@ -149,4 +149,4 @@ class GraphCycles {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_GRAPHCYCLES_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GRAPHCYCLES_GRAPHCYCLES_H_
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc b/tensorflow/compiler/xla/service/graphcycles/graphcycles_test.cc
similarity index 99%
rename from tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
rename to tensorflow/compiler/xla/service/graphcycles/graphcycles_test.cc
index 5b7eec19e27..f44a36c677f 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
+++ b/tensorflow/compiler/xla/service/graphcycles/graphcycles_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // A test for the GraphCycles interface.
 
-#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 
 #include <optional>
 #include <random>
diff --git a/tensorflow/compiler/jit/graphcycles/ordered_set.h b/tensorflow/compiler/xla/service/graphcycles/ordered_set.h
similarity index 93%
rename from tensorflow/compiler/jit/graphcycles/ordered_set.h
rename to tensorflow/compiler/xla/service/graphcycles/ordered_set.h
index 0417782b984..622c5d3afb9 100644
--- a/tensorflow/compiler/jit/graphcycles/ordered_set.h
+++ b/tensorflow/compiler/xla/service/graphcycles/ordered_set.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
-#define TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GRAPHCYCLES_ORDERED_SET_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GRAPHCYCLES_ORDERED_SET_H_
 
 #include <vector>
 
@@ -82,4 +82,4 @@ class OrderedSet {
 };
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GRAPHCYCLES_ORDERED_SET_H_
diff --git a/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc b/tensorflow/compiler/xla/service/graphcycles/ordered_set_test.cc
similarity index 97%
rename from tensorflow/compiler/jit/graphcycles/ordered_set_test.cc
rename to tensorflow/compiler/xla/service/graphcycles/ordered_set_test.cc
index 38ac1cfe9b6..eec433b979b 100644
--- a/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc
+++ b/tensorflow/compiler/xla/service/graphcycles/ordered_set_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/graphcycles/ordered_set.h"
+#include "tensorflow/compiler/xla/service/graphcycles/ordered_set.h"
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 2e2b668eba7..7ca2d31b098 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -409,11 +409,16 @@ HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
   // Post-process the result to add chunks for shared buffers.  An empty chunk
   // map means that either no buffers were allocated, or the heap was only
   // collecting statistics, e.g. NoFragmentationStatsHeap.
-  if (!result.chunk_map.empty()) {
+  size_t total_chunk_count = absl::c_accumulate(
+      result.heap_results, static_cast<size_t>(0),
+      [&](size_t lhs, const HeapResult<HloValue>& rhs) -> size_t {
+        return lhs + rhs.chunk_map.size();
+      });
+  if (total_chunk_count != 0) {
     // If we were told to assign specific buffers, make sure we've assigned
     // exactly that many buffers.
     if (options_.buffers_to_assign != nullptr) {
-      CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
+      CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count);
     }
   }
 
@@ -825,7 +830,10 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
     CommitChunk(buffer_interval, chunk_candidate);
   }
   VLOG(1) << "result heap_size: " << result_.heap_size;
-  return result_;
+  Result result;
+  result.heap_size = result_.heap_size;
+  result.heap_results.emplace_back(result_);
+  return result;
 }
 
 template <typename BufferType>
@@ -968,6 +976,58 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
   DCHECK(emplace_result.second);
 }
 
+HeapSimulator::Result<HloValue>
+ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
+  std::vector<BufferInterval> sorted_buffer_vec = GetSortedBufferIntervals();
+  // Convert into std::list so that erase() is O(1).
+  std::list<BufferInterval> sorted_buffer_intervals(sorted_buffer_vec.begin(),
+                                                    sorted_buffer_vec.end());
+
+  // Use do-while here, because we need to create 1 heap in `multi_heap_result`
+  // even if `sorted_buffer_intervals` is empty.
+  Result multi_heap_result;
+  do {
+    // Place buffers into the currently processed heap as many as possible.
+    for (auto it = sorted_buffer_intervals.begin();
+         it != sorted_buffer_intervals.end();) {
+      BufferInterval buffer_interval = *it;
+      if (!buffer_interval.need_allocation) {
+        it = sorted_buffer_intervals.erase(it);
+        continue;
+      }
+      if (buffer_interval.size > size_limit_per_heap_) {
+        LOG(WARNING) << "Alloc buffer size " << buffer_interval.size
+                     << " larger than the per-heap size limit "
+                     << size_limit_per_heap_;
+      }
+
+      ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval);
+      if (chunk_candidate.heap_size <= size_limit_per_heap_ ||
+          // Commit the chunk as long as the heap is empty. We do this because
+          // we want the size constraint to be soft, meaning that results are
+          // successfully generated even if there are some buffer sizes larger
+          // than the given constraint size.
+          result_.heap_size == 0) {
+        CommitChunk(buffer_interval, chunk_candidate);
+        it = sorted_buffer_intervals.erase(it);
+        continue;
+      }
+
+      ++it;
+    }
+    // Collect the result from the currently processed heap and reset the heap
+    // states.
+    multi_heap_result.heap_size += result_.heap_size;
+    multi_heap_result.heap_results.push_back(std::move(result_));
+    result_ = {};
+    interval_tree_ = {};
+  } while (!sorted_buffer_intervals.empty());
+
+  VLOG(1) << "Number of heaps produced = "
+          << multi_heap_result.heap_results.size();
+  return multi_heap_result;
+}
+
 template <typename BufferType>
 HeapSimulator::Result<BufferType>
 ChooseBestHeapAlgorithm<BufferType>::Finish() {
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index b47ff685139..6418731befc 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -67,14 +67,23 @@ class HeapSimulator {
     }
   };
 
-  // Result represents the result of the heap simulation.
   template <typename BufferType>
-  struct Result {
+  struct HeapResult {
     // The assignment of buffers to chunks.
     absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
+  };
+  // Result represents the result of the heap simulation.
+  template <typename BufferType>
+  struct Result {
+    // Heap results.
+    std::vector<HeapResult<BufferType>> heap_results;
+
+    // The total size in bytes of the heaps.
+    // heap_size == sum([hr.heap_size for hr in heap_results]).
+    int64 heap_size = 0;
 
     // The total size in bytes of heap fragmentation.
     int64 fragmentation_size = 0;
@@ -229,6 +238,7 @@ class HeapAlgorithm {
  public:
   using Chunk = HeapSimulator::Chunk;
   using Result = HeapSimulator::Result<BufferType>;
+  using HeapResult = HeapSimulator::HeapResult<BufferType>;
 
   virtual ~HeapAlgorithm() = default;
 
@@ -347,6 +357,7 @@ class BufferIntervalTree {
 template <typename BufferType>
 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
  public:
+  using HeapResult = HeapSimulator::HeapResult<BufferType>;
   using Result = HeapSimulator::Result<BufferType>;
   using Chunk = HeapSimulator::Chunk;
 
@@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
                                     int64 preferred_offset = -1) const;
   void CommitChunk(const BufferInterval& buffer_interval,
                    ChunkCandidate chunk_candidate);
+
   // Adds the buffer and the chunk to the result chunk map.
   virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
 
@@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
 
   absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
-  Result result_;
+  HeapResult result_;
   BufferIntervalCompare buffer_interval_compare_;
   BufferIntervalTree interval_tree_;
 
@@ -444,6 +456,41 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
       const BufferInterval& interval) const;
 };
 
+// This class implements an algorithm that will produce multiple heaps, where
+// each heap size is constrained by a given limit. Note that the constraint is
+// soft, meaning that a valid heap result is generated even if there are some
+// buffer sizes larger than the given constraint size.
+//
+// Pseudocode:
+//   while( `buffers` is not empty ) {
+//     create a new heap `h`
+//     for (each buffer `buf` in `buffers` in the size-decreasing order) {
+//       if (buf.size() is larger than the heap size limit &&
+//           `h` is empty) {
+//         h.place(buf)
+//         buffers.remove(buf)
+//       } else if (placing `buf` into `h` does not violate size
+//           constraint) {
+//         h.place(buf)
+//         buffers.remove(buf)
+//       }
+//     }
+//   }
+class ConstrainedGlobalDecreasingSizeBestFitHeap
+    : public GlobalDecreasingSizeBestFitHeap<HloValue> {
+ public:
+  explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
+      uint64 size_limit_per_heap, int64 alignment, Type type = kSpatial)
+      : GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type),
+        size_limit_per_heap_(size_limit_per_heap) {}
+  ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
+
+  Result Finish() override;
+
+ private:
+  uint64 size_limit_per_heap_;
+};
+
 // A heap algorithm that chooses the best results from other algorithms added to
 // it.
 template <typename BufferType>
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 8f7668b4965..26305eebb0d 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
   }
   Result Finish() override {
     calls_->emplace_back(kFinish, nullptr);
-    return result_;
+    HeapSimulator::Result<HloValue> result;
+    result.heap_size = result_.heap_size;
+    result.heap_results.emplace_back(std::move(result_));
+    return result;
   }
 
  private:
   CallSequence* calls_;
-  Result result_;
+  HeapSimulator::HeapResult<HloValue> result_;
 };
 
 // HeapSimulatorTracker runs the heap simulator, recording the sequence of calls
@@ -335,7 +338,8 @@ class HeapSimulatorTracker {
 
   int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) {
     const HloValue* buffer = BufferAt(instruction, index);
-    return result_.chunk_map.at(buffer).offset;
+    CHECK_EQ(1, result_.heap_results.size());
+    return result_.heap_results.at(0).chunk_map.at(buffer).offset;
   }
 
   // Ensures the expected sequence of Alloc/Free/Finish calls was performed.
@@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
   GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(0, result.heap_size);
-  EXPECT_EQ(0, result.chunk_map.size());
+  EXPECT_EQ(1, result.heap_results.size());
+  EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size());
 }
 
 TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
@@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
   heap.Free(buffer_c_, 20);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(100, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
   heap.Free(buffer_c_, 50);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(120, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
   heap.Free(buffer_d_, 30);
   heap.Free(buffer_e_, 50);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(140, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
   heap.ShareWith(buffer_c_, buffer_a_, 40);
   heap.Free(buffer_c_, 40);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
   heap.Free(buffer_c_, 40);
   heap.Free(buffer_b_, 20);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(60, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
   heap.Free(buffer_c_, 10);
   heap.Free(buffer_b_, 30);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) {
   // Preferred offset 15 could not be given because it is occupied.
 }
 
+class ConstrainedGlobalDecreasingSizeBestFitHeapTest
+    : public HeapAlgorithmTestBase {};
+
+TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
+  // space
+  //   ^
+  //   |      +-------+
+  //   |      +---c---+
+  //   |    +-------+
+  //   |    |   b   |
+  //   |    +-------+
+  //   | ................ // split into two allocations.
+  //   |  +---a---+
+  //   |         +-------+
+  //   |         |       |
+  //   |         |   d   |
+  //   |         +-------+
+  //   -----------------> time
+  ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
+                                                  /*alignment=*/1);
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 30);
+  heap.Alloc(buffer_c_, 20);
+  heap.Alloc(buffer_d_, 40);
+  heap.Free(buffer_a_, 10);
+  heap.Free(buffer_b_, 30);
+  heap.Free(buffer_c_, 20);
+  heap.Free(buffer_d_, 40);
+
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  EXPECT_EQ(100, result.heap_size);
+  EXPECT_EQ(2, result.heap_results.size());
+
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_));
+  EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size);
+  EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset);
+}
+
+TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest,
+       DecreasingSizeWithAlignment) {
+  // space
+  //   ^
+  //   |      +-------+
+  //   |      +---b---+
+  //   |            +-------+
+  //   |            |       |
+  //   |            |   d   |
+  //   |            +-------+
+  //   | ...................
+  //   |  +---a---+
+  //   |
+  //   |         +-------+
+  //   |         |       |
+  //   |         |   c   |
+  //   |         |       |
+  //   |         +-------+
+  //   ---------------------> time
+  ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70,
+                                                  /*alignment=*/20);
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 20);
+  heap.Alloc(buffer_c_, 50);
+  heap.Free(buffer_a_, 10);
+  heap.Alloc(buffer_d_, 40);
+  heap.Free(buffer_b_, 20);
+  heap.Free(buffer_c_, 50);
+  heap.Free(buffer_d_, 40);
+
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  EXPECT_EQ(130, result.heap_size);  // 70 + 60
+  EXPECT_EQ(2, result.heap_results.size());
+
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
+  EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
+}
+
+TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
+  // space
+  //   ^
+  //   |       +---------------+
+  //   |       +-------b-------+
+  //   | ....................
+  //   |+------+      +-------+
+  //   ||      |      |       |
+  //   ||      |      |       | <--- colocate with a
+  //   |+--a---+      +---c---+
+  //   ---------------------> time
+  ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
+                                                  /*alignment=*/20);
+  heap.Alloc(buffer_a_, 30);
+  heap.Free(buffer_a_, 30);
+  heap.Alloc(buffer_b_, 20);
+
+  heap.ShareWith(buffer_c_, buffer_a_, 40);
+  heap.Free(buffer_c_, 40);
+  heap.Free(buffer_b_, 20);
+
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  EXPECT_EQ(50, result.heap_size);
+  EXPECT_EQ(2, result.heap_results.size());
+
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
+  EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
+}
+
 class IntervalTreeTest : public ::testing::Test {};
 
 TEST_F(IntervalTreeTest, InsertAndRemove) {
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index ac94b2e1d24..8c9d3a606c6 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 75
+// Next ID: 76
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -259,6 +259,9 @@ message HloInstructionProto {
 
   // Specifies if this is a cross-program-prefetch, used by kCopyStart.
   bool is_cross_program_prefetch = 73;
+
+  // If a convolution is dynamic, a dynamic padding type will be specified.
+  xla.PaddingType padding_type = 75;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index b04635dda03..b01caca0b1d 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -2545,8 +2545,7 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
 
 // Reducing many numbers should be fast because it doesn't create
 // intermediate Literals; the microbenchmark should finish in < 1 msec.
-void BM_ReducePrecisely(int num_iters) {
-  tensorflow::testing::StopTiming();
+void BM_ReducePrecisely(::testing::benchmark::State& state) {
   HloComputation::Builder b("BM_ReducePrecisely");
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsFromFlags());
@@ -2574,10 +2573,11 @@ void BM_ReducePrecisely(int num_iters) {
                                    /*dimensions_to_reduce=*/{0}, add_func));
   module.AddEntryComputation(b.Build());
 
-  HloEvaluator hlo_eval;
-  tensorflow::testing::StartTiming();
-  hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
-  tensorflow::testing::StopTiming();
+  // Benchmark loop
+  for (auto s : state) {
+    HloEvaluator hlo_eval;
+    hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  }
 }
 
 BENCHMARK(BM_ReducePrecisely);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 4fb7edd0104..4ddd8ce5146 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1932,6 +1932,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleReduceWindow(HloInstruction* reduce_window) override {
+    if (reduce_window->shape().IsTuple()) {
+      return Status(tensorflow::error::UNIMPLEMENTED,
+                    "Variadic reduce window op is not yet fully supported.");
+    }
     auto operand = reduce_window->operand(0);
     const Window& window = reduce_window->window();
     HloComputation* function = reduce_window->to_apply();
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 41488dcdaaa..bc6dc23eea7 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -515,11 +516,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kReduceWindow:
+      TF_RET_CHECK(proto.operand_ids_size() % 2 == 0)
+          << "Reduce window should have an even number of operands but "
+             "sees "
+          << proto.operand_ids_size();
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "ReduceWindow should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateReduceWindow(shape, operands(0), operands(1),
-                                       proto.window(), computations(0));
+      {
+        const auto reduce_operands = all_operands();
+        auto inputs = absl::MakeSpan(reduce_operands)
+                          .subspan(0, reduce_operands.size() / 2);
+        auto init_values =
+            absl::MakeSpan(reduce_operands)
+                .subspan(reduce_operands.size() / 2, reduce_operands.size());
+        instruction = CreateReduceWindow(shape, inputs, init_values,
+                                         proto.window(), computations(0));
+      }
       break;
     case HloOpcode::kSelectAndScatter:
       TF_RET_CHECK(proto.called_computation_ids_size() == 2)
@@ -568,6 +581,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           std::max(static_cast<int64>(proto.batch_group_count()), int64{1}));
       custom_call_instr->set_custom_call_has_side_effect(
           proto.custom_call_has_side_effect());
+      custom_call_instr->set_padding_type(proto.padding_type());
+
+      PrecisionConfig precision_config = proto.precision_config();
+      precision_config.mutable_operand_precision()->Resize(
+          proto.operand_ids_size(), PrecisionConfig::DEFAULT);
+      *custom_call_instr->mutable_precision_config() = precision_config;
       std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
           output_to_operand_aliasing;
       for (const auto& aliasing : proto.custom_call_output_operand_aliasing()) {
@@ -1273,6 +1292,13 @@ HloInstruction::CreateBitcastConvert(const Shape& shape,
       shape, operand, init_value, window, reduce_computation);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReduceWindow(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::Span<HloInstruction* const> init_values, const Window& window,
+    HloComputation* reduce_computation) {
+  return absl::make_unique<HloReduceWindowInstruction>(
+      shape, operands, init_values, window, reduce_computation);
+}
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBatchNormTraining(const Shape& shape,
                                         HloInstruction* operand,
@@ -2932,6 +2958,10 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
 
 bool HloInstruction::IsFused() const { return parent_->IsFusionComputation(); }
 
+bool HloInstruction::IsCustomCall(absl::string_view target) const {
+  return opcode() == HloOpcode::kCustomCall && custom_call_target() == target;
+}
+
 bool HloInstruction::IsInputFusion() const {
   return opcode() == HloOpcode::kFusion && fusion_kind() == FusionKind::kInput;
 }
@@ -3807,6 +3837,10 @@ const PrecisionConfig& HloInstruction::precision_config() const {
   if (auto* dot = DynCast<HloDotInstruction>(this)) {
     return dot->precision_config();
   }
+
+  if (auto* custom_call = DynCast<HloCustomCallInstruction>(this)) {
+    return custom_call->precision_config();
+  }
   LOG(FATAL) << "Unimplemented method.";
 }
 
@@ -4164,6 +4198,10 @@ const PaddingConfig& HloInstruction::padding_config() const {
   return Cast<HloPadInstruction>(this)->padding_config();
 }
 
+PaddingType HloInstruction::padding_type() const {
+  return Cast<HloCustomCallInstruction>(this)->padding_type();
+}
+
 PaddingConfig* HloInstruction::mutable_padding_config() {
   return Cast<HloPadInstruction>(this)->mutable_padding_config();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 9675a2f0f0d..8f032c0b184 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -830,6 +830,16 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
       const Window& window, HloComputation* reduce_computation);
 
+  // A more general, multiple-argument version of the above.
+  // The reduce_computation being applied,now takes N arguments:
+  // [accumulator0, accumulator1, ..., accumulatorN, value0, value1, ...,
+  // valueN], and returns an N-tuple. The operands and init_values now each
+  // contain a span of N input arrays and n initial values.
+  static std::unique_ptr<HloInstruction> CreateReduceWindow(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloInstruction* const> init_values, const Window& window,
+      HloComputation* reduce_computation);
+
   // Creates a batch-norm-training instruction.
   static std::unique_ptr<HloInstruction> CreateBatchNormTraining(
       const Shape& shape, HloInstruction* operand, HloInstruction* scale,
@@ -1356,6 +1366,8 @@ class HloInstruction {
   // instruction.
   bool IsFusible() const;
 
+  bool IsCustomCall(absl::string_view target) const;
+
   // Returns the sharding applied to this operator.
   // REQUIRES: has_sharding() is true.
   const HloSharding& sharding() const {
@@ -1833,6 +1845,9 @@ class HloInstruction {
   const PaddingConfig& padding_config() const;
   PaddingConfig* mutable_padding_config();
 
+  // Delegates to HloConvolutionInstruction::padding_type.
+  PaddingType padding_type() const;
+
   // Delegates to HloDynamicSliceInstruction::slice_sizes.
   int64 slice_sizes(int64 dimension) const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 45b2d885d8e..5d5b62359e0 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -2199,7 +2199,6 @@ std::vector<string> HloConvolutionInstruction::ExtraAttributesToStringImpl(
   if (!precision_config_string.empty()) {
     extra.push_back(precision_config_string);
   }
-
   return extra;
 }
 
@@ -2237,9 +2236,21 @@ HloConvolutionInstruction::CloneWithNewOperandsImpl(
 HloReduceWindowInstruction::HloReduceWindowInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
     const Window& window, HloComputation* reduce_computation)
+    : HloReduceWindowInstruction(shape, absl::MakeSpan(&operand, 1),
+                                 absl::MakeSpan(&init_value, 1), window,
+                                 reduce_computation) {}
+
+HloReduceWindowInstruction::HloReduceWindowInstruction(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::Span<HloInstruction* const> init_values, const Window& window,
+    HloComputation* reduce_computation)
     : HloInstruction(HloOpcode::kReduceWindow, shape), window_(window) {
-  AppendOperand(operand);
-  AppendOperand(init_value);
+  for (auto* operand : operands) {
+    AppendOperand(operand);
+  }
+  for (auto* init_value : init_values) {
+    AppendOperand(init_value);
+  }
   AppendComputation(reduce_computation);
 }
 
@@ -2334,6 +2345,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(false),
+      padding_type_(PaddingType::PADDING_INVALID),
       custom_call_has_side_effect_(false) {
   set_raw_backend_config_string(std::move(opaque));
   for (auto operand : operands) {
@@ -2350,6 +2362,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(false),
+      padding_type_(PaddingType::PADDING_INVALID),
       custom_call_has_side_effect_(false) {
   set_raw_backend_config_string(std::move(opaque));
   for (auto operand : operands) {
@@ -2367,6 +2380,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(true),
+      padding_type_(PaddingType::PADDING_INVALID),
       operand_shapes_with_layout_(operand_shapes_with_layout.begin(),
                                   operand_shapes_with_layout.end()),
       custom_call_has_side_effect_(false) {
@@ -2388,6 +2402,8 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   proto.set_custom_call_target(custom_call_target_);
   proto.set_feature_group_count(feature_group_count_);
   proto.set_batch_group_count(batch_group_count_);
+  *proto.mutable_precision_config() = precision_config_;
+  proto.set_padding_type(padding_type_);
   if (layout_constrained()) {
     proto.set_constrain_layout(true);
     for (const Shape& shape : operand_shapes_with_layout_) {
@@ -2425,6 +2441,13 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (batch_group_count_ != 1) {
     extra.push_back(StrCat("batch_group_count=", batch_group_count_));
   }
+  string precision_config_string = PrecisionConfigToString(precision_config_);
+  if (!precision_config_string.empty()) {
+    extra.push_back(precision_config_string);
+  }
+  if (padding_type_ != PaddingType::PADDING_INVALID) {
+    extra.push_back(StrCat("padding_type=", PaddingType_Name(padding_type())));
+  }
   // By contract, we print the custom call target even if
   // options.print_subcomputation_mode() == kOff, because the call target is not
   // an HloComputation.
@@ -2480,6 +2503,11 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
   if (batch_group_count_ != casted_other.batch_group_count_) {
     return false;
   }
+
+  if (padding_type_ != casted_other.padding_type()) {
+    return false;
+  }
+
   if (layout_constrained() != casted_other.layout_constrained()) {
     return false;
   }
@@ -2499,6 +2527,10 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
       casted_other.output_to_operand_aliasing()) {
     return false;
   }
+  if (!protobuf_util::ProtobufEquals(precision_config(),
+                                     casted_other.precision_config())) {
+    return false;
+  }
   // Note: backend_config comparison is done in Identical, which is the
   // intended/exposed way to compare computations, and so not repeated here.
   return custom_call_target_ == casted_other.custom_call_target_;
@@ -2524,6 +2556,8 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
   cloned->set_batch_group_count(batch_group_count_);
   cloned->set_custom_call_has_side_effect(custom_call_has_side_effect_);
   cloned->set_output_to_operand_aliasing(output_to_operand_aliasing_);
+  cloned->set_padding_type(padding_type_);
+  *cloned->mutable_precision_config() = precision_config();
   return std::move(cloned);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 88e874347bd..15c5fbd276d 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -1294,10 +1295,43 @@ class HloReduceWindowInstruction : public HloInstruction {
                                       HloInstruction* init_value,
                                       const Window& window,
                                       HloComputation* reduce_computation);
+  explicit HloReduceWindowInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloInstruction* const> init_values, const Window& window,
+      HloComputation* reduce_computation);
   const Window& window() const override { return window_; }
   void set_window(const Window& window) override { window_ = window; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
+  // Returns the number of input arrays (and, consequentially, the number of
+  // init values) this reduce has.
+  int64 input_count() const { return operand_count() / 2; }
+  // Returns the input tensors to be reduced.
+  absl::Span<HloInstruction* const> input_arrays() const {
+    return absl::MakeSpan(operands()).subspan(0, input_count());
+  }
+  // Returns the init values of the reduction.
+  absl::Span<HloInstruction* const> init_values() const {
+    return absl::MakeSpan(operands()).subspan(input_count(), operand_count());
+  }
+  // Returns the shapes of input tensors to be reduced.
+  absl::InlinedVector<const Shape*, 2> input_array_shapes() const {
+    absl::InlinedVector<const Shape*, 2> shapes;
+    for (const auto* op : input_arrays()) {
+      VLOG(2) << "Pushing input array shape for: " << op->ToString() << "\n";
+      shapes.push_back(&op->shape());
+      VLOG(2) << "Pushed shape: " << shapes.back()->ToString() << "\n";
+    }
+    return shapes;
+  }
+  // Returns the init values of the reduction.
+  absl::InlinedVector<const Shape*, 2> init_value_shapes() const {
+    absl::InlinedVector<const Shape*, 2> shapes;
+    for (const auto* op : init_values()) {
+      shapes.push_back(&op->shape());
+    }
+    return shapes;
+  }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -1310,6 +1344,7 @@ class HloReduceWindowInstruction : public HloInstruction {
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
       HloCloneContext* context) const override;
+
   Window window_;
 };
 
@@ -1418,6 +1453,16 @@ class HloCustomCallInstruction : public HloInstruction {
   bool custom_call_has_side_effect() const {
     return custom_call_has_side_effect_;
   }
+  // Returns padding type used for ops like convolution.
+  PaddingType padding_type() const { return padding_type_; }
+
+  void set_padding_type(PaddingType padding_type) {
+    padding_type_ = padding_type;
+  }
+
+  const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
+
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1467,6 +1512,11 @@ class HloCustomCallInstruction : public HloInstruction {
   int64 batch_group_count_;
   // Whether the result and operand layouts are constrained.
   bool layout_constrained_;
+  // Information used to communicate to the implementation about the algorithm
+  // used to produce results for convolution instructions.
+  PrecisionConfig precision_config_;
+  // Describes the padding type for convolution instructions.
+  PaddingType padding_type_;
   // For layout-constrained custom calls, this vector holds the shape with
   // layout for each operand.
   std::vector<Shape> operand_shapes_with_layout_;
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.cc b/tensorflow/compiler/xla/service/hlo_module_dce.cc
index 6b72ba12866..5a46e2f7bf7 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -38,6 +40,7 @@ namespace {
 
 StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
   bool changed = false;
+  std::vector<HloComputation*> while_body_comps_to_dce;
   for (auto* computation : module->computations()) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() != HloOpcode::kWhile) {
@@ -60,6 +63,7 @@ StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
       // Remove dead tuple elements.
       const int64 tuple_element_count =
           ShapeUtil::TupleElementCount(xla_while->shape());
+      bool modified_while_body_comp = false;
       for (int64 i = 0; i < tuple_element_count; ++i) {
         if (liveness->IsLive(xla_while, {i})) {
           continue;
@@ -79,9 +83,22 @@ StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
         TF_RETURN_IF_ERROR(
             while_body_root->ReplaceOperandWith(i, pass_thru_gte));
         changed = true;
+        modified_while_body_comp = true;
+      }
+      if (modified_while_body_comp) {
+        while_body_comps_to_dce.push_back(while_body_comp);
       }
     }
   }
+
+  // Run DCE on while body computations that we modified.
+  for (auto* while_body_comp : while_body_comps_to_dce) {
+    TF_ASSIGN_OR_RETURN(bool changed_for_computation,
+                        HloDCE().RunOnComputation(
+                            while_body_comp,
+                            /*remove_cross_partition_collective_ops=*/false));
+    changed |= changed_for_computation;
+  }
   return changed;
 }
 
@@ -100,6 +117,15 @@ StatusOr<bool> HloModuleDCE::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(bool hlo_module_dce_changed,
                       RunWhileDCE(module, liveness.get()));
 
+  // Run the while loop simplifier to remove dead tuple elements.
+  WhileLoopSimplifier while_loop_simplifier;
+  TF_ASSIGN_OR_RETURN(bool while_loop_simplifier_changed,
+                      while_loop_simplifier.Run(module));
+
+  TupleSimplifier tuple_simplifier;
+  TF_ASSIGN_OR_RETURN(bool tuple_simplifier_changed,
+                      tuple_simplifier.Run(module));
+
   // Run HloDCE to clean up any dead code created during HloModuleDCE.
   HloDCE hlo_dce;
   TF_ASSIGN_OR_RETURN(bool hlo_dce_changed, hlo_dce.Run(module));
@@ -107,7 +133,8 @@ StatusOr<bool> HloModuleDCE::Run(HloModule* module) {
   VLOG(2) << "After HloModuleDCE:";
   XLA_VLOG_LINES(3, module->ToString());
 
-  return hlo_module_dce_changed | hlo_dce_changed;
+  return hlo_module_dce_changed | hlo_dce_changed | tuple_simplifier_changed |
+         while_loop_simplifier_changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index 301faa75f0a..d2dddc833eb 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -66,6 +66,18 @@ class HloModuleDceTest : public HloTestBase {
     }
     return false;
   }
+
+  // Returns all of the while loops in 'computation'.
+  std::vector<const HloInstruction*> GetWhileLoops(
+      const HloComputation* computation) {
+    std::vector<const HloInstruction*> while_loops;
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        while_loops.push_back(instruction);
+      }
+    }
+    return while_loops;
+  }
 };
 
 // Tests that a while with all outputs live is unmodified.
@@ -182,8 +194,9 @@ TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) {
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while", 0));
   // While tuple element {1} should now be pass-through after ModuleDCE.
-  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
-                                                  "while", 1));
+  auto while_loops = GetWhileLoops(module->entry_computation());
+  EXPECT_EQ(1, while_loops.size());
+  EXPECT_EQ(1, ShapeUtil::TupleElementCount(while_loops[0]->shape()));
 }
 
 // Tests that a tuple element {1} used by condition computation (which appears
@@ -285,16 +298,16 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while.2", 1));
   EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
-  // After HloModuleDCE while.1 and while.2 should have pass-thru elements,
+  // After HloModuleDCE while.1 and while.2 should have deleted tuple elements,
   // after being modified to pass through unused tuple element {1}.
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while.1", 0));
-  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
-                                                  "while.1", 1));
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while.2", 0));
-  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
-                                                  "while.2", 1));
+  auto while_loops = GetWhileLoops(module->entry_computation());
+  EXPECT_EQ(2, while_loops.size());
+  EXPECT_EQ(1, ShapeUtil::TupleElementCount(while_loops[0]->shape()));
+  EXPECT_EQ(1, ShapeUtil::TupleElementCount(while_loops[1]->shape()));
 }
 
 // Tests that HloModuleDCE can remove a dead tuple element at while.1{0} and
@@ -356,12 +369,12 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) {
   // After HloModuleDCE while.1{0} and while.2{1} not be pass-thru elements.
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while.1", 1));
-  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
-                                                  "while.1", 0));
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while.2", 0));
-  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
-                                                  "while.2", 1));
+  auto while_loops = GetWhileLoops(module->entry_computation());
+  EXPECT_EQ(2, while_loops.size());
+  EXPECT_EQ(1, ShapeUtil::TupleElementCount(while_loops[0]->shape()));
+  EXPECT_EQ(1, ShapeUtil::TupleElementCount(while_loops[1]->shape()));
 }
 
 // Tests that a while whose body has outfeed operations is not DCE-ed.
@@ -431,10 +444,74 @@ TEST_F(HloModuleDceTest, WhileWithOnlyLoopVariableBumping) {
                     .ValueOrDie();
 
   HloModuleDCE dce;
-  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  // Expect TRUE because while loop simplifier will remove dead tuple element.
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
   EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
                                                    "while", 0));
 }
 
+TEST_F(HloModuleDceTest, TwoWhilesWithDeadWhileLoop) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TwoWhilesWithDeadWhileLoop
+  SimpleLoop.body0 {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, get-tuple-element.2)
+  }
+  SimpleLoop.condition0 {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  SimpleLoop.body1 {
+    loop_var.3 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.3), index=0
+    constant.3 = s32[] constant(1)
+    add.1 = s32[] add(get-tuple-element.4, constant.3)
+    get-tuple-element.5 = s32[3]{0} get-tuple-element(loop_var.3), index=1
+    ROOT tuple.1 = (s32[], s32[3]{0}) tuple(add.1, get-tuple-element.5)
+  }
+  SimpleLoop.condition1 {
+    loop_var.4 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
+    constant.4 = s32[] constant(5)
+    ROOT less-than.1 = pred[] compare(get-tuple-element.6, constant.4), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.5 = s32[] constant(0)
+    constant.6 = s32[3]{0} constant({0, 1, 2})
+    tuple.2 = (s32[], s32[3]{0}) tuple(constant.5, constant.6)
+    while.1 = (s32[], s32[3]{0}) while(tuple.2), condition=
+      SimpleLoop.condition0, body=SimpleLoop.body0
+    get-tuple-element.7 = s32[3]{0} get-tuple-element(while.1), index=1
+    constant.7 = s32[] constant(0)
+    tuple.3 = (s32[], s32[3]{0}) tuple(constant.7, get-tuple-element.7)
+    while.2 = (s32[], s32[3]{0}) while(tuple.3), condition=
+      SimpleLoop.condition1, body=SimpleLoop.body1
+    ROOT get-tuple-element.8 = s32[] get-tuple-element(while.2), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // Before HloModuleDCE while.1 and while.2 should have pass-thru elements.
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.1", 1));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.2", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  // After HloModuleDCE while.1 and while.2 should have deleted tuple elements,
+  // after being modified to pass through unused tuple element {1}.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 0));
+  auto while_loops = GetWhileLoops(module->entry_computation());
+  // Dead while.1 should be removed.
+  EXPECT_EQ(1, while_loops.size());
+  EXPECT_EQ(1, ShapeUtil::TupleElementCount(while_loops[0]->shape()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index b50c7d9a584..e14d86e6bc0 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -119,7 +119,7 @@ namespace xla {
   V(kRecvDone, "recv-done", 1)                                         \
   V(kReduce, "reduce", kHloOpcodeIsVariadic)                           \
   V(kReducePrecision, "reduce-precision", 1)                           \
-  V(kReduceWindow, "reduce-window", 2)                                 \
+  V(kReduceWindow, "reduce-window", kHloOpcodeIsVariadic)              \
   V(kRemainder, "remainder", 2)                                        \
   V(kReplicaId, "replica-id", 0)                                       \
   V(kReshape, "reshape", 1)                                            \
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index cceb60a70e9..95bb81c60f6 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -65,6 +65,7 @@ TEST(HloOpcodeTest, OpcodeProperties) {
       case HloOpcode::kRng:
       case HloOpcode::kSort:
       case HloOpcode::kTuple:
+      case HloOpcode::kReduceWindow:
         EXPECT_TRUE(HloOpcodeIsVariadic(opcode));
         break;
       default:
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index d04a7695f3c..e0072d91738 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -193,6 +193,7 @@ class HloParserImpl : public HloParser {
     kHloComputation,
     kBracedHloComputationList,
     kFftType,
+    kPaddingType,
     kComparisonDirection,
     kComparisonType,
     kWindow,
@@ -328,6 +329,7 @@ class HloParserImpl : public HloParser {
   bool ParseTiles(std::vector<Tile>* tiles);
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
+  bool ParsePaddingType(PaddingType* result);
   bool ParseComparisonDirection(ComparisonDirection* result);
   bool ParseComparisonType(Comparison::Type* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
@@ -1838,6 +1840,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> to_apply;
       optional<std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>>
           output_to_operand_aliasing;
+      optional<PaddingType> padding_type;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
       attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
@@ -1856,6 +1859,12 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       attrs["output_to_operand_aliasing"] = {/*required=*/false,
                                              AttrTy::kInstructionAliasing,
                                              &output_to_operand_aliasing};
+
+      attrs["padding_type"] = {/*required=*/false, AttrTy::kPaddingType,
+                               &padding_type};
+      optional<std::vector<PrecisionConfig::Precision>> operand_precision;
+      attrs["operand_precision"] = {/*required=*/false, AttrTy::kPrecisionList,
+                                    &operand_precision};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
@@ -1921,6 +1930,9 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       if (batch_group_count.has_value()) {
         custom_call_instr->set_batch_group_count(*batch_group_count);
       }
+      if (padding_type.has_value()) {
+        custom_call_instr->set_padding_type(*padding_type);
+      }
       if (custom_call_has_side_effect.has_value()) {
         custom_call_instr->set_custom_call_has_side_effect(
             *custom_call_has_side_effect);
@@ -1929,6 +1941,15 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
         custom_call_instr->set_output_to_operand_aliasing(
             std::move(*output_to_operand_aliasing));
       }
+      PrecisionConfig precision_config;
+      if (operand_precision) {
+        *precision_config.mutable_operand_precision() = {
+            operand_precision->begin(), operand_precision->end()};
+      } else {
+        precision_config.mutable_operand_precision()->Resize(
+            operands.size(), PrecisionConfig::DEFAULT);
+      }
+      *custom_call_instr->mutable_precision_config() = precision_config;
       break;
     }
     case HloOpcode::kDot: {
@@ -3105,6 +3126,14 @@ bool HloParserImpl::ParseAttributeHelper(
         static_cast<optional<FftType>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kPaddingType: {
+        PaddingType result;
+        if (!ParsePaddingType(&result)) {
+          return false;
+        }
+        static_cast<optional<PaddingType>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
       case AttrTy::kComparisonDirection: {
         ComparisonDirection result;
         if (!ParseComparisonDirection(&result)) {
@@ -4246,6 +4275,19 @@ bool HloParserImpl::ParseFftType(FftType* result) {
   return true;
 }
 
+bool HloParserImpl::ParsePaddingType(PaddingType* result) {
+  VLOG(3) << "ParsePaddingType";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects padding type");
+  }
+  std::string val = lexer_.GetStrVal();
+  if (!PaddingType_Parse(val, result) || !PaddingType_IsValid(*result)) {
+    return TokenError(StrFormat("expects padding type but sees: %s", val));
+  }
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParserImpl::ParseComparisonDirection(ComparisonDirection* result) {
   VLOG(3) << "ParseComparisonDirection";
   if (lexer_.GetKind() != TokKind::kIdent) {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 3cb9a1c564b..fd7ce24395e 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -451,6 +451,20 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
   ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, operand_precision={high,default}
 }
 
+)"
+},
+// convolution dynamic
+{
+"ConvolutionDynamic",
+R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
+  %input = f32[1,2,1]{2,1,0} parameter(0)
+  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
+  %filter = f32[1,1,1]{2,1,0} parameter(1)
+  ROOT %custom-call.52 = f32[1,2,1]{2,0,1} custom-call(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, operand_precision={high,default}, custom_call_target="DynamicConvolutionForward", metadata={op_type="Conv2D" op_name="conv1d"}
+}
+
 )"
 },
 // convolution rank 2
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 977f6ee8ea6..f70f91d7c26 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -42,6 +42,10 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
 HloSharding HloSharding::PartialTile(
     const Array<int64>& group_tile_assignment,
     absl::Span<const absl::Span<const int64>> replication_groups) {
+  CHECK_EQ(group_tile_assignment.num_elements(), replication_groups.size());
+  if (replication_groups.size() == 1) {
+    return Replicate();
+  }
   auto new_tile_dims = group_tile_assignment.dimensions();
   new_tile_dims.push_back(replication_groups[0].size());
   auto new_tile_assignment = Array<int64>(new_tile_dims);
@@ -56,6 +60,9 @@ HloSharding HloSharding::PartialTile(
 
 HloSharding HloSharding::PartialTile(
     const Array<int64>& tile_assignment_last_dim_replicate) {
+  if (tile_assignment_last_dim_replicate.num_dimensions() == 1) {
+    return Replicate();
+  }
   if (tile_assignment_last_dim_replicate.dimensions().back() == 1) {
     auto new_tile_dims = tile_assignment_last_dim_replicate.dimensions();
     new_tile_dims.pop_back();
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
index 00998994c0a..e8cfc931e65 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -57,7 +57,6 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   for (auto& argument : arguments) {
     const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
     argument_buffers.push_back(ShapedBuffer(buffers.shape(),
-                                            /*platform=*/nullptr,
                                             /*device_ordinal=*/device_ordinal));
     auto in_it = buffers.begin();
     auto out_it = argument_buffers.back().buffers().begin();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 9940b032558..c8b4d2f013d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -264,3 +264,13 @@ cc_library(
         "@llvm-project//llvm:Core",
     ],
 )
+
+tf_cc_test(
+    name = "ir_array_test",
+    srcs = ["ir_array_test.cc"],
+    deps = [
+        ":ir_array",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 406fe84019e..ff132b40605 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 
@@ -190,9 +189,8 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
 //
 // Emits a sequential loop if launch_dimensions is null.
 static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
-    HloInstruction* fusion,
-    GeneratorForOperandIrArrays operand_arrays_generator,
-    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
+    HloInstruction* fusion, const IrArray& fusion_output_array,
+    FusedIrEmitter* fused_emitter,
     const gpu::LaunchDimensions* launch_dimensions, llvm::IRBuilder<>* b) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
   VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for "
@@ -221,14 +219,14 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
       LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape));
 
   // Create element generators for update and start_indices.
-  FusedIrEmitter fused_emitter(std::move(operand_arrays_generator),
-                               elemental_emitter);
-  TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
-  ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
+  TF_ASSIGN_OR_RETURN(ElementGenerator update_array_generator,
+                      fused_emitter->GetGenerator(update));
 
-  IndexGenerator start_indices_generator = [&](int64 index) {
-    ElementGenerator element_generator =
-        fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
+  IndexGenerator start_indices_generator =
+      [&](int64 index) -> StatusOr<llvm::Value*> {
+    TF_ASSIGN_OR_RETURN(
+        ElementGenerator element_generator,
+        fused_emitter->GetGenerator(dynamic_update_slice->operand(2 + index)));
     return element_generator(IrArray::Index(b->getInt64Ty()));
   };
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
@@ -237,25 +235,21 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
       fusion_output_array, launch_dimensions, IrName(fusion), b);
 }
 
-Status EmitFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion,
-    GeneratorForOperandIrArrays operand_arrays_generator,
-    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    llvm::IRBuilder<>* b) {
+Status EmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
+                                          const IrArray& fusion_output_array,
+                                          FusedIrEmitter* fused_emitter,
+                                          llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
-      fusion, std::move(operand_arrays_generator), fusion_output_array,
-      elemental_emitter,
+      fusion, fusion_output_array, fused_emitter,
       /*launch_dimensions=*/nullptr, b);
 }
 
 Status EmitParallelFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion,
-    GeneratorForOperandIrArrays operand_arrays_generator,
-    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
+    HloInstruction* fusion, const IrArray& fusion_output_array,
+    FusedIrEmitter* fused_emitter,
     const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
-      fusion, std::move(operand_arrays_generator), fusion_output_array,
-      elemental_emitter, &launch_dimensions, b);
+      fusion, fusion_output_array, fused_emitter, &launch_dimensions, b);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index b40501b738c..b5e343b367f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 
 // Utilities related to emitting LLVM IR for various HLO ops.
@@ -71,18 +72,16 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
 // array-to-be-updated and output share the same buffer slice, emits
 // (sequential) code for a fusion node that does the dynamic-update-slice in
 // place.
-Status EmitFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion,
-    GeneratorForOperandIrArrays operand_arrays_generator,
-    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    llvm::IRBuilder<>* b);
+Status EmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
+                                          const IrArray& fusion_output_array,
+                                          FusedIrEmitter* fused_emitter,
+                                          llvm::IRBuilder<>* b);
 
 // Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with
 // the given launch dimensions.
 Status EmitParallelFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion,
-    GeneratorForOperandIrArrays operand_arrays_generator,
-    const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
+    HloInstruction* fusion, const IrArray& fusion_output_array,
+    FusedIrEmitter* fused_emitter,
     const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b);
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index f8514a6cba3..0a26a2bb7ce 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
-#include <utility>
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Value.h"
@@ -44,32 +43,37 @@ using llvm_ir::IrArray;
 Status FusedIrEmitter::DefaultAction(const HloInstruction* hlo) {
   indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    auto cache = generated_value_cache_.find(hlo);
-    if (cache != generated_value_cache_.end()) {
-      auto key = std::make_pair(b_->GetInsertBlock(), index.multidim());
-      if (llvm::Value* generated_value =
-              FindOrDefault(cache->second, key, nullptr)) {
-        VLOG(3) << "The cached generated value is reused.";
-        return generated_value;
-      }
-      auto null_key = std::make_pair(nullptr, index.multidim());
-      if (llvm::Value* generated_value =
-              FindOrDefault(cache->second, null_key, nullptr)) {
+    if (llvm::Value* generated_value = FindOrDefault(
+            generated_value_cache_[hlo], index.multidim(), nullptr)) {
+      llvm::BasicBlock* generated_value_bb = nullptr;
+      if (auto* generated_instruction =
+              llvm::dyn_cast<llvm::Instruction>(generated_value)) {
+        generated_value_bb = generated_instruction->getParent();
+      }
+      // Ideally, we should be able to reuse the cached generated value if it
+      // dominates the current insertion block. However, the check for dominance
+      // can be expensive and unreliable when the function is being constructed.
+      //
+      // It's also worth experimenting what if we don't do caching at all.
+      // LLVM's CSE or GVN should be able to easily merge common subexpressions
+      // that would be regenerated without caching. But this might increase the
+      // JIT compilation time.
+      if (generated_value_bb == nullptr ||
+          generated_value_bb == b_->GetInsertBlock()) {
         VLOG(3) << "The cached generated value is reused.";
         return generated_value;
       }
+      VLOG(3) << "The cached generated value can't be reused, because it is in "
+                 "a different BB ("
+              << generated_value_bb->getName().str()
+              << ") from the current insertion block ("
+              << b_->GetInsertBlock()->getName().str() << ").";
     }
 
     TF_ASSIGN_OR_RETURN(llvm::Value* const generated_value,
                         elemental_emitter_->MakeElementGenerator(
                             hlo, indexed_generators_)(index));
-    llvm::BasicBlock* generated_value_bb = nullptr;
-    if (auto* generated_instruction =
-            llvm::dyn_cast<llvm::Instruction>(generated_value)) {
-      generated_value_bb = generated_instruction->getParent();
-    }
-    generated_value_cache_[hlo][std::make_pair(
-        generated_value_bb, index.multidim())] = generated_value;
+    generated_value_cache_[hlo][index.multidim()] = generated_value;
     return generated_value;
   };
   return Status::OK();
@@ -106,64 +110,13 @@ Status FusedIrEmitter::HandleConstant(const HloInstruction* constant) {
 
 Status FusedIrEmitter::HandleGetTupleElement(
     const HloInstruction* get_tuple_element) {
-  auto emit_tuple_element_ptr = [=]() -> StatusOr<llvm::Value*> {
-    const HloInstruction* tuple_operand = get_tuple_element->operand(0);
-    llvm::Value* tuple_ptr;
-    if (tuple_operand->opcode() == HloOpcode::kGetTupleElement) {
-      TF_ASSIGN_OR_RETURN(tuple_ptr, non_indexed_generators_[tuple_operand]());
-    } else {
-      if (tuple_operand->opcode() != HloOpcode::kParameter) {
-        return Unimplemented(
-            "GetTupleElement fusion currently only supports parameter or "
-            "nested"
-            "GetTupleElement as tuple operand, found an exception: %s",
-            tuple_operand->name());
-      }
-      tuple_ptr =
-          GetBasePointerForFusedParameter(tuple_operand->parameter_number());
-    }
-
-    // Lookup tuple element pointer.
-    return llvm_ir::EmitGetTupleElement(get_tuple_element->shape(),
-                                        get_tuple_element->tuple_index(),
-                                        /*alignment=*/1, tuple_ptr, b_);
-  };
-
-  if (!get_tuple_element->shape().IsTuple()) {
-    indexed_generators_[get_tuple_element] =
-        [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-      // TODO(b/34080002) Add aliasing information to tuple element IrArray.
-      TF_ASSIGN_OR_RETURN(llvm::Value * tuple_element_ptr,
-                          emit_tuple_element_ptr());
-      return IrArray(tuple_element_ptr, get_tuple_element->shape())
-          .EmitReadArrayElement(index, b_);
-    };
-  } else {
-    non_indexed_generators_[get_tuple_element] = emit_tuple_element_ptr;
-  }
-  return Status::OK();
+  return InternalError("Tuple parameters are not supported for fusion");
 }
 
 Status FusedIrEmitter::HandleParameter(const HloInstruction* parameter) {
-  indexed_generators_[parameter] =
-      [=](const IrArray::Index& index) -> llvm::Value* {
-    int64 param_num = parameter->parameter_number();
-    if (param_shmem_buffers_.size() > param_num) {
-      if (llvm::Value* param_tile_buffer = param_shmem_buffers_[param_num]) {
-        // TODO(jlebar): Add AA metadata to this load.  Tile buffers are global
-        // variables, so LLVM's points-to analysis doesn't help us much.  And we
-        // want the AA info to be present before address spaces are inferred
-        // (which is pretty late in the pipeline), so even if we had
-        // address-space-based AA in LLVM, it wouldn't help us much here.
-        return b_->CreateLoad(
-            b_->CreateGEP(param_tile_buffer, {index.GetConstantWithIndexType(0),
-                                              thread_id_x_, thread_id_y_}),
-            "tiled_buffer");
-      }
-    }
-    return GetIrArrayForFusedParameter(param_num).EmitReadArrayElement(index,
-                                                                       b_);
-  };
+  if (indexed_generators_.find(parameter) == indexed_generators_.end()) {
+    return InvalidArgument("Unbound parameter: %s", parameter->ToString());
+  }
   return Status::OK();
 }
 
@@ -188,22 +141,6 @@ Status FusedIrEmitter::HandleTuple(const HloInstruction* tuple) {
   return Status::OK();
 }
 
-Status FusedIrEmitter::FinishVisit(const HloInstruction* root) {
-  fused_root_ = root;
-  return Status::OK();
-}
-
-FusedIrEmitter::IndexedGenerator FusedIrEmitter::GetRootGenerator() const {
-  CHECK_NE(nullptr, fused_root_)
-      << "GetRootGenerator should be called after Accept.";
-  return indexed_generators_.at(fused_root_);
-}
-
-FusedIrEmitter::IndexedGenerator FusedIrEmitter::GetGenerator(
-    const HloInstruction* instruction) const {
-  return indexed_generators_.at(instruction);
-}
-
 bool FusedIrEmitter::IsFusedIrEmitterInefficient(
     const HloInstruction* consumer, const HloInstruction* producer) {
   if (consumer->opcode() != HloOpcode::kFusion) {
@@ -220,4 +157,39 @@ bool FusedIrEmitter::IsFusedIrEmitterInefficient(
   return eval_producer.MaxCodeDuplicationTooHigh();
 }
 
+StatusOr<FusedIrEmitter::IndexedGenerator> FusedIrEmitter::GetGenerator(
+    const HloInstruction* instruction) {
+  std::vector<const HloInstruction*> stack;
+  stack.push_back(instruction);
+  while (!stack.empty()) {
+    const HloInstruction* instr = stack.back();
+    stack.pop_back();
+    if (indexed_generators_.count(instr)) {
+      continue;
+    }
+    for (const HloInstruction* operand : instr->operands()) {
+      stack.push_back(operand);
+    }
+    switch (instr->opcode()) {
+      case HloOpcode::kConstant:
+        TF_RETURN_IF_ERROR(HandleConstant(instr));
+        break;
+      case HloOpcode::kGetTupleElement:
+        TF_RETURN_IF_ERROR(HandleGetTupleElement(instr));
+        break;
+      case HloOpcode::kParameter:
+        TF_RETURN_IF_ERROR(HandleParameter(instr));
+        break;
+      case HloOpcode::kTuple:
+        TF_RETURN_IF_ERROR(HandleTuple(instr));
+        break;
+      default:
+        TF_RETURN_IF_ERROR(DefaultAction(instr));
+        break;
+    }
+    CHECK(indexed_generators_.count(instr));
+  }
+  return indexed_generators_.at(instruction);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index e19e970cb24..9059d150065 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include <map>
 #include <unordered_map>
-#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -52,47 +50,22 @@ namespace xla {
 // created produces an LLVM struct with N elements, one for each element of the
 // arrays in the tuple.  It follows that the arrays in the tuple must have the
 // same length.
-class FusedIrEmitter : public ConstDfsHloVisitorWithDefault {
+class FusedIrEmitter {
  public:
   using IndexedGenerator = llvm_ir::ElementGenerator;
-  using NonIndexedGenerator = std::function<StatusOr<llvm::Value*>()>;
-  using GeneratorForOperandIrArrays =
-      std::function<std::vector<llvm_ir::IrArray>()>;
 
-  FusedIrEmitter(GeneratorForOperandIrArrays operand_arrays_generator,
-                 ElementalIrEmitter* elemental_emitter,
-                 llvm::Value* thread_id_x = nullptr,
-                 llvm::Value* thread_id_y = nullptr,
-                 absl::Span<llvm::Value* const> param_shmem_buffers = {})
-      : operand_arrays_(),
-        operand_arrays_generator_(std::move(operand_arrays_generator)),
-        thread_id_x_(thread_id_x),
-        thread_id_y_(thread_id_y),
-        param_shmem_buffers_(param_shmem_buffers.begin(),
-                             param_shmem_buffers.end()),
-        elemental_emitter_(elemental_emitter),
+  explicit FusedIrEmitter(ElementalIrEmitter* elemental_emitter)
+      : elemental_emitter_(elemental_emitter),
         b_(elemental_emitter->b()),
         module_(elemental_emitter->module()) {}
 
-  Status DefaultAction(const HloInstruction* hlo) override;
-
-  Status HandleConstant(const HloInstruction* constant) override;
-
-  Status HandleGetTupleElement(
-      const HloInstruction* get_tuple_element) override;
-
-  Status HandleParameter(const HloInstruction* parameter) override;
-
-  // Emits the ir value for each element in the tuple.
-  Status HandleTuple(const HloInstruction* tuple) override;
-
-  Status FinishVisit(const HloInstruction* root) override;
-
-  // Returns the generator function for the root of the fused computation.
-  IndexedGenerator GetRootGenerator() const;
+  void BindGenerator(const HloInstruction* hlo,
+                     llvm_ir::ElementGenerator generator) {
+    indexed_generators_[hlo] = std::move(generator);
+  }
 
   // Returns the generator function for the given instruction.
-  IndexedGenerator GetGenerator(const HloInstruction* instruction) const;
+  StatusOr<IndexedGenerator> GetGenerator(const HloInstruction* instruction);
 
   // Evaluates whether fusing 'producer' into 'consumer' might cause exponential
   // behavior in FusedIrEmitter. We currently can have exponential time/memory
@@ -102,40 +75,20 @@ class FusedIrEmitter : public ConstDfsHloVisitorWithDefault {
   static bool IsFusedIrEmitterInefficient(const HloInstruction* consumer,
                                           const HloInstruction* producer);
 
- protected:
-  // Returns the IrArrays for the fusion instruction operands.
-  llvm_ir::IrArray& GetIrArrayForFusedParameter(int64 parameter_number) {
-    if (!operand_arrays_.has_value()) {
-      operand_arrays_ = operand_arrays_generator_();
-    }
-    return operand_arrays_.value()[parameter_number];
-  }
-
-  llvm::Value* GetBasePointerForFusedParameter(int64 parameter_number) {
-    return GetIrArrayForFusedParameter(parameter_number).GetBasePointer();
-  }
-
  private:
-  // IrArrays for the fusion instruction operands, whose base addresses are the
-  // base address of the corresponding parameters in the fused computation.
-  absl::optional<std::vector<llvm_ir::IrArray>> operand_arrays_;
-  GeneratorForOperandIrArrays operand_arrays_generator_;
+  Status DefaultAction(const HloInstruction* hlo);
 
-  // The x coordinate within a tile.
-  llvm::Value* thread_id_x_;
+  Status HandleConstant(const HloInstruction* constant);
 
-  // The y coordinate within a tile.
-  llvm::Value* thread_id_y_;
+  Status HandleGetTupleElement(const HloInstruction* get_tuple_element);
 
-  // Param_buffers_[i] stores the tile buffer for the ith parameter or nullptr
-  // if the parameter is not tiled.
-  std::vector<llvm::Value*> param_shmem_buffers_;
+  Status HandleParameter(const HloInstruction* parameter);
+
+  // Emits the ir value for each element in the tuple.
+  Status HandleTuple(const HloInstruction* tuple);
 
   ElementalIrEmitter* elemental_emitter_;
 
-  // This member will be set by FinishVisit and used in GetRootGenerator.
-  const HloInstruction* fused_root_ = nullptr;
-
   // Borrowed
   llvm::IRBuilder<>* b_;
   llvm::Module* module_;
@@ -146,18 +99,11 @@ class FusedIrEmitter : public ConstDfsHloVisitorWithDefault {
   std::unordered_map<const HloInstruction*, IndexedGenerator>
       indexed_generators_;
 
-  // Map from tuple-result-producing GetTupleELement instructions to functions
-  // that generate the base pointers for the output elements. This is used to
-  // support the translation of nested GetTupleElement instructions.
-  std::unordered_map<const HloInstruction*, NonIndexedGenerator>
-      non_indexed_generators_;
-
   // Cache of generated values, lest we regenerate an element of a node with
   // multiple outgoing edges
-  absl::flat_hash_map<const HloInstruction*,
-                      absl::flat_hash_map<std::pair<const llvm::BasicBlock*,
-                                                    std::vector<llvm::Value*>>,
-                                          llvm::Value*>>
+  absl::flat_hash_map<
+      const HloInstruction*,
+      absl::flat_hash_map<std::vector<llvm::Value*>, llvm::Value*>>
       generated_value_cache_;
 };
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 73d430e2c54..6da4d08f182 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -527,5 +527,27 @@ IrArray IrArray::CastToShape(const Shape& new_shape,
   return new_irarray;
 }
 
+bool IrArray::Index::ShapeIsCompatible(const Shape& a, const Shape& b) {
+  // Compute strides for two sides of the comparison. Sometimes different shapes
+  // give the same strides:
+  //   [10, 20, 30, 1]{3,2,1,0} vs [10, 20, 1, 30]{3,2,1,0}
+  // which should be considered compatible.
+  const auto get_strides = [](const Shape& shape) {
+    int rank = shape.dimensions().size();
+    int64 stride = 1;
+    std::vector<int64> strides;
+    for (int i = 0; i < rank; i++) {
+      auto dim = shape.dimensions(shape.layout().minor_to_major(i));
+      if (dim != 1) {
+        stride *= dim;
+        strides.push_back(stride);
+      }
+    }
+    return strides;
+  };
+
+  return get_strides(a) == get_strides(b);
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 32273de38ea..dfc49ce3dde 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -110,15 +110,12 @@ class IrArray {
 
     bool LinearValidOnShape(const Shape& a) const;
 
+    static bool ShapeIsCompatible(const Shape& a, const Shape& b);
+
     bool ShapeIsCompatible(const Shape& a) const {
-      Shape own_shape = ShapeUtil::MakeShape(a.element_type(), dims_);
-      *own_shape.mutable_layout() = layout_;
-      // The shape 'a' could have dynamic dimensions set. Before we check for
-      // equality, we need to copy the information which dimensions are dynamic.
-      for (int64 i = 0; i < a.rank(); ++i) {
-        own_shape.set_dynamic_dimension(i, a.is_dynamic_dimension(i));
-      }
-      return ShapeUtil::Equal(own_shape, a);
+      return ShapeIsCompatible(
+          a, ShapeUtil::MakeShapeWithLayout(a.element_type(), dims_,
+                                            layout_.minor_to_major()));
     }
 
     // Given that "this" is the target index of a reshape from `input_shape`
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array_test.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array_test.cc
new file mode 100644
index 00000000000..7f464b76e4f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array_test.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace llvm_ir {
+namespace {
+
+TEST(IrArrayTest, TestShapeIsCompatible) {
+  xla::Shape a = ShapeUtil::MakeShapeWithLayout(F32, {1, 10, 20}, {2, 1, 0});
+  xla::Shape b = ShapeUtil::MakeShapeWithLayout(F32, {1, 10, 20}, {2, 0, 1});
+  xla::Shape c = ShapeUtil::MakeShapeWithLayout(F32, {10, 1, 20}, {2, 1, 0});
+
+  xla::Shape d = ShapeUtil::MakeShapeWithLayout(F32, {1, 10, 30}, {2, 1, 0});
+  xla::Shape e = ShapeUtil::MakeShapeWithLayout(F32, {1, 10, 30}, {2, 0, 1});
+  xla::Shape f = ShapeUtil::MakeShapeWithLayout(F32, {10, 1, 30}, {2, 1, 0});
+
+  EXPECT_TRUE(IrArray::Index::ShapeIsCompatible(a, b));
+  EXPECT_TRUE(IrArray::Index::ShapeIsCompatible(a, c));
+  EXPECT_FALSE(IrArray::Index::ShapeIsCompatible(a, d));
+  EXPECT_FALSE(IrArray::Index::ShapeIsCompatible(a, e));
+  EXPECT_FALSE(IrArray::Index::ShapeIsCompatible(a, f));
+}
+
+}  // namespace
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc b/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
new file mode 100644
index 00000000000..0da457c829c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/loop_schedule_linearizer.h"
+
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
+
+namespace xla {
+
+namespace {
+
+// Calculate ordering for HLO, for fast online checking of whether adding
+// additional dependencies would create cycles.
+struct ComputationInstructionOrdering {
+  explicit ComputationInstructionOrdering(const HloComputation& computation) {
+    for (const HloInstruction* instr : computation.instructions()) {
+      for (const HloInstruction* control_pred : instr->control_predecessors()) {
+        CHECK(this->InsertEdge(*control_pred, *instr))
+            << "Graph already contained a cycle";
+      }
+
+      for (int op_id = 0; op_id < instr->operand_count(); op_id++) {
+        const HloInstruction* op = instr->operand(op_id);
+        CHECK(this->InsertEdge(*op, *instr))
+            << "Graph already contained a cycle";
+      }
+    }
+  }
+
+  int32 NodeIdForInstruction(const HloInstruction& instr) {
+    int32 instruction_id = instr.unique_id();
+    auto it = node_id_to_graph_id.find(instruction_id);
+
+    if (it != node_id_to_graph_id.end()) {
+      return it->second;
+    }
+    int32 node_id = graph_cycles.NewNode();
+    node_id_to_graph_id[instruction_id] = node_id;
+    return node_id;
+  }
+
+  // Returns `false` if adding an edge would have introduced a cycle. Does not
+  // add an edge in that case. Returns `true` otherwise.
+  bool InsertEdge(const HloInstruction& source, const HloInstruction& dest) {
+    int32 source_id = NodeIdForInstruction(source);
+    int32 dest_id = NodeIdForInstruction(dest);
+    return graph_cycles.InsertEdge(source_id, dest_id);
+  }
+
+  absl::flat_hash_map<int32, int32> node_id_to_graph_id;
+
+  tensorflow::GraphCycles graph_cycles;
+};
+
+}  // namespace
+
+static StatusOr<bool> AddControlEdgesForLoopWrites(
+    HloInstruction* xla_while, HloAliasAnalysis& alias_analysis) {
+  HloDataflowAnalysis& dataflow = alias_analysis.dataflow_analysis();
+  HloComputation* body = xla_while->while_body();
+  HloInstruction* root = body->root_instruction();
+  HloInstruction* input = body->parameter_instruction(0);
+
+  bool changed = false;
+
+  // Compute dependency ordering ourselves. The reason we don't reuse other
+  // computations is because it is hard to extract the underlying graph from
+  // those abstractions.
+  ComputationInstructionOrdering ordering(*body);
+  ShapeTree<bool> indices_to_copy(xla_while->shape());
+
+  for (auto& p : indices_to_copy) {
+    const ShapeIndex& index = p.first;
+
+    if (index.empty()) {
+      continue;
+    }
+
+    if (dataflow.GetValueSet(root, index).values().size() > 1 ||
+        dataflow.GetValueSet(input, index).values().size() > 1) {
+      VLOG(2) << "Index " << index.ToString() << " is associated with multiple "
+              << "values, not attempting to introduce stricter dependencies";
+    } else {
+      HloValue& value_at_root = dataflow.GetUniqueValueAt(root, index);
+      HloValue& value_at_input = dataflow.GetUniqueValueAt(input, index);
+
+      if (value_at_root.shape().IsTuple()) {
+        // TODO(cheshire): For simplicity we currently do not handle nested
+        // tuples, as we haven't seen them in the examples we care about.
+        continue;
+      }
+
+      // TODO(cheshire): This is too conservative and does not take aliasing
+      // into account.
+      HloInstruction* write = value_at_root.defining_instruction();
+
+      for (const HloUse& use : value_at_input.uses()) {
+        HloInstruction* read = use.instruction;
+
+        if (read != write &&
+            value_at_root != value_at_input
+
+            // TODO(cheshire): Parents sometimes differ in case of e.g. nested
+            // loops, where the value is read/written into in the inner loop.
+            // For now we skip this case for simplicity (as the inner loop
+            // performance is more important in any case)
+            && read->parent() == write->parent()) {
+          VLOG(2) << "Inside " << body->name() << ", index "
+                  << index.ToString();
+          if (!ordering.InsertEdge(*read, *write)) {
+            VLOG(2) << "Not adding a control dependency from "
+                    << read->ToShortString() << " to " << write->ToShortString()
+                    << " as it would introduce a cycle";
+            continue;
+          }
+
+          changed |= absl::c_linear_search(read->control_successors(), write);
+
+          // Unless we want a copy, read should happen before write.
+          TF_RETURN_IF_ERROR(read->AddControlDependencyTo(write));
+          VLOG(2) << "Adding dependency: " << read->ToShortString()
+                  << " before " << write->ToShortString();
+        }
+      }
+    }
+  }
+  return changed;
+}
+
+StatusOr<bool> LoopScheduleLinearizer::Run(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module, can_share_buffer_));
+
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        StatusOr<bool> updated_loop =
+            AddControlEdgesForLoopWrites(instruction, *alias_analysis);
+        TF_RETURN_IF_ERROR(updated_loop.status());
+        changed |= *updated_loop;
+      }
+    }
+  }
+  DumpHloModuleDuringPassIfEnabled(
+      name(), "after inserting control edges inside while loop bodies",
+      *module);
+
+  return changed;
+}
+
+}  // end namespace xla
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer.h b/tensorflow/compiler/xla/service/loop_schedule_linearizer.h
new file mode 100644
index 00000000000..67ef37bcc5b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Adds control dependency edges from instructions which "write" values inside
+// the loop, to instructions which "read" those same values, in order to avoid
+// extraneous copies. This is not always possible with our buffer layout
+// constraints (that is, assuming that every element of the tuple the while loop
+// operates upon gets the same buffer) as it may create cycles (an easiest
+// example of a dependency cycle is a loop doing `(a, b) = (b, a)`). Thus we
+// take a best-effort approach instead: add dependency edges only if we can show
+// they don't create a cycle.
+class LoopScheduleLinearizer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "loop-schedule-linearizer"; }
+
+  explicit LoopScheduleLinearizer(
+      const HloDataflowAnalysis::CanShareBuffer& can_share_buffer = nullptr)
+      : can_share_buffer_(can_share_buffer) {}
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Backend specific function that decides whether an instruction can share
+  // buffer with its operand.
+  HloDataflowAnalysis::CanShareBuffer can_share_buffer_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc b/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
new file mode 100644
index 00000000000..d3f6d8b01a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/loop_schedule_linearizer.h"
+
+#include <set>
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace xla {
+namespace {
+
+int64 CountCopies(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64 CountCopies(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
+
+int64 CountControlEdges(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    count += instruction->control_successors().size();
+  }
+  return count;
+}
+
+int64 CountControlEdges(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountControlEdges(*computation);
+  }
+  return count;
+}
+
+class LoopScheduleLinearizerTest : public HloTestBase {
+ protected:
+  void InsertCopies(HloModule* module) {
+    LoopScheduleLinearizer loop_schedule_linearizer;
+    ASSERT_IS_OK(loop_schedule_linearizer.Run(module).status());
+
+    CopyInsertion copy_insertion;
+    ASSERT_IS_OK(copy_insertion.Run(module).status());
+  }
+};
+
+TEST_F(LoopScheduleLinearizerTest, NoExtraCopiesRequired) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+while_body {
+  input = (s32[], s32[]) parameter(0)
+  counter = s32[] get-tuple-element(input), index=0
+  buffer = s32[] get-tuple-element(input), index=1
+
+  one = s32[] constant(1)
+
+  updated_counter = s32[] add(counter, one)
+
+  updated_buffer = s32[] add(buffer, counter)
+  ROOT out = (s32[], s32[]) tuple(updated_counter, updated_buffer)
+}
+
+while_cond {
+  input = (s32[], s32[]) parameter(0)
+  counter = s32[] get-tuple-element(input), index=0
+  bound = s32[] constant(100)
+  ROOT cmp = pred[] compare(counter, bound), direction=LT
+}
+
+ENTRY entry {
+  zero = s32[] constant(0)
+  buffer = s32[] parameter(0)
+  while_input = (s32[], s32[]) tuple(zero, buffer)
+  ROOT out = (s32[], s32[]) while(while_input), condition=while_cond, body=while_body
+}
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(
+                *module->entry_computation()->root_instruction()->while_body()),
+            0);
+  EXPECT_EQ(CountControlEdges(
+                *module->entry_computation()->root_instruction()->while_body()),
+            1);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index efee06fdbf3..e424daa56b2 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -29,6 +29,105 @@ const HeapSimulator::Chunk kDummyChunk{-1, -1};
 // pow(kWhileExecutionCount, nesting_level) times.
 const int kWhileExecutionCount = 5;
 
+bool LooksLikeAnActivation(const HloInstruction* inst) {
+  for (HloInstruction* user : inst->users()) {
+    switch (user->opcode()) {
+      case HloOpcode::kConvolution:
+      case HloOpcode::kDot:
+        if (user->operand(0) == inst) {
+          return true;
+        }
+        break;
+      case HloOpcode::kGather:
+        if (user->operand(1) == inst) {
+          return true;
+        }
+        break;
+      case HloOpcode::kFusion:
+        for (int i = 0; i < user->operand_count(); ++i) {
+          if (user->operand(i) == inst &&
+              LooksLikeAnActivation(user->fused_parameter(i))) {
+            return true;
+          }
+        }
+        break;
+      case HloOpcode::kBitcast:
+        return LooksLikeAnActivation(user);
+      default:
+        return true;
+    }
+  }
+  return false;
+}
+
+bool IsCrossProgramPrefetchCandidate(
+    const HloValue& value, const MemorySpaceAssignment::Options& options) {
+  return value.instruction()->parent() ==
+             value.instruction()->GetModule()->entry_computation() &&
+         value.instruction()->opcode() == HloOpcode::kParameter &&
+         (!value.shape().has_layout() ||
+          value.shape().layout().memory_space() !=
+              options.alternate_memory_space) &&
+         value.index().size() == 1 && value.shape().IsArray() &&
+         !value.uses().empty() &&
+         options.size_fn(value) <= options.max_size_in_bytes &&
+         absl::c_all_of(value.uses(), [&](const HloUse& use) {
+           const HloInstruction* inst =
+               use.instruction->operand(use.operand_number);
+
+           // Skip the LooksLikeAnActivation test since we're testing the
+           // parent GTE and its children below.
+           if (inst->opcode() == HloOpcode::kBitcast &&
+               inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
+               inst->operand(0)->operand(0)->opcode() ==
+                   HloOpcode::kParameter) {
+             return true;
+           }
+
+           return inst->opcode() == HloOpcode::kGetTupleElement &&
+                  !LooksLikeAnActivation(inst);
+         });
+}
+
+absl::optional<MemorySpaceAssignment::BufferInterval>
+FindCrossProgramPrefetchCandidate(
+    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
+    const MemorySpaceAssignment::Options& options) {
+  std::vector<MemorySpaceAssignment::BufferInterval> candidates;
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    CHECK_GE(buffer.values().size(), 1);
+    const HloValue* value = buffer.values().at(0);
+    if (IsCrossProgramPrefetchCandidate(*value, options)) {
+      MemorySpaceAssignment::BufferInterval interval;
+      interval.buffer = value;
+      interval.size = options.size_fn(*value);
+      interval.start = 0;
+      interval.end = hlo_live_range.schedule_end_time();
+      interval.need_allocation = true;
+      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
+      candidates.emplace_back(interval);
+    }
+  }
+
+  // The buffer_interval_compare ought to do a good job picking the most
+  // appropriate buffer to cross program prefetch, but empirically, it makes
+  // worse choices than just picking the largest buffer.
+  // TODO(b/152421603): Investigate.
+  auto size_compare = [](const auto& x, const auto& y) {
+    return x.size < y.size;
+  };
+  auto& compare = options.default_cross_program_prefetch_heuristic &&
+                          options.buffer_interval_compare
+                      ? *options.buffer_interval_compare
+                      : size_compare;
+
+  auto best_candidate = absl::c_max_element(candidates, compare);
+  if (best_candidate == candidates.end()) {
+    return absl::nullopt;
+  }
+  return *best_candidate;
+}
+
 }  // namespace
 
 /*static*/ StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>>
@@ -64,12 +163,16 @@ float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
         while_nest_multiplier = it->second;
       } else {
         while_nest_multiplier = tensorflow::MathUtil::IPow<float>(
-            kWhileExecutionCount, CalculateWhileLoopNestLevel(&instruction));
+            kWhileExecutionCount,
+            CalculateComputationNestLevel(&instruction,
+                                          /*while_only=*/true));
         cache->while_nest_multiplier[&instruction] = while_nest_multiplier;
       }
     } else {
       while_nest_multiplier = tensorflow::MathUtil::IPow<float>(
-          kWhileExecutionCount, CalculateWhileLoopNestLevel(&instruction));
+          kWhileExecutionCount,
+          CalculateComputationNestLevel(&instruction,
+                                        /*while_only=*/true));
     }
     return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
            while_nest_multiplier;
@@ -125,8 +228,8 @@ float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
   return alternate_mem_benefit / std::sqrt(interval.size);
 }
 
-int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
-    const HloInstruction* instruction) const {
+int MemorySpaceAssignmentCostAnalysis::CalculateComputationNestLevel(
+    const HloInstruction* instruction, bool while_only) const {
   int nest_level = 0;
   const HloComputation* computation = instruction->parent();
   while (!computation->IsEntryComputation()) {
@@ -134,7 +237,7 @@ int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
     auto callsites = node.caller_callsites();
     CHECK_EQ(callsites.size(), 1) << "The module is not flattened!";
     auto callsite = callsites[0];
-    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+    if (!while_only || callsite.instruction()->opcode() == HloOpcode::kWhile) {
       ++nest_level;
     }
     computation = callsite.instruction()->parent();
@@ -280,6 +383,8 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     float preferred_async_copy_to_overlap_ratio)
     : while_nest_level_(
           cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
+      computation_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
       cost_analysis_(cost_analysis),
       min_async_copy_to_overlap_ratio_(min_async_copy_to_overlap_ratio),
       max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio),
@@ -303,9 +408,12 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
       instructions_elapsed_time.resize(logical_time + 1, 0.0);
       while_nest_level_.resize(logical_time + 1, 0);
     }
-    int nest_level = cost_analysis_.CalculateWhileLoopNestLevel(
-        instruction_and_logical_time.first);
-    while_nest_level_[logical_time] = nest_level;
+    int while_nest_level = cost_analysis_.CalculateComputationNestLevel(
+        instruction_and_logical_time.first, /*while_only=*/true);
+    while_nest_level_[logical_time] = while_nest_level;
+    int computation_nest_level = cost_analysis_.CalculateComputationNestLevel(
+        instruction_and_logical_time.first, /*while_only=*/false);
+    computation_nest_level_[logical_time] = computation_nest_level;
     if (instruction->opcode() == HloOpcode::kWhile ||
         instruction->opcode() == HloOpcode::kConditional) {
       continue;
@@ -313,8 +421,8 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     float elapsed_time = cost_analysis_.GetInstructionElapsed(
         *instruction_and_logical_time.first);
     instructions_elapsed_time[logical_time] =
-        elapsed_time *
-        tensorflow::MathUtil::IPow<float>(kWhileExecutionCount, nest_level);
+        elapsed_time * tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
+                                                         while_nest_level);
   }
   // As an optimization, create a cumulative sum vector of elapsed time.
   float cumsum = 0.0;
@@ -384,14 +492,14 @@ int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
             /*output_in_alternate_mem=*/false);
     inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
   }
-  int end_nest_level = while_nest_level_[end_time];
+  int end_nest_level = computation_nest_level_[end_time];
 
   // Find the latest time we're allowed to start prefetching.
   float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed;
   int latest_prefetch_time;
   for (latest_prefetch_time = end_time - 1;
        latest_prefetch_time >= start_time &&
-       (while_nest_level_[latest_prefetch_time] != end_nest_level ||
+       (computation_nest_level_[latest_prefetch_time] != end_nest_level ||
         min_interval >
             GetLogicalIntervalElapsed(latest_prefetch_time, end_time) +
                 inst_elapsed_reduction);
@@ -412,13 +520,13 @@ int64 CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
       preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed;
   float best_interval = GetLogicalIntervalElapsed(earliest_prefetch_start_time,
                                                   prefetch_end_time);
-  int end_nest_level = while_nest_level_[prefetch_end_time];
+  int end_nest_level = computation_nest_level_[prefetch_end_time];
   for (int64 prefetch_start_time = earliest_prefetch_start_time + 1;
        prefetch_start_time <= latest_prefetch_start_time;
        ++prefetch_start_time) {
     float interval =
         GetLogicalIntervalElapsed(prefetch_start_time, prefetch_end_time);
-    if (while_nest_level_[prefetch_start_time] == end_nest_level &&
+    if (computation_nest_level_[prefetch_start_time] == end_nest_level &&
         std::abs(preferred_interval - interval) <
             std::abs(preferred_interval - best_interval)) {
       best_interval = interval;
@@ -432,10 +540,11 @@ int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
     int64 original_prefetch_end_time, int64 proposed_prefetch_end_time) const {
   // Iterate towards the beginning until we find a suitable end time that is the
   // same while nest level as the original prefetch end time.
-  int64 original_nest_level = while_nest_level_[original_prefetch_end_time];
+  int64 original_nest_level =
+      computation_nest_level_[original_prefetch_end_time];
   int64 new_prefetch_end_time;
   for (new_prefetch_end_time = proposed_prefetch_end_time;
-       while_nest_level_[new_prefetch_end_time] != original_nest_level;
+       computation_nest_level_[new_prefetch_end_time] != original_nest_level;
        --new_prefetch_end_time) {
   }
   return new_prefetch_end_time;
@@ -456,7 +565,7 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
           /*output_in_alternate_mem=*/false);
   inst_elapsed_reduction_ = elapsed_time - elapsed_time_in_alternate_mem;
   end_logical_time_ = end_time;
-  int end_nest_level = while_nest_level_[end_logical_time_];
+  int end_nest_level = computation_nest_level_[end_logical_time_];
 
   // Find the latest time we're allowed to start prefetching.
   float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
@@ -468,7 +577,7 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
                        max_overlap_multiplier_ * async_copy_elapsed_;
   for (earliest_prefetch_time_ = start_time;
        earliest_prefetch_time_ <= end_logical_time_ &&
-       (while_nest_level_[earliest_prefetch_time_] != end_nest_level ||
+       (computation_nest_level_[earliest_prefetch_time_] != end_nest_level ||
         max_interval < GetLogicalIntervalElapsed(earliest_prefetch_time_,
                                                  end_logical_time_));
        ++earliest_prefetch_time_) {
@@ -506,8 +615,8 @@ int64 CostAnalysisPrefetchIntervalPicker::Next() {
   if (using_increasing_prefetch_time_iterator_) {
     int64 prefetch_time = increasing_prefetch_time_iterator_++;
     while (increasing_prefetch_time_iterator_ <= latest_prefetch_time_ &&
-           while_nest_level_[increasing_prefetch_time_iterator_] !=
-               while_nest_level_[end_logical_time_]) {
+           computation_nest_level_[increasing_prefetch_time_iterator_] !=
+               computation_nest_level_[end_logical_time_]) {
       ++increasing_prefetch_time_iterator_;
     }
     if (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_) {
@@ -517,8 +626,8 @@ int64 CostAnalysisPrefetchIntervalPicker::Next() {
   } else {
     int64 prefetch_time = decreasing_prefetch_time_iterator_--;
     while (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_ &&
-           while_nest_level_[decreasing_prefetch_time_iterator_] !=
-               while_nest_level_[end_logical_time_]) {
+           computation_nest_level_[decreasing_prefetch_time_iterator_] !=
+               computation_nest_level_[end_logical_time_]) {
       --decreasing_prefetch_time_iterator_;
     }
     if (increasing_prefetch_time_iterator_ <= latest_prefetch_time_) {
@@ -562,11 +671,11 @@ float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
   // Since elapsed_time_cumsum_ is already weighed by the while loop nesting
   // level, normalize the elapsed time by dividing with the nesting factor of
   // the interval (start and end times).
-  int interval_nest_level = GetMinWhileNestLevel(start_time, end_time);
+  int interval_while_nest_level = GetMinWhileNestLevel(start_time, end_time);
   return (elapsed_time_cumsum_[end_time - 1] -
           elapsed_time_cumsum_[start_time]) /
          tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
-                                           interval_nest_level);
+                                           interval_while_nest_level);
 }
 
 std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
@@ -982,6 +1091,17 @@ void AlternateMemoryBestFitHeap::DumpDebugStringsIfEnabled() const {
 }
 
 HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
+  if (options_.enable_cross_program_prefetch) {
+    absl::optional<AlternateMemoryBestFitHeap::BufferInterval>
+        prefetch_candidate = FindCrossProgramPrefetchCandidate(
+            alias_analysis_, hlo_live_range_, options_);
+    if (prefetch_candidate) {
+      HloModule* module =
+          prefetch_candidate->buffer->instruction()->GetModule();
+      AllocateCrossProgramPrefetchBuffer(module, prefetch_candidate);
+    }
+  }
+
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
 
@@ -1087,6 +1207,7 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
     bool repacked = false;
     for (int retry_number = 0; retry_number < options_.max_retries;
          retry_number++) {
+      AddRequiredAssignmentsForColocatedIntervals(colocated_intervals);
       bool final_retry = (retry_number == options_.max_retries - 1);
       options_.prefetch_interval_picker->SetRetryNumber(retry_number);
       Result result =
@@ -1097,7 +1218,8 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
           (!final_retry && result_failed_because_of_async_copy(result))) {
         UncommitPendingChunks(absl::MakeSpan(allocation_values));
         VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
-      } else if (result_is(result, Result::kFailOutOfMemory) &&
+      } else if ((result_is(result, Result::kFailOutOfMemory) ||
+                  options_.repack_after_every_allocation) &&
                  num_repacks_ < options_.max_repacks && !repacked) {
         UncommitPendingChunks(absl::MakeSpan(allocation_values));
         ++num_repacks_;
@@ -1128,13 +1250,15 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   VLOG(3) << allocation_info_str_;
   DumpDebugStringsIfEnabled();
 
-  return result_;
+  HeapSimulator::Result<HloValue> result;
+  result.heap_size = result_.heap_size;
+  result.heap_results.emplace_back(std::move(result_));
+  return result;
 }
 
-void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
+void AlternateMemoryBestFitHeap::AddRequiredAssignmentsForColocatedIntervals(
     absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
-        colocated_intervals,
-    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
+        colocated_intervals) {
   // TODO(berkin): For now, place the phi values due to conditionals in
   // default memory.
   for (const BufferInterval* colocated_interval : colocated_intervals) {
@@ -1153,7 +1277,12 @@ void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
       }
     }
   }
+}
 
+void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
+    absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+        colocated_intervals,
+    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
   // Create AllocationValues for all the colocated intervals.
   for (const auto& colocated_interval : colocated_intervals) {
     CreateAllocationValues(*colocated_interval, allocation_values);
@@ -2508,107 +2637,6 @@ MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
   };
 }
 
-namespace {
-
-bool LooksLikeAnActivation(const HloInstruction* inst) {
-  for (HloInstruction* user : inst->users()) {
-    switch (user->opcode()) {
-      case HloOpcode::kConvolution:
-      case HloOpcode::kDot:
-        if (user->operand(0) == inst) {
-          return true;
-        }
-        break;
-      case HloOpcode::kGather:
-        if (user->operand(1) == inst) {
-          return true;
-        }
-        break;
-      case HloOpcode::kFusion:
-        for (int i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) == inst &&
-              LooksLikeAnActivation(user->fused_parameter(i))) {
-            return true;
-          }
-        }
-        break;
-      case HloOpcode::kBitcast:
-        return LooksLikeAnActivation(user);
-      default:
-        return true;
-    }
-  }
-  return false;
-}
-
-bool IsCrossProgramPrefetchCandidate(
-    const HloValue& value, const MemorySpaceAssignment::Options& options) {
-  return value.instruction()->parent() ==
-             value.instruction()->GetModule()->entry_computation() &&
-         value.instruction()->opcode() == HloOpcode::kParameter &&
-         (!value.shape().has_layout() ||
-          value.shape().layout().memory_space() !=
-              options.alternate_memory_space) &&
-         value.index().size() == 1 && value.shape().IsArray() &&
-         !value.uses().empty() &&
-         options.size_fn(value) <= options.max_size_in_bytes &&
-         absl::c_all_of(value.uses(), [&](const HloUse& use) {
-           const HloInstruction* inst =
-               use.instruction->operand(use.operand_number);
-
-           // Skip the LooksLikeAnActivation test since we're testing the
-           // parent GTE and its children below.
-           if (inst->opcode() == HloOpcode::kBitcast &&
-               inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
-               inst->operand(0)->operand(0)->opcode() ==
-                   HloOpcode::kParameter) {
-             return true;
-           }
-
-           return inst->opcode() == HloOpcode::kGetTupleElement &&
-                  !LooksLikeAnActivation(inst);
-         });
-}
-
-absl::optional<MemorySpaceAssignment::BufferInterval>
-FindCrossProgramPrefetchCandidate(
-    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
-    const MemorySpaceAssignment::Options& options) {
-  std::vector<MemorySpaceAssignment::BufferInterval> candidates;
-  for (const HloBuffer& buffer : alias_analysis.buffers()) {
-    CHECK_GE(buffer.values().size(), 1);
-    const HloValue* value = buffer.values().at(0);
-    if (IsCrossProgramPrefetchCandidate(*value, options)) {
-      MemorySpaceAssignment::BufferInterval interval;
-      interval.buffer = value;
-      interval.size = options.size_fn(*value);
-      interval.start = 0;
-      interval.end = hlo_live_range.schedule_end_time();
-      interval.need_allocation = true;
-      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
-      candidates.emplace_back(interval);
-    }
-  }
-
-  // The buffer_interval_compare ought to do a good job picking the most
-  // appropriate buffer to cross program prefetch, but empirically, it makes
-  // worse choices than just picking the largest buffer.
-  // TODO(b/152421603): Investigate.
-  auto size_compare = [](const auto& x, const auto& y) {
-    return x.size < y.size;
-  };
-  auto& compare = options.default_cross_program_prefetch_heuristic &&
-                          options.buffer_interval_compare
-                      ? *options.buffer_interval_compare
-                      : size_compare;
-
-  auto best_candidate = absl::c_max_element(candidates, compare);
-  if (best_candidate == candidates.end()) {
-    return absl::nullopt;
-  }
-  return *best_candidate;
-}
-}  // namespace
 
 /*static*/ StatusOr<std::unique_ptr<PresetAssignments>>
 MemorySpaceAssignment::Run(HloModule* module,
@@ -2659,13 +2687,6 @@ Status MemorySpaceAssignment::FindAllocationSequence(
   auto algorithm = absl::make_unique<AlternateMemoryBestFitHeap>(
       &allocations_, options_, alias_analysis, hlo_live_range);
 
-  if (options_.enable_cross_program_prefetch) {
-    absl::optional<AlternateMemoryBestFitHeap::BufferInterval>
-        prefetch_candiate = FindCrossProgramPrefetchCandidate(
-            alias_analysis, hlo_live_range, options_);
-    algorithm->AllocateCrossProgramPrefetchBuffer(module_, prefetch_candiate);
-  }
-
   HeapSimulator::Options heap_simulator_options;
   heap_simulator_options.may_reuse_operand_buffers = false;
   TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module_,
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index b1f59fa9c78..341bf7e9895 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -149,9 +149,11 @@ class MemorySpaceAssignmentCostAnalysis {
 
   int64 GetScheduleEndTime() const;
 
-  // Returns the number of nested while loop levels this instruction resides in.
-  // 0 means it is not in a while loop.
-  int CalculateWhileLoopNestLevel(const HloInstruction* instruction) const;
+  // Returns the number of nested computation levels this instruction resides
+  // in. If while_only is true, it returns the while loop nest level and 0
+  // means the instruction is not in a while loop.
+  int CalculateComputationNestLevel(const HloInstruction* instruction,
+                                    bool while_only) const;
 
   const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
 
@@ -360,6 +362,7 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   // (in cumulative sum) and while nesting level.
   std::vector<float> elapsed_time_cumsum_;
   std::vector<int> while_nest_level_;
+  std::vector<int> computation_nest_level_;
   // Maintain the index of the most recent (before this instruction) nest level
   // change in order to efficiently determine the minimum nest level in an
   // interval.
@@ -459,6 +462,9 @@ class MemorySpaceAssignment {
     // max_repacks is greater than 0.
     MemorySpaceAssignmentRepacker* repacker = nullptr;
 
+    // This is only useful for testing, repack after every allocation.
+    bool repack_after_every_allocation = false;
+
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
     bool allocate_across_sequential_calls = false;
@@ -976,7 +982,7 @@ class AlternateMemoryBestFitHeap
 
   // Given colocated intervals, populates allocation_values with the
   // corresponding AllocationValue objects.
-  void CreateAllocationValuesFromColocatedIntervals(
+  virtual void CreateAllocationValuesFromColocatedIntervals(
       absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
           colocated_intervals,
       std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values);
@@ -1198,6 +1204,11 @@ class AlternateMemoryBestFitHeap
   absl::optional<RequiredMemoryAssignment> AliasedRequiredAssignmentForUse(
       const AllocationValue::Use& use) const;
 
+  // Goes through the colocated intervals and adds any required assignment.
+  void AddRequiredAssignmentsForColocatedIntervals(
+      absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+          colocated_intervals);
+
   // Propagates aliased required assignment for a given position.
   void AddAliasedRequiredAssignment(
       const HloInstruction* instruction, ShapeIndex index,
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 5af61eac5d1..187076abe8a 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -4136,10 +4136,12 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
  public:
   explicit FakeMemorySpaceAssignmentRepacker(
       absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map,
-      std::function<void(absl::Span<AllocationBlock*>)> check_fun = nullptr)
+      std::function<void(absl::Span<AllocationBlock*>)> check_fun = nullptr,
+      bool always_return_modified = false)
       : MemorySpaceAssignmentRepacker(/*max_size=*/128, /*alignment=*/8),
         repack_map_(repack_map),
-        check_fun_(check_fun) {}
+        check_fun_(check_fun),
+        always_return_modified_(always_return_modified) {}
 
   StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
     bool modified = false;
@@ -4173,13 +4175,14 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
       check_fun_(allocations);
     }
 
-    return modified;
+    return always_return_modified_ || modified;
   }
 
  private:
   // A map from (start_time, offset) to new_offset.
   absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map_;
   std::function<void(absl::Span<AllocationBlock*>)> check_fun_;
+  bool always_return_modified_;
 };
 
 TEST_P(MemorySpaceAssignmentTest, Repack) {
@@ -4418,6 +4421,66 @@ TEST_P(MemorySpaceAssignmentTest, RepackExportsAliasedOffsets) {
                     options);
 }
 
+TEST_P(MemorySpaceAssignmentTest,
+       RepackShouldntEraseRequiredAssignmentForConditionalOutput) {
+  // This is a test case for b/171040271. Repacks erase the required assignments
+  // (since some required assignments are inserted conditionally based on
+  // allocation decisions), including the fact that conditional outputs are
+  // always required to get assignments in the default memory. After repacking,
+  // this required assignment was never added back, causing conditionals to get
+  // alternate-memory allocations.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]) parameter(0)
+    gte = f32[3] get-tuple-element(p0), index=0
+    neg1 = f32[3] negate(gte)
+    ROOT tuple1 = (f32[3]) tuple(neg1)
+  }
+
+  false_computation {
+    p0 = (f32[3]) parameter(0)
+    gte = f32[3] get-tuple-element(p0), index=0
+    neg2 = f32[3] negate(gte)
+    ROOT tuple2 = (f32[3]) tuple(neg2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3] parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3] copy(p0)
+    tuple = (f32[3]) tuple(copy)
+    conditional = (f32[3]) conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+    ROOT gte = f32[3] get-tuple-element(conditional), index=0
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map, nullptr,
+                                        /*always_return_modified=*/true);
+  MemorySpaceAssignment::Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  options.max_repacks = 10;
+  options.repacker = &repacker;
+  options.repack_after_every_allocation = true;
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*buffer_interval_compare=*/{}, &prefetch_interval_picker,
+                    options);
+  // Make sure the root of the entry computation is in the default memory space.
+  EXPECT_EQ(module->entry_computation()
+                ->root_instruction()
+                ->shape()
+                .layout()
+                .memory_space(),
+            kDefaultMemorySpace);
+}
+
 TEST_P(MemorySpaceAssignmentTest, Determinism) {
   // Run memory space assignment a few times to make sure every time it compiles
   // to the same thing.
@@ -5148,5 +5211,75 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
             4);
 }
 
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, ConsecutiveConditionals) {
+  // This is a test for b/170668492, where prefetching for consecutive
+  // conditionals can cause the prefetch to start in the conditional's
+  // computation.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  true_computation.0 {
+    p0 = (f32[3]{0}) parameter(0)                   // 5
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 6
+    ROOT neg1 = f32[3]{0} negate(gte)               // 7
+  }
+
+  false_computation.0 {
+    p0 = (f32[3]{0}) parameter(0)                   // 8
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 9
+    ROOT neg2 = f32[3]{0} negate(gte)               // 10
+  }
+
+  true_computation.1 {
+    p0 = (f32[3]{0}) parameter(0)                   // 12
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 13
+    ROOT neg1 = f32[3]{0} negate(gte)               // 14
+  }
+
+  false_computation.1 {
+    p0 = (f32[3]{0}) parameter(0)                   // 15
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 16
+    ROOT neg2 = f32[3]{0} negate(gte)               // 17
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)       // 0
+    p1 = f32[3]{0} parameter(1)       // 1
+    p2 = pred[] parameter(2)          // 2
+    tuple0 = (f32[3]{0}) tuple(p0)    // 3
+    tuple1 = (f32[3]{0}) tuple(p1)    // 4
+    conditional0 = f32[3]{0} conditional(p2, tuple0, tuple0), true_computation=true_computation.0, false_computation=false_computation.0  // 11
+    conditional1 = f32[3]{0} conditional(p2, tuple1, tuple1), true_computation=true_computation.1, false_computation=false_computation.1  // 18
+    ROOT tuple2 = (f32[3]{0}, f32[3]{0}) tuple(conditional0, conditional1)  // 19
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_async_copy_to_overlap_ratio=*/1.0,
+      /*max_async_copy_to_overlap_ratio=*/12.0,
+      /*preferred_async_copy_to_overlap_ratio=*/2.0);
+
+  LOG(INFO) << module->ToString();
+
+  HloInstruction* conditional1 =
+      module->entry_computation()->GetInstructionWithName("conditional1");
+  const HloUse use{conditional1, /*operand_number=*/1, /*operand_index=*/{0}};
+  const Shape& shape =
+      module->entry_computation()->parameter_instruction(0)->shape();
+
+  // Expect that the prefetch to start before conditional0's called
+  // computations.
+  EXPECT_LT(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
+                                                    /*end_time=*/11, &use),
+            5);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/g3doc/conv_emitter.md b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/g3doc/conv_emitter.md
index 437a0c488e2..6151357372d 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/g3doc/conv_emitter.md
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/g3doc/conv_emitter.md
@@ -8,7 +8,7 @@ TODO(timshen): Change once all patches are checked in.
 The convolution emitter is a prototype with the following goals:
 
 *   The top priority is performance.
-*   It supports arbitrarily sophiscated layouts.
+*   It supports arbitrarily sophisticated layouts.
 *   It supports platform-specific high-performance instructions.
 *   It is as portable as possible.
 *   It enables fusion support in the future.
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index bb8a990fa6d..6e895f583d6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
@@ -70,18 +70,19 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Legalize from HLO to LHLO.
   pm.addPass(::mlir::mhlo::createLegalizeToLhloPass());
   // Moving `AllocOp`s and inserting missing `DeallocOp`s
-  pm.addPass(::mlir::createBufferPlacementPass());
+  pm.addPass(::mlir::createBufferHoistingPass());
+  pm.addPass(::mlir::createBufferDeallocationPass());
   // Next, we can strip the outer fusion operation.
   pm.addPass(createFusionOpRemoverPass());
   // Remove unnecessary LHLO copies.
   pm.addPass(::mlir::createCopyRemovalPass());
+  // Legalize reduce operations directly to GPU dialect.
+  pm.addPass(::mlir::lmhlo::createLegalizeToGpuPass());
   // Transform LHLO operations to LinAlg.
   pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
   // Fuse linalg operations.
   pm.addPass(::mlir::lmhlo::createLhloFuseLinalgPass(
       /*use_parallel_loops=*/true, tiling_for_unrolling));
-  // Legalize reduce operations directly to GPU dialect.
-  pm.addPass(::mlir::lmhlo::createLegalizeToGpuPass());
   // Transform the Linalg operations inside of the loop nest into parallel
   // loops.
   pm.addPass(::mlir::createConvertLinalgToParallelLoopsPass());
@@ -125,8 +126,6 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
     pm.addNestedPass<::mlir::FuncOp>(
         ::mlir::mhlo::createLegalizeTrigonometricToApproximationPass());
   }
-  // Move scalar operations into the launch to ensure smaller signatures.
-  pm.addPass(createMoveScalarComputationsIntoGpuLaunchPass());
   // Take launches to launches with kernels.
   pm.addPass(::mlir::createGpuKernelOutliningPass());
   // Make sure the kernel signature resembled the original function's
@@ -171,7 +170,7 @@ class LowerToNVVMPass
     // TODO(csigg): Remove once we support replacing non-root ops.
     target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
                       ::mlir::gpu::YieldOp>();
-    if (failed(mlir::applyFullConversion(m, target, patterns))) {
+    if (failed(mlir::applyFullConversion(m, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
@@ -215,11 +214,13 @@ class LowerToROCDLPass
   void runOnOperation() override {
     ::mlir::gpu::GPUModuleOp m = getOperation();
 
-    ::mlir::OwningRewritePatternList patterns;
-    ::mlir::populateGpuRewritePatterns(m.getContext(), patterns);
-    ::mlir::applyPatternsAndFoldGreedily(m, patterns);
-    patterns.clear();
+    {
+      ::mlir::OwningRewritePatternList patterns;
+      ::mlir::populateGpuRewritePatterns(m.getContext(), patterns);
+      ::mlir::applyPatternsAndFoldGreedily(m, std::move(patterns));
+    }
 
+    ::mlir::OwningRewritePatternList patterns;
     ::mlir::LLVMTypeConverter converter(m.getContext());
     ::mlir::populateStdToLLVMConversionPatterns(converter, patterns);
     // TODO(b/145824979) Remove linalg once sliceop is in std.
@@ -240,7 +241,7 @@ class LowerToROCDLPass
     // TODO(csigg): Remove once we support replacing non-root ops.
     target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
                       ::mlir::gpu::YieldOp>();
-    if (failed(mlir::applyFullConversion(m, target, patterns))) {
+    if (failed(mlir::applyFullConversion(m, target, std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index b275dd4525f..1457fa5df1d 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -322,12 +322,9 @@ Status LhloDialectEmitter::HandleGather(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
   OpBuilder func_builder(function.getBody());
 
-  // TODO(pifon): Clean-up LHLO GatherOp to be consistent with HLO GatherOp.
   func_builder.create<lhlo::GatherOp>(
       getLocation(instr), function.getArgument(0), function.getArgument(1),
-      dim_numbers.index_vector_dim(), dim_numbers.offset_dims(), slice_sizes,
-      dim_numbers.collapsed_slice_dims(), dim_numbers.start_index_map(),
-      function.getArgument(2));
+      dim_numbers, slice_sizes, function.getArgument(2));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 52af857efe5..f00f46b83c1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -562,9 +562,20 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
       auto ptx, xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
                                               GetGpuVersion(stream_exec),
                                               config, GetLibdeviceDir(config)));
-  TF_ASSIGN_OR_RETURN(
-      auto cubin, se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
-                                    gpu::PtxOptsFromConfig(config)));
+  // Allow to fallback to the driver compilation when ptxas isn't able to
+  // compile.
+  StatusOr<std::vector<uint8>> maybe_cubin =
+      se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
+                        gpu::PtxOptsFromConfig(config));
+  std::vector<uint8> cubin;
+  if (maybe_cubin.ok()) {
+    cubin = std::move(maybe_cubin).ValueOrDie();
+  } else if (maybe_cubin.status().code() ==
+             tensorflow::error::Code::UNIMPLEMENTED) {
+    xla::gpu::WarnIfBadDriverJITVersion();
+  } else {
+    return maybe_cubin.status();
+  }
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       std::make_unique<gpu::ThunkSequence>(std::move(thunk_sequence)),
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
index f0997701d73..84751bc0507 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
@@ -203,63 +203,6 @@ struct DeadTempBufferRemovalPass
   }
 };
 
-struct MoveScalarComputationsIntoGpuLaunchPass
-    : MoveScalarComputationsIntoGpuLaunchPassBase<
-          MoveScalarComputationsIntoGpuLaunchPass> {
-  static bool isInliningBeneficiary(mlir::Operation* op) {
-    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
-                     mlir::CmpIOp>(op);
-  }
-
-  static bool extractBeneficiaryOps(
-      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
-      llvm::SetVector<mlir::Value> args) {
-    if (!isInliningBeneficiary(op)) {
-      return false;
-    }
-
-    ops->push_back(op);
-    for (auto operand : op->getOperands()) {
-      // It is an existing arg, keep going.
-      if (args.count(operand)) {
-        continue;
-      }
-      mlir::Operation* definingOp = operand.getDefiningOp();
-      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
-    llvm::SetVector<mlir::Value> used_above;
-    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
-    mlir::BlockAndValueMapping inlined_map;
-    for (mlir::Value v : used_above) {
-      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
-      mlir::Operation* definingOp = v.getDefiningOp();
-      if (definingOp &&
-          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
-        mlir::OpBuilder b(launch.body());
-        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
-          auto result = b.clone(*op, inlined_map);
-          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
-            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
-                                             std::get<1>(pair), launch.body());
-          }
-          inlined_map.map(op->getResults(), result->getResults());
-        }
-      }
-    }
-  }
-
-  void runOnFunction() override {
-    getFunction().walk(
-        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
-  }
-};
-
 struct RewriteKernelSignaturePass
     : RewriteKernelSignaturePassBase<RewriteKernelSignaturePass> {
   void runOnFunction() override {
@@ -414,11 +357,6 @@ std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass() {
   return absl::make_unique<DeadTempBufferRemovalPass>();
 }
 
-std::unique_ptr<mlir::FunctionPass>
-createMoveScalarComputationsIntoGpuLaunchPass() {
-  return absl::make_unique<MoveScalarComputationsIntoGpuLaunchPass>();
-}
-
 std::unique_ptr<mlir::FunctionPass> createRewriteKernelSignaturePass() {
   return absl::make_unique<RewriteKernelSignaturePass>();
 }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.h b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
index 19ebe53f7ce..832321387c6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/passes.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
@@ -37,10 +37,6 @@ std::unique_ptr<mlir::FunctionPass> createStoreForwardingPass();
 /// that loads and stores are side-effect free (in bounds, no aliasing, etc.).
 std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass();
 
-/// Moves scalar computations to the GPULaunchOp body.
-std::unique_ptr<mlir::FunctionPass>
-createMoveScalarComputationsIntoGpuLaunchPass();
-
 /// Sorts the operands to the kernel for a deterministic order. First operands
 /// that are defined by function arguments, followed by operands that are
 /// returned from the function. This only works for simple functions without
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.td b/tensorflow/compiler/xla/service/mlir_gpu/passes.td
index 1b19fcf5274..55fe15ad6ff 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/passes.td
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.td
@@ -46,15 +46,6 @@ def DeadTempBufferRemovalPass
   }];
 }
 
-def MoveScalarComputationsIntoGpuLaunchPass
-    : FunctionPass<"mlir-gpu-inline-scalar-computation"> {
-  let summary = "Pass to Move scalar computations to the GPULaunchOp body.";
-  let constructor = "createMoveScalarComputationsIntoGpuLaunchPass()";
-  let description = [{
-    Moves scalar computations to the GPULaunchOp body.
-  }];
-}
-
 def RewriteKernelSignaturePass
     : FunctionPass<"mlir-gpu-rewrite-signatures"> {
   let summary = "Rewrite kernel signatures to be deterministic.";
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
index c640130d245..db39919a803 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
@@ -8,7 +8,9 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
 }
 
 // CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
-// CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[ARG2]]
+// CHECK: gpu.launch_func
+// CHECK-SAME: blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args
+// CHECK-SAME: (%[[ARG0]] : [[TYPE]], %[[ARG1]] : [[TYPE]], %[[ARG2]] : [[TYPE]])
 // CHECK: }
 // CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]]
 // CHECK-DAG: subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
index 470ae348740..6a4f020b850 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
@@ -12,9 +12,11 @@ ENTRY %Gather (x: f32[100,10], y: s64[4,6]) -> f32[4,6,10] {
 // CHECK: func @gather(%[[ARG0:.*]]: [[TYPE0:.*]], %[[ARG1:.*]]: [[TYPE1:.*]],
 // CHECK-SAME:         %[[RESULT:.*]]: [[RTYPE:.*]]) {
 // CHECK-NEXT: "lmhlo.gather"(%[[ARG0]], %[[ARG1]], %[[RESULT]]) {
-// CHECK-SAME:   collapsed_slice_dims = dense<0> : tensor<1xi64>,
-// CHECK-SAME:   index_vector_dim = 2 : i64,
-// CHECK-SAME:   offset_dims = dense<2> : tensor<1xi64>,
-// CHECK-SAME:   slice_sizes = dense<[1, 10]> : tensor<2xi64>,
-// CHECK-SAME:   start_index_map = dense<0> : tensor<1xi64>
+// CHECK-SAME:   dimension_numbers = {
+// CHECK-SAME:     collapsed_slice_dims = dense<0> : tensor<1xi64>,
+// CHECK-SAME:     index_vector_dim = 2 : i64,
+// CHECK-SAME:     offset_dims = dense<2> : tensor<1xi64>,
+// CHECK-SAME:     start_index_map = dense<0> : tensor<1xi64>
+// CHECK-SAME:   },
+// CHECK-SAME:   slice_sizes = dense<[1, 10]> : tensor<2xi64>
 // CHECK-SAME: } : ([[TYPE0]], [[TYPE1]], [[RTYPE]]) -> ()
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir
index cff1989f05b..befbae7382e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir
@@ -19,11 +19,12 @@ func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
   %res = alloc() : memref<8xf32>
 
   // CHECK: gpu.launch_func
-  // CHECK-SAME: index, memref<32xf32>, memref<16xf32>, memref<8xf32>)
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %res, %arg1, %arg0)
-      { kernel = @kernel_module::@kernel }
-      : (index, index, index, index, index, index,
-         memref<8xf32>, memref<16xf32>, memref<32xf32>) -> ()
+  // CHECK-SAME: memref<32xf32>,
+  // CHECK-SAME: memref<16xf32>,
+  // CHECK-SAME: memref<8xf32>)
+  gpu.launch_func @kernel_module::@kernel
+      blocks in (%cst, %cst, %cst) threads in(%cst, %cst, %cst)
+      args(%res : memref<8xf32>, %arg1 : memref<16xf32>, %arg0 : memref<32xf32>)
 
   return %res : memref<8xf32>
 }
@@ -45,10 +46,9 @@ func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
   %cst = constant 8 : index
   %res = alloc() : memref<8xf32>
 
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %arg1, %arg0)
-      { kernel = @kernel_module::@kernel }
-      : (index, index, index, index, index, index,
-         memref<16xf32>, memref<32xf32>) -> ()
+  gpu.launch_func @kernel_module::@kernel
+      blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+      args(%arg1 : memref<16xf32>, %arg0 : memref<32xf32>)
 
   return %res : memref<8xf32>
 }
@@ -72,10 +72,9 @@ func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
   %res = alloc() : memref<8xf32>
   %fake = alloc() : memref<8xf32>
 
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %arg1, %arg0, %fake)
-      { kernel = @kernel_module::@kernel }
-      : (index, index, index, index, index, index,
-         memref<16xf32>, memref<32xf32>, memref<8xf32>) -> ()
+  gpu.launch_func @kernel_module::@kernel
+      blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+      args(%arg1 : memref<16xf32>, %arg0 : memref<32xf32>, %fake : memref<8xf32>)
 
   return %res : memref<8xf32>
 }
@@ -99,10 +98,9 @@ func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
   %res = alloc() : memref<8xf32>
   %fake = alloc() : memref<16xf32>
 
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %fake, %arg0, %res)
-      { kernel = @kernel_module::@kernel }
-      : (index, index, index, index, index, index,
-         memref<16xf32>, memref<32xf32>, memref<8xf32>) -> ()
+  gpu.launch_func @kernel_module::@kernel
+      blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+      args(%fake : memref<16xf32>, %arg0 : memref<32xf32>, %res : memref<8xf32>)
 
   return %res : memref<8xf32>
 }
@@ -127,10 +125,9 @@ func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
   br ^bb1
 
   ^bb1:
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %res, %arg1, %arg0)
-      { kernel = @kernel_module::@kernel }
-      : (index, index, index, index, index, index,
-         memref<8xf32>, memref<16xf32>, memref<32xf32>) -> ()
+  gpu.launch_func @kernel_module::@kernel
+      blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+      args(%res : memref<8xf32>, %arg1 : memref<16xf32>, %arg0 : memref<32xf32>)
 
   return %res : memref<8xf32>
 }
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index eb29fa89098..8ebb522d6a8 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -2125,6 +2125,7 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
 XLA_TERNOP_PATTERN(Clamp);
 XLA_TERNOP_PATTERN(Scatter);
 XLA_TERNOP_PATTERN(Select);
+XLA_TERNOP_PATTERN(SelectAndScatter);
 #undef XLA_TERNOP_PATTERN
 
 namespace detail {
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index bc79f16db2a..e72bce71a8e 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -245,17 +245,6 @@ Service::ResolveAndValidateArguments(
     CHECK_EQ(options_.number_of_replicas(), replicated_buffers.size());
     for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
       const ShapedBuffer* shaped_buffer = replicated_buffers[replica];
-      int replica_device_ordinal = stream_executors[replica]->device_ordinal();
-      // Verify allocation is same platform and device as the execution.
-      if (shaped_buffer->platform() != execute_backend_->platform() ||
-          shaped_buffer->device_ordinal() != replica_device_ordinal) {
-        return InvalidArgument(
-            "argument %lu is on device %s:%d but computation will be executed "
-            "on device %s",
-            i, shaped_buffer->platform()->Name(),
-            shaped_buffer->device_ordinal(),
-            execute_backend_->device_name(replica_device_ordinal));
-      }
       replicated_arguments[replica].push_back(shaped_buffer);
     }
   }
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index a96c9c34260..f3a0c471e89 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1811,19 +1811,32 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         // Input feature dimension is a contracting dimension, which does not
         // affect the output dimension size. So we need to do nothing.
       } else {
-        return InvalidArgument(
-            "Dynamic Spatial Convolution is not supported: lhs shape is %s ",
-            lhs.ToString());
+        for (int64 j = 0; j < dnums.output_spatial_dimensions_size(); ++j) {
+          if (i == dnums.input_spatial_dimensions(j)) {
+            // i is a spatial dimension, find corresponding output spatial
+            // dimension.
+            is_dynamic[dnums.output_spatial_dimensions(j)] = true;
+          }
+        }
       }
     }
     if (rhs.is_dynamic_dimension(i)) {
       if (i == dnums.kernel_input_feature_dimension()) {
         // Kernel feature dimension does not affect the output dimension size.
         // So we need to do nothing.
-      } else {
+      } else if (i == dnums.kernel_output_feature_dimension()) {
         return InvalidArgument(
-            "Dynamic Spatial Convolution is not supported: rhs shape is %s ",
+            "Dynamic output feature dim on convolution kernel is not "
+            "supported: rhs shape is %s ",
             rhs.ToString());
+      } else {
+        for (int64 j = 0; j < dnums.kernel_spatial_dimensions_size(); ++j) {
+          if (i == dnums.kernel_spatial_dimensions(j)) {
+            // i is a spatial dimension, find corresponding output spatial
+            // dimension.
+            is_dynamic[dnums.output_spatial_dimensions(j)] = true;
+          }
+        }
       }
     }
   }
@@ -2084,7 +2097,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         arg_shapes.size());
   }
   int64 num_reduced_args = arg_shapes.size() / 2;
-
   auto reduced_args = arg_shapes.subspan(0, num_reduced_args);
   // Check that all of the reduced tensors have the same dimensions. The element
   // types may be different.
@@ -2097,7 +2109,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           ShapeUtil::HumanString(*reduced_args[i]));
     }
   }
-
   // Check that the dimensions to reduce are in-bounds for the given shape.
   // We've already verified all reduced tensors have the same dimensions, so it
   // doesn't matter which one we choose.
@@ -2156,6 +2167,43 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return InferReduceWindowShape(operand_shape, init_value_shape, window);
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferReduceWindowShape(
+    absl::Span<const Shape*> operands, absl::Span<const Shape*> init_values,
+    const Window& window, const ProgramShape& to_apply_shape) {
+  auto number_of_input = operands.size();
+  // Check that all of the reduced tensors have the same dimensions. The element
+  // types may be different.
+  for (int64 i = 1; i < number_of_input; ++i) {
+    if (!ShapeUtil::SameDimensions(*operands[0], *operands[i])) {
+      return InvalidArgument(
+          "All reduced tensors must have the same dimension. Tensor 0 has "
+          "shape %s, Tensor %d has shape %s",
+          ShapeUtil::HumanString(*operands[0]), i,
+          ShapeUtil::HumanString(*operands[i]));
+    }
+  }
+  std::vector<PrimitiveType> operand_element_type_vec;
+  for (const Shape* s : operands) {
+    operand_element_type_vec.push_back(s->element_type());
+  }
+  TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, init_values,
+                                        operand_element_type_vec,
+                                        /*inputs=*/number_of_input));
+  std::vector<Shape> output_shape_vec;
+  for (int i = 0; i < operands.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        auto cur_output_shape,
+        InferReduceWindowShape(*operands[i], *init_values[i], window));
+    output_shape_vec.push_back(cur_output_shape);
+  }
+  if (ShapeUtil::IsScalar(to_apply_shape.result())) {
+    CHECK_EQ(output_shape_vec.size(), 1);
+    return output_shape_vec[0];
+  } else {
+    return ShapeUtil::MakeTupleShape(output_shape_vec);
+  }
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferReduceWindowShape(
     const Shape& operand_shape, const Shape& init_value_shape,
     const Window& window) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index f03e4e5fa98..eb969873fd0 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -164,10 +164,16 @@ class ShapeInference {
   static StatusOr<Shape> InferReduceWindowShape(
       const Shape& operand_shape, const Shape& init_value, const Window& window,
       const ProgramShape& to_apply_shape);
-
   static StatusOr<Shape> InferReduceWindowShape(const Shape& operand_shape,
                                                 const Shape& init_value,
                                                 const Window& window);
+  static StatusOr<Shape> InferReduceWindowShape(
+      absl::Span<const Shape*> operands, absl::Span<const Shape*> init_values,
+      const Window& window, const ProgramShape& to_apply_shape);
+
+  static StatusOr<Shape> InferReduceWindowShape(
+      absl::Span<const Shape*> operands, absl::Span<const Shape*> init_values,
+      const Window& window);
 
   // Infers the shape produced by scattering the given source shape to the
   // selected indices of each window on the operand shape.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 00ecb254a17..73bbe5cb3bd 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -912,6 +913,32 @@ TEST_F(ReduceShapeInferenceTest, ReduceMultiOutput) {
                                inferred_status.ValueOrDie()));
 }
 
+TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
+  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
+  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
+  std::vector<const Shape*> args = {&f32_arg_shape, &s32_arg_shape};
+  std::vector<const Shape*> inits = {&f32_, &s32_};
+  ProgramShape to_apply = ShapeUtil::MakeProgramShape(
+      {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
+  std::vector<int64> window_dimensions = {1, 2, 4};
+  std::vector<int64> window_strides = {1, 1, 1};
+  std::vector<std::pair<int64, int64>> padding_values =
+      MakePadding(AsInt64Slice(f32_arg_shape.dimensions()), window_dimensions,
+                  window_strides, Padding::kValid);
+  TF_ASSERT_OK_AND_ASSIGN(
+      Window window,
+      ShapeInference::InferWindowFromDimensions(
+          window_dimensions, window_strides, padding_values, {}, {}));
+  auto inferred_status = ShapeInference::InferReduceWindowShape(
+      absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
+  VLOG(2) << inferred_status.ValueOrDie().ToString() << "\n";
+  EXPECT_IS_OK(inferred_status.status());
+  EXPECT_TRUE(ShapeUtil::Equal(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {5, 2, 0}),
+                                 ShapeUtil::MakeShape(S32, {5, 2, 0})}),
+      inferred_status.ValueOrDie()));
+}
+
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) {
   Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
   Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
@@ -948,6 +975,29 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) {
               HasSubstr("must have at least 2 arguments, has 0"));
 }
 
+TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
+  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
+  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
+  std::vector<const Shape*> args = {&f32_arg_shape, &s32_arg_shape};
+  std::vector<const Shape*> inits = {&f32_, &s32_};
+  ProgramShape to_apply = ShapeUtil::MakeProgramShape(
+      {f32_, f32_, f32_, f32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
+  std::vector<int64> window_dimensions = {1, 2, 4};
+  std::vector<int64> window_strides = {1, 1, 1};
+  std::vector<std::pair<int64, int64>> padding_values =
+      MakePadding(AsInt64Slice(f32_arg_shape.dimensions()), window_dimensions,
+                  window_strides, Padding::kValid);
+  TF_ASSERT_OK_AND_ASSIGN(
+      Window window,
+      ShapeInference::InferWindowFromDimensions(
+          window_dimensions, window_strides, padding_values, {}, {}));
+  auto inferred_status = ShapeInference::InferReduceWindowShape(
+      absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
+  EXPECT_FALSE(inferred_status.status().ok());
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("f32[] vs s32[]"));
+}
+
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
   Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
   Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 67c7896cebd..902213b3bc0 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -31,23 +31,20 @@ limitations under the License.
 
 namespace xla {
 
-ShapedBuffer::ShapedBuffer(Shape on_device_shape, const se::Platform* platform,
-                           int device_ordinal)
+ShapedBuffer::ShapedBuffer(Shape on_device_shape, int device_ordinal)
     : on_device_shape_(std::move(on_device_shape)),
-      platform_(platform),
       device_ordinal_(device_ordinal),
       buffers_(&on_device_shape_) {
   on_host_shape_ = ShapeUtil::DeviceShapeToHostShape(on_device_shape_);
 }
 
 ShapedBuffer::ShapedBuffer(Shape on_host_shape, Shape on_device_shape,
-                           const se::Platform* platform, int device_ordinal)
-    : ShapedBuffer(on_device_shape, platform, device_ordinal) {}
+                           int device_ordinal)
+    : ShapedBuffer(on_device_shape, device_ordinal) {}
 
 ShapedBuffer::ShapedBuffer(ShapedBuffer&& s)
     : on_host_shape_(std::move(s.on_host_shape_)),
       on_device_shape_(std::move(s.on_device_shape_)),
-      platform_(s.platform_),
       device_ordinal_(s.device_ordinal_),
       buffers_(std::move(s.buffers_)) {
   // s.buffers_ has a pointer to s.on_device_shape_. When we move s.buffers_
@@ -59,7 +56,6 @@ ShapedBuffer::ShapedBuffer(ShapedBuffer&& s)
 ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) {
   on_device_shape_ = std::move(s.on_device_shape_);
   on_host_shape_ = std::move(s.on_host_shape_);
-  platform_ = s.platform_;
   device_ordinal_ = s.device_ordinal_;
   buffers_ = std::move(s.buffers_);
   // buffers_ has a pointer to its on_device_shape_. When we move s.buffers_
@@ -75,7 +71,7 @@ StatusOr<ShapedBuffer> ShapedBuffer::SubShapedBuffer(
     const ShapeIndex& index) const {
   TF_ASSIGN_OR_RETURN(const Shape* device_sub_shape,
                       ShapeUtil::TryGetSubshape(on_device_shape(), index));
-  ShapedBuffer sub_shaped_buffer(*device_sub_shape, platform_, device_ordinal_);
+  ShapedBuffer sub_shaped_buffer(*device_sub_shape, device_ordinal_);
   TF_ASSIGN_OR_RETURN(ShapeTree<se::DeviceMemoryBase> sub_buffers,
                       buffers_.SubShapeTree(index));
   sub_shaped_buffer.set_buffers(std::move(sub_buffers));
@@ -91,7 +87,7 @@ void ShapedBuffer::clear() {
 
 string ShapedBuffer::ToString() const {
   string s =
-      absl::StrCat("ShapedBuffer(", platform_->Name(), ":", device_ordinal(),
+      absl::StrCat("ShapedBuffer(", device_ordinal(),
                    "), on-device shape=" +
                        ShapeUtil::HumanStringWithLayout(on_device_shape()),
                    ":\n");
@@ -120,8 +116,7 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
 ScopedShapedBuffer::ScopedShapedBuffer(Shape on_device_shape,
                                        se::DeviceMemoryAllocator* allocator,
                                        int device_ordinal)
-    : ShapedBuffer(std::move(on_device_shape), allocator->platform(),
-                   device_ordinal),
+    : ShapedBuffer(std::move(on_device_shape), device_ordinal),
       allocator_(allocator) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(Shape on_host_shape,
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index 7f1248998a6..55ab768b857 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -43,12 +43,10 @@ class ShapedBuffer {
   // both the on-host and on-device shape are required. The on-device shape
   // determines the number of device allocations (DeviceMemoryBase) held by the
   // ShapedBuffer.
-  ShapedBuffer(Shape on_device_shape, const se::Platform* platform,
-               int device_ordinal);
+  ShapedBuffer(Shape on_device_shape, int device_ordinal);
 
   // TODO(b/170310047): remove this overload.
-  ShapedBuffer(Shape on_host_shape, Shape on_device_shape,
-               const se::Platform* platform, int device_ordinal);
+  ShapedBuffer(Shape on_host_shape, Shape on_device_shape, int device_ordinal);
 
   // Movable, but not copyable.
   ShapedBuffer(ShapedBuffer&& s);
@@ -70,7 +68,6 @@ class ShapedBuffer {
   // ShapedBuffer.
   const Shape& on_device_shape() const { return on_device_shape_; }
 
-  const se::Platform* platform() const { return platform_; }
   int device_ordinal() const { return device_ordinal_; }
 
   // Return the root buffer of the shape (shape index {}).
@@ -132,9 +129,6 @@ class ShapedBuffer {
   // The shape of the data on the device.
   Shape on_device_shape_;
 
-  // The platform the memory is allocated on.
-  const se::Platform* platform_;
-
   // The device the memory is allocated on.
   int device_ordinal_;
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index 49751d10c5a..763d89e57fa 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -173,8 +173,10 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
 
 // Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
 // (cardinality of each non-leaf node's children).
-void BM_TakeSubTree(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_TakeSubTree(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   TestAllocator allocator;
   xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
@@ -183,13 +185,11 @@ void BM_TakeSubTree(int iters, int depth, int fan_out) {
   }
   xla::ScopedShapedBuffer shaped_buffer(shape, /*allocator=*/&allocator,
                                         /*device_ordinal=*/0);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     // Extract a buffer from approximately the middle of the first level of the
     // tree.
     (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
   }
-  tensorflow::testing::StopTiming();
 }
 
 BENCHMARK(BM_TakeSubTree)
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.cc b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
index 47aee8ed5a8..e9fd1c1aa3d 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter.cc
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
@@ -95,8 +95,9 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
 
   // This is the spatial dimension we choose to spilt.
   constexpr int64 kChosenSpatialDim = 0;
-  constexpr int64 kLowLimitForSplitCount = 4;
-  constexpr int64 kHighLimitForSplitCount = 24;
+  // We choose the new batch size to be a constant so that space-to-batch
+  // propagation through several convolutional layers is consistent.
+  constexpr int64 kNewBatchSize = 8;
 
   // Batch in batch_group_count has different semantics (it isn't true batch).
   // Consider supporting this case in future if needed.
@@ -151,35 +152,16 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
       input_dim_size + inherent_low_padding + inherent_high_padding;
   VLOG(1) << "spatial size " << spatial_size;
 
-  int64 min_pad_size = INT64_MAX;
-  int64 num_splits;
-  // Explore several splitting points; choose one that requires least padding.
-  // This padding is done so that we can evenly reshape.
-  for (int64 j = kHighLimitForSplitCount; j >= kLowLimitForSplitCount; j--) {
-    if (input_dim_size / j < kernel_spatial_dim_size) {
-      continue;
-    }
+  const int64 num_splits = kNewBatchSize / old_batch_size;
 
-    if (spatial_size < j) {
-      continue;
-    }
-
-    const int64 output_offsets = convolution->shape().dimensions(
-        dim_numbers.output_spatial_dimensions(kChosenSpatialDim));
-    const int64 output_offsets_per_split = CeilOfRatio(output_offsets, j);
-
-    const int64 spatial_split_size = output_offsets_per_split * stride;
-
-    // Pad spatial dim
-    const int64 pad_size = spatial_split_size * j - spatial_size;
-    if (pad_size >= 0 && pad_size < min_pad_size) {
-      min_pad_size = pad_size;
-      num_splits = j;
-    }
+  // We currently only cater to evenly divisible cases.
+  if (kNewBatchSize % old_batch_size != 0) {
+    return Status::OK();
   }
 
-  // No suitable split found.
-  if (min_pad_size == INT64_MAX) {
+  // Splitting will be incorrect in these cases.
+  if (spatial_size < num_splits ||
+      input_dim_size / num_splits < kernel_spatial_dim_size) {
     return Status::OK();
   }
 
@@ -200,7 +182,6 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
         continue;
       }
       if (i == spatial_dimension_to_split) {
-        new_dim_numbers.set_input_batch_dimension(pushed_counter);
         transpose_dims.push_back(activations_batch_dim);
         new_batch_dim = pushed_counter;
         pushed_counter++;
@@ -232,16 +213,21 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   const int64 output_offsets_per_split =
       CeilOfRatio(output_offsets, num_splits);
 
-  const int64 spatial_split_size = output_offsets_per_split * stride;
-  const int64 slice_size =
-      (output_offsets_per_split - 1) * stride + kernel_spatial_dim_size;
+  int64 spatial_split_size = output_offsets_per_split * stride;
+  // Keep increasing the split size so that overall size isn't smaller than the
+  // original spatial dimension.
+  while (spatial_split_size * num_splits - spatial_size < 0) {
+    spatial_split_size += stride;
+  }
 
-  VLOG(1) << "spatial_split_size " << spatial_split_size << " stride "
-          << stride;
+  const int64 slice_size =
+      spatial_split_size + kernel_spatial_dim_size - stride;
 
   // Pad spatial dim.
   const int64 pad_size = spatial_split_size * num_splits - spatial_size;
 
+  VLOG(1) << "spatial_split_size " << spatial_split_size << " stride "
+          << stride;
   VLOG(1) << "spatial_dimension_to_split " << spatial_dimension_to_split
           << " num_splits " << num_splits << " kernel_spatial_dim_size "
           << kernel_spatial_dim_size;
@@ -266,12 +252,9 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
 
   // Now we reorganize the activations. E.g. if the shape [B, SPACE] was [1, 16]
   // and 4 splits were needed, we first create [4, 4]. Next, to deal with halo
-  // in the spatial dimension, we first pad that dimension. E.g. if halo size
-  // was 2, we'd create a shape of [4, 6]. We then flatten the shape such that
-  // A = [1, 24]. Now, we rotate the flattened 24 dimension left by 2 (with
-  // -2 low padding and +2 high padding) to create shape B. Then, we select
-  // between A and B such that halo regions are placed into A at the right
-  // locations.
+  // in the spatial dimension, we generate a gather. E.g. if halo size was 2,
+  // we'd create a shape of [24] using the gather, and reshape it into [6, 4]
+  // (4 being the batch).
 
   // The benefit of the above mentioned scheme is that it allows for batch
   // growth. Here are some examples of the size increases it causes for a 3x3
@@ -293,87 +276,109 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
 
   VLOG(1) << "First reshape done " << batch_increased_reshape->ToString();
 
-  PaddingConfig padding_config =
-      MakeNoPaddingConfig(batch_increased_reshape->shape().dimensions_size());
-  padding_config.mutable_dimensions(spatial_dimension_to_split)
-      ->set_edge_padding_high(slice_size - spatial_split_size);
-  HloInstruction* padding =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::Zero(batch_increased_reshape->shape().element_type())));
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * pad_applied,
-      MakePadHlo(batch_increased_reshape, padding, padding_config));
+  // Create a gather HLO. We extract slices for given spatial and batch
+  // dimensions.
+  std::vector<int64> slice_sizes(activations->shape().dimensions().begin(),
+                                 activations->shape().dimensions().end());
+  slice_sizes[spatial_dimension_to_split] = 1;
+  slice_sizes[activations_batch_dim] = 1;
 
-  VLOG(1) << "Padding done " << pad_applied->ToString();
-
-  auto straightened_activations_dims = reshape_dimensions;
-  straightened_activations_dims[spatial_dimension_to_split] =
-      num_splits * slice_size;
-  straightened_activations_dims[activations_batch_dim] = old_batch_size;
-
-  VLOG(1) << "slice_size " << slice_size;
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * straightened_activations,
-      MakeReshapeHlo(straightened_activations_dims, pad_applied));
-
-  VLOG(1) << "Straightening done";
-
-  PaddingConfig rotation_padding_config =
-      MakeNoPaddingConfig(straightened_activations->shape().dimensions_size());
-  rotation_padding_config.mutable_dimensions(spatial_dimension_to_split)
-      ->set_edge_padding_high(slice_size - spatial_split_size);
-  rotation_padding_config.mutable_dimensions(spatial_dimension_to_split)
-      ->set_edge_padding_low(spatial_split_size - slice_size);
-  HloInstruction* rotation_padding =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::Zero(straightened_activations->shape().element_type())));
-  TF_ASSIGN_OR_RETURN(HloInstruction * rotated_activations,
-                      MakePadHlo(straightened_activations, rotation_padding,
-                                 rotation_padding_config));
-  convolution->SetupDerivedInstruction(rotated_activations);
-
-  // Build a constant PRED to decide which elements in the split dimension
-  // are from halo.
-  tensorflow::core::Bitmap b(num_splits * slice_size);
-  for (int k = 0; k < num_splits * slice_size; ++k) {
-    if (k % slice_size < spatial_split_size) {
-      b.set(k);
+  const int64 rank = activations->shape().dimensions_size();
+  std::vector<int64> offset_dims;
+  std::vector<int64> collapsed_dims(2);
+  int64 collapsed_dim_counter = 0;
+  bool seen_collapsed = false;
+  for (int j = 0; j < rank; ++j) {
+    if (j == activations_batch_dim || j == spatial_dimension_to_split) {
+      collapsed_dims[collapsed_dim_counter++] = j;
+      seen_collapsed = true;
     } else {
-      b.clear(k);
+      if (seen_collapsed) {
+        offset_dims.push_back(j - 1);
+      } else {
+        offset_dims.push_back(j);
+      }
     }
   }
+  std::vector<int64> start_index(2);
+  start_index[0] = activations_batch_dim;
+  start_index[1] = spatial_dimension_to_split;
 
-  auto arg_literal = LiteralUtil::CreateR1(b);
-  HloInstruction* slice_mask = computation_->AddInstruction(
+  xla::GatherDimensionNumbers gather_dim_numbers =
+      HloGatherInstruction::MakeGatherDimNumbers(
+          /*offset_dims=*/offset_dims,
+          /*collapsed_slice_dims=*/collapsed_dims,
+          /*start_index_map=*/start_index,
+          /*index_vector_dim=*/1);
+
+  // Create a static index for the gather.
+  auto arg_array = absl::make_unique<Array2D<int32>>(
+      slice_size * old_batch_size * num_splits, 2);
+  auto generate_cell = [&](int64 i, int64 j, int32* value) {
+    const int64 row_number = i / (num_splits * old_batch_size);
+    if (row_number >= spatial_split_size) {
+      if (j == 0) {
+        *value = i % (num_splits * old_batch_size) + 1;
+        if (num_splits * old_batch_size <=
+            i % (num_splits * old_batch_size) + 1) {
+          *value = 0;
+        }
+      } else {
+        *value = row_number - spatial_split_size;
+      }
+    } else {
+      if (j == 0) {
+        *value = i % (num_splits * old_batch_size);
+      } else {
+        *value = row_number;
+      }
+    }
+  };
+
+  arg_array->Each(generate_cell);
+
+  auto arg_literal = LiteralUtil::CreateR2FromArray2D<int32>(*arg_array);
+  VLOG(1) << " arg_literal " << arg_literal.ToString();
+  HloInstruction* index = computation_->AddInstruction(
       HloInstruction::CreateConstant(std::move(arg_literal)));
 
-  // Broadcast the mask in all dimensions of the activations.
-  HloInstruction* shape_mask =
-      MakeBroadcastHlo(slice_mask, {spatial_dimension_to_split},
-                       straightened_activations->shape().dimensions());
+  VLOG(1) << "slice_size " << slice_size;
+  std::vector<int64> gather_output_shape_dims(
+      activations->shape().dimensions().begin(),
+      activations->shape().dimensions().end());
 
-  VLOG(1) << "Shape mask made " << shape_mask->ToString();
+  gather_output_shape_dims[activations_batch_dim] =
+      slice_size * old_batch_size * num_splits;
+  gather_output_shape_dims.erase(gather_output_shape_dims.begin() +
+                                 spatial_dimension_to_split);
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * select,
-                      MakeSelectHlo(shape_mask, straightened_activations,
-                                    rotated_activations, convolution));
-  VLOG(1) << "Select generated" << select->ToString();
+  auto gather_shape = ShapeUtil::MakeShape(activations->shape().element_type(),
+                                           gather_output_shape_dims);
 
-  // Increase batch size for one last time.
-  std::vector<int64> combined_batch_dimensions(
-      pad_applied->shape().dimensions().begin(),
-      pad_applied->shape().dimensions().end());
+  HloInstruction* gather = computation_->AddInstruction(
+      HloInstruction::CreateGather(gather_shape, batch_increased_reshape, index,
+                                   gather_dim_numbers, slice_sizes, false));
 
-  combined_batch_dimensions[activations_batch_dim] =
+  std::vector<int64> gather_reshape_dimensions(
+      activations->shape().dimensions().begin(),
+      activations->shape().dimensions().end());
+
+  gather_reshape_dimensions[activations_batch_dim] = slice_size;
+  gather_reshape_dimensions[spatial_dimension_to_split] =
       old_batch_size * num_splits;
+
+  // Reshape the gather so that batch is split out.
   TF_ASSIGN_OR_RETURN(activations,
-                      MakeReshapeHlo(combined_batch_dimensions, select));
+                      MakeReshapeHlo(gather_reshape_dimensions, gather));
 
   VLOG(1) << "Batch merge done " << activations->ToString();
 
   // Now, we rewrite the convolution with a larger batch.
-  const auto& activations_shape = activations->shape();
-  const int64 rank = activations_shape.dimensions_size();
+
+  // Set the batch and spatial dimensions for the new convolution.
+  new_dim_numbers.set_input_batch_dimension(spatial_dimension_to_split);
+  new_dim_numbers.set_input_spatial_dimensions(kChosenSpatialDim,
+                                               activations_batch_dim);
 
   // We will generate output such that batch is followed by the split spatial
   // dimension.
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 9ebaaa8242f..dfcba8f0a32 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -20,6 +20,7 @@ cc_library(
     srcs = [
         "convolution_handler.cc",
         "dot_handler.cc",
+        "fft_handler.cc",
         "spmd_partitioner.cc",
         "spmd_partitioner_util.cc",
     ],
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index 30fc8355402..9f366ec97cd 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -91,6 +91,17 @@ Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
 
 namespace {
 
+std::vector<int64> GetAllDevicesInOrder(const HloSharding& sharding) {
+  CHECK(!sharding.IsTileMaximal());
+  std::vector<int64> results;
+  results.reserve(sharding.tile_assignment().num_elements());
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> /* indices */, int64 device) {
+        results.push_back(device);
+      });
+  return results;
+}
+
 StatusOr<HloInstruction*> PartitionBaseCase(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
     const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
@@ -237,13 +248,24 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     }
   }
 
+  int64 output_sharding_dim = -1;
+  for (int64 i = 0; i < output_sharding.tile_assignment().num_dimensions();
+       ++i) {
+    if (output_sharding.tile_assignment().dim(i) == num_partitions) {
+      output_sharding_dim = i;
+      break;
+    }
+  }
   // Try to emit windowed DotGeneral when one operand is partitioned in the same
   // way as the output along non-contracting dimensions, but the other operand
-  // is tiled in other dimensions.
+  // is tiled in other dimensions. Or both operands are partitioned in the same
+  // way along contracting dimensions, but the output is partitioned along
+  // non-contracting dimensions.
   auto emit_windowed_dot_general =
       [&](int64 matching_operand, int64 windowing_operand,
-          bool windowed_at_contracting_dims,
-          bool windowed_at_batch_dims) -> StatusOr<HloInstruction*> {
+          bool windowed_at_contracting_dims, bool windowed_at_batch_dims,
+          bool operands_sharded_at_contracting_dims)
+      -> StatusOr<HloInstruction*> {
     CHECK_EQ(matching_operand + windowing_operand, 1);
     CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
     auto unpadded_result_buffer_shape =
@@ -252,7 +274,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     // For windowing at batch/non-contracting dims, we produce the result one
     // partition at a time, so we need to pad the shape in case of uneven
     // partitioning in order to make dynamic-update-slice in-bound.
-    if (!windowed_at_contracting_dims) {
+    if (!windowed_at_contracting_dims &&
+        !operands_sharded_at_contracting_dims) {
       padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
           padded_result_buffer_shape,
           windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
@@ -292,27 +315,75 @@ StatusOr<HloInstruction*> PartitionBaseCase(
         lhs.state().collective_ops_creator.create_partition_id(&body_b);
     auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
         i->shape(), HloOpcode::kAdd, i, partition_id));
+    if (operands_sharded_at_contracting_dims) {
+      // For reduce-scatter case, start from the data_partition_id + 1 to make
+      // the data_partition_id of the final data shard in each partition the
+      // same as the corresponding partition_id.
+      auto one = body_b.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
+      data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+          data_partition_id->shape(), HloOpcode::kAdd, data_partition_id, one));
+    }
     auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
         LiteralUtil::CreateR0<uint32>(num_partitions)));
     data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
         i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
     auto dot_lhs = l;
     auto dot_rhs = r;
-    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
-      // Slice the matching operand according to the partitioned contracting
-      // dimensions on the windowed operand. We do this by treating the matching
-      // operand as replicated, and resharding it to match the windowed operand.
+    if (windowed_at_contracting_dims || windowed_at_batch_dims ||
+        operands_sharded_at_contracting_dims) {
+      // Slice the matching operand according to the partitioned dimensions on
+      // the windowed operand.
       auto slice_operand = matching_operand == 0 ? l : r;
-      slice_operand->set_sharding(HloSharding::Replicate());
-      auto state = lhs.state();
-      state.b = &body_b;
-      state.partition_id = data_partition_id;
-      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
-                       .Reshard(windowing_operand == 0
-                                    ? *lhs_sharding_transposed_to_match_rhs
-                                    : *rhs_sharding_transposed_to_match_lhs)
-                       .hlo();
-      slice_operand->clear_sharding();
+      HloInstruction* slice;
+      if (operands_sharded_at_contracting_dims) {
+        CHECK_NE(output_sharding_dim, -1);
+        int64 output_sharding_dim_size =
+            o->shape().dimensions(output_sharding_dim);
+        int64 slice_dim = matching_operand == 0
+                              ? output_to_lhs_indices[output_sharding_dim]
+                              : output_to_rhs_indices[output_sharding_dim];
+        auto slice_shape = slice_operand->shape();
+        slice_shape.set_dimensions(slice_dim, output_sharding_dim_size);
+        std::vector<HloInstruction*> slice_offsets(slice_shape.rank());
+        for (int64 i = 0; i < slice_offsets.size(); ++i) {
+          if (i != slice_dim) {
+            slice_offsets[i] =
+                body_b.AddInstruction(HloInstruction::CreateConstant(
+                    LiteralUtil::CreateR0<uint32>(0)));
+          } else {
+            auto stride = body_b.AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::CreateR0<uint32>(output_sharding_dim_size)));
+            slice_offsets[i] =
+                body_b.AddInstruction(HloInstruction::CreateBinary(
+                    data_partition_id->shape(), HloOpcode::kMultiply,
+                    data_partition_id, stride));
+          }
+        }
+        auto padded_shape = slice_operand->shape();
+        padded_shape.set_dimensions(
+            slice_dim,
+            o->shape().dimensions(output_sharding_dim) * num_partitions);
+        auto padded_slice_operand =
+            PadToShape(slice_operand, padded_shape, &body_b);
+        slice = body_b.AddInstruction(HloInstruction::CreateDynamicSlice(
+            slice_shape, padded_slice_operand, slice_offsets,
+            slice_shape.dimensions()));
+      } else {
+        // For windowed operand that partitioned along contracting dimensions,
+        // we do this by treating the matching operand as replicated, and
+        // resharding it to match the windowed operand.
+        slice_operand->set_sharding(HloSharding::Replicate());
+        auto state = lhs.state();
+        state.b = &body_b;
+        state.partition_id = data_partition_id;
+        slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                    .Reshard(windowing_operand == 0
+                                 ? *lhs_sharding_transposed_to_match_rhs
+                                 : *rhs_sharding_transposed_to_match_lhs)
+                    .hlo();
+        slice_operand->clear_sharding();
+      }
       if (matching_operand == 0) {
         dot_lhs = slice;
       } else {
@@ -321,7 +392,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     }
     TF_ASSIGN_OR_RETURN(
         auto dot, create_sharded_dot(dot_lhs, dot_rhs, &body_b, conv_window));
-    if (windowed_at_contracting_dims) {
+    if (windowed_at_contracting_dims || operands_sharded_at_contracting_dims) {
       // Accumulate the partial output to the result buffer.
       o = body_b.AddInstruction(
           HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
@@ -355,7 +426,11 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       SpmdBuilder cp_b("window_collective_permute", original_hlo);
       {
         auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
-            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+            0,
+            operands_sharded_at_contracting_dims ? o->shape()
+            : windowing_operand == 0             ? l->shape()
+                                                 : r->shape(),
+            "window"));
         std::vector<std::pair<int64, int64>> sd_pairs(num_partitions);
         for (int64 source = 0; source < num_partitions; ++source) {
           // 0 -> n-1, 1 -> 0, 2 -> 1, ...
@@ -369,16 +444,29 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       SpmdBuilder ncp_b("last_iteration_noop", original_hlo);
       {
         ncp_b.AddInstruction(HloInstruction::CreateParameter(
-            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+            0,
+            operands_sharded_at_contracting_dims ? o->shape()
+            : windowing_operand == 0             ? l->shape()
+                                                 : r->shape(),
+            "window"));
       }
       conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
-          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
-          windowing_operand == 0 ? l : r,
+          operands_sharded_at_contracting_dims ? o->shape()
+          : windowing_operand == 0             ? l->shape()
+                                               : r->shape(),
+          has_more,
+          operands_sharded_at_contracting_dims ? o
+          : windowing_operand == 0             ? l
+                                               : r,
           module->AddEmbeddedComputation(cp_b.Build()),
-          windowing_operand == 0 ? l : r,
+          operands_sharded_at_contracting_dims ? o
+          : windowing_operand == 0             ? l
+                                               : r,
           module->AddEmbeddedComputation(ncp_b.Build())));
     }
-    if (windowing_operand == 0) {
+    if (operands_sharded_at_contracting_dims) {
+      o = conditional;
+    } else if (windowing_operand == 0) {
       l = conditional;
     } else {
       r = conditional;
@@ -403,9 +491,9 @@ StatusOr<HloInstruction*> PartitionBaseCase(
         module->AddEmbeddedComputation(body_b.Build()),
         b->AddInstruction(HloInstruction::CreateTuple(
             {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
-    windowed_dot_general_loops->push_back({while_loop, windowing_operand,
-                                           windowed_at_contracting_dims,
-                                           windowed_at_batch_dims});
+    windowed_dot_general_loops->push_back(
+        {while_loop, windowing_operand, windowed_at_contracting_dims,
+         windowed_at_batch_dims, operands_sharded_at_contracting_dims});
     auto result = b->AddInstruction(HloInstruction::CreateGetTupleElement(
         result_buffer->shape(), while_loop, 2));
     if (!ShapeUtil::Compatible(padded_result_buffer_shape,
@@ -423,13 +511,13 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       ShapeSizeInBytes(rhs.base_shape()) >=
           options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (rhs_contracting_partitions == num_partitions) {
-      return emit_windowed_dot_general(0, 1, true, false);
+      return emit_windowed_dot_general(0, 1, true, false, false);
     }
     if (rhs_non_contracting_partitions == num_partitions) {
-      return emit_windowed_dot_general(0, 1, false, false);
+      return emit_windowed_dot_general(0, 1, false, false, false);
     }
     if (rhs_batch_partitions == num_partitions) {
-      return emit_windowed_dot_general(0, 1, false, true);
+      return emit_windowed_dot_general(0, 1, false, true, false);
     }
   }
   if (output_rhs_non_contracting_partitions == num_partitions &&
@@ -437,13 +525,25 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       ShapeSizeInBytes(lhs.base_shape()) >=
           options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (lhs_contracting_partitions == num_partitions) {
-      return emit_windowed_dot_general(1, 0, true, false);
+      return emit_windowed_dot_general(1, 0, true, false, false);
     }
     if (lhs_non_contracting_partitions == num_partitions) {
-      return emit_windowed_dot_general(1, 0, false, false);
+      return emit_windowed_dot_general(1, 0, false, false, false);
     }
     if (lhs_batch_partitions == num_partitions) {
-      return emit_windowed_dot_general(1, 0, false, true);
+      return emit_windowed_dot_general(1, 0, false, true, false);
+    }
+  }
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions == num_partitions &&
+      output_sharding_dim > -1 &&
+      ShapeSizeInBytes(output_base_shape) >=
+          options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (output_lhs_non_contracting_partitions == num_partitions) {
+      return emit_windowed_dot_general(0, 1, false, false, true);
+    }
+    if (output_rhs_non_contracting_partitions == num_partitions) {
+      return emit_windowed_dot_general(1, 0, false, false, true);
     }
   }
 
@@ -478,7 +578,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
         auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
     auto ar =
         lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
-            b, dot, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+            b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
+            {GetAllDevicesInOrder(lhs.sharding())},
             (*lhs.state().next_channel_id)++);
     ar->set_sharding(HloSharding::Replicate());
     return PartitionedHlo(ar, output_base_shape, lhs.state())
@@ -581,7 +682,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     TF_ASSIGN_OR_RETURN(
         auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
     return lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
-        b, dot, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+        b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
+        {GetAllDevicesInOrder(lhs.sharding())},
         (*lhs.state().next_channel_id)++);
   }
   return nullptr;
@@ -940,7 +1042,7 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
             other.sharding(), {other_group_dims[0]},
             {other.sharding().tile_assignment().dimensions().back() /
              group_count}),
-        output_grouped);
+        output_grouped, /*ignore_group_order=*/true);
     other = other.Reshard(UngroupSharding(grouped));
     partially_replicated_other = other.hlo();
     top_level_sharding_to_reset.emplace_back(other.hlo(), other.sharding());
@@ -1062,7 +1164,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
             {output_sharding.tile_assignment().num_dimensions() - 1},
             {output_sharding.tile_assignment().dimensions().back() /
              group_count}),
-        lhs_grouped);
+        lhs_grouped,
+        /*ignore_group_order=*/true);
     outer_output_tmp_sharding = UngroupSharding(grouped);
     inner_output_sharding = std::move(grouped.sharding);
   } else {
@@ -1125,7 +1228,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
                                       inverse_grouped.device_groups, b)
           .collective_ops_creator.create_cross_partition_all_reduce(
               b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
-              {}, (*lhs.state().next_channel_id)++);
+              {GetAllDevicesInOrder(inverse_grouped.sharding)},
+              (*lhs.state().next_channel_id)++);
   ar->set_sharding(outer_output_tmp_sharding);
   return PartitionedHlo(ar, output_base_shape, lhs.state())
       .Reshard(output_sharding)
@@ -2140,16 +2244,18 @@ Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
 Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
     HloComputation* computation) {
   for (auto& loop : windowed_dot_general_loops_) {
-    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
+    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims ||
+        loop.operands_sharded_at_contracting_dims) {
       // We have a dynamic-slice for the non-windowed operand in
-      // batch/contracting-dim windowed dot-general. So moving the
-      // broadcast/iota/elementwise ops into the loop could help reduce memory
-      // via fusion.
+      // batch/contracting-dim/noncontracting-dim windowed dot-general. So
+      // moving the broadcast/iota/elementwise ops into the loop could help
+      // reduce memory via fusion.
       TF_RETURN_IF_ERROR(
           SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
               loop.while_loop, 1 - loop.windowed_operand));
     }
-    if (!loop.windowed_in_contracting_dims) {
+    if (!loop.windowed_in_contracting_dims &&
+        !loop.operands_sharded_at_contracting_dims) {
       // We have a dynamic-update-slice for the output in
       // batch/non-contracting-dim windowed dot-general. So moving reduce ops
       // into the loop could help reduce memory.
diff --git a/tensorflow/compiler/xla/service/spmd/fft_handler.cc b/tensorflow/compiler/xla/service/spmd/fft_handler.cc
new file mode 100644
index 00000000000..4e1c6a96b81
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/fft_handler.cc
@@ -0,0 +1,436 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <float.h>
+
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+namespace {
+
+// Pad each partition to have size that is multiplication of num_partitions.
+// For example, if input is {0, 1, 2, 3, 4, 5} and num_partitions = 2,
+// after padding, it becomes {0, 1, 2, 3} in partition 0 and {4, 5, 0, 0} in
+// partition 1.
+absl::optional<HloInstruction*> PadEachPartitionWithHaloExchange(
+    HloInstruction* hlo, int64 num_partitions, const HloSharding& sharding,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b) {
+  int64 size_per_partition = hlo->shape().dimensions().back();
+  int64 size_padded_per_partition =
+      CeilOfRatio(size_per_partition, num_partitions) * num_partitions;
+  if (size_per_partition == size_padded_per_partition) {
+    return hlo;
+  }
+  // 1. Calculate left_halo size.
+  // left-halo size is 0
+  OffsetCalculation left_halo_size_function =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1));
+
+  // 2. Calculate right_halo size.
+  // D = size_padded_per_partition
+  // S = size_per_partition
+  // i = shard_ordinal
+  // right-halo size is D * (i + 2) - S * (i + 2) = (D - S) * i + 2 * (D - S)
+  OffsetCalculation right_halo_size_function =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+          size_padded_per_partition - size_per_partition,
+          2 * (size_padded_per_partition - size_per_partition), 1));
+
+  auto concat = hlo;
+  // 3. Halo exchange.
+  auto halo_exchange_result =
+      ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function,
+                   hlo->shape().rank() - 1, sharding, collective_ops_creator,
+                   next_channel_id, b);
+
+  if (halo_exchange_result.has_value()) {
+    concat = halo_exchange_result.value();
+  } else {
+    return absl::nullopt;
+  }
+
+  // 4. Slice the valid result.
+  // Slice offset is (D - S) * i
+  OffsetCalculation start_offset_on_padded_concat_calculation =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+          size_padded_per_partition - size_per_partition, 0, 1));
+  auto slice_shape = concat->shape();
+  slice_shape.set_dimensions(concat->shape().rank() - 1,
+                             size_padded_per_partition);
+  auto zero_s32 =
+      b->AddInstruction(HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+  std::vector<HloInstruction*> slice_offsets(concat->shape().rank(), zero_s32);
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(sharding, partition_id, b);
+  slice_offsets[concat->shape().rank() - 1] =
+      start_offset_on_padded_concat_calculation.Calculate(
+          partition_ordinals[concat->shape().rank() - 1], b);
+  return b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+}
+
+// If partition 0 has {0, 1, 2, 3} and num partitions is 2, after shuffling,
+// the data becomes {0, 2, 1, 3}.
+HloInstruction* ShuffleWithinEachPartitionUsingOneHot(HloInstruction* hlo,
+                                                      int64 num_partitions,
+                                                      SpmdBuilder* b) {
+  int64 size_per_partition = hlo->shape().dimensions().back();
+  CHECK_EQ(size_per_partition % num_partitions, 0);
+  auto indices_iota = b->AddInstruction(HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(S32, {size_per_partition}), 0));
+  auto reshape_indices_iota = b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(
+          S32, {size_per_partition / num_partitions, num_partitions}),
+      indices_iota));
+  auto transpoe_indices_iota =
+      b->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(
+              S32, {num_partitions, size_per_partition / num_partitions}),
+          reshape_indices_iota, {1, 0}));
+  auto one_hot_indices = b->AddInstruction(HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(S32, {size_per_partition, size_per_partition}),
+      b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(S32, {size_per_partition}),
+          transpoe_indices_iota)),
+      /*broadcast_dimensions=*/{1}));
+
+  auto partition_indices = b->AddInstruction(HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(S32, {size_per_partition, size_per_partition}), 0));
+
+  auto shuffle_one_hot = b->AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::ChangeElementType(partition_indices->shape(),
+                                   hlo->shape().element_type()),
+      b->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(partition_indices->shape(), PRED),
+          one_hot_indices, partition_indices, ComparisonDirection::kEq))));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(hlo->shape().rank() - 1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  PrecisionConfig precision_config;
+  precision_config.mutable_operand_precision()->Resize(
+      2, PrecisionConfig::DEFAULT);
+  HloInstruction* dot = b->AddInstruction(HloInstruction::CreateDot(
+      hlo->shape(), hlo, shuffle_one_hot, dot_dnums, precision_config));
+  return dot;
+}
+
+// If partition 0 has {0, 2, 1, 3}, partition 1 has {4, 0, 5, 0} and
+// num partitions is 2, after all-to-all, partition 0 will have {0, 2, 4, 0}
+// and partition 1 will have {1, 3, 5, 0}.
+HloInstruction* ShuffleDataWithAllToAll(
+    HloInstruction* hlo, int64 num_partitions,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  std::vector<std::vector<int64>> groups(1);
+  std::vector<int64> partition_subgroups(num_partitions);
+  std::iota(partition_subgroups.begin(), partition_subgroups.end(), 0);
+  groups[0] = partition_subgroups;
+  auto all_to_all = collective_ops_creator.create_cross_partition_all_to_all(
+      b, {hlo}, groups, (*next_channel_id)++, hlo->shape().rank() - 1);
+  return all_to_all;
+}
+
+HloInstruction* GetCorrectionFactor(HloInstruction* hlo, int64 num_partitions,
+                                    HloInstruction* partition_id,
+                                    SpmdBuilder* b) {
+  /* n = size_per_replica
+     m = num_partitions
+  factor = tf.exp(-2.0j * np.pi * tf.cast(position_index, tf.complex64) *
+                    * tf.cast(tf.range(n), dtype=tf.complex64) /
+                    (n * m))
+
+  */
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    return b->AddInstruction(std::move(to_add));
+  };
+  int64 per_replica_size = hlo->shape().dimensions().back();
+  auto constant_factor =
+      add_hlo(HloInstruction::CreateConstant(LiteralUtil::CreateR0(
+          complex64(0, -2.0 * M_PI / (num_partitions * per_replica_size)))));
+  constant_factor = add_hlo(HloInstruction::CreateBroadcast(
+      hlo->shape(), constant_factor, /*broadcast_dimensions=*/{}));
+  auto converted_partition_id = add_hlo(HloInstruction::CreateConvert(
+      ShapeUtil::ChangeElementType(partition_id->shape(),
+                                   hlo->shape().element_type()),
+      partition_id));
+  // TODO(wangtao): multipy before broadcast.
+  auto broadcast_partition_id = add_hlo(HloInstruction::CreateBroadcast(
+      hlo->shape(), converted_partition_id, /*broadcast_dimensions=*/{}));
+  auto exp_operand = add_hlo(
+      HloInstruction::CreateBinary(hlo->shape(), HloOpcode::kMultiply,
+                                   constant_factor, broadcast_partition_id));
+  auto iota = add_hlo(
+      HloInstruction::CreateIota(hlo->shape(), hlo->shape().rank() - 1));
+  exp_operand = add_hlo(HloInstruction::CreateBinary(
+      hlo->shape(), HloOpcode::kMultiply, exp_operand, iota));
+  return add_hlo(
+      HloInstruction::CreateUnary(hlo->shape(), HloOpcode::kExp, exp_operand));
+}
+
+// Sudo code for the while loop:
+// def body(dest_transform, dest_core_position, source_transform,
+//             source_core_position, i):
+//      factor = tf.exp(-2.0j * np.pi  *
+//                      tf.cast(dest_core_position, tf.complex64) *
+//                tf.cast(source_core_position, tf.complex64) / num_partitions)
+//      dest_transform += factor * source_transform
+//      source_core_position = tf.raw_ops.CollectivePermute(
+//          input=source_core_position,
+//          source_target_pairs=source_target_pairs,
+//          name='source_core_position_permute')
+//      source_transform = tf.raw_ops.CollectivePermute(
+//          input=source_transform,
+//          source_target_pairs=source_target_pairs,
+//          name='source_transform_permute')
+//      i += 1
+//      return (dest_transform, dest_core_position, source_transform,
+//              source_core_position, i)
+HloInstruction* GetFinalFftUsingCollectivePermute(
+    HloInstruction* hlo, const HloSharding& sharding,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64 num_partitions, HloInstruction* partition_id, int64* next_channel_id,
+    HloModule* module, SpmdBuilder* b) {
+  auto iteration = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
+  auto converted_partition_id = b->AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::ChangeElementType(partition_id->shape(),
+                                   hlo->shape().element_type()),
+      partition_id));
+  // Buid while loop body.
+  SpmdBuilder body_b("fft_collective_permute_body", hlo);
+  auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeTupleShape(
+          {hlo->shape(), hlo->shape(), converted_partition_id->shape(),
+           converted_partition_id->shape(), iteration->shape()}),
+      "param"));
+  auto dest_transform = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(hlo->shape(), param, 0));
+  auto source_transform = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(hlo->shape(), param, 1));
+  auto dest_partition_id =
+      body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+          converted_partition_id->shape(), param, 2));
+  auto source_partition_id =
+      body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+          converted_partition_id->shape(), param, 3));
+  auto i = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(iteration->shape(), param, 4));
+  /*
+    factor = tf.exp(-2.0j * np.pi  *
+                      tf.cast(dest_partiton_id, tf.complex64) *
+                      tf.cast(source_partition_id, tf.complex64) /
+    num_partitions) dest_transform += factor * source_transform
+  */
+  auto constant_factor = body_b.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR0(complex64(0, -2.0 * M_PI / num_partitions))));
+
+  constant_factor = body_b.AddInstruction(HloInstruction::CreateBinary(
+      constant_factor->shape(), HloOpcode::kMultiply, constant_factor,
+      dest_partition_id));
+  constant_factor = body_b.AddInstruction(HloInstruction::CreateBinary(
+      constant_factor->shape(), HloOpcode::kMultiply, constant_factor,
+      source_partition_id));
+  auto phase_factor = body_b.AddInstruction(HloInstruction::CreateUnary(
+      constant_factor->shape(), HloOpcode::kExp, constant_factor));
+  phase_factor = body_b.AddInstruction(
+      HloInstruction::CreateBroadcast(hlo->shape(), phase_factor, {}));
+  auto phase_adjust_source_transform =
+      body_b.AddInstruction(HloInstruction::CreateBinary(
+          hlo->shape(), HloOpcode::kMultiply, phase_factor, source_transform));
+  dest_transform = body_b.AddInstruction(HloInstruction::CreateBinary(
+      hlo->shape(), HloOpcode::kAdd, phase_adjust_source_transform,
+      dest_transform));
+  // collective permute for source partition_id and source_transfrom.
+  std::vector<std::pair<int64, int64>> src_dst_pairs;
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 src_device) {
+        std::vector<int64> target_indices(indices.begin(), indices.end());
+        target_indices.back() = (indices.back() + 1) % num_partitions;
+        int64 dst_device = sharding.tile_assignment()(target_indices);
+        src_dst_pairs.emplace_back(src_device, dst_device);
+      });
+
+  source_partition_id =
+      collective_ops_creator.create_cross_partition_collective_permute(
+          &body_b, source_partition_id, src_dst_pairs, (*next_channel_id)++);
+
+  source_transform =
+      collective_ops_creator.create_cross_partition_collective_permute(
+          &body_b, source_transform, src_dst_pairs, (*next_channel_id)++);
+
+  // ++i
+  i = body_b.AddInstruction(HloInstruction::CreateBinary(
+      i->shape(), HloOpcode::kAdd, i,
+      body_b.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
+  body_b.AddInstruction(
+      HloInstruction::CreateTuple({dest_transform, source_transform,
+                                   dest_partition_id, source_partition_id, i}));
+
+  // Build while loop conditions.
+  auto zero = CreateZero(hlo->shape(), b);
+  SpmdBuilder cond_b("fft_collective_permute_condition", hlo);
+  auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeTupleShape(
+          {hlo->shape(), hlo->shape(), converted_partition_id->shape(),
+           converted_partition_id->shape(), iteration->shape()}),
+      "param"));
+  auto cond_i = cond_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(iteration->shape(), cond_param, 4));
+  cond_b.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}), cond_i,
+      cond_b.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<uint32>(num_partitions))),
+      ComparisonDirection::kLt));
+
+  // Build while loop.
+  auto while_loop = b->AddInstruction(HloInstruction::CreateWhile(
+      cond_param->shape(), module->AddEmbeddedComputation(cond_b.Build()),
+      module->AddEmbeddedComputation(body_b.Build()),
+      b->AddInstruction(
+          HloInstruction::CreateTuple({zero, hlo, converted_partition_id,
+                                       converted_partition_id, iteration}))));
+
+  return b->AddInstruction(
+      HloInstruction::CreateGetTupleElement(hlo->shape(), while_loop, 0));
+}
+
+// Slice valid data in each partition.
+HloInstruction* SliceValidData(HloInstruction* hlo, const Shape& target_shape,
+                               SpmdBuilder* b) {
+  std::vector<int64> start_indices(target_shape.rank(), 0);
+  std::vector<int64> strides(target_shape.rank(), 1);
+  return b->AddInstruction(HloInstruction::CreateSlice(
+      target_shape, hlo, start_indices, target_shape.dimensions(), strides));
+}
+
+}  // namespace
+
+// Distributed FFT using the algorithm described in go/tpu-spmd-fft.
+Status SpmdPartitioningVisitor::HandleFft(HloInstruction* hlo) {
+  if (hlo->operand(0)->shape().rank() < 3 || hlo->fft_type() != FftType::FFT) {
+    return DefaultAction(hlo);
+  }
+
+  // Only support input_length equals fft_length's case.
+  int64 input_length = hlo->operand(0)->shape().dimensions().back();
+  int64 fft_length = hlo->fft_length().back();
+  if (input_length != fft_length || input_length % num_partitions_ != 0) {
+    return DefaultAction(hlo);
+  }
+
+  // Support partition at the last dimension only.
+  if (!hlo->has_sharding() ||
+      hlo->sharding().tile_assignment().dimensions().back() !=
+          num_partitions_) {
+    return DefaultAction(hlo);
+  }
+
+  auto partitioned_input =
+      GetPartitionedHlo(hlo->operand(0))
+          .PadWithValue(CreateR0WithType(hlo->shape().element_type(), 0, &b_));
+
+  // 1.a. Use right halo exchange to shuffle data first and slice with
+  // valid data. Data shuffling ensures an in-order transform that the sequences
+  // of data before and after the transform are the same. The data shuffling
+  // requires the size of data per partition is divisible by the number of
+  // partitions. For example, If input is {0, 1, 2, 3, 4, 5} and
+  // num partitions is 2, after halo exchange partition 0 has {0, 1, 2, 3} and
+  // partition 1 has {4, 5, 0, 0}, where 0s in the partition 1 are padding data.
+  // Zeros paddings append zeros to the end of the full data.
+  auto result = partitioned_input.hlo();
+  auto padded_hlo = PadEachPartitionWithHaloExchange(
+      partitioned_input.hlo(), num_partitions_, hlo->sharding(),
+      partitioned_input.state().collective_ops_creator,
+      partitioned_input.state().next_channel_id,
+      partitioned_input.state().partition_id, partitioned_input.state().b);
+
+  if (padded_hlo.has_value()) {
+    result = padded_hlo.value();
+  }
+
+  // 1.b Shuffle data within each partition using one hot and matmul.
+  // If partition 0 has {0, 1, 2, 3} and num partitions is 2, after shuffling,
+  // the data becomes {0, 2, 1, 3}.
+  result = ShuffleWithinEachPartitionUsingOneHot(result, num_partitions_,
+                                                 partitioned_input.state().b);
+  // 1.c all-to-all
+  // If partition 0 has {0, 2, 1, 3}, partition 1 has {4, 0, 5, 0} and
+  // num partitions is 2, after all-to-all, partition 0 will have {0, 2, 4, 0}
+  // and partition 1 will have {1, 3, 5, 0}.
+  result = ShuffleDataWithAllToAll(
+      result, num_partitions_, partitioned_input.state().collective_ops_creator,
+      partitioned_input.state().next_channel_id, partitioned_input.state().b);
+  // 1.d Slice valid data in each partition.
+  result = SliceValidData(result, partitioned_input.hlo()->shape(), &b_);
+
+  // 2. Do local fft transform.
+  auto partitioned_fft_length = hlo->fft_length();
+  partitioned_fft_length.back() /= num_partitions_;
+  result = b_.AddInstruction(HloInstruction::CreateFft(
+      result->shape(), result, hlo->fft_type(), partitioned_fft_length));
+
+  // Multiply by correct factor for local phase ajustment.
+  auto correction_factor = GetCorrectionFactor(
+      result, num_partitions_, partitioned_input.state().partition_id,
+      partitioned_input.state().b);
+  result = b_.AddInstruction(HloInstruction::CreateBinary(
+      result->shape(), HloOpcode::kMultiply, result, correction_factor));
+
+  // 3. Second phase FFT with collective permute. fft_length = num_partitions.
+  result = GetFinalFftUsingCollectivePermute(
+      result, hlo->sharding(), partitioned_input.state().collective_ops_creator,
+      num_partitions_, partitioned_input.state().partition_id,
+      partitioned_input.state().next_channel_id, module_,
+      partitioned_input.state().b);
+
+  result->set_sharding(hlo->sharding());
+  auto partitioned_fft =
+      PartitionedHlo(result, hlo->shape(), partitioned_input.state());
+  SetPartitionedHlo(hlo, partitioned_fft);
+  return Status::OK();
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index ceb81330639..2e65e4098da 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -1000,13 +1000,6 @@ PartitionedHlo::ReshardFromPartialReplicateWithDynamicSlice(
         target.tile_assignment().dimensions().back());
   }
 
-  // Get per_group partitioner state.
-  std::vector<int64> group_dims(sharding().tile_assignment().num_dimensions() -
-                                1);
-  std::iota(group_dims.begin(), group_dims.end(), 0);
-  auto sharding_grouped = GroupShardingOnDims(sharding(), group_dims);
-  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
-      state_, sharding_grouped.device_groups, state_.b);
   // 2. Get the padded_hlo, do right halo exchange if needed.
   auto padded_hlo = PadFromPartialReplicateShape(
       hlo_, base_shape_, sharding(), temp_target_sharding, expand_tile_dims,
@@ -1017,20 +1010,24 @@ PartitionedHlo::ReshardFromPartialReplicateWithDynamicSlice(
   }
   // 3. Slice out the tile from replicate ones.
   auto shard_shape = MakePartitionedShape(base_shape_, temp_target_sharding);
-  // device assignment within each group is sorted in
-  // HloSharding::PartialTile, thus partiton_id within each group can be
-  // matched with the order in tile_assignment.
-  Array<int64> tiling_assignment(tiling_dim_factors);
-  tiling_assignment.FillIota(0);
+  // Since we are just slicing, we can just use the differences between the new
+  // and old offsets in the full shape as the dynamic-slice offsets.
+  auto padded_base_shape = shard_shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, padded_base_shape.dimensions(i) *
+               temp_target_sharding.tile_assignment().dim(i));
+  }
+  auto offsets = MakePartitionOffsets(padded_base_shape, temp_target_sharding,
+                                      state_.partition_id, state_.b);
+  auto old_offsets = MakePartitionOffsets(padded_base_shape, sharding(),
+                                          state_.partition_id, state_.b);
+  for (int64 i = 0; i < offsets.size(); ++i) {
+    offsets[i] = state_.b->AddInstruction(HloInstruction::CreateBinary(
+        offsets[i]->shape(), HloOpcode::kSubtract, offsets[i], old_offsets[i]));
+  }
   auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
-      shard_shape, padded_hlo.value(),
-      MakePartitionOffsets(padded_hlo.value()->shape(),
-                           target.ReplicateOnLastTileDim()
-                               ? HloSharding::PartialTile(tiling_assignment)
-                               : HloSharding::Tile(tiling_assignment),
-                           per_group_partitioner_state.partition_id,
-                           per_group_partitioner_state.b),
-      shard_shape.dimensions()));
+      shard_shape, padded_hlo.value(), offsets, shard_shape.dimensions()));
   slice->set_sharding(temp_target_sharding);
   auto result = PartitionedHlo(slice, base_shape_, state_);
   // If temp_target_sharding's device assignment is different from target,
@@ -2403,15 +2400,141 @@ Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
+
+  std::vector<int64> partitioned_slice_dims;
+  std::vector<int64> slice_dims;
+  std::vector<int64> partitioned_non_slice_dims;
+  std::vector<int64> partitioned_slice_offsets;
   for (int64 i = 0; i < hlo->shape().rank(); ++i) {
-    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
-        (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i) ||
-         !hlo->operand(i + 2)->IsConstant() ||
-         !hlo->operand(i + 2)->literal().IsZero({}))) {
-      // We currently do not partition the sliced dimensions.
-      return DefaultAction(hlo);
+    if (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i)) {
+      slice_dims.push_back(i);
+      if (hlo->sharding().tile_assignment().dim(i) != 1) {
+        if (!hlo->operand(i + 2)->IsConstant()) {
+          return DefaultAction(hlo);
+        }
+        partitioned_slice_dims.push_back(i);
+        partitioned_slice_offsets.push_back(
+            hlo->operand(i + 2)->literal().Get<int>({}));
+      }
+    } else if (hlo->sharding().tile_assignment().dim(i) != 1) {
+      if (!hlo->operand(i + 2)->IsConstant() ||
+          !hlo->operand(i + 2)->literal().IsZero({})) {
+        return DefaultAction(hlo);
+      }
+      partitioned_non_slice_dims.push_back(i);
     }
   }
+
+  // Handle when there is slice dim partitioned.
+  if (!partitioned_slice_dims.empty()) {
+    auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+      return b_.AddInstruction(std::move(to_add));
+    };
+    std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+    for (int64 i = 0; i < new_indices.size(); ++i) {
+      // Replicate the indices.
+      new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2))
+                           .Reshard(HloSharding::Replicate())
+                           .hlo();
+    }
+
+    // Get partitioned input.
+    const auto& dus_sharding = hlo->sharding();
+    const auto& partitioned_input =
+        GetPartitionedHlo(hlo->operand(0)).Reshard(dus_sharding).hlo();
+
+    // Get replicate update.
+    auto update_sharding = HloSharding::Replicate();
+    if (!partitioned_non_slice_dims.empty()) {
+      // Do partial replicate for update if non slice dims are partitioned.
+      update_sharding =
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(dus_sharding,
+                                                                   slice_dims);
+    }
+    HloInstruction* replicate_update =
+        GetPartitionedHlo(hlo->operand(1)).Reshard(update_sharding).hlo();
+
+    const auto& update_shape = replicate_update->shape();
+    const auto& partitioned_shape = partitioned_input->shape();
+    auto partition_ordinals =
+        MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+    HloInstruction* all_dims_within_partition = add_hlo(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+
+    for (int i = 0; i < partitioned_slice_dims.size(); ++i) {
+      int dim = partitioned_slice_dims[i];
+      // Calculate per partition size.
+      const int64 per_partition_size = partitioned_shape.dimensions(dim);
+
+      // Only update within a single partition is supported.
+      if ((partitioned_slice_offsets[i] / per_partition_size) !=
+          ((partitioned_slice_offsets[i] + update_shape.dimensions(dim) - 1) /
+           per_partition_size)) {
+        return DefaultAction(hlo);
+      }
+
+      // within_partition = (offset >= partition_id * per_partition_size) &&
+      //                    (offset < (partition_id + 1) * per_partition_size)
+      const Shape& compare_shape =
+          ShapeUtil::ChangeElementType(partition_id_->shape(), PRED);
+      auto per_partition_size_hlo = add_hlo(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int>(per_partition_size)));
+      const Shape& offset_shape = per_partition_size_hlo->shape();
+      auto partition_offset = add_hlo(HloInstruction::CreateBinary(
+          offset_shape, HloOpcode::kMultiply, partition_ordinals[dim],
+          per_partition_size_hlo));
+      // offset >= partition_id * per_partition_size
+      auto offset_ge = add_hlo(HloInstruction::CreateCompare(
+          compare_shape, new_indices[dim], partition_offset,
+          ComparisonDirection::kGe));
+      // offset < (partition_id + 1) * per_partition_size
+      auto offset_lt = add_hlo(HloInstruction::CreateCompare(
+          compare_shape, new_indices[dim],
+          add_hlo(HloInstruction::CreateBinary(
+              offset_shape, HloOpcode::kMultiply,
+              add_hlo(HloInstruction::CreateBinary(
+                  offset_shape, HloOpcode::kAdd, partition_ordinals[dim],
+                  add_hlo(HloInstruction::CreateConstant(
+                      LiteralUtil::CreateR0<int>(1))))),
+              per_partition_size_hlo)),
+          ComparisonDirection::kLt));
+      auto update_within_partition = add_hlo(HloInstruction::CreateBinary(
+          compare_shape, HloOpcode::kAnd, offset_ge, offset_lt));
+
+      all_dims_within_partition = add_hlo(HloInstruction::CreateBinary(
+          compare_shape, HloOpcode::kAnd, all_dims_within_partition,
+          update_within_partition));
+
+      // Calculate offset.
+      // slice dim offset =
+      //  within_partition ?
+      //  offset - partition_id * per_partition_size : 0
+      new_indices[dim] = add_hlo(HloInstruction::CreateTernary(
+          new_indices[dim]->shape(), HloOpcode::kSelect,
+          update_within_partition,
+          add_hlo(HloInstruction::CreateBinary(
+              new_indices[dim]->shape(), HloOpcode::kSubtract, new_indices[dim],
+              partition_offset)),
+          add_hlo(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)))));
+    }
+
+    // Create dynamic update slice.
+    auto dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
+        partitioned_shape, partitioned_input, replicate_update, new_indices));
+    SetPartitionedHlo(hlo, [&]() {
+      // Select if update is needed.
+      return add_hlo(HloInstruction::CreateTernary(
+          dus->shape(), HloOpcode::kSelect,
+          add_hlo(HloInstruction::CreateBroadcast(
+              ShapeUtil::ChangeElementType(dus->shape(), PRED),
+              all_dims_within_partition, {})),
+          dus, partitioned_input));
+    });
+    return Status::OK();
+  }
+
+  // Partition non slice dims only.
   std::vector<HloInstruction*> new_indices(hlo->shape().rank());
   auto new_input =
       GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 86c1a97b0d2..9b6d6e34b11 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -382,6 +382,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   Status HandleDot(HloInstruction* hlo) override;
   Status HandleDynamicSlice(HloInstruction* hlo) override;
   Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  Status HandleFft(HloInstruction* hlo) override;
   Status HandleGather(HloInstruction* hlo) override;
   Status HandleGetTupleElement(HloInstruction* hlo) override;
   Status HandleInfeed(HloInstruction* hlo) override;
@@ -473,6 +474,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
     int64 windowed_operand;
     bool windowed_in_contracting_dims;
     bool windowed_in_batch_dims;
+    bool operands_sharded_at_contracting_dims;
   };
 
  private:
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 84c40e888a3..6144a12bb59 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -3378,6 +3378,30 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[24,19648]")));
 }
 
+TEST_F(SpmdPartitioningTest, DotPartialDeviceOrder) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,256,4096] parameter(0), sharding={devices=[1,1,2,2]1,3,0,2 last_tile_dim_replicate}
+  %rhs = f32[4096,2048] parameter(1), sharding={devices=[2,2]3,1,2,0}
+  ROOT %dot = f32[16,256,2048] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[1,1,2,2]2,3,0,1 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Parameter(0), op::Shape("f32[16,256,2048]"));
+  auto rhs = AllOf(op::Parameter(1), op::Shape("f32[2048,1024]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Dot(lhs, rhs)),
+                          op::Shape("f32[16,256,1024]")));
+}
+
 TEST_F(SpmdPartitioningTest, EinsumBatchPartitioned) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3651,6 +3675,163 @@ ENTRY entry {
                     op::Shape("f32[32,24,19648]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       EinsumRHSWindowedInContractingOutNonContractingPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[320,25,64,128] parameter(0)
+  %lhs.copy = f32[320,25,64,128] copy(%lhs), sharding={devices=[1,1,4,1]0,1,2,3}
+  %rhs = f32[320,39296,64,128] parameter(1)
+  %rhs.copy = f32[320,39296,64,128] copy(%rhs),
+    sharding={devices=[1,1,4,1]0,1,2,3}
+  ROOT %dot = f32[320,25,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,4,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Constant())),
+      op::Shape("f32[320,25,16,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Constant())),
+      op::Shape("f32[320,39296,16,128]"));
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::While(op::Tuple(
+                              lhs, rhs, op::Broadcast(), op::Constant()))),
+                          op::Shape("f32[320,7,39296]")));
+
+  auto while_loop = root->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto ds =
+      AllOf(op::DynamicSlice(
+                op::Pad(op::GetTupleElement(op::Parameter(0)), op::Constant()),
+                op::Constant(), op::Multiply(), op::Constant(), op::Constant()),
+            op::Shape("f32[320,7,16,128]"));
+  auto partial_output =
+      AllOf(op::Add(op::GetTupleElement(op::Parameter(0)),
+                    op::Dot(ds, op::GetTupleElement(op::Parameter(0)))),
+            op::Shape("f32[320,7,39296]"));
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                partial_output, partial_output);
+  EXPECT_THAT(while_loop->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Parameter(0)),
+                        op::GetTupleElement(op::Parameter(0)), window, next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(2);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest,
+       EinsumRHSWindowedInContractingOutNonContractingFromBroadcast) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,25,64,128] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,1,4,1]0,1,2,3}
+  %add = f32[32,25,64,128] add(%broadcast, %broadcast),
+    sharding={devices=[1,1,4,1]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(0)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs),
+    sharding={devices=[1,1,4,1]0,1,2,3}
+  ROOT %dot = f32[32,25,39296] dot(%add, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,4,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest,
+       EinsumLHSWindowedInContractingOutNonContractingPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,1024,16384] parameter(0)
+  %lhs.copy = f32[16,1024,16384] copy(%lhs),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7}
+  %rhs = f32[16384,67,128] parameter(1)
+  %rhs.copy = f32[16384,67,128] copy(%rhs),
+    sharding={devices=[4,1,1,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+  ROOT %dot = f32[16,1024,67,128] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[2,1,4,1]0,1,2,3,4,5,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Reshape())),
+                   op::Shape("f32[8,1024,4096]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[4096,67,128]"));
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::While(op::Tuple(
+                              lhs, rhs, op::Broadcast(), op::Constant()))),
+                          op::Shape("f32[8,1024,17,128]")));
+
+  auto while_loop = root->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto ds =
+      AllOf(op::DynamicSlice(
+                op::Pad(op::GetTupleElement(op::Parameter(0)), op::Constant()),
+                op::Constant(), op::Multiply(), op::Constant()),
+            op::Shape("f32[4096,17,128]"));
+  auto partial_output =
+      AllOf(op::Add(op::GetTupleElement(op::Parameter(0)),
+                    op::Dot(op::GetTupleElement(op::Parameter(0)), ds)),
+            op::Shape("f32[8,1024,17,128]"));
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                partial_output, partial_output);
+  EXPECT_THAT(while_loop->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Parameter(0)),
+                        op::GetTupleElement(op::Parameter(0)), window, next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(2);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
 TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContracting) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4015,6 +4196,84 @@ ENTRY entry {
                           op::Shape("s32[64,64]")));
 }
 
+TEST_F(SpmdPartitioningTest, DynamicUpdateSliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[1,2]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(60)
+  %update = s32[128,2] parameter(2)
+  %update.copy = s32[128,2] copy(%update), sharding={devices=[1,2]0,1}
+  ROOT %dynamic-update-slice = s32[128,64]
+    dynamic-update-slice(%input.copy, %update.copy, %index, %constant),
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                               op::Reshape())),
+                     op::Shape("s32[128,32]"));
+  auto update = AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                          op::Broadcast(),
+                          op::Copy(op::DynamicSlice(
+                              op::Parameter(2), op::Constant(), op::Reshape())),
+                          op::Constant(), op::Reshape())),
+                      op::Shape("s32[128,2]"));
+
+  EXPECT_THAT(
+      root, AllOf(op::Select(op::Broadcast(),
+                             op::DynamicUpdateSlice(
+                                 input, update, op::Parameter(1), op::Select()),
+                             input),
+                  op::Shape("s32[128,32]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicUpdateSlicePartitionSliceAndNonSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,2]0,1,2,3}
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(60)
+  %update = s32[128,2] parameter(1)
+  %update.copy = s32[128,2] copy(%update), sharding={devices=[2,2]0,1,2,3}
+  ROOT %dynamic-update-slice = s32[128,64]
+    dynamic-update-slice(%input.copy, %update.copy, %constant.0, %constant.1),
+    sharding={devices=[2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Reshape())),
+                     op::Shape("s32[64,32]"));
+  auto update = AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                          op::Broadcast(),
+                          op::Copy(op::DynamicSlice(
+                              op::Parameter(1), op::Reshape(), op::Reshape())),
+                          op::Constant(), op::Reshape())),
+                      op::Shape("s32[64,2]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Select(op::Broadcast(),
+                               op::DynamicUpdateSlice(
+                                   input, update, op::Constant(), op::Select()),
+                               input),
+                    op::Shape("s32[64,32]")));
+}
+
 TEST_F(SpmdPartitioningTest, PassthroughGather) {
   const char* const hlo_string = R"(
 HloModule module
@@ -5115,7 +5374,7 @@ ENTRY entry {
           op::DynamicSlice(
               op::Pad(op::Concatenate(multiply, right_halo), op::Constant()),
               op::Reshape(), op::Constant()),
-          op::Reshape(), op::Constant()));
+          op::Subtract(), op::Subtract()));
   auto add_rhs = AllOf(op::Shape("f32[2,3]"),
                        op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
                                         op::Reshape(), op::Constant()));
@@ -5167,9 +5426,10 @@ ENTRY entry {
       AllOf(op::Shape("f32[4,8]"),
             op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
                                       op::Constant())));
-  auto tiled = AllOf(op::Shape("f32[4,4]"),
-                     op::Copy(op::DynamicSlice(partially_replicated,
-                                               op::Constant(), op::Reshape())));
+  auto tiled =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::DynamicSlice(partially_replicated, op::Subtract(),
+                                      op::Subtract())));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, tiled);
 }
@@ -5223,9 +5483,10 @@ ENTRY entry {
       AllOf(op::Shape("f32[4,8]"),
             op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
                                       op::Constant())));
-  auto tiled = AllOf(op::Shape("f32[4,4]"),
-                     op::Copy(op::DynamicSlice(partially_replicated,
-                                               op::Constant(), op::Reshape())));
+  auto tiled =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::DynamicSlice(partially_replicated, op::Subtract(),
+                                      op::Subtract())));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, tiled);
 }
@@ -5250,9 +5511,10 @@ ENTRY entry {
       AllOf(op::Shape("f32[8,8]"),
             op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
                                       op::Constant())));
-  auto tiled = AllOf(op::Shape("f32[4,4]"),
-                     op::Copy(op::DynamicSlice(partially_replicated,
-                                               op::Reshape(), op::Reshape())));
+  auto tiled =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::DynamicSlice(partially_replicated, op::Subtract(),
+                                      op::Subtract())));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, tiled);
 }
@@ -5309,7 +5571,7 @@ ENTRY entry {
   auto tiled =
       AllOf(op::Shape("f32[4,4]"),
             op::Copy(op::CollectivePermute(op::DynamicSlice(
-                partially_replicated, op::Reshape(), op::Constant()))));
+                partially_replicated, op::Subtract(), op::Subtract()))));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, tiled);
 }
@@ -5816,8 +6078,8 @@ ENTRY entry {
       op::Shape("f32[8,801,1,1024]"));
   auto resharded_lhs =
       AllOf(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(
-                op::Pad(op::DynamicSlice(lhs, op::Constant(), op::Constant(),
-                                         op::Constant(), op::Reshape()),
+                op::Pad(op::DynamicSlice(lhs, op::Subtract(), op::Subtract(),
+                                         op::Subtract(), op::Subtract()),
                         op::Constant()))))),
             op::Shape("f32[16,401,1,512]"));
   auto left_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
@@ -6110,6 +6372,43 @@ ENTRY entry {
                           op::Shape("f32[8,105,210,32]")));
 }
 
+TEST_F(SpmdPartitioningTest, Fft3D) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = c64[1,1,6]
+    constant({{{(0,0),(1,1),(2,2),(3,3),(4,4),(5,5)}}}),
+    sharding={devices=[1,1,2]0,1}
+  ROOT fft = c64[1,1,6] fft(c64[1,1,6] constant), fft_type=FFT, fft_length={6},
+    sharding={devices=[1,1,2]0,1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::DynamicSlice(op::Constant(), op::Constant(),
+                                      op::Constant(), op::Reshape()),
+                     op::Shape("c64[1,1,3]"));
+  auto padded_input =
+      AllOf(op::DynamicSlice(
+                op::Concatenate(input, op::CollectivePermute(op::Slice())),
+                op::Constant(), op::Constant(), op::Reshape()),
+            op::Shape("c64[1,1,4]"));
+
+  auto shuffled_input =
+      AllOf(op::Slice(op::AllToAll(op::Dot(padded_input, op::Convert()))),
+            op::Shape("c64[1,1,3]"));
+
+  auto local_fft = AllOf(op::Fft(shuffled_input), op::Shape("c64[1,1,3]"));
+
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::While(op::Tuple(
+                              _, op::Multiply(local_fft, op::Exp()), _, _, _))),
+                          op::Shape("c64[1,1,3]")));
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 913bfed926a..96dee4be524 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -169,7 +169,7 @@ Status TransferManager::TransferArrayToDeviceAsync(
         "%d < %d",
         dest.size(), GetByteSizeRequirement(on_device_shape));
   }
-  ShapedBuffer shaped_buffer(on_device_shape, stream->parent()->platform(),
+  ShapedBuffer shaped_buffer(on_device_shape,
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(dest, /*index=*/{});
   return TransferLiteralToDevice(stream, literal, shaped_buffer,
@@ -193,8 +193,7 @@ void TransferManager::TransferArrayFromDevice(
                            "%d < %d",
                            source.size(), GetByteSizeRequirement(shape)));
   }
-  ShapedBuffer shaped_buffer(shape, stream->parent()->platform(),
-                             stream->parent()->device_ordinal());
+  ShapedBuffer shaped_buffer(shape, stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(source, /*index=*/{});
   return TransferLiteralFromDevice(stream, shaped_buffer, literal,
                                    std::move(done), transfer_metadata);
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
index 4015c69e3e2..8df078d1377 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -366,6 +366,107 @@ XlaOp TriangularSolveExpander::InvertDiagonalBlocks(
   });
 }
 
+XlaOp TriangularSolveExpander::SolveByInvertingDiagonalBlocks(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, bool unit_diagonal,
+    PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int64 ndims = a_shape.rank();
+    int64 k = ShapeUtil::GetDimension(a_shape, -1);
+
+    // TODO(phawkins): consider pushing triangle masking into
+    // InvertDiagonalBlocks.
+    if (unit_diagonal) {
+      // Mask everything but the subdiagonal/superdiagonal elements.
+      a = lower ? Select(TriangleMask(a, -1), a, ZerosLike(a))
+                : Select(TriangleMask(a, 0), ZerosLike(a), a);
+      a = xla::Add(a, IdentityMatrix(builder, a_shape.element_type(), k, k),
+                   /*broadcast_dimensions=*/{ndims - 2, ndims - 1});
+    } else {
+      // Mask off the ignored elements of the triangular matrix a.
+      a = Triangle(a, lower);
+    }
+
+    // We find the diagonal blocks of the coefficient matrix
+    int64 block_size = std::min(block_size_, k);
+    auto diag_blocks = DiagonalBlocks(a, block_size);
+
+    // We invert these blocks in parallel using batched matrix-vector products
+    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, precision);
+
+    // We now find the solution using GEMMs
+    return SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side,
+                                           lower, transpose_a, conjugate_a,
+                                           precision);
+  });
+}
+
+// def trsm_left_lower_leftlooking(a, b):
+//   n = a.shape[-1]
+//   assert a.shape == (n, n)
+//   b = b.copy()
+//   for j in range(n):
+//     b[j, :] = (b[j, :] - np.dot(a[j, :j], b[:j, :])) / a[j, j]
+//   return b
+XlaOp TriangularSolveExpander::SolveDirectly(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, bool unit_diagonal,
+    PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    int64 m = ShapeUtil::GetDimension(b_shape, -2);
+    int64 n = ShapeUtil::GetDimension(b_shape, -1);
+    const int64 a_size = ShapeUtil::GetDimension(a_shape, -1);
+    a = MaybeConjugate(a, conjugate_a);
+    bool backwards = transpose_a ^ lower ^ !left_side;
+    for (int64 i = 0; i < a_size; ++i) {
+      int64 j = backwards ? i : (a_size - i - 1);
+      std::vector<int64> b_row_start, b_row_end;
+      if (left_side) {
+        b_row_start = {j, 0};
+        b_row_end = {j + 1, n};
+      } else {
+        b_row_start = {0, j};
+        b_row_end = {m, j + 1};
+      }
+      auto b_row = SliceInMinorDims(b, b_row_start, b_row_end);
+
+      std::vector<int64> a_start = {j, backwards ? 0 : (j + 1)};
+      std::vector<int64> a_end = {j + 1, backwards ? j : a_size};
+      if (transpose_a ^ !left_side) {
+        std::swap(a_start[0], a_start[1]);
+        std::swap(a_end[0], a_end[1]);
+      }
+      auto a_chunk = SliceInMinorDims(a, a_start, a_end);
+      if (left_side) {
+        bool which = transpose_a ^ lower;
+        auto b_chunk =
+            SliceInMinorDims(b, {which ? 0 : (j + 1), 0}, {which ? j : m, n});
+        b_row = b_row - BatchDot(a_chunk, /*transpose_x=*/transpose_a, b_chunk,
+                                 /*transpose_y=*/false, precision);
+      } else {
+        bool which = transpose_a ^ !lower;
+        auto b_chunk =
+            SliceInMinorDims(b, {0, which ? 0 : (j + 1)}, {m, which ? j : n});
+        b_row = b_row - BatchDot(b_chunk, /*transpose_x=*/false, a_chunk,
+                                 /*transpose_y=*/transpose_a, precision);
+      }
+      if (!unit_diagonal) {
+        auto a_diag = SliceInMinorDims(a, {j, j}, {j + 1, j + 1});
+        b_row = b_row / a_diag;
+      }
+
+      b = UpdateSliceInMinorDims(b, b_row, b_row_start);
+    }
+
+    return b;
+  });
+}
+
 XlaOp TriangularSolveExpander::BuildTriangularSolve(
     XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
     bool conjugate_a, bool unit_diagonal, int64 block_size,
@@ -388,6 +489,7 @@ XlaOp TriangularSolveExpander::BuildTriangularSolve(
     }
     // The batch dimensions must be equal.
     std::vector<int64> batch_dimensions;
+    int64 batch = 1;
     for (int i = 0; i < ndims - 2; ++i) {
       int64 a_size = a_shape.dimensions(i);
       int64 b_size = b_shape.dimensions(i);
@@ -398,6 +500,7 @@ XlaOp TriangularSolveExpander::BuildTriangularSolve(
             ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
       }
       batch_dimensions.push_back(a_size);
+      batch *= a_size;
     }
 
     if (ShapeUtil::GetDimension(a_shape, -1) !=
@@ -416,14 +519,7 @@ XlaOp TriangularSolveExpander::BuildTriangularSolve(
           ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
 
-    if (block_size < 1) {
-      return InvalidArgument(
-          "block_size argument to TriangularSolve must be >= 1; got %d",
-          block_size);
-    }
-
-    block_size = std::max(
-        int64{1}, std::min(block_size, ShapeUtil::GetDimension(a_shape, -1)));
+    int64 a_size = ShapeUtil::GetDimension(a_shape, -1);
 
     if (ShapeUtil::IsZeroElementArray(b_shape)) {
       // The output has the same shape as 'b', and since the output has zero
@@ -432,41 +528,27 @@ XlaOp TriangularSolveExpander::BuildTriangularSolve(
     }
 
     // Degenerate case: 1x1 matrices.
-    if (ShapeUtil::GetDimension(a_shape, -1) == 1) {
+    if (a_size == 1) {
       return unit_diagonal ? b : Div(b, MaybeConjugate(a, conjugate_a));
     }
 
-    // TODO(phawkins): consider pushing triangle masking into
-    // InvertDiagonalBlocks.
-    if (unit_diagonal) {
-      // Mask everything but the subdiagonal/superdiagonal elements.
-      a = lower ? Select(TriangleMask(a, -1), a, ZerosLike(a))
-                : Select(TriangleMask(a, 0), ZerosLike(a), a);
-      int64 k = ShapeUtil::GetDimension(a_shape, -1);
-      a = xla::Add(a, IdentityMatrix(builder, a_shape.element_type(), k, k),
-                   /*broadcast_dimensions=*/{ndims - 2, ndims - 1});
+    // Prefer the direct implementation whenever there is a nontrivial batch
+    // dimension and the matrix is very small.
+    if (batch > block_size_ / 16 && a_size < block_size_ / 4) {
+      return SolveDirectly(a, b, left_side, lower, transpose_a, conjugate_a,
+                           unit_diagonal, precision);
     } else {
-      // Mask off the ignored elements of the triangular matrix a.
-      a = Triangle(a, lower);
+      return SolveByInvertingDiagonalBlocks(a, b, left_side, lower, transpose_a,
+                                            conjugate_a, unit_diagonal,
+                                            precision);
     }
-
-    // We find the diagonal blocks of the coefficient matrix
-    auto diag_blocks = DiagonalBlocks(a, block_size);
-
-    // We invert these blocks in parallel using batched matrix-vector products
-    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, precision);
-
-    // We now find the solution using GEMMs
-    auto x =
-        SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
-                                        transpose_a, conjugate_a, precision);
-
-    return x;
   });
 }
 
 TriangularSolveExpander::TriangularSolveExpander(int64 block_size)
-    : block_size_(block_size) {}
+    : block_size_(block_size) {
+  CHECK_GE(block_size_, 1);
+}
 
 bool TriangularSolveExpander::InstructionMatchesPattern(
     HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
index 3f9e58a3246..60cf8faaee4 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.h
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -36,9 +36,23 @@ class TriangularSolveExpander : public OpExpanderPass {
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 
+  // Performs a triangular solve using an algorithm from MAGMA, which inverts
+  // diagonal blocks and multiplies them using matrix multiplications.
+  XlaOp SolveByInvertingDiagonalBlocks(XlaOp a, XlaOp b, bool left_side,
+                                       bool lower, bool transpose_a,
+                                       bool conjugate_a, bool unit_diagonal,
+                                       PrecisionConfig::Precision precision);
+
+  // Helper function used by SolveByInvertingDiagonalBlocks
   virtual XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower_triangular,
                                      PrecisionConfig::Precision precision);
 
+  // Performs a direct triangular solve, suitable for case with small matrices
+  // or with large batch.
+  XlaOp SolveDirectly(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool transpose_a, bool conjugate_a, bool unit_diagonal,
+                      PrecisionConfig::Precision precision);
+
   XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
                              bool transpose_a, bool conjugate_a,
                              bool unit_diagonal, int64 block_size,
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 6a19a1fac09..0c9a2f3ab54 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -39,6 +39,15 @@ class Shape {
   // Construct a shape from a ShapeProto.
   explicit Shape(const ShapeProto& shape_proto);
 
+  Shape(PrimitiveType element_type, absl::Span<const int64> dimensions,
+        absl::Span<const bool> dynamic_dimensions,
+        std::vector<Shape> tuple_shapes)
+      : element_type_(element_type),
+        dimensions_(dimensions.begin(), dimensions.end()),
+        dynamic_dimensions_(dynamic_dimensions.begin(),
+                            dynamic_dimensions.end()),
+        tuple_shapes_(std::move(tuple_shapes)) {}
+
   // Returns a ShapeProto representation of the Shape.
   ShapeProto ToProto() const;
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c294355e269..9078f674fa0 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -535,94 +535,100 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
                }));
 }
 
-void BM_Construct(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Construct(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> shape_tree(shape);
   }
 }
 
-void BM_ConstructUnowned(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_ConstructUnowned(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> shape_tree(&shape);
   }
 }
 
-void BM_Copy(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Copy(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> copy = shape_tree;
     tensorflow::testing::DoNotOptimize(copy);
   }
 }
 
-void BM_Move(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Move(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> copy = std::move(shape_tree);
     shape_tree = std::move(copy);
   }
 }
 
-void BM_ForEach(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_ForEach(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) {
       tensorflow::testing::DoNotOptimize(index);
     });
   }
 }
 
-void BM_Iterate(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Iterate(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     for (auto& iter : shape_tree) {
       tensorflow::testing::DoNotOptimize(iter.second);
     }
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 238879ebdc0..0c877bf6102 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1074,8 +1074,7 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     }
   }
 
-  return std::make_tuple(!deleted_indices.empty() || !inserted_indices.empty(),
-                         deleted_indices, inserted_indices);
+  return std::make_tuple(true, deleted_indices, inserted_indices);
 }
 
 /* static */ std::vector<std::pair<int64, int64>>
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 414b53d4f67..4e2030667ee 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -558,8 +558,6 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
       ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape1)));
   EXPECT_FALSE(std::get<0>(
       ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape2)));
-  EXPECT_FALSE(std::get<0>(
-      ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape0)));
 }
 
 TEST(ShapeUtilTest, ForEachIndex) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 98ed49ad76a..a429bf7f2bc 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -413,16 +413,18 @@ xla_test(
     ],
     shard_count = 50,
     deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
         ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:bfloat16_normalization",
         "//tensorflow/compiler/xla/service:despecializer",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:optional",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index ef4ce24a839..fc49c9249d7 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -146,6 +146,30 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, IntPow) {
+  XlaBuilder builder(TestName());
+  XlaOp lhs =
+      ConstantR1<int32>(&builder, {0, 1, 2, 3, 4, 5, -1, -2, 3, 5, 3, 1});
+  XlaOp rhs =
+      ConstantR1<int32>(&builder, {0, 3, 3, 3, 3, 3, 2, 3, 2, 10, -100, -2});
+  Pow(lhs, rhs);
+
+  std::vector<int32> expected = {1, 1, 8, 27, 64, 125, 1, -8, 9, 9765625, 0, 1};
+
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, IntPowLarge) {
+  XlaBuilder builder(TestName());
+  XlaOp lhs = ConstantR1<int64>(&builder, {2});
+  XlaOp rhs = ConstantR1<int64>(&builder, {62});
+  Pow(lhs, rhs);
+
+  std::vector<int64> expected = {4611686018427387904};
+
+  ComputeAndCompareR1<int64>(&builder, expected, {});
+}
+
 // A non-canonical quiet NaN value.
 static const float kNonCanonicalNaN = absl::bit_cast<float>(0x7FD01234);
 
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9df83e30ad4..fe27a8c6963 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -521,8 +521,7 @@ XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
   ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
 }
 
-// TODO(b/169314478): Enable the test when the slow compilation is fixed.
-XLA_TEST_F(ConcatTestHlo, DISABLED_ConcatWithBitcast) {
+XLA_TEST_F(ConcatTestHlo, ConcatWithBitcast) {
   auto module = ParseAndReturnVerifiedModule(R"(
 HloModule jit_broken.874
 
@@ -762,7 +761,7 @@ ENTRY jit_broken.874 {
   auto input_array = absl::make_unique<Array2D<float>>(4, 2);
   input_array->FillUnique(1.0f);
   auto input = LiteralUtil::CreateR2FromArray2D<float>(*input_array);
-  EXPECT_TRUE(RunAndCompare(std::move(module), {&input}, absl::nullopt));
+  EXPECT_TRUE(RunAndCompare(std::move(module), {&input}, error_spec_));
 }
 
 // Describes a binary rank-2 concatenation test.
diff --git a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
index 2a1eed7c7a7..c884fcca25b 100644
--- a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
@@ -824,9 +824,8 @@ XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   ComputeAndCompare(&b, {});
 }
 
-void BM_ParallelFusion(int num_iters) {
+void BM_ParallelFusion(::testing::benchmark::State& state) {
   // Simple element-wise computation to benchmark parallel task partitioning.
-  tensorflow::testing::StopTiming();
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
@@ -915,17 +914,16 @@ void BM_ParallelFusion(int num_iters) {
   const int64 total_bytes = param0_dim0 * param0_dim0 +
                             param1_dim0 * param1_dim0 +
                             param2_dim0 * param2_dim0;
-  tensorflow::testing::BytesProcessed(static_cast<int64>(num_iters) *
-                                      total_bytes * sizeof(float));
-  tensorflow::testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+
+  for (auto s : state) {
     auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * total_bytes *
+                          sizeof(float));
 }
 
-BENCHMARK(BM_ParallelFusion);
+BENCHMARK(BM_ParallelFusion)->UseRealTime();
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 0974d37779e..0362d5fe1a5 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -750,9 +750,7 @@ XLA_TEST_F(HloTestBase, AddOfDUS) {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
 
-void BM_DynamicSlice(int num_iters) {
-  tensorflow::testing::StopTiming();
-
+void BM_DynamicSlice(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
   se::StreamExecutorMemoryAllocator allocator(platform, executors);
@@ -817,8 +815,7 @@ void BM_DynamicSlice(int num_iters) {
   }
 
   // Run benchmark.
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
     auto result = executable->Run(shaped_buffer_ptrs, options);
     ASSERT_TRUE(result.ok());
   }
diff --git a/tensorflow/compiler/xla/tests/grouped_convolution_test.cc b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
index 4b06fe2678f..36a1ee112d4 100644
--- a/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
@@ -23,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 namespace {
@@ -248,5 +253,28 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::Bool()),
     GroupedConvolution2DTestDataToString);
 
+using GroupedConvolutionTest = HloTestBase;
+
+XLA_TEST_F(GroupedConvolutionTest, BackwardInputConvolution) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+
+ENTRY convolution {
+  p1 = f32[2,1,1,1]{3,2,1,0} parameter(0)
+  p2 = f32[2,4,4,1]{3,2,1,0} parameter(1)
+  reverse = f32[2,4,4,1]{3,2,1,0} reverse(p2), dimensions={1,2}
+  ROOT convolution = f32[2,4,4,1]{3,2,1,0} convolution(p1, reverse), window={size=4x4 pad=3_3x3_3}, dim_labels=fb01_o01i->f01b, feature_group_count=2
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(auto fake_arguments, MakeFakeArguments(module.get()));
+  std::vector<Literal*> fake_argument_ptrs;
+  absl::c_transform(
+      fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const Literal& literal) { return &const_cast<Literal&>(literal); });
+  EXPECT_TRUE(RunAndCompare(std::move(module), fake_argument_ptrs,
+                            ErrorSpec{0.01, 0.01}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index fab1a53611f..cc277e603ca 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -946,9 +946,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
 
 // Benchmark that measures the overhead of the LocalClient API when running a
 // trivial computation
-void BM_LocalClientOverhead(int num_iters) {
-  tensorflow::testing::StopTiming();
-
+void BM_LocalClientOverhead(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
   se::StreamExecutorMemoryAllocator allocator(platform, executors);
@@ -990,8 +988,7 @@ void BM_LocalClientOverhead(int num_iters) {
     ASSERT_IS_OK(result);
   }
 
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
     auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 201c0da87f1..1a95f2fb549 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -122,7 +122,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
   bfloat16 interval = static_cast<bfloat16>(0.25);
   std::vector<int32> counts(static_cast<int64>((high - low) / interval), 0);
 
-  constexpr int64 count = 100;
+  constexpr int64 count = 1000;
   for (int64 seed = 0; seed < count; ++seed) {
     auto result = UniformTest<bfloat16>(low, high, {}, /*seed=*/seed);
     result.EachCell<bfloat16>([&](absl::Span<const int64>, bfloat16 value) {
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 697c24e6587..a343184d66e 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -357,8 +357,8 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
   using TransferManagerTest::TransferManagerTest;
   ~TransferDeviceToHostBenchmark() override {}
 
-  void Run(int iters, int num_tuple_elements, int array_size) {
-    tensorflow::testing::StopTiming();
+  void Run(::testing::benchmark::State& state, int num_tuple_elements,
+           int array_size) {
     SetUp();
 
     std::vector<Literal> tuple_elements;
@@ -370,13 +370,11 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
     auto device_buffer = AllocateDeviceBuffer(literal.shape());
     TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
                                                            device_buffer));
-    tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
       TF_ASSERT_OK_AND_ASSIGN(
           Literal result,
           transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer));
     }
-    tensorflow::testing::StopTiming();
     TearDown();
   }
 
@@ -388,7 +386,8 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
   using TransferManagerTest::TransferManagerTest;
   ~TransferHostToDeviceBenchmark() override {}
 
-  void Run(int iters, int num_tuple_elements, int array_size) {
+  void Run(::testing::benchmark::State& state, int num_tuple_elements,
+           int array_size) {
     tensorflow::testing::StopTiming();
     SetUp();
 
@@ -400,7 +399,7 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
     Literal literal = LiteralUtil::MakeTupleOwned(std::move(tuple_elements));
     auto device_buffer = AllocateDeviceBuffer(literal.shape());
     tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
       TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
                                                              device_buffer));
     }
@@ -411,16 +410,20 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
   void TestBody() override {}
 };
 
-void BM_TransferDeviceToHost(int iters, int num_tuple_elements,
-                             int array_size) {
+void BM_TransferDeviceToHost(::testing::benchmark::State& state) {
+  const int num_tuple_elements = state.range(0);
+  const int array_size = state.range(1);
+
   TransferDeviceToHostBenchmark bm;
-  bm.Run(iters, num_tuple_elements, array_size);
+  bm.Run(state, num_tuple_elements, array_size);
 }
 
-void BM_TransferHostToDevice(int iters, int num_tuple_elements,
-                             int array_size) {
+void BM_TransferHostToDevice(::testing::benchmark::State& state) {
+  const int num_tuple_elements = state.range(0);
+  const int array_size = state.range(1);
+
   TransferHostToDeviceBenchmark bm;
-  bm.Run(iters, num_tuple_elements, array_size);
+  bm.Run(state, num_tuple_elements, array_size);
 }
 
 BENCHMARK(BM_TransferHostToDevice)
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 8e8c3605cc7..73bb30f46d0 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1259,9 +1259,8 @@ XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) {
   ComputeAndCompareR0<int32>(&builder, 2, {});
 }
 
-void BM_WhileLoop(int num_iters) {
+void BM_WhileLoop(::testing::benchmark::State& state) {
   // Benchmark a simple kernel to measure while loop overheads.
-  tensorflow::testing::StopTiming();
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
@@ -1330,8 +1329,7 @@ void BM_WhileLoop(int num_iters) {
   }
 
   // Run benchmark.
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
     auto result =
         executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 1cf30b10373..f2488c39504 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -290,7 +290,11 @@ message DebugOptions {
   // Extra parameters to pass the GPU assembler.
   string xla_gpu_asm_extra_flags = 141;
 
-  // Next id: 142
+  // Per-heap size constraint. New heaps will be created if per-heap max size is
+  // reached.
+  int32 xla_multiheap_size_constraint_per_heap = 142;
+
+  // Next id: 143
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 7da8d2cb84d..eade7c2426d 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -250,6 +250,7 @@ enum ProfileType {
   INVALID = 0;
   WINDOW = 1;
   FLAG = 2;
+  INTEGER = 3;
 }
 
 // Symbolization metadata for HLO Instructions.
@@ -535,6 +536,12 @@ message ConvolutionDimensionNumbers {
   // Next = 13
 }
 
+enum PaddingType {
+  PADDING_INVALID = 0;
+  PADDING_VALID = 1;  // Only valid portion of the base are covered.
+  PADDING_SAME = 2;  // Extra is added to produce same output size as the input.
+}
+
 enum FftType {
   FFT = 0;    // Forward FFT; complex in, complex out.
   IFFT = 1;   // Inverse FFT; complex in, complex out.
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index 724cfe38d54..4f3f4b36970 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -73,7 +73,9 @@ tf_cuda_cc_test(
         "--xla_test_device=XLA_GPU",
         "--xla_platform=GPU",
     ],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171319142): re-enable.
+    ],
     deps = [
         ":raw_api_test_lib",
         "//tensorflow/compiler/jit:xla_gpu_device",
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index c4094795a96..f671cb52ac7 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -585,7 +585,7 @@ void XRTTupleAllocation::InitializeFromShapedBuffer(
 
 xla::StatusOr<xla::ShapedBuffer> XRTTupleAllocation::ToShapedBuffer() {
   xla::ShapedBuffer shaped_buffer(on_host_shape(), on_device_shape(),
-                                  allocator_->platform(), device_ordinal_);
+                                  device_ordinal_);
   for (const auto& index_buffer : buffers_) {
     if (index_buffer.second == nullptr ||
         (index_buffer.second->allocation().is_null() &&
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1350775ae44..f6084e6acbc 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -46,7 +46,6 @@
 #
 # Public mobile targets, e.g. for Android:
 #
-# filegroup ":android_proto_srcs" - Protos
 # cc_library ":portable_tensorflow_lib" - Native library
 # cc_library ":portable_tensorflow_lib_lite" - Native library, without ops,
 #   supporting SELECTIVE_REGISTRATION feature.
@@ -84,18 +83,12 @@ load(
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tensorflow_opensource_extra_deps")
 
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 
@@ -161,14 +154,8 @@ package_group(
 # Export the BUILD file so automated tooling can check licenses
 exports_files([
     "BUILD",
-    "ops/ops.pbtxt",
 ])
 
-package_group(
-    name = "experimental_access",
-    packages = ["//tensorflow/core/common_runtime/..."],
-)
-
 # Authorized users go here.
 package_group(name = "friends")
 
@@ -302,12 +289,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "framework_bounds_check",
-    actual = "//tensorflow/core/framework:bounds_check",
-    visibility = ["//tensorflow/core/kernels:friends"],
-)
-
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -365,26 +346,11 @@ cc_library(
         ":lib_internal",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:optional",
     ],
 )
 
-# APIs defined in lib_experimental are for experimental usage and may be
-# subject to change. Its visibility is limited to selected packages.
-cc_library(
-    name = "lib_experimental",
-    hdrs = [
-        "//tensorflow/core/lib/core:legacy_lib_core_threadpool_options_header",
-    ],
-    visibility = [
-        ":experimental_access",
-        "//tensorflow/cc:__pkg__",
-    ],
-    deps = [
-        ":lib",
-    ],
-)
-
 alias(
     name = "feature_util",
     actual = "//tensorflow/core/example:feature_util",
@@ -453,7 +419,9 @@ tf_cuda_library(
         "//tensorflow/core/framework:control_flow.h",  # TODO(josh11b): Make internal?
         "//tensorflow/core/framework:dataset.h",
         "//tensorflow/core/framework:dataset_stateful_op_allowlist.h",
+        "//tensorflow/core/framework:device.h",
         "//tensorflow/core/framework:device_base.h",
+        "//tensorflow/core/framework:device_factory.h",
         "//tensorflow/core/framework:function.h",
         "//tensorflow/core/framework:function_handle_cache.h",
         "//tensorflow/core/framework:graph_def_util.h",
@@ -481,6 +449,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:reader_op_kernel.h",
         "//tensorflow/core/framework:register_types.h",
         "//tensorflow/core/framework:register_types_traits.h",
+        "//tensorflow/core/framework:registration_options.h",
         "//tensorflow/core/framework:resource_mgr.h",
         "//tensorflow/core/framework:resource_op_kernel.h",
         "//tensorflow/core/framework:rng_alg.h",
@@ -572,157 +541,7 @@ cc_library(
         ],
 )
 
-# Generates library per group of ops.
-tf_gen_op_libs(
-    is_external = False,
-    op_lib_names = [
-        "batch_ops",
-        "bitwise_ops",
-        "boosted_trees_ops",
-        "tensor_forest_ops",
-        "candidate_sampling_ops",
-        "checkpoint_ops",
-        "clustering_ops",
-        "collective_ops",
-        "control_flow_ops",
-        "count_ops",
-        "ctc_ops",
-        "data_flow_ops",
-        "dataset_ops",
-        "decode_proto_ops",
-        "encode_proto_ops",
-        "experimental_dataset_ops",
-        "function_ops",
-        "functional_ops",
-        "image_ops",
-        "io_ops",
-        "linalg_ops",
-        "list_ops",
-        "map_ops",
-        "lookup_ops",
-        "manip_ops",
-        "math_ops",
-        "mkl_nn_ops",
-        "nccl_ops",
-        "nn_ops",
-        "no_op",
-        "parsing_ops",
-        "random_grad",
-        "random_ops",
-        "special_math_ops",
-        "stateful_random_ops",
-        "remote_fused_graph_ops",
-        "rnn_ops",
-        "rpc_ops",
-        "scoped_allocator_ops",
-        "sdca_ops",
-        "set_ops",
-        "script_ops",
-        "sendrecv_ops",
-        "sparse_csr_matrix_ops",
-        "sparse_ops",
-        "spectral_ops",
-        "state_ops",
-        "stateless_random_ops",
-        "stateless_random_ops_v2",
-        "summary_ops",
-        "training_ops",
-    ],
-    deps = [
-        ":lib",
-        ":protos_all_cc",
-    ],
-)
-
-tf_gen_op_libs(
-    is_external = False,
-    op_lib_names = [
-        "logging_ops",
-    ],
-    deps = [
-        ":lib",
-        ":protos_all_cc",
-        # TODO(b/162630222): remove this dependency.
-        "//tensorflow/c/kernels:histogram_summary_op_lib",
-        "//tensorflow/c/kernels:merge_summary_op_lib",
-        "//tensorflow/c/kernels:summary_op_lib",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "string_ops",
-    ],
-    deps = [
-        ":lib_internal",
-        ":lib_proto_parsing",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "array_ops",
-    ],
-    deps = [
-        ":lib",
-        ":protos_all_cc",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "mkl_array_ops",
-    ],
-    deps = [":protos_all_cc"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "audio_ops",
-    ],
-    deps = [":lib"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["debug_ops"],
-    deps = [":lib"],
-)
-
-tf_gen_op_libs(
-    is_external = False,
-    op_lib_names = [
-        "resource_variable_ops",
-    ],
-    deps = [":lib"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "tpu_configuration_ops",
-        "tpu_cross_replica_ops",
-        "tpu_embedding_ops",
-        "tpu_embedding_load_retrieve_ops",
-        "tpu_functional_ops",
-        "tpu_heartbeat_ops",
-        "tpu_host_compute_ops",
-        "tpu_infeed_ops",
-        "tpu_outfeed_ops",
-        "tpu_ordinal_selector_ops",
-        "tpu_replication_ops",
-    ],
-    deps = [
-        ":lib",
-        ":lib_proto_parsing",
-        ":protos_all_cc",
-        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
-        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
-    ],
-)
-
-# And one for all user ops
+# One target for all user ops
 cc_library(
     name = "user_ops_op_lib",
     srcs = glob(["user_ops/**/*.cc"]),
@@ -733,214 +552,29 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "word2vec_ops",
-    srcs = ["ops/word2vec_ops.cc"],
-    linkstatic = 1,
-    visibility = ["//tensorflow:internal"],
-    deps = [":framework"],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "cudnn_rnn_ops",
-    srcs = [
-        "ops/cudnn_rnn_ops.cc",
-    ],
-    linkstatic = 1,
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":lib_internal",
-        ":stream_executor",
-        "//tensorflow/core/kernels:bounds_check_lib",
-    ],
-    alwayslink = 1,
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "cudnn_rnn_ops",
-    ],
-    deps = [
-        ":lib",
-    ],
-)
-
-cc_library(
-    name = "ragged_ops",
-    deps = [
-        ":ragged_array_ops_op_lib",
-        ":ragged_conversion_ops_op_lib",
-        ":ragged_math_ops_op_lib",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "ragged_array_ops",
-        "ragged_conversion_ops",
-        "ragged_math_ops",
-    ],
-    deps = ["//tensorflow/core/util:ragged_to_dense_util"],
-)
-
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
     deps = [
-        ":array_ops_op_lib",
-        ":audio_ops_op_lib",
-        ":batch_ops_op_lib",
-        ":bitwise_ops_op_lib",
-        ":boosted_trees_ops_op_lib",
-        ":tensor_forest_ops_op_lib",
-        ":candidate_sampling_ops_op_lib",
-        ":checkpoint_ops_op_lib",
-        ":clustering_ops_op_lib",
-        ":collective_ops_op_lib",
-        ":control_flow_ops_op_lib",
-        ":count_ops_op_lib",
-        ":ctc_ops_op_lib",
-        ":cudnn_rnn_ops_op_lib",
-        ":data_flow_ops_op_lib",
-        ":dataset_ops_op_lib",
-        ":debug_ops_op_lib",
-        ":decode_proto_ops_op_lib",
-        ":encode_proto_ops_op_lib",
-        ":experimental_dataset_ops_op_lib",
-        ":function_ops_op_lib",
-        ":functional_ops_op_lib",
-        ":image_ops_op_lib",
-        ":io_ops_op_lib",
-        ":linalg_ops_op_lib",
-        ":list_ops_op_lib",
-        ":map_ops_op_lib",
-        ":logging_ops_op_lib",
-        ":lookup_ops_op_lib",
-        ":manip_ops_op_lib",
-        ":math_ops_op_lib",
-        ":nccl_ops_op_lib",
-        ":nn_ops_op_lib",
-        ":no_op_op_lib",
-        ":parsing_ops_op_lib",
-        ":ragged_ops",
-        ":random_ops_op_lib",
-        ":rnn_ops_op_lib",
-        ":special_math_ops_op_lib",
-        ":stateful_random_ops_op_lib",
-        ":remote_fused_graph_ops_op_lib",
-        ":resource_variable_ops_op_lib",
-        ":rpc_ops_op_lib",
-        ":scoped_allocator_ops_op_lib",
-        ":script_ops_op_lib",
-        ":sdca_ops_op_lib",
-        ":sendrecv_ops_op_lib",
-        ":set_ops_op_lib",
-        ":sparse_csr_matrix_ops_op_lib",
-        ":sparse_ops_op_lib",
-        ":summary_ops_op_lib",
-        ":spectral_ops_op_lib",
-        ":state_ops_op_lib",
-        ":stateless_random_ops_op_lib",
-        ":stateless_random_ops_v2_op_lib",
-        ":string_ops_op_lib",
-        ":training_ops_op_lib",
         ":user_ops_op_lib",
-        ":word2vec_ops",
         "//tensorflow/c/kernels:bitcast_op_lib",
         "//tensorflow/c/kernels:histogram_summary_op_lib",
         "//tensorflow/c/kernels:merge_summary_op_lib",
         "//tensorflow/c/kernels:summary_op_lib",
+        "//tensorflow/core/ops:ops",
     ] + if_chromiumos(
         [],
         # Non-tpu platforms don't need tpu dependency.
         [
-            ":tpu_configuration_ops_op_lib",
-            ":tpu_cross_replica_ops_op_lib",
-            ":tpu_embedding_ops_op_lib",
-            ":tpu_embedding_load_retrieve_ops_op_lib",
-            ":tpu_functional_ops_op_lib",
-            ":tpu_heartbeat_ops_op_lib",
-            ":tpu_host_compute_ops_op_lib",
-            ":tpu_infeed_ops_op_lib",
-            ":tpu_outfeed_ops_op_lib",
-            ":tpu_ordinal_selector_ops_op_lib",
-            ":tpu_replication_ops_op_lib",
             "//tensorflow/core/tpu/ops",
         ],
-    ) + if_mkl([
-        ":mkl_array_ops_op_lib",
-        ":mkl_nn_ops_op_lib",
-    ]) + if_tensorrt([
+    ) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_ops_op_lib",
         "//tensorflow/compiler/tf2tensorrt:trt_op_libs",
     ]) + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op"],
         if_true = [],
     ),
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "array_grad",
-    srcs = ["ops/array_grad.cc"],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":array_ops_op_lib",
-        ":framework",
-        ":lib",
-        "//tensorflow/c/kernels:bitcast_op_lib",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "functional_grad",
-    srcs = ["ops/functional_grad.cc"],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":framework",
-        ":functional_ops_op_lib",
-        ":lib",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "math_grad",
-    srcs = [
-        "ops/math_grad.cc",
-        "ops/random_grad.cc",
-        "ops/stateless_random_grad.cc",
-    ],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":math_ops_op_lib",
-        ":protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "nn_grad",
-    srcs = ["ops/nn_grad.cc"],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":nn_ops_op_lib",
-    ] + if_mkl([
-        ":mkl_nn_ops_op_lib",
-    ]),
-    alwayslink = 1,
 )
 
 alias(
@@ -1172,10 +806,10 @@ tf_cuda_library(
         ":ops",
         ":protos_all_cc",
         ":test",
-        ":testlib_ops",
         # TODO(gunan): resolve dependency issues and load these kernels dynamically.
         ":testlib_kernels_impl",
         "//tensorflow/cc:scope",
+        "//tensorflow/core/common_runtime:testlib_ops",
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/framework:function_testlib",
         "//tensorflow/core/framework:shape_inference_testutil",
@@ -1185,13 +819,6 @@ tf_cuda_library(
     ],
 )
 
-alias(
-    name = "testlib_ops",
-    testonly = 1,
-    actual =
-        "//tensorflow/core/common_runtime:testlib_ops",
-)
-
 # This is a link-only library to provide a DirectSession
 # implementation of the Session interface.
 tf_cuda_library(
@@ -1205,23 +832,9 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-# -----------------------------------------------------------------------------
-# MKL targets
-alias(
-    name = "mkl_graph_util",
-    actual = "//tensorflow/core/graph:mkl_graph_util",
-)
-
 # -----------------------------------------------------------------------------
 # Public Android targets
 
-# List of protos we want on android
-filegroup(
-    name = "android_proto_srcs",
-    srcs = CORE_PROTO_SRCS,
-    visibility = ["//visibility:public"],
-)
-
 # Sources required to build the TensorFlow framework without the runtime on
 # mobile platforms. This is essentially the sources required to build
 # tensorflow/core/framework:tensor without using granular targets.
@@ -1311,6 +924,100 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+# All the aliases for stuff under ops/
+# Once the dependencies move to the real targets, remove the aliases here!
+
+[
+    alias(
+        name = "%s" % (name,),
+        actual = "//tensorflow/core/ops:%s" % (name,),
+        visibility = ["//visibility:public"],
+    )
+    for name in [
+        "array_grad",
+        "array_ops_op_lib",
+        "audio_ops_op_lib",
+        "batch_ops_op_lib",
+        "bitwise_ops_op_lib",
+        "boosted_trees_ops_op_lib",
+        "candidate_sampling_ops_op_lib",
+        "checkpoint_ops_op_lib",
+        "clustering_ops_op_lib",
+        "collective_ops_op_lib",
+        "control_flow_ops_op_lib",
+        "count_ops_op_lib",
+        "ctc_ops_op_lib",
+        "cudnn_rnn_ops_op_lib",
+        "data_flow_ops_op_lib",
+        "dataset_ops_op_lib",
+        "debug_ops_op_lib",
+        "decode_proto_ops_op_lib",
+        "encode_proto_ops_op_lib",
+        "experimental_dataset_ops_op_lib",
+        "function_ops_op_lib",
+        "functional_grad",
+        "functional_ops_op_lib",
+        "image_ops_op_lib",
+        "io_ops_op_lib",
+        "linalg_ops_op_lib",
+        "list_ops_op_lib",
+        "logging_ops_op_lib",
+        "lookup_ops_op_lib",
+        "manip_ops_op_lib",
+        "map_ops_op_lib",
+        "math_grad",
+        "math_ops_op_lib",
+        "mkl_array_ops_op_lib",
+        "mkl_nn_ops_op_lib",
+        "nccl_ops_op_lib",
+        "nn_grad",
+        "nn_ops_op_lib",
+        "no_op_op_lib",
+        "parsing_ops_op_lib",
+        "portable_op_registrations_and_gradients",
+        "ragged_array_ops_op_lib",
+        "ragged_conversion_ops_op_lib",
+        "ragged_math_ops_op_lib",
+        "ragged_ops",
+        "random_grad_op_lib",
+        "random_ops_op_lib",
+        "remote_fused_graph_ops_op_lib",
+        "resource_variable_ops_op_lib",
+        "risc_ops_op_lib",
+        "rnn_ops_op_lib",
+        "rpc_ops_op_lib",
+        "scoped_allocator_ops_op_lib",
+        "script_ops_op_lib",
+        "sdca_ops_op_lib",
+        "sendrecv_ops_op_lib",
+        "set_ops_op_lib",
+        "sparse_csr_matrix_ops_op_lib",
+        "sparse_ops_op_lib",
+        "special_math_ops_op_lib",
+        "spectral_ops_op_lib",
+        "state_ops_op_lib",
+        "stateful_random_ops_op_lib",
+        "stateless_random_ops_op_lib",
+        "stateless_random_ops_v2_op_lib",
+        "string_ops_op_lib",
+        "summary_ops_op_lib",
+        "tensor_forest_ops_op_lib",
+        "tpu_configuration_ops_op_lib",
+        "tpu_cross_replica_ops_op_lib",
+        "tpu_embedding_ops_op_lib",
+        "tpu_embedding_load_retrieve_ops_op_lib",
+        "tpu_functional_ops_op_lib",
+        "tpu_heartbeat_ops_op_lib",
+        "tpu_host_compute_ops_op_lib",
+        "tpu_infeed_ops_op_lib",
+        "tpu_outfeed_ops_op_lib",
+        "tpu_ordinal_selector_ops_op_lib",
+        "tpu_replication_ops_op_lib",
+        "training_ops_op_lib",
+        "word2vec_ops",
+    ]
+]
+
 # Native library support for mobile applications.  Does not contain
 # operators, use :portable_tensorflow_lib if you want full operator
 # support.
@@ -1377,30 +1084,6 @@ cc_library(
     alwayslink = 1,
 )
 
-alias(
-    name = "android_op_registrations_and_gradients",
-    actual = ":portable_op_registrations_and_gradients",
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "portable_op_registrations_and_gradients",
-    srcs = ["//tensorflow/c/kernels:android_all_ops"] + glob(
-        [
-            "ops/**/*.cc",
-            "ops/**/*.h",
-        ],
-        exclude = [
-            "**/*test.cc",
-            "**/*testutil*",
-            "**/*testlib*",
-            "**/*main.cc",
-            "**/tpu_*",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 # This is like android_test_srcs, minus the things that are already in mobile_srcs.
 filegroup(
     name = "android_test_srcs_no_core",
@@ -1782,28 +1465,9 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "jpeg_internal",
-    srcs = [
-        "lib/jpeg/jpeg_handle.cc",
-        "lib/jpeg/jpeg_mem.cc",
-        "//tensorflow/core/platform:jpeg_hdrs",
-    ],
-    hdrs = [
-        "lib/jpeg/jpeg_handle.h",
-        "lib/jpeg/jpeg_mem.h",
-    ],
-    copts = tf_copts(),
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
-    deps = [
-        ":lib",
-        ":lib_internal",
-        "//tensorflow/core/platform:jpeg",
-    ],
+    actual = "//tensorflow/core/lib/jpeg:jpeg_internal",
 )
 
 cc_library(
@@ -1828,32 +1492,9 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "portable_jpeg_internal",
-    srcs = if_mobile([
-        "lib/jpeg/jpeg_handle.cc",
-        "lib/jpeg/jpeg_mem.cc",
-        "//tensorflow/core/platform:jpeg_hdrs",
-    ]),
-    hdrs = [
-        "lib/jpeg/jpeg_handle.h",
-        "lib/jpeg/jpeg_mem.h",
-        "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
-        "//tensorflow/core/platform:jpeg_internal_hdrs",
-        "//tensorflow/core/platform/default:integral_types.h",
-        "//tensorflow/core/platform/default:logging.h",
-    ],
-    copts = tf_copts(),
-    linkopts = if_android(["-ldl"]),
-    deps = [
-        ":core_stringpiece",
-        "//tensorflow/core/platform:dynamic_annotations",
-        "//tensorflow/core/platform:jpeg",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:stringpiece",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-    ],
+    actual = "//tensorflow/core/lib/jpeg:portable_jpeg_internal",
 )
 
 cc_library(
@@ -1882,16 +1523,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "android_jpeg_internal",
-    actual = ":portable_jpeg_internal",
-)
-
-alias(
-    name = "android_gif_internal",
-    actual = ":portable_gif_internal",
-)
-
 alias(
     name = "error_codes_proto_impl",
     actual = "//tensorflow/core/protobuf:error_codes_proto_impl",
@@ -1902,11 +1533,6 @@ alias(
     actual = "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
 )
 
-alias(
-    name = "error_codes_proto_cc",
-    actual = "//tensorflow/core/lib/core:error_codes_proto_cc",
-)
-
 alias(
     name = "version_lib",
     actual = "//tensorflow/core/util:version_info",
@@ -1918,6 +1544,7 @@ filegroup(
         "//tensorflow/core/example:feature_util.h",
         "//tensorflow/core/framework:framework_internal_private_hdrs",
         "//tensorflow/core/graph:framework_internal_private_headers",
+        "//tensorflow/core/public:session_options.h",
         "//tensorflow/core/util:framework_internal_private_hdrs",
         "//tensorflow/core/util:memmapped_file_system_hdrs",
         "//tensorflow/core/util/sparse:framework_internal_private_headers_group",
@@ -1976,7 +1603,7 @@ cc_header_only_library(
         ":lib",
         ":lib_internal",
         ":version_lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/platform/default/build_config:platformlib",
     ],
 )
@@ -2028,6 +1655,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:attr_value_proto_text",
         "//tensorflow/core/framework:attr_value_util",
         "//tensorflow/core/framework:bfloat16",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/framework:common_shape_fns",
         "//tensorflow/core/framework:kernel_shape_util",
         "//tensorflow/core/framework:node_def_util",
@@ -2041,7 +1669,6 @@ tf_cuda_library(
         "//tensorflow/core/framework:shape_inference",
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_shape",
-        "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/platform:env_impl",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/profiler/lib:annotated_traceme",
@@ -2084,15 +1711,10 @@ cc_header_only_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
 
-alias(
-    name = "stream_executor",
-    actual = "//tensorflow/core/platform:stream_executor",
-)
-
 # Like stream_executor library, but compiles without --config=cuda
 # and does not include any cuda dependencies.
 alias(
@@ -2144,18 +1766,12 @@ tf_cuda_library(
         ":function_ops_op_lib",
         ":functional_grad",
         ":functional_ops_op_lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:required",
     ]),
     alwayslink = 1,
 )
 
-alias(
-    name = "core_cpu_impl",
-    actual =
-        "//tensorflow/core/common_runtime:core_cpu_impl",
-)
-
 alias(
     name = "core_cpu_lib",
     actual =
@@ -2168,18 +1784,6 @@ alias(
         "//tensorflow/core/common_runtime:core_cpu_internal",
 )
 
-alias(
-    name = "regexp_internal",
-    actual =
-        "//tensorflow/core/platform:regexp",
-    visibility = [
-        "//tensorflow/compiler:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/core/profiler:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-)
-
 alias(
     name = "direct_session_internal",
     actual =
@@ -2192,14 +1796,6 @@ alias(
     visibility = ["//visibility:public"],
 )
 
-alias(
-    name = "replay_log_proto_cc",
-    actual = "//tensorflow/core/protobuf:replay_log_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
 alias(
     name = "gpu_runtime",
     actual =
@@ -2223,18 +1819,6 @@ cc_library(
     ],
 )
 
-# TODO(gonnet): Remove this alias once all users have been moved to the actual target.
-alias(
-    name = "tensor_testutil",
-    actual = "//tensorflow/core/framework:tensor_testutil",
-)
-
-# TODO(gonnet): Remove this alias once all users have been moved to the actual target.
-alias(
-    name = "shape_inference_testutil",
-    actual = "//tensorflow/core/framework:shape_inference_testutil",
-)
-
 # Main program for tests
 alias(
     name = "test_main",
@@ -2242,14 +1826,6 @@ alias(
     visibility = ["//tensorflow:internal"],
 )
 
-test_suite(
-    name = "low_level_tests",
-    tests = [
-        ":low_level_library_tests",
-        "//tensorflow/core/platform:low_level_library_tests",
-    ],
-)
-
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
@@ -2270,7 +1846,6 @@ tf_cc_tests(
         "//tensorflow/core/lib/random:legacy_lib_random_tests",
         "//tensorflow/core/lib/strings:legacy_low_level_library_tests",
     ],
-    create_named_test_suite = True,
     deps = [
         ":lib",
         ":lib_internal",
@@ -2306,36 +1881,6 @@ tf_cc_test(
     ],
 )
 
-test_suite(
-    name = "platform_tests",
-    tests = [
-        "//tensorflow/core/platform:abi_test",
-        "//tensorflow/core/platform:env_test",
-        "//tensorflow/core/platform:fake_python_env_test",
-        "//tensorflow/core/platform:file_system_test",
-        "//tensorflow/core/platform:numa_test",
-        "//tensorflow/core/platform:platform_strings_test",
-        "//tensorflow/core/platform:rocm_rocdl_path_test",
-        "//tensorflow/core/platform:setround_test",
-        "//tensorflow/core/platform:unbounded_work_queue_test",
-        "//tensorflow/core/platform:vmodule_test",
-    ],
-)
-
-tf_cc_test(
-    name = "lib_jpeg_jpeg_mem_unittest",
-    srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
-    data = glob(["lib/jpeg/testdata/*.jpg"]),
-    deps = [
-        ":jpeg_internal",
-        ":lib",
-        ":lib_internal",
-        ":test",
-        ":test_main",
-        "@com_google_absl//absl/base",
-    ],
-)
-
 tf_cc_test(
     name = "lib_strings_ordered_code_test",
     srcs = ["//tensorflow/core/lib/strings:legacy_strings_ordered_code_test"],
@@ -2374,27 +1919,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "framework_op_gen_lib_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:op_gen_lib_test.cc"],
-    deps = [
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        "//tensorflow/core/framework:op_gen_lib",
-    ],
-)
-
-test_suite(
-    name = "higher_level_tests",
-    tests = [
-        ":core_higher_level_tests",
-        "//tensorflow/core/framework:higher_level_tests",
-        "//tensorflow/core/util:higher_level_tests",
-    ],
-)
-
 tf_cc_tests(
     name = "core_higher_level_tests",
     size = "small",
@@ -2413,7 +1937,6 @@ tf_cc_tests(
         "//tensorflow/core/graph:validate_test.cc",
         "//tensorflow/core/util/sparse:higher_level_tests_group",
     ],
-    create_named_test_suite = True,
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -2450,22 +1973,6 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_test(
-    name = "cudnn_rnn_ops_test_cc",
-    size = "small",
-    srcs = [
-        "ops/cudnn_rnn_ops_test.cc",
-    ],
-    deps = [
-        ":core",
-        ":framework",
-        ":lib",
-        ":test",
-        ":test_main",
-        ":testlib",
-    ],
-)
-
 tf_cc_test_mkl(
     name = "mkl_related_tests",
     size = "small",
@@ -2542,224 +2049,6 @@ tf_cc_tests_gpu(
     ],
 )
 
-tf_cc_test_gpu(
-    name = "variant_op_copy_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:variant_op_copy_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session",
-        ":framework",
-        ":framework_internal",
-        ":gpu_runtime",
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/kernels:array",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "framework_run_handler_util_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:run_handler_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":framework_internal",
-        ":lib",
-        ":test",
-        ":test_main",
-    ],
-)
-
-tf_cc_test(
-    name = "framework_run_handler_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:run_handler_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core_cpu",
-        ":direct_session_internal",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/core/framework:tensor_testutil",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:matmul_op",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-tf_cc_test(
-    name = "framework_op_segment_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:op_segment_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:ops_util",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_array_grad_test",
-    size = "small",
-    srcs = ["ops/array_grad_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:array",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_math_grad_test",
-    size = "small",
-    srcs = ["ops/math_grad_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = ["no_gpu"],
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:array",
-        "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_remote_fused_graph_ops_test",
-    size = "small",
-    srcs = ["ops/remote_fused_graph_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/core/kernels:remote_fused_graph_ops",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_tests",
-    size = "small",
-    srcs = [
-        "ops/array_ops_test.cc",
-        "ops/candidate_sampling_ops_test.cc",
-        "ops/control_flow_ops_test.cc",
-        "ops/ctc_ops_test.cc",
-        "ops/data_flow_ops_test.cc",
-        "ops/functional_ops_test.cc",
-        "ops/image_ops_test.cc",
-        "ops/io_ops_test.cc",
-        "ops/linalg_ops_test.cc",
-        "ops/lookup_ops_test.cc",
-        "ops/math_ops_test.cc",
-        "ops/nn_ops_test.cc",
-        "ops/parsing_ops_test.cc",
-        "ops/random_ops_test.cc",
-        "ops/rnn_ops_test.cc",
-        "ops/set_ops_test.cc",
-        "ops/shape_function_test.cc",
-        "ops/sparse_csr_matrix_ops_test.cc",
-        "ops/sparse_ops_test.cc",
-        "ops/spectral_ops_test.cc",
-        "ops/state_ops_test.cc",
-        "ops/string_ops_test.cc",
-        "ops/training_ops_test.cc",
-    ],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//third_party/eigen3",
-    ],
-)
-
 # Test data
 filegroup(
     name = "image_testdata",
@@ -2769,23 +2058,12 @@ filegroup(
         "//tensorflow/core/lib/ssim:testdata",
         "//tensorflow/core/lib/psnr:testdata",
         # JPEG data
-        "lib/jpeg/testdata/jpeg_merge_test1.jpg",
-        "lib/jpeg/testdata/jpeg_merge_test1_cmyk.jpg",
-        # JPEG data for jpeg benchmark.
-        "lib/jpeg/testdata/small.jpg",
-        "lib/jpeg/testdata/medium.jpg",
-        # Corrupted JPEG files for tests
-        "lib/jpeg/testdata/bad_huffman.jpg",
-        "lib/jpeg/testdata/corrupt.jpg",
-        # -- hand-edited variant: stops at line 0
-        "lib/jpeg/testdata/corrupt34_2.jpg",
-        # -- hand-edited variant: stops at line 4
-        "lib/jpeg/testdata/corrupt34_3.jpg",
-        # -- hand-edited variant: stops after a restart marker
-        "lib/jpeg/testdata/corrupt34_4.jpg",
+        "//tensorflow/core/lib/jpeg/testdata",
         # GIF data
         "lib/gif/testdata/lena.gif",
         "lib/gif/testdata/scan.gif",
+        "lib/gif/testdata/red_black.gif",
+        "lib/gif/testdata/squares.gif",
         # GIF data with optimization
         "lib/gif/testdata/optimized.gif",
         # BMP data
@@ -2794,17 +2072,6 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-alias(
-    name = "lmdb_testdata",
-    actual = "//tensorflow/core/lib/lmdb:lmdb_testdata",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cuda_libdevice_path",
-    actual = "//tensorflow/core/platform:cuda_libdevice_path",
-)
-
 transitive_hdrs(
     name = "headers",
     visibility = ["//tensorflow:__subpackages__"],
@@ -2813,7 +2080,7 @@ transitive_hdrs(
         ":framework",
         ":lib",
         ":protos_all_cc",
-        ":stream_executor",
         "//tensorflow/core/platform:platform_strings",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
index c534425eb24..7174c8d3daf 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
@@ -47,5 +47,11 @@ constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
 PNG files. Alternately, set the expand_animations argument of this function to
 False, in which case the op will return 3-dimensional tensors and will truncate
 animated GIF files to the first frame.
+
+*NOTE*: If the first frame of an animated GIF does not occupy the entire
+canvas (maximum frame width x maximum frame height), then it fills the
+unoccupied areas (in the first frame) with zeros (black). For frames after the
+first frame that does not occupy the entire canvas, it uses the previous
+frame to fill the unoccupied areas.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..a84ccb78436
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4"
+  summary: "Returns the gradient of `QuantizeAndDequantizeV4`."
+  description: <<END
+This is almost identical to QuantizeAndDequantizeV2, except that it returns a
+gradient of 1 for inputs that are within the quantization range, or 0 otherwise.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..88ba0ea88cb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+  summary: "Returns the gradient of `QuantizeAndDequantizeV4`."
+  description: <<END
+Returns a gradient of 1 for inputs that are within the quantization range,
+or 0 otherwise.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariantGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariantGradient.pbtxt
new file mode 100644
index 00000000000..066d6b5eae4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariantGradient.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "RaggedTensorToVariantGradient"
+  visibility: HIDDEN
+  in_arg {
+    name: "encoded_ragged_grad"
+    description: <<END
+A `variant` Tensor containing encoded `RaggedTensor` gradients.
+END
+  }
+  in_arg {
+    name: "row_splits"
+    description: <<END
+Outermost row-splits that were used as input to the RaggedTensorToVariant op.
+END
+  }
+  in_arg {
+    name: "dense_values_shape"
+    description: <<END
+Shape of the dense_values that was used as an input to the
+RaggedTensorToVariant op.
+END
+  }
+  out_arg {
+    name: "dense_values_grad"
+    description: <<END
+Gradient for the dense_values of the RaggedTensorToVariant op.
+END
+  }
+  summary: <<END
+Helper used to compute the gradient for `RaggedTensorToVariant`.
+END
+  description: <<END
+Computes the gradient for the dense_values input to the RaggedTensorToVariant
+op, given the variant-encoded ragged gradients of the outputs, along with
+the outer row-splits and the shape of the dense-values that were provided as
+inputs to the RaggedTensorToVariant op.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RiscAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_RiscAdd.pbtxt
new file mode 100644
index 00000000000..d3600318a4c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RiscAdd.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "RiscAdd"
+  visibility: HIDDEN
+  summary: "Returns x + y element-wise."
+  description: <<END
+*NOTE*: `RiscAdd` does not supports broadcasting.
+
+Given two input tensors, the `tf.risc_add` operation computes the sum for every element in the tensor.
+
+Both input and output have a range `(-inf, inf)`.
+
+END
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..8054405368a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..8054405368a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..0ed576f0690
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..0ed576f0690
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 74587068aa7..1403b05de38 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -499,28 +499,19 @@ cc_library(
 
 cc_library(
     name = "device",
-    srcs = ["device.cc"],
     hdrs = ["device.h"],
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
 cc_library(
     name = "device_factory",
-    srcs = ["device_factory.cc"],
     hdrs = ["device_factory.h"],
     copts = tf_copts(),
     deps = [
-        ":device",
-        ":session_options",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:framework_internal",
     ],
 )
 
@@ -1549,6 +1540,7 @@ cc_library(
         ":local_device",
         ":scoped_allocator",
         ":session_options",
+        "@com_google_absl//absl/base",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -1739,7 +1731,6 @@ tf_cuda_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_experimental",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
@@ -2226,14 +2217,10 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "direct_session_test",
-    size = "small",
+    size = "medium",
     srcs = ["direct_session_test.cc"],
     args = [] + if_cuda(["--heap_check=local"]),  # The GPU tracer leaks memory
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = [
-        "noasan",  # b/168317266
-        "notsan",  # b/168317266
-    ],
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -2273,12 +2260,8 @@ tf_cuda_cc_test(
 # enables support for TensorFlow Debugger (tfdbg).
 tf_cc_test(
     name = "direct_session_with_debug_test",
-    size = "small",
     srcs = ["direct_session_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = [
-        "notsan",  #b/168811551
-    ],
     deps = [
         ":core",
         ":core_cpu",
@@ -2534,10 +2517,13 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cc_test_gpu(
     name = "lower_if_op_test",
     size = "small",
     srcs = ["lower_if_op_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171575050): re-enable once fixed.
+    ],
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 213736fb68b..a2cfce1111c 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -42,6 +43,14 @@ limitations under the License.
 #define VALUE_IN_DEBUG_STRING false
 
 namespace tensorflow {
+
+namespace {
+bool IsCancelled(CancellationManager* cancel_mgr) {
+  return cancel_mgr != nullptr &&
+         (cancel_mgr->IsCancelled() || cancel_mgr->IsCancelling());
+}
+}  // namespace
+
 /*static*/
 int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
                                           int64 num_chunks) {
@@ -215,43 +224,74 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
 BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
 
 void BaseCollectiveExecutor::StartAbort(const Status& s) {
-  VLOG(1) << "BaseCollectiveExecutor::StartAbort " << s;
-  cem_->GetParamResolver()->StartAbort(s);
-  remote_access_->StartAbort(s);
-  if (cem_->GetNcclCommunicator() != nullptr) {
-    cem_->GetNcclCommunicator()->StartAbort(s);
+  Status status;
+  {
+    mutex_lock l(status_mu_);
+    if (!status_.ok()) {
+      VLOG(2) << "BaseCollectiveExecutor already aborted, ignoring StartAbort: "
+              << s;
+      return;
+    }
+    status_ = StatusGroup::MakeDerived(Status(
+        s.code(),
+        absl::StrCat(
+            "Collective ops is aborted by: ", s.error_message(),
+            "\nThe error could be from a previous operation. Restart your "
+            "program to reset.")));
+    status = status_;
   }
+  LOG(ERROR) << "BaseCollectiveExecutor::StartAbort " << s;
+  cem_->GetParamResolver()->StartAbort(status);
+  remote_access_->StartAbort(status);
+  if (cem_->GetNcclCommunicator() != nullptr) {
+    cem_->GetNcclCommunicator()->StartAbort(status);
+  }
+}
+
+Status BaseCollectiveExecutor::GetStatus(const Status& s) {
+  if (s.ok()) return s;
+  mutex_lock l(status_mu_);
+  // If the collective executor is already aborted, use the aborted status
+  // which is more likely the actual error instead of an artifact of an
+  // abortion.
+  if (!status_.ok()) {
+    VLOG(2) << "Overriding status with collective ops executor status. "
+               "Original status: "
+            << s;
+    return status_;
+  }
+  return s;
 }
 
 void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
                                           const CollectiveParams& col_params,
                                           const string& exec_key,
                                           StatusCallback done) {
+  // See CompleteParamsAsync() how done() and the timeout callback interacts.
   const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
-
-  // On any individual collective Op failure we need to abort the
-  // BufRendezvous so that other Ops in the instance don't hang
-  // waiting for transmissions that will never happen.
-  StatusCallback done_safe = [this, done, is_callback_called](const Status& s) {
-    auto should_call_callback = !is_callback_called->exchange(true);
-    if (should_call_callback) {
-      if (!s.ok()) {
-        remote_access_->buf_rendezvous()->StartAbort(s);
+  auto done_safe = [this, done, ctx, is_callback_called](const Status& s) {
+    bool called = is_callback_called->exchange(true);
+    if (!called) {
+      if (!s.ok() && !IsCancelled(ctx->cancellation_manager())) {
+        // This is a collective error. Abort CollectiveExecutor so that this
+        // error can propagate to other workers.
+        StartAbort(s);
       }
-      done(s);
+      done(GetStatus(s));
     }
   };
-
   auto timeout_microseconds = static_cast<int64>(
       col_params.instance.impl_details.timeout_seconds * 1'000'000);
   if (timeout_microseconds > 0) {
     // TODO(xldrx): Share the timeout watchdog thread among collectives.
     SchedNonBlockingClosureAfter(
-        timeout_microseconds, [is_callback_called, done_safe] {
-          if (!is_callback_called->load()) {
-            auto status = Status(error::DEADLINE_EXCEEDED,
-                                 "Collective has timed out during execution.");
-            done_safe(status);
+        timeout_microseconds, [this, is_callback_called, done] {
+          bool called = is_callback_called->exchange(true);
+          if (!called) {
+            Status status(error::DEADLINE_EXCEEDED,
+                          "Collective has timed out during execution.");
+            StartAbort(status);
+            done(status);
           }
         });
   }
@@ -307,31 +347,46 @@ void BaseCollectiveExecutor::CompleteParamsAsync(
     const DeviceAttributes& device, CollectiveParams* cp,
     CancellationManager* cancel_mgr, StatusCallback done) {
   cp->group.gpu_ring_order = *gpu_ring_order_;
+  // We need to make sure that when the timeout callback executes,
+  // CollectiveExecutor and CollectiveExecutorMgr are both alive. After done()
+  // is called, CollectiveExecutorMgr may be destructed and we don't have a way
+  // to keep it without making the ownerships more complicated. Therefore if the
+  // timeout callback executes, done_safe will become a no-op and the timeout
+  // callback is responsible for invoking done() at the end.
   const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
-  auto done_with_timeout = done;
+  auto trace_id =
+      profiler::TraceMe::ActivityStart("CollectiveExecutor::CompleteParams");
+  auto done_safe = [this, is_callback_called, cancel_mgr, trace_id,
+                    done](const Status& s) {
+    profiler::TraceMe::ActivityEnd(trace_id);
+    bool called = is_callback_called->exchange(true);
+    if (!called) {
+      if (!s.ok() && !IsCancelled(cancel_mgr)) {
+        // This is a collective error. Abort CollectiveExecutor so that this
+        // error can propagate to other workers.
+        StartAbort(s);
+      }
+      done(GetStatus(s));
+    }
+  };
   auto timeout_microseconds =
       static_cast<int64>(cp->instance.impl_details.timeout_seconds * 1'000'000);
   if (timeout_microseconds > 0) {
     // TODO(xldrx): Share the timeout watchdog thread among collectives.
     SchedNonBlockingClosureAfter(
-        timeout_microseconds, [is_callback_called, done] {
-          auto should_call_callback = !is_callback_called->exchange(true);
-          if (should_call_callback) {
-            auto status =
-                Status(error::DEADLINE_EXCEEDED,
-                       "Collective has timed out waiting for other workers.");
+        timeout_microseconds, [this, is_callback_called, done]() {
+          bool called = is_callback_called->exchange(true);
+          if (!called) {
+            Status status(
+                error::DEADLINE_EXCEEDED,
+                "Collective has timed out waiting for other workers.");
+            StartAbort(status);
             done(status);
           }
         });
-    done_with_timeout = [is_callback_called, done](const Status& s) {
-      auto should_call_callback = !is_callback_called->exchange(true);
-      if (should_call_callback) {
-        done(s);
-      }
-    };
   }
   cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr,
-                                                done_with_timeout);
+                                                done_safe);
 }
 
 Status BaseCollectiveExecutor::CreateCollective(
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 4081b887add..142c825df55 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -108,7 +108,7 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
 
   ~BaseCollectiveExecutor() override;
 
-  void StartAbort(const Status& s) override;
+  void StartAbort(const Status& s) override TF_LOCKS_EXCLUDED(status_mu_);
 
   void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
                     const string& exec_key, StatusCallback done) override;
@@ -148,6 +148,8 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   // collective instance key -> number of local devices for which NCCL ops have
   // been launched.
   std::unordered_map<int32, int32> launched_ TF_GUARDED_BY(launch_mu_);
+  mutex status_mu_;
+  Status status_ TF_GUARDED_BY(status_mu_);
 
  private:
   Status CreateCollective(const CollectiveParams& col_params,
@@ -155,6 +157,9 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   // Check if all ops on which this collective depends on have launched.
   bool CheckDependencies(const CollectiveParams& col_params)
       TF_EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
+  // Tries to return the status that is the original error. It returns the
+  // aborted status if the collective executor is aborted.
+  Status GetStatus(const Status& s) TF_LOCKS_EXCLUDED(status_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
index 49cc9fd3db8..fc05ad0dd96 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -20,10 +20,20 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 
 namespace tensorflow {
+namespace {
+void DeregisterCancellation(BufRendezvous::Hook* h) {
+  if (h->cancellation_manager != nullptr) {
+    h->cancellation_manager->DeregisterCallback(h->cancellation_token);
+    h->cancellation_manager = nullptr;
+    h->cancellation_token = CancellationManager::kInvalidToken;
+  }
+}
+}  // namespace
 
 BufRendezvous::~BufRendezvous() {
   mutex_lock l(mu_);
@@ -51,6 +61,9 @@ void BufRendezvous::StartAbort(const Status& s) {
 void BufRendezvous::PurgeTable(const Status& s, HookTable* table) {
   for (auto& it : *table) {
     Hook* h = it.second;
+    if (h->cancellation_manager != nullptr) {
+      h->cancellation_manager->TryDeregisterCallback(h->cancellation_token);
+    }
     if (h->cons_cb != nullptr) {
       h->cons_cb(s, nullptr);
     }
@@ -73,7 +86,8 @@ string BufRendezvous::Hook::DebugString() const {
 void BufRendezvous::ProvideBuf(const string& key, Device* dev,
                                DeviceContext* dev_ctx, const Tensor* v,
                                const AllocatorAttributes& attr,
-                               const ProducerCallback& done) {
+                               const ProducerCallback& done,
+                               CancellationManager* cancellation_manager) {
   Hook* h = nullptr;
   Status providebuf_status;
   do {
@@ -82,9 +96,13 @@ void BufRendezvous::ProvideBuf(const string& key, Device* dev,
       providebuf_status = status_;
       break;
     } else {
+      CancellationToken cancellation_token = CancellationManager::kInvalidToken;
       auto it = hook_table_.find(key);
       if (it == hook_table_.end()) {
-        h = new Hook;
+        if (cancellation_manager != nullptr) {
+          cancellation_token = cancellation_manager->get_cancellation_token();
+        }
+        h = new Hook(cancellation_manager, cancellation_token);
         it = hook_table_.insert(std::make_pair(key, h)).first;
       } else {
         if (it->second->prod_cb != nullptr) {
@@ -100,15 +118,27 @@ void BufRendezvous::ProvideBuf(const string& key, Device* dev,
       h->prod_value = v;
       h->prod_attr = attr;
       h->prod_cb = done;
-      // If consumer is waiting, kick off right away, removing Hook from table.
       if (h->cons_cb != nullptr) {
+        // If consumer is waiting, kick off right away, removing Hook from
+        // table.
         hook_table_.erase(it);
       } else {
+        if (cancellation_manager != nullptr &&
+            !cancellation_manager->RegisterCallback(
+                cancellation_token, [this, key]() { CancelHook(key); })) {
+          // Register cancellation callback with CancellationManager.  If it is
+          // already cancelled, call done immediately with cancelled status.
+          providebuf_status = errors::Cancelled(
+              "Operation was cancelled for BufRendezvous key ", key);
+          hook_table_.erase(it);
+          delete h;
+        }
         h = nullptr;
       }
     }
   } while (false);
   if (h) {
+    DeregisterCancellation(h);
     h->cons_cb(Status::OK(), h);
   }
   if (!providebuf_status.ok()) {
@@ -118,7 +148,8 @@ void BufRendezvous::ProvideBuf(const string& key, Device* dev,
 
 void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
                                const uint64 device_incarnation,
-                               const ConsumerCallback& done) {
+                               const ConsumerCallback& done,
+                               CancellationManager* cancellation_manager) {
   // Check the incarnation in the request matches the current device
   // incarnation of the producer.
   Device* device;
@@ -157,13 +188,26 @@ void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
       existing_hook->cons_cb = done;
     } else {
       // Hang consumer callback on the Hook.
-      Hook* h = new Hook;
-      hook_table_[key] = h;
-      h->cons_cb = done;
-      return;
+      CancellationToken cancellation_token = CancellationManager::kInvalidToken;
+      bool already_cancelled = false;
+      if (cancellation_manager != nullptr) {
+        cancellation_token = cancellation_manager->get_cancellation_token();
+        already_cancelled = !cancellation_manager->RegisterCallback(
+            cancellation_token, [this, key]() { CancelHook(key); });
+      }
+      if (already_cancelled) {
+        consumebuf_status = errors::Cancelled(
+            "Operation was cancelled for BufRendezvous key ", key);
+      } else {
+        Hook* h = new Hook(cancellation_manager, cancellation_token);
+        h->cons_cb = done;
+        it = hook_table_.insert(std::make_pair(key, h)).first;
+        return;
+      }
     }
   } while (false);
   if (existing_hook) {
+    DeregisterCancellation(existing_hook);
     existing_hook->cons_cb(Status::OK(), existing_hook);
     return;
   }
@@ -173,6 +217,28 @@ void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
   }
 }
 
+void BufRendezvous::CancelHook(const string& key) {
+  Hook* h = nullptr;
+  {
+    mutex_lock l(mu_);
+    auto it = hook_table_.find(key);
+    if (it == hook_table_.end()) return;
+    h = it->second;
+    hook_table_.erase(it);
+  }
+  if (h != nullptr) {
+    auto s = errors::Cancelled("Operation was cancelled for BufRendezvous key ",
+                               key);
+    if (h->prod_cb != nullptr) {
+      h->prod_cb(s);
+    }
+    if (h->cons_cb != nullptr) {
+      h->cons_cb(s, /*Hook=*/nullptr);
+    }
+    delete h;
+  }
+}
+
 /*static*/
 void BufRendezvous::DoneWithHook(Hook* h) {
   h->prod_cb(Status::OK());
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
index 74857e46a53..c8cd527f4ae 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -66,20 +67,30 @@ class BufRendezvous {
     AllocatorAttributes prod_attr;
     ProducerCallback prod_cb;
     ConsumerCallback cons_cb;
-    Hook()
+    CancellationManager* cancellation_manager;
+    CancellationToken cancellation_token;
+    explicit Hook(CancellationManager* cancellation_manager,
+                  CancellationToken cancellation_token)
         : prod_dev(nullptr),
           prod_ctx(nullptr),
           prod_value(nullptr),
           prod_cb(nullptr),
-          cons_cb(nullptr) {}
+          cons_cb(nullptr),
+          cancellation_manager(cancellation_manager),
+          cancellation_token(cancellation_token) {}
     string DebugString() const;
   };
 
   // Called to advertise availability of a Tensor value corresponding
   // to key.  That value must stay valid until done is called.
+  //
+  // If a non-null cancellation manager is provided, this function registers a
+  // callback to delete the hook and invoke provider/consumer callbacks with
+  // cancelled error.
   void ProvideBuf(const string& key, Device* dev, DeviceContext* dev_ctx,
                   const Tensor* v, const AllocatorAttributes& attr,
-                  const ProducerCallback& done);
+                  const ProducerCallback& done,
+                  CancellationManager* cancellation_manager);
 
   // Called to request access to a Tensor value corresponding to key.
   // Consumer is provided with a Hook as soon as available.
@@ -88,8 +99,17 @@ class BufRendezvous {
   // `device` that produced this value matches the `incarnation` expected by the
   // consumer, and invokes `done` with `FailedPrecondition` status and
   // `nullptr` hook if it does not match.
+  //
+  // If a non-null cancellation manager is provided, this function registers a
+  // callback to delete the hook and invoke provider/consumer callbacks with
+  // cancelled error.
   void ConsumeBuf(const string& key, const string& device,
-                  const uint64 incarnation, const ConsumerCallback& done);
+                  const uint64 incarnation, const ConsumerCallback& done,
+                  CancellationManager* cancellation_manager);
+
+  // Cancel the rendezvous entry corresponding to `key`.  Triggered by the
+  // cancellation manager. No-op if the rendezvous was already successful.
+  void CancelHook(const string& key);
 
   // Consumer must call this function when it's done reading the Hook provided
   // by the ConsumerCallback.  This function will invoke the producer callback
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 270165114f7..b1c7b6a80e2 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -68,6 +68,7 @@ class BufRendezvousTest : public ::testing::Test {
   DeviceContext* fake_device_context_;
   std::unique_ptr<DeviceMgr> dev_mgr_;
   std::unique_ptr<BufRendezvous> br_;
+  CancellationManager cm_;
   static const string* const kDefaultKey;
   static const string* const kDefaultDeviceName;
   static const uint64 kDefaultIncarnation;
@@ -90,19 +91,22 @@ TEST_F(BufRendezvousTest, CorrectUseProducerFirst) {
         prod_status = s;
         prod_callback_called = true;
         note.Notify();
-      });
+      },
+      &cm_);
   EXPECT_FALSE(prod_callback_called);
-  br_->ConsumeBuf(*kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
-                  [this, &cons_status, &cons_callback_called](
-                      const Status& s, BufRendezvous::Hook* h) {
-                    cons_status = s;
-                    cons_callback_called = true;
-                    ASSERT_TRUE(h != nullptr);
-                    EXPECT_EQ(h->prod_dev, default_device_);
-                    EXPECT_EQ(h->prod_ctx, fake_device_context_);
-                    EXPECT_EQ(h->prod_value, &a_);
-                    br_->DoneWithHook(h);
-                  });
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [this, &cons_status, &cons_callback_called](const Status& s,
+                                                  BufRendezvous::Hook* h) {
+        cons_status = s;
+        cons_callback_called = true;
+        ASSERT_TRUE(h != nullptr);
+        EXPECT_EQ(h->prod_dev, default_device_);
+        EXPECT_EQ(h->prod_ctx, fake_device_context_);
+        EXPECT_EQ(h->prod_value, &a_);
+        br_->DoneWithHook(h);
+      },
+      &cm_);
   EXPECT_TRUE(cons_callback_called);
   note.WaitForNotification();
   EXPECT_TRUE(prod_callback_called);
@@ -116,17 +120,19 @@ TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
   bool prod_callback_called = false;
   bool cons_callback_called = false;
   Notification note;
-  br_->ConsumeBuf(*kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
-                  [this, &cons_status, &cons_callback_called](
-                      const Status& s, BufRendezvous::Hook* h) {
-                    cons_status = s;
-                    cons_callback_called = true;
-                    ASSERT_TRUE(h != nullptr);
-                    EXPECT_EQ(h->prod_dev, default_device_);
-                    EXPECT_EQ(h->prod_ctx, fake_device_context_);
-                    EXPECT_EQ(h->prod_value, &a_);
-                    br_->DoneWithHook(h);
-                  });
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [this, &cons_status, &cons_callback_called](const Status& s,
+                                                  BufRendezvous::Hook* h) {
+        cons_status = s;
+        cons_callback_called = true;
+        ASSERT_TRUE(h != nullptr);
+        EXPECT_EQ(h->prod_dev, default_device_);
+        EXPECT_EQ(h->prod_ctx, fake_device_context_);
+        EXPECT_EQ(h->prod_value, &a_);
+        br_->DoneWithHook(h);
+      },
+      &cm_);
   EXPECT_FALSE(cons_callback_called);
   br_->ProvideBuf(
       *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
@@ -134,7 +140,8 @@ TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
         prod_status = s;
         prod_callback_called = true;
         note.Notify();
-      });
+      },
+      &cm_);
   EXPECT_TRUE(cons_callback_called);
   note.WaitForNotification();
   EXPECT_TRUE(prod_callback_called);
@@ -144,17 +151,19 @@ TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
 
 TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
   bool prod_callback_called = false;
-  br_->ProvideBuf(*kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
-                  [&prod_callback_called](const Status& s) {
-                    prod_callback_called = true;
-                  });
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [&prod_callback_called](const Status& s) { prod_callback_called = true; },
+      &cm_);
   Status bad_status;
   Notification note;
-  br_->ProvideBuf(*kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
-                  [&bad_status, &note](const Status& s) {
-                    bad_status = s;
-                    note.Notify();
-                  });
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [&bad_status, &note](const Status& s) {
+        bad_status = s;
+        note.Notify();
+      },
+      &cm_);
   note.WaitForNotification();
   EXPECT_FALSE(bad_status.ok());
   EXPECT_EQ(absl::StrCat("BufRendezvous::ProvideBuf already called for key ",
@@ -166,11 +175,13 @@ TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
 
 TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
   Status cons_status;
-  br_->ConsumeBuf(*kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
-                  [&cons_status](const Status& s, BufRendezvous::Hook* h) {
-                    cons_status = s;
-                    EXPECT_EQ(h, nullptr);
-                  });
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [&cons_status](const Status& s, BufRendezvous::Hook* h) {
+        cons_status = s;
+        EXPECT_EQ(h, nullptr);
+      },
+      &cm_);
   EXPECT_TRUE(cons_status.ok());
   br_.reset();
   EXPECT_FALSE(cons_status.ok());
@@ -188,12 +199,15 @@ TEST_F(BufRendezvousTest, AbortNonEmpty) {
       [&cons_note, &cons_status](const Status& s, BufRendezvous::Hook* h) {
         cons_status = s;
         cons_note.Notify();
-      });
-  br_->ProvideBuf("key1", default_device_, fake_device_context_, &a_, aa_,
-                  [&prod_note, &prod_status](const Status& s) {
-                    prod_status = s;
-                    prod_note.Notify();
-                  });
+      },
+      &cm_);
+  br_->ProvideBuf(
+      "key1", default_device_, fake_device_context_, &a_, aa_,
+      [&prod_note, &prod_status](const Status& s) {
+        prod_status = s;
+        prod_note.Notify();
+      },
+      &cm_);
   br_->StartAbort(errors::Internal("Falling sky detected"));
   prod_note.WaitForNotification();
   cons_note.WaitForNotification();
@@ -218,12 +232,15 @@ TEST_F(BufRendezvousTest, UseAfterAbort) {
       [&cons_note, &cons_status](const Status& s, BufRendezvous::Hook* h) {
         cons_status = s;
         cons_note.Notify();
-      });
-  br_->ProvideBuf("key1", default_device_, fake_device_context_, &a_, aa_,
-                  [&prod_note, &prod_status](const Status& s) {
-                    prod_status = s;
-                    prod_note.Notify();
-                  });
+      },
+      &cm_);
+  br_->ProvideBuf(
+      "key1", default_device_, fake_device_context_, &a_, aa_,
+      [&prod_note, &prod_status](const Status& s) {
+        prod_status = s;
+        prod_note.Notify();
+      },
+      &cm_);
   prod_note.WaitForNotification();
   cons_note.WaitForNotification();
   EXPECT_FALSE(prod_status.ok());
@@ -237,18 +254,161 @@ TEST_F(BufRendezvousTest, UseAfterAbort) {
 TEST_F(BufRendezvousTest, DeviceIncarnationMismatch) {
   Status cons_status;
   Notification note;
-  br_->ProvideBuf(*kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
-                  [](const Status&) {});
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [](const Status&) {}, /*cancellation_manager=*/nullptr);
   const uint64 incorrect_incarnation = 23456;
   br_->ConsumeBuf(
       *kDefaultKey, *kDefaultDeviceName, incorrect_incarnation,
       [&note, &cons_status](const Status& s, BufRendezvous::Hook* h) {
         cons_status = s;
         note.Notify();
-      });
+      },
+      /*cancellation_manager=*/nullptr);
   note.WaitForNotification();
   EXPECT_TRUE(errors::IsFailedPrecondition(cons_status));
 }
 
+TEST_F(BufRendezvousTest, ProvideThenCancel) {
+  Status status;
+  Notification note;
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [&status, &note](const Status& s) {
+        status = s;
+        note.Notify();
+      },
+      &cm_);
+  cm_.StartCancel();
+  note.WaitForNotification();
+  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_NE(
+      status.error_message().find(absl::StrCat(
+          "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
+      string::npos);
+}
+
+TEST_F(BufRendezvousTest, CancelThenProvide) {
+  Status status;
+  Notification note;
+  cm_.StartCancel();
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [&status, &note](const Status& s) {
+        status = s;
+        note.Notify();
+      },
+      &cm_);
+  note.WaitForNotification();
+  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_NE(
+      status.error_message().find(absl::StrCat(
+          "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
+      string::npos);
+}
+
+TEST_F(BufRendezvousTest, ConsumeThenCancel) {
+  Status status;
+  Notification note;
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [&status, &note](const Status& s, BufRendezvous::Hook* h) {
+        status = s;
+        note.Notify();
+      },
+      &cm_);
+  cm_.StartCancel();
+  note.WaitForNotification();
+  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_NE(
+      status.error_message().find(absl::StrCat(
+          "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
+      string::npos);
+}
+
+TEST_F(BufRendezvousTest, CancelThenConsume) {
+  Status status;
+  Notification note;
+  cm_.StartCancel();
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [&status, &note](const Status& s, BufRendezvous::Hook* h) {
+        status = s;
+        note.Notify();
+      },
+      &cm_);
+  note.WaitForNotification();
+  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_NE(
+      status.error_message().find(absl::StrCat(
+          "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
+      string::npos);
+}
+
+TEST_F(BufRendezvousTest, ProvideConsumeThenCancel) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      },
+      &cm_);
+  EXPECT_FALSE(prod_callback_called);
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [this, &cons_status, &cons_callback_called](const Status& s,
+                                                  BufRendezvous::Hook* h) {
+        cons_status = s;
+        cons_callback_called = true;
+        ASSERT_TRUE(h != nullptr);
+        EXPECT_EQ(h->prod_dev, default_device_);
+        EXPECT_EQ(h->prod_ctx, fake_device_context_);
+        EXPECT_EQ(h->prod_value, &a_);
+        br_->DoneWithHook(h);
+      },
+      &cm_);
+  note.WaitForNotification();
+  cm_.StartCancel();
+  EXPECT_TRUE(cons_callback_called);
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, CancelThenProvideConsume) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  cm_.StartCancel();
+  br_->ProvideBuf(
+      *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
+      [&prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        EXPECT_TRUE(errors::IsCancelled(prod_status));
+        prod_callback_called = true;
+      },
+      &cm_);
+  EXPECT_TRUE(prod_callback_called);
+  EXPECT_TRUE(errors::IsCancelled(prod_status));
+  br_->ConsumeBuf(
+      *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
+      [&cons_status, &cons_callback_called](const Status& s,
+                                            BufRendezvous::Hook* h) {
+        cons_status = s;
+        EXPECT_TRUE(errors::IsCancelled(cons_status));
+        cons_callback_called = true;
+      },
+      &cm_);
+  EXPECT_TRUE(cons_callback_called);
+  EXPECT_TRUE(errors::IsCancelled(cons_status));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 9c46314af67..01b89494c0d 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -779,7 +779,7 @@ void CollectiveParamResolverLocal::StartAbort(const Status& s) {
   {
     mutex_lock l(status_mu_);
     if (!status_.ok()) {
-      VLOG(1) << "CollectiveParamResolverLocal already aborted. Ignoring "
+      VLOG(2) << "CollectiveParamResolverLocal already aborted. Ignoring "
                  "subsequent abortion with status: "
               << s;
       return;
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index b958a25c091..44175a042a7 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -28,7 +28,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
     const string& key, Device* to_device, DeviceContext* to_device_ctx,
     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
     const DeviceLocality& client_locality, int dev_to_dev_stream_index,
-    const StatusCallback& done) {
+    CancellationManager* cancellation_manager, const StatusCallback& done) {
   VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key "
           << key;
   if (!peer_is_local) {
@@ -91,21 +91,23 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
   };
   buf_rendezvous_.ConsumeBuf(key, from_device->name(),
                              from_device->attributes().incarnation(),
-                             consumer_callback);
+                             consumer_callback, cancellation_manager);
 }
 
 void CollectiveRemoteAccessLocal::PostToPeer(
     const string& peer_device, const string& peer_task, const string& key,
     Device* from_device, DeviceContext* from_device_ctx,
     const AllocatorAttributes& from_alloc_attr, const Tensor* from_tensor,
-    const DeviceLocality& client_locality, const StatusCallback& done) {
+    const DeviceLocality& client_locality,
+    CancellationManager* cancellation_manager, const StatusCallback& done) {
   VLOG(1) << "PostToPeer " << this << " key " << key
           << " step_id_=" << step_id_;
   buf_rendezvous_.ProvideBuf(key, from_device, from_device_ctx, from_tensor,
-                             from_alloc_attr, done);
+                             from_alloc_attr, done, cancellation_manager);
 }
 
 void CollectiveRemoteAccessLocal::CheckPeerHealth(const string& peer_task,
+                                                  int64 timeout_in_ms,
                                                   const StatusCallback& done) {
   // Assume local devices are always healthy.
   done(errors::Internal(
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index 12aca901054..fb4ddf178e5 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -43,6 +43,7 @@ class CollectiveRemoteAccessLocal : public CollectiveRemoteAccess {
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality,
                     int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
                     const StatusCallback& done) override;
 
   void PostToPeer(const string& peer_device, const string& peer_task,
@@ -51,9 +52,10 @@ class CollectiveRemoteAccessLocal : public CollectiveRemoteAccess {
                   const AllocatorAttributes& from_alloc_attr,
                   const Tensor* from_tensor,
                   const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
                   const StatusCallback& done) override;
 
-  void CheckPeerHealth(const string& peer_task,
+  void CheckPeerHealth(const string& peer_task, int64 timeout_in_ms,
                        const StatusCallback& done) override;
 
   BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index 2c606147f7d..30f6e372606 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -52,6 +52,7 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
         cp, device_mgr_.get(), drl_.get(), kTaskName);
     rma_ = absl::make_unique<CollectiveRemoteAccessLocal>(device_mgr_.get(),
                                                           drl_.get(), kStepId);
+    cm_ = absl::make_unique<CancellationManager>();
   }
 
   ~CollectiveRemoteAccessLocalTest() override = default;
@@ -61,6 +62,7 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
   std::unique_ptr<CollectiveRemoteAccessLocal> rma_;
+  std::unique_ptr<CancellationManager> cm_;
 };
 
 TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
@@ -74,7 +76,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
   rma_->RecvFromPeer(kTaskName + "/device:CPU:0", kTaskName, true /*is_local*/,
                      "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
-                     0 /*stream_index*/,
+                     0 /*stream_index*/, cm_.get(),
                      [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
@@ -90,7 +92,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
   rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
                    cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [&send_note, &send_status](const Status& s) {
+                   cm_.get(), [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
@@ -117,7 +119,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   rma_->RecvFromPeer(kTaskName + "/device:CPU:1", kTaskName, true /*is_local*/,
                      "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
-                     0 /*stream_index*/,
+                     0 /*stream_index*/, cm_.get(),
                      [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
@@ -135,7 +137,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
                    cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [&send_note, &send_status](const Status& s) {
+                   cm_.get(), [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
@@ -154,13 +156,100 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
 TEST_F(CollectiveRemoteAccessLocalTest, CheckHealth) {
   Status status;
   Notification done;
-  rma_->CheckPeerHealth(kTaskName, [&status, &done](const Status& s) {
-    status = s;
-    done.Notify();
-  });
+  rma_->CheckPeerHealth(kTaskName, /*timeout_in_ms=*/0,
+                        [&status, &done](const Status& s) {
+                          status = s;
+                          done.Notify();
+                        });
   done.WaitForNotification();
   EXPECT_TRUE(errors::IsInternal(status));
 }
 
+TEST_F(CollectiveRemoteAccessLocalTest, RecvThenCancel) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:0", kTaskName, true /*is_local*/,
+                     "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     0 /*stream_index*/, cm_.get(),
+                     [&recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  cm_->StartCancel();
+  recv_note.WaitForNotification();
+  EXPECT_TRUE(cm_->IsCancelled());
+  EXPECT_TRUE(errors::IsCancelled(recv_status));
+}
+
+TEST_F(CollectiveRemoteAccessLocalTest, CancelThenRecv) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  cm_->StartCancel();
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:0", kTaskName, true /*is_local*/,
+                     "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     0 /*stream_index*/, cm_.get(),
+                     [&recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  recv_note.WaitForNotification();
+  EXPECT_TRUE(cm_->IsCancelled());
+  EXPECT_TRUE(errors::IsCancelled(recv_status));
+}
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostThenCancel) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
+                   cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   cm_.get(), [&send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  cm_->StartCancel();
+  send_note.WaitForNotification();
+  EXPECT_TRUE(cm_->IsCancelled());
+  EXPECT_TRUE(errors::IsCancelled(send_status));
+}
+
+TEST_F(CollectiveRemoteAccessLocalTest, CancelThenPost) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  Notification send_note;
+  Status send_status;
+  cm_->StartCancel();
+  rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
+                   cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   cm_.get(), [&send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  send_note.WaitForNotification();
+  EXPECT_TRUE(cm_->IsCancelled());
+  EXPECT_TRUE(errors::IsCancelled(send_status));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index 0a13f973106..962db699d1f 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -805,7 +805,14 @@ Status ColocationGraph::AddHostOnlyDataTypesConstraints() {
     absl::optional<bool> is_host_data_type;
 
     auto edge_filter = [&](const Edge& edge) -> bool {
-      return !is_host_data_type.has_value();
+      // We already found the underlying data type.
+      if (is_host_data_type.has_value()) return false;
+
+      // Otherwise follow only DT_VARIANT data edges.
+      auto edge_dtype = [&]() -> DataType {
+        return edge.src()->output_type(edge.src_output());
+      };
+      return !edge.IsControlEdge() && edge_dtype() == DT_VARIANT;
     };
 
     auto enter = [&](Node* n) -> void {
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index c68c395198a..364fae1b118 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -53,6 +53,8 @@ class CompositeDevice : public Device {
       const std::vector<string>& underlying_devices, const string& device_name,
       Status* status);
 
+  bool IsRemoteCallAllowed() const override { return false; }
+
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
                   const std::vector<string>& underlying_devices)
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index f87efb369ed..384ec836cdf 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -219,6 +219,7 @@ bool IsConstantFoldable(
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
     const std::function<bool(const Node*)>& consider,
+    int64 max_constant_size_in_bytes,
     std::unordered_map<const Node*, std::vector<Tensor>>*
         shape_replacement_map) {
   if (n->IsConstant()) {
@@ -233,6 +234,20 @@ bool IsConstantFoldable(
   if (consider && !consider(n)) {
     return false;
   }
+  if (shape_map != nullptr) {
+    // We can skip the node if an output is known to be oversized.
+    auto shape_it = shape_map->find(n->name());
+    if (shape_it != shape_map->end()) {
+      for (int64 i = 0; i < shape_it->second.size(); ++i) {
+        const auto& out_shape = shape_it->second[i];
+        if (out_shape.IsFullyDefined() &&
+            out_shape.num_elements() * DataTypeSize(n->output_type(i)) >
+                max_constant_size_in_bytes) {
+          return false;
+        }
+      }
+    }
+  }
   if (n->IsControlFlow() || n->IsSend() || n->IsRecv()) {
     return false;
   }
@@ -280,6 +295,7 @@ void ConsiderConstantFoldableNode(
     std::unordered_map<const Node*, std::vector<Tensor>>* shape_replacement_map,
     bool* internal_node_inserted) {
   if (IsConstantFoldable(n, opts.shape_map, opts.consider,
+                         opts.max_constant_size_in_bytes,
                          shape_replacement_map)) {
     // A node is constant provided all of its non-control incoming Tensors come
     // from constant nodes, or it's a shape Op with statically known inputs in
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 9e2db9faaf1..83785e3341b 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -12,191 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-// A Device is a something that can perform computations as part of a
-// model.  Devices can be local (runs computation on this machine), or
-// remote (contacts a device local to another machine using an RPC to
-// do the work).  Devices are registered in a DeviceSet, which is also
-// responsible for the Device <-> id mapping.
-//
-// Device names
-// * Every Device should have a unique name with the format:
-//     /job:___/replica:___/task:___/(gpu|cpu):___
-//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
-// * Task numbers are within the specified replica, so there are as
-//   many "task zeros" as replicas.
-
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
 
-#include <memory>
-#include <string>
-
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/control_flow.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/op_segment.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/device_name_utils.h"
-
-namespace tensorflow {
-
-class Device : public DeviceBase {
- public:
-  // Callback type that takes a Status and returns void.
-  typedef std::function<void(const Status&)> DoneCallback;
-
-  Device(Env* env, const DeviceAttributes& device_attributes);
-  ~Device() override;
-
-  // Full name of this device (see top comment).
-  const std::string& name() const override { return device_attributes_.name(); }
-
-  // Parsed name of this device
-  const DeviceNameUtils::ParsedName& parsed_name() const {
-    return parsed_name_;
-  }
-
-  // Describes what kind of device this is.  This is intended to be
-  // human-readable and not computer-parsed, except that two devices
-  // with the same device_type() are expected to perform similarly
-  // (both from a computation and communication perspective).
-  const std::string& device_type() const {
-    return device_attributes_.device_type();
-  }
-
-  // Returns an aggregation of device attributes.
-  const DeviceAttributes& attributes() const override {
-    return device_attributes_;
-  }
-
-  // Performs the actual compute function.
-  //
-  // Subclasses may override this function if they wish to perform
-  // some initialization before each compute.
-  virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) {
-    op_kernel->Compute(context);
-  }
-
-  // Asynchronous kernel's compute.
-  virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
-                            AsyncOpKernel::DoneCallback done) {
-    op_kernel->ComputeAsync(context, std::move(done));
-  }
-
-  // Blocks until all operations queued on the device at the time of
-  // the call have completed.  Returns any error pending on the device
-  // at completion.
-  virtual Status Sync() = 0;
-
-  // Calls the given callback when all operations queued on the device at the
-  // time of the call have completed. The callback is passed any error pending
-  // on the device at completion.
-  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
-  // version.
-  virtual void Sync(const DoneCallback& done);
-
-  // On session completion, the executor may call Device::Sync() depending on
-  // flag settings. Override this to return false for devices that don't allow
-  // such calls. Instead, these devices must use other mechanisms (such as
-  // num_deferred_ops) to ensure the device has finished processing necessary
-  // work at session completion. In addition, for these devices, RefreshStatus
-  // must be called at session completion to retrieve execution result status.
-  //
-  // Devices that override this function must also implement RefreshStatus.
-  virtual bool AllowsSyncOnCompletion() const { return true; }
-
-  // This is used in conjunction with AllowsSyncOnCompletion to allow the
-  // executor to get execution result status at session completion.
-  //
-  // For supported devices, this call returns the underlying device stream's
-  // current status in a non-blocking way, without using blocking calls such as
-  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
-  // status is also updated with the retrieved stream status.
-  virtual Status RefreshStatus() {
-    return errors::Unimplemented(
-        "RefreshStatus is not supported on this device.");
-  }
-
-  // Optionally modify the device's GraphDef before execution.
-  //
-  // This method should be considered experimental and is supplied to enable
-  // prototyping of TensorFlow device implementations that need to modify
-  // the GraphDef before execution.
-  //
-  // 'graph' supplies the partition of the graph assigned to this
-  // device.
-  virtual Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
-    return Status::OK();
-  }
-
-  // Sets `out_context` a new DeviceContext* for executing a graph, or nullptr
-  // if the device does not support contexts. Returns an error status if any
-  // error occurred while trying to create a context, otherwise OK.
-  //
-  // The caller takes ownership of one reference on the output DeviceContext*,
-  // and should call Unref().
-  virtual Status TryGetDeviceContext(DeviceContext** out_context) {
-    *out_context = nullptr;
-    return Status::OK();
-  }
-
-  // Returns the op segment of this device.  The caller can reuse op
-  // kernels registered for the same session running on this device.
-  OpSegment* op_segment() { return &op_seg_; }
-
-  // Returns the resource manager associated w/ this device.
-  virtual ResourceMgr* resource_manager() { return rmgr_; }
-
-  // Summarizes the status of this Device, for debugging.
-  std::string DebugString() const { return device_attributes_.DebugString(); }
-
-  // Assembles the parameter components into a complete DeviceAttributes value.
-  static DeviceAttributes BuildDeviceAttributes(
-      const std::string& name, DeviceType device, Bytes memory_limit,
-      const DeviceLocality& locality, const std::string& physical_device_desc);
-
-  static DeviceAttributes BuildDeviceAttributes(
-      const std::string& name, DeviceType device, Bytes memory_limit,
-      const DeviceLocality& locality) {
-    // Pass in an empty string as physical device name.
-    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
-  }
-
-  // Clears the resource manager associated with this device.
-  void ClearResourceMgr() { rmgr_->Clear(); }
-
-  virtual bool IsLocal() const { return true; }
-
- protected:
-  void DeleteResourceMgr() {
-    delete rmgr_;
-    rmgr_ = nullptr;
-  }
-
- private:
-  const DeviceAttributes device_attributes_;
-  DeviceNameUtils::ParsedName parsed_name_;
-
-  // op_seg_ maps session handle and op name to OpKernel objects.
-  OpSegment op_seg_;
-
-  // Resources associated w/ this device. E.g., shared variables, etc.
-  ResourceMgr* rmgr_ = nullptr;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Device);
-};
-
-}  // namespace tensorflow
+#include "tensorflow/core/framework/device.h"
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index f10a718db05..1b5a662639a 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -12,140 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
 
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-class Device;
-struct SessionOptions;
-
-class DeviceFactory {
- public:
-  virtual ~DeviceFactory() {}
-  static void Register(const std::string& device_type, DeviceFactory* factory,
-                       int priority);
-  static DeviceFactory* GetFactory(const std::string& device_type);
-
-  // Append to "*devices" all suitable devices, respecting
-  // any device type specific properties/counts listed in "options".
-  //
-  // CPU devices are added first.
-  static Status AddDevices(const SessionOptions& options,
-                           const std::string& name_prefix,
-                           std::vector<std::unique_ptr<Device>>* devices);
-
-  // Helper for tests.  Create a single device of type "type".  The
-  // returned device is always numbered zero, so if creating multiple
-  // devices of the same type, supply distinct name_prefix arguments.
-  static std::unique_ptr<Device> NewDevice(const string& type,
-                                           const SessionOptions& options,
-                                           const string& name_prefix);
-
-  // Iterate through all device factories and build a list of all of the
-  // possible physical devices.
-  //
-  // CPU is are added first.
-  static Status ListAllPhysicalDevices(std::vector<string>* devices);
-
-  // Get details for a specific device among all device factories.
-  // 'device_index' indexes into devices from ListAllPhysicalDevices.
-  static Status GetAnyDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details);
-
-  // For a specific device factory list all possible physical devices.
-  virtual Status ListPhysicalDevices(std::vector<string>* devices) = 0;
-
-  // Get details for a specific device for a specific factory. Subclasses
-  // can store arbitrary device information in the map. 'device_index' indexes
-  // into devices from ListPhysicalDevices.
-  virtual Status GetDeviceDetails(int device_index,
-                                  std::unordered_map<string, string>* details) {
-    return Status::OK();
-  }
-
-  // Most clients should call AddDevices() instead.
-  virtual Status CreateDevices(
-      const SessionOptions& options, const std::string& name_prefix,
-      std::vector<std::unique_ptr<Device>>* devices) = 0;
-
-  // Return the device priority number for a "device_type" string.
-  //
-  // Higher number implies higher priority.
-  //
-  // In standard TensorFlow distributions, GPU device types are
-  // preferred over CPU, and by default, custom devices that don't set
-  // a custom priority during registration will be prioritized lower
-  // than CPU.  Custom devices that want a higher priority can set the
-  // 'priority' field when registering their device to something
-  // higher than the packaged devices.  See calls to
-  // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
-  // for built-in devices.
-  static int32 DevicePriority(const std::string& device_type);
-};
-
-namespace dfactory {
-
-template <class Factory>
-class Registrar {
- public:
-  // Multiple registrations for the same device type with different priorities
-  // are allowed.  Priorities are used in two different ways:
-  //
-  // 1) When choosing which factory (that is, which device
-  //    implementation) to use for a specific 'device_type', the
-  //    factory registered with the highest priority will be chosen.
-  //    For example, if there are two registrations:
-  //
-  //      Registrar<CPUFactory1>("CPU", 125);
-  //      Registrar<CPUFactory2>("CPU", 150);
-  //
-  //    then CPUFactory2 will be chosen when
-  //    DeviceFactory::GetFactory("CPU") is called.
-  //
-  // 2) When choosing which 'device_type' is preferred over other
-  //    DeviceTypes in a DeviceSet, the ordering is determined
-  //    by the 'priority' set during registration.  For example, if there
-  //    are two registrations:
-  //
-  //      Registrar<CPUFactory>("CPU", 100);
-  //      Registrar<GPUFactory>("GPU", 200);
-  //
-  //    then DeviceType("GPU") will be prioritized higher than
-  //    DeviceType("CPU").
-  //
-  // The default priority values for built-in devices is:
-  // GPU: 210
-  // GPUCompatibleCPU: 70
-  // ThreadPoolDevice: 60
-  // Default: 50
-  explicit Registrar(const std::string& device_type, int priority = 50) {
-    DeviceFactory::Register(device_type, new Factory(), priority);
-  }
-};
-
-}  // namespace dfactory
-
-#define REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, ...) \
-  INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory,   \
-                                         __COUNTER__, ##__VA_ARGS__)
-
-#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \
-                                               ctr, ...)                    \
-  static ::tensorflow::dfactory::Registrar<device_factory>                  \
-      INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr)(device_type,         \
-                                                       ##__VA_ARGS__)
-
-// __COUNTER__ must go through another macro to be properly expanded
-#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr) ___##ctr##__object_
-
-}  // namespace tensorflow
+#include "tensorflow/core/framework/device_factory.h"
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 613449f572e..aceacf40132 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -2587,11 +2587,9 @@ TEST(DirectSessionTest,
 
 // A simple benchmark for the overhead of `DirectSession::Run()` calls
 // with varying numbers of feeds/fetches.
-void FeedFetchBenchmarkHelper(int iters, int num_feeds, bool use_make_callable,
-                              int inter_op_threads,
+void FeedFetchBenchmarkHelper(::testing::benchmark::State& state, int num_feeds,
+                              bool use_make_callable, int inter_op_threads,
                               bool use_single_threaded_executor) {
-  testing::StopTiming();
-
   Tensor value(DT_FLOAT, TensorShape());
   value.flat<float>()(0) = 37.0;
 
@@ -2643,13 +2641,11 @@ void FeedFetchBenchmarkHelper(int iters, int num_feeds, bool use_make_callable,
     }
     TF_CHECK_OK(session->MakeCallable(callable_options, &handle));
 
-    testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
       std::vector<Tensor> output_values;
       TF_CHECK_OK(
           session->RunCallable(handle, input_tensors, &output_values, nullptr));
     }
-    testing::StopTiming();
   } else {
     {
       // NOTE(mrry): Ignore the first run, which will incur the graph
@@ -2661,32 +2657,40 @@ void FeedFetchBenchmarkHelper(int iters, int num_feeds, bool use_make_callable,
       std::vector<Tensor> output_values;
       TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
     }
-    testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+
+    for (auto s : state) {
       std::vector<Tensor> output_values;
       TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
     }
-    testing::StopTiming();
   }
 }
 
-void BM_FeedFetch(int iters, int num_feeds) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ false,
+void BM_FeedFetch(::testing::benchmark::State& state) {
+  const int num_feeds = state.range(0);
+
+  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ false,
                            /* inter_op_threads */ 0,
                            /* use_single_threaded_executor */ false);
 }
-void BM_FeedFetchCallable(int iters, int num_feeds) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true,
+void BM_FeedFetchCallable(::testing::benchmark::State& state) {
+  const int num_feeds = state.range(0);
+
+  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ true,
                            /* inter_op_threads */ 0,
                            /* use_single_threaded_executor */ false);
 }
-void BM_FeedFetchCallableSingleThread(int iters, int num_feeds) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true,
+void BM_FeedFetchCallableSingleThread(::testing::benchmark::State& state) {
+  const int num_feeds = state.range(0);
+
+  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ true,
                            /* inter_op_threads */ -1,
                            /* use_single_threaded_executor */ false);
 }
-void BM_FeedFetchCallableSingleThreadExecutor(int iters, int num_feeds) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true,
+void BM_FeedFetchCallableSingleThreadExecutor(
+    ::testing::benchmark::State& state) {
+  const int num_feeds = state.range(0);
+
+  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ true,
                            /* inter_op_threads */ -1,
                            /* use_single_threaded_executor */ true);
 }
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index a049de9a188..91f60d6ebe2 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -451,7 +451,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
-        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core/graph:mkl_graph_util",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 757ac1f7783..d7c1359a3e1 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #if !defined(IS_MOBILE_PLATFORM)
@@ -108,6 +109,8 @@ EagerContext::EagerContext(
     this->thread_pool_->Schedule(std::move(closure));
   };
 
+  run_metadata_ = std::make_unique<RunMetadata>();
+
 #if !defined(IS_MOBILE_PLATFORM)
   context_id_ = kInvalidContextId;
   context_view_id_ = 0;
@@ -577,7 +580,12 @@ const FunctionDef* EagerContext::FindFunctionDef(const string& name) const {
   return func_lib_def_.Find(name);
 }
 
-void EagerContext::ClearRunMetadata() { run_metadata_.Clear(); }
+std::unique_ptr<RunMetadata> EagerContext::ExportRunMetadata() {
+  mutex_lock ml(metadata_mu_);
+  auto result = std::make_unique<RunMetadata>();
+  run_metadata_.swap(result);
+  return result;
+}
 
 bool EagerContext::UsesTFRT() { return false; }
 
@@ -858,7 +866,7 @@ void EagerContext::SetShouldStoreGraphs(bool value) {
   mutex_lock ml(metadata_mu_);
   should_store_graphs_.store(value);
   if (!value) {
-    run_metadata_.Clear();
+    run_metadata_.reset(new RunMetadata);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index f48da696d48..62093dcc1d0 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -79,12 +79,6 @@ namespace eager {
 class RemoteMgr;
 }  // namespace eager
 
-class RunMetadataListener {
- public:
-  virtual ~RunMetadataListener() {}
-  virtual void BeforeClearRunMetadata() = 0;
-};
-
 class TensorHandle;
 class EagerOperation;
 
@@ -310,8 +304,11 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   mutex* MetadataMu() TF_LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
   bool ShouldStoreGraphs() TF_LOCKS_EXCLUDED(metadata_mu_);
   void SetShouldStoreGraphs(bool value) override;
-  RunMetadata* RunMetadataProto() { return &run_metadata_; }
-  void ClearRunMetadata() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
+  RunMetadata* RunMetadataProto() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_) {
+    return run_metadata_.get();
+  }
+  std::unique_ptr<RunMetadata> ExportRunMetadata() override
+      TF_LOCKS_EXCLUDED(metadata_mu_);
 
   void StartStep() override;
   void EndStep() override;
@@ -587,7 +584,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Whether we should compute RunMetadata.
   std::atomic<bool> should_store_graphs_{false};
   mutex metadata_mu_;
-  RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
+  std::unique_ptr<RunMetadata> run_metadata_ TF_GUARDED_BY(metadata_mu_);
   GraphCollector graph_collector_;
   std::atomic<bool> log_device_placement_;
   std::atomic<bool> allow_soft_placement_;
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index 7fe321edffd..015b914337d 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -300,6 +300,9 @@ void EagerExecutor::NotifyWaiters(uint64 id) {
     } else {
       upperbound_id = next_node_id_ - 1;
     }
+    if (upperbound_id < id) {
+      return;
+    }
     DVLOG(3) << "Notify node done: [id " << id << " to " << upperbound_id
              << "] ";
     // Note that we notify all waiting threads in case an error has
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
index daa9e8f52fb..133799e6ab3 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
@@ -55,7 +55,8 @@ class EagerOpRewriteRegistry {
   // Phases at which the Eager op rewrite pass should run.
   // For now we only added PRE_EXECUTION. Expand as needed.
   enum Phase {
-    PRE_EXECUTION = 0  // right before executing an eager op
+    PRE_EXECUTION = 0,  // right before executing an eager op
+    POST_PLACEMENT = 1  // after device placement
   };
 
   // Add a rewrite pass to the registry.
@@ -70,7 +71,7 @@ class EagerOpRewriteRegistry {
   static EagerOpRewriteRegistry* Global();
 
  private:
-  static constexpr int32 kNumPhases = 1;
+  static constexpr int32 kNumPhases = 2;
   // Holds all the registered Eager op rewrites.
   std::array<std::unique_ptr<EagerOpRewrite>, kNumPhases> rewrites_;
 };
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 6d1ecf64fcc..41ab54a91e9 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -378,6 +378,12 @@ Status EagerOperation::InferInputListAttrs(int num_inputs) {
   } else if (!input_def.type_attr().empty() &&
              !input_def.number_attr().empty()) {
     InferSingleTypeInputListAttrs(input_def, inputs_[start]->dtype, num_inputs);
+  } else if (!input_def.number_attr().empty()) {
+    if (inference_attrs_.find(input_def.number_attr()) ==
+        inference_attrs_.end()) {
+      MutableAttrs()->Set(input_def.number_attr(), num_inputs);
+      inference_attrs_.insert(input_def.number_attr());
+    }
   } else {
     return errors::InvalidArgument("Invalid input list definition");
   }
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index afa850481ea..572615bb3f8 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
@@ -171,10 +172,11 @@ Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
                         /* mirror= */ true, &result_handle);
   activity.Stop();
   if (!status.ok()) {
-    return errors::Internal("Failed copying input tensor from ",
-                            handle_device->name(), " to ",
-                            expected_input_device->name(), " in order to run ",
-                            op->Name(), ": ", status.error_message());
+    return Status(
+        status.code(),
+        absl::StrCat("Failed copying input tensor from ", handle_device->name(),
+                     " to ", expected_input_device->name(), " in order to run ",
+                     op->Name(), ": ", status.error_message()));
   }
 
   *result = result_handle;
@@ -730,8 +732,23 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   TF_RETURN_IF_ERROR(executor.status());
 
   core::RefCountPtr<KernelAndDevice> kernel;
-  TF_RETURN_IF_ERROR(
-      GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel));
+  auto status = GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel);
+
+  // Run all the registered rewrite pass after the placement, regardless whether
+  // the placement is successful or not. The passes can either create new ops
+  // (without placement) or update some fields of the input op.
+  std::unique_ptr<tensorflow::EagerOperation> out_op;
+  TF_RETURN_IF_ERROR(EagerOpRewriteRegistry::Global()->RunRewrite(
+      EagerOpRewriteRegistry::POST_PLACEMENT, op, &out_op));
+  if (out_op) {
+    op = out_op.get();
+    // If the out op doesn't have device, either because it is a new op or
+    // the op wasn't placed successfully, then we do the placement again.
+    if (op->Device() == kVariantDeviceNull) {
+      status = GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel);
+    }
+  }
+  if (!status.ok()) return status;
 
   int num_outputs = kernel->num_outputs();
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 33e85b25fb4..169dbb5fe4b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -69,8 +69,8 @@ class TestEnv {
   Device* cpu_device_;
 };
 
-void BM_CreateGraph(int iters) {
-  for (int i = 0; i < iters; ++i) {
+void BM_CreateGraph(::testing::benchmark::State& state) {
+  for (auto s : state) {
     Scope root = Scope::NewRootScope();
     auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
     auto M = ops::MatMul(root, C, C);
@@ -79,8 +79,7 @@ void BM_CreateGraph(int iters) {
 }
 BENCHMARK(BM_CreateGraph);
 
-void BM_RunGraph(int iters) {
-  tensorflow::testing::StopTiming();
+void BM_RunGraph(::testing::benchmark::State& state) {
   Scope root = Scope::NewRootScope();
   auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
   auto M = ops::MatMul(root, C, C);
@@ -89,28 +88,24 @@ void BM_RunGraph(int iters) {
   opts.config.set_intra_op_parallelism_threads(1);
   ClientSession sess(root, opts);
   std::vector<Tensor> outputs;
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     outputs.clear();
     TF_CHECK_OK(sess.Run({M}, &outputs));
   }
 }
 BENCHMARK(BM_RunGraph);
 
-void BM_CreateAndDestroySession(int iters) {
-  tensorflow::testing::StopTiming();
+void BM_CreateAndDestroySession(::testing::benchmark::State& state) {
   Scope root = Scope::NewRootScope();
   auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
   auto M = ops::MatMul(root, C, C);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ClientSession sess(root);
   }
 }
 BENCHMARK(BM_CreateAndDestroySession);
 
-void BM_KernelAndDeviceInit(int iters) {
-  tensorflow::testing::StopTiming();
+void BM_KernelAndDeviceInit(::testing::benchmark::State& state) {
   NodeDef ndef(AttrBuilder("MatMul")
                    .Set("T", DT_FLOAT)
                    .Set("transpose_a", false)
@@ -120,15 +115,13 @@ void BM_KernelAndDeviceInit(int iters) {
   TestEnv env;
   KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
                       nullptr, env.cpu_device());
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     TF_CHECK_OK(k.Init({}, ndef, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceInit);
 
-void BM_KernelAndDeviceRun(int iters) {
-  tensorflow::testing::StopTiming();
+void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
   Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
   gtl::InlinedVector<TensorValue, 4> inputs;
   inputs.push_back(TensorValue(&t));
@@ -145,8 +138,7 @@ void BM_KernelAndDeviceRun(int iters) {
                       nullptr, env.cpu_device());
   TF_CHECK_OK(k.Init({}, ndef, nullptr));
   const EagerKernelArgs args(std::move(inputs));
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt));
   }
 }
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index 31d5e05462c..9f5eb90ab64 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -54,6 +54,9 @@ class MklEagerOpRewrite : public EagerOpRewrite {
   // Rewrite rule for Conv2D, Conv2DBackpropInput and Conv2DBackpropFilter.
   static bool RewriteConv2D(EagerOperation* op);
 
+  // Rewrite rule for FusedBatchNormV3 and FusedBatchNormGradV3
+  static bool RewriteFusedBatchNormV3(EagerOperation* op);
+
   // Calls op-specific rewrite function to create new MKL op.
   Status RewriteToMklOp(EagerOperation* orig_op,
                         std::unique_ptr<EagerOperation>* mkl_op);
@@ -110,9 +113,10 @@ MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line)
   InsertMKLEagerOps(
       {"FusedBatchNormGradV2", AlwaysRewrite, CreateGenericMklOp});
   InsertMKLEagerOps(
-      {"FusedBatchNormGradV3", AlwaysRewrite, CreateGenericMklOp});
+      {"FusedBatchNormGradV3", RewriteFusedBatchNormV3, CreateGenericMklOp});
   InsertMKLEagerOps({"FusedBatchNormV2", AlwaysRewrite, CreateGenericMklOp});
-  InsertMKLEagerOps({"FusedBatchNormV3", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"FusedBatchNormV3", RewriteFusedBatchNormV3, CreateGenericMklOp});
   InsertMKLEagerOps({"MatMul", AlwaysRewrite, CreateGenericMklOp});
 };
 
@@ -246,5 +250,15 @@ bool MklEagerOpRewrite::RewriteConv2D(EagerOperation* op) {
   return (padding != "EXPLICIT");
 }
 
+bool MklEagerOpRewrite::RewriteFusedBatchNormV3(EagerOperation* op) {
+  const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+  if (Check5DFormat(ndef)) {
+    VLOG(1) << "Eager Op Rewrite: FusedBatchNorm(Grad)V3 op currently does not "
+            << "support 5D tensors.";
+    return false;
+  }
+  return true;
+}
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
index 4f89c3b6b60..b56d97428b3 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
@@ -25,11 +25,15 @@ namespace tensorflow {
 
 class EagerOpRewriteTest : public ::testing::Test {
  public:
-  EagerOpRewriteTest() {}
+  EagerOpRewriteTest() : eager_ctx_(nullptr) {}
+  ~EagerOpRewriteTest() {
+    if (eager_ctx_) {
+      eager_ctx_->Unref();
+    }
+  }
 
   // Creates a new op to be used as input to MKL eager rewrite.
-  static std::unique_ptr<tensorflow::EagerOperation> CreateOp(
-      const string op_name) {
+  std::unique_ptr<tensorflow::EagerOperation> CreateOp(const string op_name) {
     std::unique_ptr<DeviceMgr> device_mgr =
         absl::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
             "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
@@ -37,22 +41,21 @@ class EagerOpRewriteTest : public ::testing::Test {
     bool lazy_remote_tensor_copy = false;
     tensorflow::Rendezvous* rendezvous =
         new tensorflow::IntraProcessRendezvous(device_mgr.get());
-    tensorflow::EagerContext* eager_ctx = new tensorflow::EagerContext(
+    eager_ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
         async, lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous);
 
     EagerExecutor executor_(false);
     std::unique_ptr<tensorflow::EagerOperation> op(
-        new tensorflow::EagerOperation(eager_ctx));
+        new tensorflow::EagerOperation(eager_ctx_));
     EXPECT_EQ(Status::OK(),
               op.get()->Reset(op_name.c_str(), nullptr, false, &executor_));
-    eager_ctx->Unref();
     return op;
   }
 
   // Validates the result of MKL eager rewrite.
-  static void CheckRewrite(EagerOperation* orig_op, string expected_op_name) {
+  void CheckRewrite(EagerOperation* orig_op, string expected_op_name) {
     std::unique_ptr<tensorflow::EagerOperation> out_op;
     EagerOpRewriteRegistry::Global()->RunRewrite(
         EagerOpRewriteRegistry::PRE_EXECUTION, orig_op, &out_op);
@@ -65,6 +68,9 @@ class EagerOpRewriteTest : public ::testing::Test {
 
     EXPECT_EQ(actual_op_name, expected_op_name);
   }
+
+ protected:
+  tensorflow::EagerContext* eager_ctx_;
 };
 
 #define CONV_OPS                                                      \
@@ -124,6 +130,26 @@ REGISTER_TEST_ALL_TYPES(ConvOpsExplicitPadding_Negative);
 REGISTER_TEST_ALL_TYPES(MostOps_Positive);
 #undef REGISTER_TEST
 
+#define REGISTER_TEST(NAME, T, INPUT)                                 \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                            \
+    std::vector<string> Fused_BN_ops = {"FusedBatchNormV3",           \
+                                        "FusedBatchNormGradV3"};      \
+    for (int i = 0; i < Fused_BN_ops.size(); ++i) {                   \
+      auto orig_op = CreateOp(Fused_BN_ops[i]);                       \
+      orig_op->MutableAttrs()->Set("T", T);                           \
+      orig_op->MutableAttrs()->Set("data_format", "" DATA_FORMAT ""); \
+      CheckRewrite(orig_op.get(), Fused_BN_ops[i]);                   \
+    }                                                                 \
+  }
+#define DATA_FORMAT "NCDHW"
+REGISTER_TEST_ALL_TYPES(FusedBatchNormV3_5D_Negative_1);
+
+#define DATA_FORMAT "NDHWC"
+REGISTER_TEST_ALL_TYPES(FusedBatchNormV3_5D_Negative_2);
+
+#undef DATA_FORMAT
+#undef REGISTER_TEST
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 0b17bf7e3e6..da37ad1b480 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -1116,6 +1116,28 @@ const char* TensorHandle::BackingDeviceName(Status* status) const {
   }
 }
 
+const char* TensorHandle::DeviceType(Status* status) const {
+  if (VariantDeviceIsCustom(device())) {
+    status->Update(
+        tensorflow::errors::Unimplemented("Custom device unsupported"));
+    return nullptr;
+  }
+  status->Update(WaitUnknownDevice());
+  tensorflow::Device* d = op_device();
+  return (d == nullptr) ? "CPU" : d->parsed_name().type.c_str();
+}
+
+int TensorHandle::DeviceId(Status* status) const {
+  if (VariantDeviceIsCustom(device())) {
+    status->Update(
+        tensorflow::errors::Unimplemented("Custom device unsupported"));
+    return -1;
+  }
+  status->Update(WaitUnknownDevice());
+  tensorflow::Device* d = op_device();
+  return (d == nullptr) ? 0 : d->parsed_name().id;
+}
+
 tensorflow::ImmediateExecutionTensorHandle* TensorHandle::Copy() {
   Ref();
   return this;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index f54ebd45889..b2bb24f5bc0 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -131,6 +131,8 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
 
   const char* DeviceName(Status* status) const override;
   const char* BackingDeviceName(Status* status) const override;
+  const char* DeviceType(Status* status) const override;
+  int DeviceId(Status* status) const override;
   AbstractTensorInterface* Resolve(Status* status) override;
 
   ImmediateExecutionTensorHandle* Copy() override;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 286bf8775ce..715e7f48ef5 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -408,4 +408,63 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
   context->Unref();
 }
 
+TEST(TensorHandle_DeviceNameTest, OnLocalDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("GPU", "/job:localhost/replica:0/task:0/device:GPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(devices));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &local_device_mgr, false, nullptr, nullptr);
+
+  Device* dcpu = local_device_mgr.ListDevices()[0];
+  Device* dgpu = local_device_mgr.ListDevices()[1];
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {2};
+  Tensor tcpu(dtype, shape);
+  Tensor tgpu(dtype, shape);
+  Status s;
+
+  TensorHandle* th_cpu =
+      TensorHandle::CreateLocalHandle(std::move(tcpu), dcpu, dcpu, dcpu, ctx);
+  const char* device_name = th_cpu->DeviceName(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_name, "CPU")) << device_name;
+  const char* backing_device_name = th_cpu->BackingDeviceName(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU"))
+      << backing_device_name;
+  const char* device_type = th_cpu->DeviceType(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_type, "CPU")) << device_type;
+  int device_id = th_cpu->DeviceId(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_EQ(0, device_id) << device_id;
+
+  TensorHandle* th_gpu =
+      TensorHandle::CreateLocalHandle(std::move(tgpu), dgpu, dgpu, dgpu, ctx);
+  device_name = th_gpu->DeviceName(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_name, "GPU")) << device_name;
+  backing_device_name = th_gpu->BackingDeviceName(&s);
+  TF_EXPECT_OK(s);
+  std::cout << "backing_device_name for GPU: " << backing_device_name
+            << std::endl;
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "GPU"))
+      << backing_device_name;
+  device_type = th_gpu->DeviceType(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_type, "GPU")) << device_type;
+  device_id = th_gpu->DeviceId(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_EQ(0, device_id) << device_id;
+
+  th_cpu->Unref();
+  th_gpu->Unref();
+  ctx->Unref();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
index cd286edabf9..35362982df5 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.cc
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -123,6 +124,17 @@ bool HasCpuKernel(const Node& node) {
       .ok();
 }
 
+Status GetArgNodeIndex(const Node* node, int num_function_inputs, int* index) {
+  DCHECK(node->IsArg());
+  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node->def()), "index", index));
+  if (*index < 0 || num_function_inputs <= *index) {
+    return errors::Internal(
+        "Function instantiation included invalid input index: ", index,
+        " not in [0, ", num_function_inputs, ").");
+  }
+  return Status::OK();
+}
+
 // Extracts the subgraph ending at 'target_node' that is statically computable
 // and inserts into 'out_graph'. If statically computable, 'is_constant_graph'
 // will be set to true.
@@ -130,7 +142,8 @@ Status ExtractConstantSubgraph(
     const Node& target_node, const ShapeRefiner& refiner,
     const std::unordered_map<string, Tensor>* cached_values, Graph* out_graph,
     bool* is_constant_graph,
-    std::vector<std::pair<string, Tensor>>* const_inputs) {
+    std::vector<std::pair<string, Tensor>>* const_inputs,
+    InferenceContext* outer_context) {
   *is_constant_graph = false;
   std::unordered_set<string> const_inputs_added;
 
@@ -187,8 +200,9 @@ Status ExtractConstantSubgraph(
     edges_to_visit.pop_front();
     Node* current_node = current_edge->src();
 
-    // If the node is stateful, assume the graph is not constant.
-    if (current_node->op_def().is_stateful()) {
+    // If the node is stateful, assume the graph is not constant unless it is
+    // an Arg node which is handled later on.
+    if (!current_node->IsArg() && current_node->op_def().is_stateful()) {
       *is_constant_graph = false;
       return Status::OK();
     }
@@ -223,9 +237,32 @@ Status ExtractConstantSubgraph(
     }
 
     // If there is nothing more to recurse down, see if
-    // the generator node is a constant.
+    // the generator node is a constant or an Arg node whose value is available
+    // in the `outer_context`.
     if (current_node->num_inputs() == 0) {
-      if (!current_node->IsConstant()) {
+      if (outer_context && current_node->IsArg()) {
+        const string& tensor_name =
+            strings::StrCat(current_node->name(), ":", 0);
+        // If we do not already have a constant Tensor for this Arg try to
+        // fetch it from the outer context.
+        if (const_inputs_added.count(tensor_name) == 0) {
+          int index;
+          TF_RETURN_IF_ERROR(GetArgNodeIndex(
+              current_node, outer_context->num_inputs(), &index));
+          const Tensor* const_tensor = outer_context->input_tensor(index);
+          if (const_tensor) {
+            const_inputs->emplace_back(tensor_name, *const_tensor);
+            const_inputs_added.insert(tensor_name);
+          } else {
+            // Request a constant value for this Arg. If that is statically
+            // computable, shape refiner will re-run the shape inference for
+            // this function with this tensor's value.
+            outer_context->request_input_tensor(index);
+            *is_constant_graph = false;
+            return Status::OK();
+          }
+        }
+      } else if (!current_node->IsConstant()) {
         // Generator node is not a constant, so subgraph is not
         // constant.
         *is_constant_graph = false;
@@ -314,7 +351,8 @@ Status EvaluateConstantTensor(OutputTensor tensor, const ShapeRefiner& refiner,
                               Tensor* result, GraphRunner* graph_runner,
                               std::unordered_map<string, Tensor>* cached_values,
                               int64 max_cached_value_size,
-                              bool disable_constant_propagation) {
+                              bool disable_constant_propagation,
+                              InferenceContext* outer_context) {
   *evaluated = false;
   const Node* src = tensor.node;
 
@@ -326,6 +364,22 @@ Status EvaluateConstantTensor(OutputTensor tensor, const ShapeRefiner& refiner,
     }
   }
 
+  // If the source node is an Arg return its value, if available in the outer
+  // context.
+  if (src->IsArg() && outer_context) {
+    int index;
+    TF_RETURN_IF_ERROR(
+        GetArgNodeIndex(src, outer_context->num_inputs(), &index));
+    const Tensor* const_tensor = outer_context->input_tensor(index);
+    if (const_tensor) {
+      *evaluated = true;
+      *result = *(outer_context->input_tensor(index));
+    } else {
+      outer_context->request_input_tensor(index);
+    }
+    return Status::OK();
+  }
+
   if (disable_constant_propagation) {
     return Status::OK();
   }
@@ -339,7 +393,7 @@ Status EvaluateConstantTensor(OutputTensor tensor, const ShapeRefiner& refiner,
   std::vector<std::pair<string, Tensor>> const_inputs;
   TF_RETURN_IF_ERROR(ExtractConstantSubgraph(*src, refiner, cached_values,
                                              &subgraph, &is_constant_graph,
-                                             &const_inputs));
+                                             &const_inputs, outer_context));
   if (!is_constant_graph) {
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.h b/tensorflow/core/common_runtime/eval_const_tensor.h
index fca5a235695..b63d492f657 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.h
+++ b/tensorflow/core/common_runtime/eval_const_tensor.h
@@ -53,13 +53,17 @@ class Tensor;
 //     result size to cache.
 //   disable_constant_propagation - if true, only Const node values will be
 //     returned.
+//   outer_context - optional. The InferenceContext for the call node if inside
+//     a nested function. This is useful for doing constant propagation across
+//     Arg nodes.
 Status EvaluateConstantTensor(
     OutputTensor tensor, const ShapeRefiner& refiner,
     const OpRegistryInterface& ops, int32 graph_def_version, bool* evaluated,
     Tensor* result, GraphRunner* graph_runner = nullptr,
     std::unordered_map<string, Tensor>* cached_values = nullptr,
     int64 max_cached_value_size = 1024,
-    bool disable_constant_propagation = false);
+    bool disable_constant_propagation = false,
+    shape_inference::InferenceContext* outer_context = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index cd3e73a30b9..03c23f32880 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1119,11 +1119,13 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
       if (rendezvous_) {
         rendezvous_->StartAbort(s);
       }
-      if (collective_executor_) {
-        collective_executor_->StartAbort(s);
-      }
       if (cancellation_manager_) {
         cancellation_manager_->StartCancel();
+      } else if (collective_executor_) {
+        // If there's cancellation_manager_, collective ops aborts
+        // collective_executor_ upon cancellation; otherwise we need to abort
+        // here.
+        collective_executor_->StartAbort(s);
       }
     }
 
@@ -1267,11 +1269,13 @@ void ExecutorState<PropagatorStateType>::Finish() {
       if (rendezvous_) {
         rendezvous_->StartAbort(status);
       }
-      if (collective_executor_) {
-        collective_executor_->StartAbort(status);
-      }
       if (cancellation_manager_) {
         cancellation_manager_->StartCancel();
+      } else if (collective_executor_) {
+        // If there's cancellation_manager_, collective ops aborts
+        // collective_executor_ upon cancellation; otherwise we need to abort
+        // here.
+        collective_executor_->StartAbort(status);
       }
     }
     delete this;
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index dd65b5dce1d..2d483451d8f 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -433,11 +433,10 @@ TEST_F(ExecutorTest, NoInputTensors) {
 // Create a graph that is 'depth' deep. At each level, fan-in and fan-out a
 // maximum of 'width' nodes. All nodes are no-ops and all dependencies are
 // control dependencies.
-static void BM_executor(int iters, int width, int depth) {
-  testing::StopTiming();
-#ifdef PLATFORM_GOOGLE
-  BenchmarkUseRealTime();
-#endif  // PLATFORM_GOOGLE
+static void BM_executor(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int depth = state.range(1);
+
   Graph* g = new Graph(OpRegistry::Global());
   random::PhiloxRandom philox(1729, 17);
   random::SimplePhilox rand(&philox);
@@ -466,30 +465,29 @@ static void BM_executor(int iters, int width, int depth) {
       ++cur;
     }
   }
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
-  SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
+
   FixupSourceAndSinkEdges(g);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+
+  state.SetLabel(strings::StrCat("Nodes = ", cur));
+  state.SetItemsProcessed(cur * static_cast<int64>(state.iterations()));
 }
 
 // Tall skinny graphs
-BENCHMARK(BM_executor)->ArgPair(16, 1024);
-BENCHMARK(BM_executor)->ArgPair(32, 8192);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192);
 
 // Short fat graphs
-BENCHMARK(BM_executor)->ArgPair(1024, 16);
-BENCHMARK(BM_executor)->ArgPair(8192, 32);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32);
 
 // Tall fat graph
-BENCHMARK(BM_executor)->ArgPair(1024, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024);
+
+static void BM_const_identity(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int outputs_per_const = state.range(1);
 
-static void BM_const_identity(int iters, int width, int outputs_per_const) {
-#ifdef PLATFORM_GOOGL
-  BenchmarkUseRealTime();
-#endif  // PLATFORM_GOOGLE
   Graph* g = new Graph(OpRegistry::Global());
   for (int i = 0; i < width; ++i) {
     Tensor i_t(i);
@@ -499,23 +497,21 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) {
     }
   }
   FixupSourceAndSinkEdges(g);
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkLabel(
-      strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
-  SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
-                             static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetItemsProcessed((1 + outputs_per_const) * width *
+                          static_cast<int64>(state.iterations()));
 }
 
 // Graph with actual op execution.
-BENCHMARK(BM_const_identity)->ArgPair(1, 1);
-BENCHMARK(BM_const_identity)->ArgPair(1, 100);
-BENCHMARK(BM_const_identity)->ArgPair(100, 1);
-BENCHMARK(BM_const_identity)->ArgPair(100, 100);
+BENCHMARK(BM_const_identity)
+    ->UseRealTime()
+    ->ArgPair(1, 1)
+    ->ArgPair(1, 100)
+    ->ArgPair(100, 1)
+    ->ArgPair(100, 100);
 
-static void BM_FeedInputFetchOutput(int iters) {
-  testing::StopTiming();
+static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
   Graph* g = new Graph(OpRegistry::Global());
   // z = x + y: x and y are provided as benchmark inputs.  z is the
   // output of the benchmark.  Conceptually, the caller is ALICE, the
@@ -531,13 +527,10 @@ static void BM_FeedInputFetchOutput(int iters) {
 
   Tensor val(DT_FLOAT, TensorShape({}));
   val.scalar<float>()() = 3.14;
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkItemsProcessed(static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
   FixupSourceAndSinkEdges(g);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).RunWithRendezvousArgs({{x_key, val}, {y_key, val}},
-                                                  {z_key}, iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false)
+      .RunWithRendezvousArgs({{x_key, val}, {y_key, val}}, {z_key}, state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
 BENCHMARK(BM_FeedInputFetchOutput);
 
@@ -549,9 +542,8 @@ BENCHMARK(BM_FeedInputFetchOutput);
 //
 // ...using the functional `WhileOp` (if `lower` is false) or the
 // `Switch`/`Merge`-style of control flow (if `lower` is true).
-static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
-                               bool lower) {
-  testing::StopTiming();
+static void BM_WhileLoopHelper(::testing::benchmark::State& state,
+                               int loop_iters, int loop_vars, bool lower) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   // Add test functions for cond and body.
@@ -661,12 +653,15 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
   }
 
   FixupSourceAndSinkEdges(graph.get());
-  testing::StartTiming();
-  test::Benchmark("cpu", graph.release()).Run(iters);
+  test::Benchmark("cpu", graph.release(), /*old_benchmark_api=*/false)
+      .Run(state);
 }
 
-static void BM_LoweredWhileLoop(int iters, int loop_iters, int loop_vars) {
-  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ true);
+static void BM_LoweredWhileLoop(::testing::benchmark::State& state) {
+  const int loop_iters = state.range(0);
+  const int loop_vars = state.range(1);
+
+  BM_WhileLoopHelper(state, loop_iters, loop_vars, /* lower= */ true);
 }
 BENCHMARK(BM_LoweredWhileLoop)
     ->ArgPair(0, 1)
@@ -680,8 +675,11 @@ BENCHMARK(BM_LoweredWhileLoop)
     ->ArgPair(100, 100)
     ->ArgPair(1000, 100);
 
-static void BM_FunctionalWhileLoop(int iters, int loop_iters, int loop_vars) {
-  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ false);
+static void BM_FunctionalWhileLoop(::testing::benchmark::State& state) {
+  const int loop_iters = state.range(0);
+  const int loop_vars = state.range(1);
+
+  BM_WhileLoopHelper(state, loop_iters, loop_vars, /* lower= */ false);
 }
 BENCHMARK(BM_FunctionalWhileLoop)
     ->ArgPair(0, 1)
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index efbf30d4e65..d500944c3e5 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -58,7 +58,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
 
@@ -153,7 +153,6 @@ tf_cuda_library(
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -161,7 +160,8 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
@@ -181,7 +181,7 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ] + if_static([":gpu_runtime_impl"]),
 )
@@ -219,8 +219,8 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
         "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
 
@@ -234,7 +234,7 @@ tf_cuda_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ] + if_static(
         [":gpu_init_impl"],
     ),
@@ -255,7 +255,7 @@ tf_cuda_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 17e0ee4da1f..0aaa04fffde 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -599,12 +599,16 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   }
 }
 
-// Based on the semantics of Device::Sync this call should wait for
-// all streams not just the current one.
 Status BaseGPUDevice::Sync() {
-  return tensorflow_gpu_device_info()
-      ->stream->parent()
-      ->BlockHostUntilAllStreamsAreDone();
+  DCHECK_NE(stream_, nullptr);
+
+  // Device::Sync is supposed to block until all operations queued on the device
+  // at the time of the call have completed.  On GPUs, only operations enqueued
+  // on the compute stream can remain pending after the (Async)OpKernel that
+  // enqueued the operation has completed.  We do use other streams for copies
+  // and collectives, but in those cases the (Async)OpKernels themselves block
+  // until the queued operation has finished.
+  return stream_->compute->BlockHostUntilDone();
 }
 
 void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index e1d81a18464..e78fbef13de 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -423,7 +423,8 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
       col_params_->group.task_names[dst_idx], send_buf_key, col_ctx_->device,
       col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), src_tensor,
-      col_ctx_->device_locality, done);
+      col_ctx_->device_locality, col_ctx_->op_ctx->cancellation_manager(),
+      done);
 }
 
 void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
@@ -443,7 +444,8 @@ void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
       col_params_->task.is_local[src_idx], recv_buf_key, col_ctx_->device,
       col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
-      col_ctx_->device_locality, 0 /*stream_index*/, done);
+      col_ctx_->device_locality, 0 /*stream_index*/,
+      col_ctx_->op_ctx->cancellation_manager(), done);
 }
 
 namespace {
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 112e6c9881c..97a1d0b46ce 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -165,11 +166,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                     DeviceContext* to_device_ctx,
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality, int stream_index,
+                    CancellationManager* cancellation_manager,
                     const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
-        to_alloc_attr, to_tensor, client_locality, stream_index, done);
+        to_alloc_attr, to_tensor, client_locality, stream_index,
+        cancellation_manager, done);
   }
 
   void PostToPeer(const string& peer_device, const string& peer_task,
@@ -178,11 +181,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                   const AllocatorAttributes& from_alloc_attr,
                   const Tensor* from_tensor,
                   const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
                   const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::PostToPeer(
         peer_device, peer_task, key, from_device, from_device_ctx,
-        from_alloc_attr, from_tensor, client_locality, done);
+        from_alloc_attr, from_tensor, client_locality, cancellation_manager,
+        done);
   }
 
   mutex mu_;
@@ -618,6 +623,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       OpKernelContext::Params op_params;
       op_params.step_id = parent_->step_id_;
       op_params.device = device_;
+      op_params.cancellation_manager = &parent_->cancellation_manager_;
       gtl::InlinedVector<TensorValue, 4> inputs;
       inputs.push_back(TensorValue(&tensor_));
       op_params.inputs = &inputs;
@@ -710,6 +716,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
   int bcast_recv_counter_ TF_GUARDED_BY(mu_) = 0;
   int bcast_send_counter_ TF_GUARDED_BY(mu_) = 0;
   int failure_count_ TF_GUARDED_BY(mu_) = 0;
+  CancellationManager cancellation_manager_;
 };
 
 TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams1Task8GPU) {
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 0b6edf74daf..854b8603e33 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -48,7 +48,8 @@ namespace test {
 // TODO(hongm): Convert `g` and `init` to using std::unique_ptr.
 Benchmark::Benchmark(const string& device, Graph* g,
                      const SessionOptions* options, Graph* init,
-                     Rendezvous* rendez, const char* executor_type) {
+                     Rendezvous* rendez, const char* executor_type,
+                     bool old_benchmark_api) {
   auto cleanup = gtl::MakeCleanup([g, init]() {
     delete g;
     delete init;
@@ -59,7 +60,8 @@ Benchmark::Benchmark(const string& device, Graph* g,
     options = &default_options;
   }
 
-  testing::StopTiming();
+  old_benchmark_api_ = old_benchmark_api;
+  if (old_benchmark_api_) testing::StopTiming();
   string t = absl::AsciiStrToUpper(device);
   // Allow NewDevice to allocate a new threadpool with different number of
   // threads for each new benchmark.
@@ -120,6 +122,9 @@ Benchmark::Benchmark(const string& device, Graph* g,
   TF_CHECK_OK(NewExecutor(executor_type, params, *g, &exec_));
 }
 
+Benchmark::Benchmark(const string& device, Graph* g, bool old_benchmark_api)
+    : Benchmark(device, g, nullptr, nullptr, nullptr, "", old_benchmark_api) {}
+
 Benchmark::~Benchmark() {
   if (device_) {
     rendez_->Unref();
@@ -135,6 +140,10 @@ Benchmark::~Benchmark() {
 
 void Benchmark::Run(int iters) { RunWithRendezvousArgs({}, {}, iters); }
 
+void Benchmark::Run(::testing::benchmark::State& state) {
+  RunWithRendezvousArgs({}, {}, state);
+}
+
 string GetRendezvousKey(const Node* node) {
   string send_device;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
@@ -149,9 +158,63 @@ string GetRendezvousKey(const Node* node) {
                                recv_device, tensor_name, FrameAndIter(0, 0));
 }
 
+void Benchmark::RunWithRendezvousArgs(
+    const std::vector<std::pair<string, Tensor>>& inputs,
+    const std::vector<string>& outputs, ::testing::benchmark::State& state) {
+  CHECK(!old_benchmark_api_)
+      << "This method should only be called with new benchmark API";
+  if (!device_ || state.max_iterations == 0) {
+    return;
+  }
+  Tensor unused;  // In benchmark, we don't care the return value.
+  bool is_dead;
+
+  // Warm up
+  Executor::Args args;
+  args.rendezvous = rendez_;
+  args.runner = [this](std::function<void()> closure) {
+    pool_->Schedule(closure);
+  };
+  static const int kWarmupRuns = 3;
+  for (int i = 0; i < kWarmupRuns; ++i) {
+    for (const auto& p : inputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed));
+      TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
+    }
+    TF_CHECK_OK(exec_->Run(args));
+    for (const string& key : outputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
+      TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
+    }
+  }
+  TF_CHECK_OK(device_->Sync());
+  VLOG(3) << kWarmupRuns << " warmup runs done.";
+
+  // Benchmark loop. Timer starts automatically at the beginning of the loop
+  // and ends automatically after the last iteration.
+  for (auto s : state) {
+    for (const auto& p : inputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed));
+      TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
+    }
+    TF_CHECK_OK(exec_->Run(args));
+    for (const string& key : outputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
+      TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
+    }
+  }
+  TF_CHECK_OK(device_->Sync());
+}
+
 void Benchmark::RunWithRendezvousArgs(
     const std::vector<std::pair<string, Tensor>>& inputs,
     const std::vector<string>& outputs, int iters) {
+  CHECK(old_benchmark_api_) << "This method should only be called when running "
+                               "with old benchmark API";
   if (!device_ || iters == 0) {
     return;
   }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 9c6b1eb088c..8d56746b73b 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -26,6 +26,12 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace testing {
+namespace benchmark {
+class State;
+}  // namespace benchmark
+}  // namespace testing
+
 namespace tensorflow {
 
 class Device;
@@ -40,23 +46,45 @@ class Benchmark {
  public:
   // "device" must be either "cpu" or "gpu".  Takes ownership of "g",
   // "init", and one reference on "rendez" (if not null).
+  //
+  // old_benchmark_api: If true, the benchmark is running with older API
+  //   * In the old API, the timer needs to be stopped/restarted
+  //     by users.
+  //   * In the new API, the timer starts automatically at the first
+  //     iteration of the loop and stops after the last iteration.
+  // TODO(vyng) Remove this once we have migrated all code to newer API.
   Benchmark(const string& device, Graph* g,
             const SessionOptions* options = nullptr, Graph* init = nullptr,
-            Rendezvous* rendez = nullptr, const char* executor_type = "");
+            Rendezvous* rendez = nullptr, const char* executor_type = "",
+            bool old_benchmark_api = true);
+
+  Benchmark(const string& device, Graph* g, bool old_benchmark_api);
+
   ~Benchmark();
 
   // Executes the graph for "iters" times.
+  // This function is deprecated. Use the overload that takes
+  // `benchmark::State&`
+  // instead.
   void Run(int iters);
 
+  void Run(::testing::benchmark::State& state);
+
   // If "g" contains send/recv nodes, before each execution, we send
   // inputs to the corresponding recv keys in the graph, after each
   // execution, we recv outputs from the corresponding send keys in
   // the graph. In the benchmark, we throw away values returned by the
   // graph.
+  // This function is deprecated. Use the overload that takes
+  // `benchmark::State&` instead.
   void RunWithRendezvousArgs(
       const std::vector<std::pair<string, Tensor>>& inputs,
       const std::vector<string>& outputs, int iters);
 
+  void RunWithRendezvousArgs(
+      const std::vector<std::pair<string, Tensor>>& inputs,
+      const std::vector<string>& outputs, ::testing::benchmark::State& state);
+
  private:
   thread::ThreadPool* pool_ = nullptr;  // Not owned.
   Device* device_ = nullptr;            // Not owned.
@@ -66,6 +94,7 @@ class Benchmark {
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionLibraryRuntime* flr_;  // Not owned.
   std::unique_ptr<Executor> exec_;
+  bool old_benchmark_api_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Benchmark);
 };
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index ff010ad8a63..2a0e5d35de5 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -148,13 +148,22 @@ Status CondBuilder::SetColocationAndFinalize(NodeBuilder node_builder,
 Status CondBuilder::CreatePivotNodes() {
   // Construct the basic cond body (consisting of feeding in the predicate to
   // create pivot nodes).
+
+  // This is a special pivot switch node for lowering. We mark this with a
+  // special _PivotSwitch attr on it as later on in the graph partitioner we
+  // do some special placement for Switch nodes and its necessary to distinguish
+  // between a "normal" Switch node and one of these pivot switches. We would
+  // like to place this node on the CPU always as the pred_ will be on the CPU
+  // as well (either a CPU op output or a GPU op with HostMemory annotation).
+  // TODO(b/171321391): Fix this for NUMA cases.
   Node* switch_pred;
   TF_RETURN_IF_ERROR(
       SetColocationAndFinalize(NodeBuilder(NewName("switch_pred"), "Switch",
                                            graph_->op_registry(), &debug_info_)
                                    .Input(NodeOut(pred_))
                                    .Input(NodeOut(pred_))
-                                   .Device(if_op_->requested_device()),
+                                   .Attr("_PivotSwitch", true)
+                                   .Device("/CPU:0"),
                                graph_, &switch_pred));
   control_predecessor_ = switch_pred;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index cf7d35409bb..b0304cfe293 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -147,6 +147,115 @@ TEST(LowerIfOpTest, Simple) {
   }
 }
 
+TEST(LowerIfOpTest, GPUPlacement) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for then and else branch.
+  FunctionDefLibrary f_lib_proto;
+  *(f_lib_proto.add_function()) = test::function::XTimesTwo();
+  *(f_lib_proto.add_function()) = test::function::XTimesFour();
+
+  // Construct simple conditional that switches on `pred` and operates only on
+  // single input `A`.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::Placeholder(root.WithOpName("A"), DT_INT32);
+  auto x = ops::Placeholder(root.WithOpName("X"), DT_INT32);
+  auto y = ops::Placeholder(root.WithOpName("Y"), DT_INT32);
+  Node* pred;
+  TF_ASSERT_OK(NodeBuilder("greater", "Greater", &root.graph()->flib_def())
+                   .Input(x.node())
+                   .Input(y.node())
+                   .Device("/GPU:0")
+                   .Finalize(root.graph(), &pred));
+  Node* written_if;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  TF_ASSERT_OK(
+      NodeBuilder("if", "If", &root.graph()->flib_def())
+          .Input(pred)
+          .Input(inputs)
+          .Attr("then_branch", FuncAttr("XTimesTwo"))
+          .Attr("else_branch", FuncAttr("XTimesFour"))
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Attr("Tout", {DT_INT32})
+          .Device("/GPU:0")
+          .Finalize(root.graph(), &written_if));
+  TF_ASSERT_OK(root.DoShapeInference(written_if));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  // The input graph has no switch or merge nodes.
+  int node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    ASSERT_FALSE(op->IsSwitch());
+    ASSERT_FALSE(op->IsMerge());
+    if (op->name() == "if") {
+      ++node_called_if_count;
+    }
+  }
+  ASSERT_EQ(node_called_if_count, 1);
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify the resultant graph has switch and merge nodes, and a node called
+  // `if` (but not If nodes).
+  int switch_count = 0;
+  int merge_count = 0;
+  node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->IsSwitch()) {
+      ++switch_count;
+    }
+    if (op->IsMerge()) {
+      ++merge_count;
+    }
+    ASSERT_NE(op->type_string(), "If");
+    if (op->name() == "if") {
+      ++node_called_if_count;
+    }
+  }
+  // One switch for predicate and one for input (A).
+  ASSERT_EQ(switch_count, 2);
+  // One merge for the single output value of then and else, and one more merge
+  // to enforce then and else function call execution (`branch_executed` node).
+  ASSERT_EQ(merge_count, 2);
+  ASSERT_EQ(node_called_if_count, 1);
+
+  // Verify execution.
+  ClientSession session(root, SessionOptionsWithInlining());
+  {
+    RunMetadata metadata;
+    RunOptions options;
+    options.set_output_partition_graphs(true);
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(x.node()), Input::Initializer(5));
+    feeds.emplace(Output(y.node()), Input::Initializer(10));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(options, feeds, {Output(written_if)}, {},
+                             &out_tensors, &metadata));
+    GraphDef cpu_graph = metadata.partition_graphs(1);
+    int num_cpu_switch = 0;
+    for (const auto& node : cpu_graph.node()) {
+      if (node.op() == "Switch") {
+        ++num_cpu_switch;
+      }
+    }
+    EXPECT_EQ(num_cpu_switch, 2);
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 40);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(x.node()), Input::Initializer(10));
+    feeds.emplace(Output(y.node()), Input::Initializer(5));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 20);
+  }
+}
+
 TEST(LowerIfOpTest, BranchFunctionsWithoutOutputs) {
   using ::tensorflow::test::function::GDef;
   using ::tensorflow::test::function::NDef;
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 4ec85457add..43a909466ed 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -17,13 +17,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
 
-#ifdef _WIN32
-// Declare function to avoid unresolved symbol in VS
-i_malloc_t i_malloc;
-i_calloc_t i_calloc;
-i_realloc_t i_realloc;
-i_free_t i_free;
-#endif
 namespace tensorflow {
 
 constexpr const char* MklCPUAllocator::kMaxLimitStr;
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index c4e0b638c0a..1686b107c98 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -475,11 +475,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v3),
-         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+         CopyAttrsAll, FusedBatchNormV3Rewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
-         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+         CopyAttrsAll, FusedBatchNormV3Rewrite, GetRewriteCause()});
 #ifdef ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.fused_batch_norm_ex,
                       native_fmt ? csinfo_.mkl_native_fused_batch_norm_ex
@@ -531,7 +531,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          CopyAttrsAll, NonDepthBatchWisePoolRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.max_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad),
-                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+                      CopyAttrsAll, Maxpool3DGradRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum),
          CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
@@ -1121,7 +1121,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd
   // node that can be merged with 'm'.
   static Node* GetConv2DOrBiasAdd(const Node* m) {
-    CHECK_NOTNULL(m);
+    DCHECK(m);
     Node* n = nullptr;
 
     DataType T_m;
@@ -1288,7 +1288,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // So 1st input of BiasAddGrad connects with 3rd input of
   // Conv2DBackpropFilter and vice versa.
   static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) {
-    CHECK_NOTNULL(m);
+    DCHECK(m);
     Node* n = nullptr;
 
     DataType T_m;
@@ -1548,7 +1548,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @return - true (if it is not a depth/batch wise pooling case);
   //           false otherwise.
   static bool NonDepthBatchWisePoolRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
 
     string data_format_str;
     TensorFormat data_format;
@@ -1575,7 +1575,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
 
     int depth_radius;
     TF_CHECK_OK(GetNodeAttr(n->def(), "depth_radius", &depth_radius));
@@ -1593,7 +1593,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   }
 
   static bool LrnGradRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
     bool do_rewrite = false;
 
     for (const Edge* e : n->in_edges()) {
@@ -1687,8 +1687,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     }
     return true;
   }
+
   static bool MaxpoolGradRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
     bool do_rewrite = false;
     for (const Edge* e : n->in_edges()) {
       // Rewrite only if there is corresponding Maxpool, i.e workspace is
@@ -1705,6 +1706,32 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  static bool Maxpool3DGradRewrite(const Node* n) {
+    DCHECK(n);
+    for (const Edge* e : n->in_edges()) {
+      // Rewrite only if there is corresponding Maxpool3D, i.e., workspace is
+      // available
+      if (e->dst()->type_string() == csinfo_.max_pool3d_grad &&
+          e->dst_input() == 1 &&
+          e->src()->type_string() ==
+              mkl_op_registry::GetMklOpName(csinfo_.max_pool3d) &&
+          e->src_output() == 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static bool FusedBatchNormV3Rewrite(const Node* n) {
+    DCHECK(n);
+    if (Check5DFormat(n->def())) {
+      VLOG(1) << "Graph Rewrite: FusedBatchNorm(Grad)V3 op currently does not "
+              << "support 5D tensors.";
+      return false;
+    }
+    return true;
+  }
+
   static bool FusedBatchNormExRewrite(const Node* n) {
     DCHECK(n);
 
@@ -2058,7 +2085,7 @@ void MklLayoutRewritePass::GetNodesProducingTFTensorList(
     int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
+  DCHECK(output_nodes);
   output_nodes->reserve(list_length);
 
   while (list_length != 0) {
@@ -2095,7 +2122,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
                                                       // device of the original
                                                       // node.
                   .Finalize(&**g, out));
-  CHECK_NOTNULL(*out);  // Make sure we got a valid object before using it
+  DCHECK(*out);  // Make sure we got a valid object before using it
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -2123,7 +2150,7 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
+  DCHECK(output_nodes);
   output_nodes->reserve(list_length);
 
   while (list_length != 0) {
@@ -2151,9 +2178,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 void MklLayoutRewritePass::GetNodeProducingMklTensor(
     std::unique_ptr<Graph>* g, const Node* orig_node, Node* n,
     int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(mkl_node);
-  CHECK_NOTNULL(mkl_node_output_slot);
+  DCHECK(n);
+  DCHECK(mkl_node);
+  DCHECK(mkl_node_output_slot);
 
   // If this is an MKL op, then it will create extra output for MKL layout.
   DataType T;
@@ -2172,7 +2199,7 @@ void MklLayoutRewritePass::GetNodeProducingMklTensor(
     // DummyMklTensor node has no input and generates only 1 output
     // (dummy Mkl tensor) as output slot number 0.
     GetDummyMklTensorNode(g, mkl_node, orig_node);
-    CHECK_NOTNULL(*mkl_node);
+    DCHECK(*mkl_node);
     *mkl_node_output_slot = 0;
   }
 }
@@ -2183,7 +2210,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     NodeBuilder* nb, const Node* old_node,
     std::vector<NodeBuilder::NodeOut>* workspace_tensors,
     bool are_workspace_tensors_available) {
-  CHECK_NOTNULL(workspace_tensors);
+  DCHECK(workspace_tensors);
   CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
 
   // TODO(nhasabni): Temporary solution to connect filter input of
@@ -2201,7 +2228,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     Node* filter_node = nullptr;
     TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
                                      &filter_node));
-    CHECK_NOTNULL(filter_node);
+    DCHECK(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
     // 2nd input (slot 1) of _MklConv2D, _MklConv2DWithBias, and
@@ -2451,14 +2478,14 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
     std::unique_ptr<Graph>* g, const Node* orig_node, NodeBuilder* nb,
     std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
   bool workspace_edge_added = false;  // Default initializer
-  CHECK_NOTNULL(are_ws_tensors_added);
+  DCHECK(are_ws_tensors_added);
   *are_ws_tensors_added = false;  // Default initializer
 
   DataType T;
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklLayoutDependentOp(
+        mkl_op_registry::IsMklOp(
             mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
@@ -2485,7 +2512,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklLayoutDependentOp(
+               mkl_op_registry::IsMklOp(
                    mkl_op_registry::GetMklOpName(orig_node->type_string()),
                    T)) {
       // If this op is a bwd op, then we need to add workspace edge and
@@ -2506,13 +2533,17 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
                 mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
-          CHECK_NOTNULL(ws_tensors);
+          DCHECK(ws_tensors);
           // Add workspace edge between fwd op and bwd op.
           ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
-          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
-          ws_tensors->push_back(NodeBuilder::NodeOut(
-              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
-                                                 e->src()->num_outputs())));
+          // Check if we are running in native format mode. If so,
+          // we don't need to have an Mkl metadata tensor for the workspace.
+          if (!NativeFormatEnabled()) {
+            // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+            ws_tensors->push_back(NodeBuilder::NodeOut(
+                e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                   e->src()->num_outputs())));
+          }
           *are_ws_tensors_added = true;
           // In terms of input ordering, we add these calls to add Input
           // here because workspace edge (and its Mkl tensor) is the last
@@ -2536,9 +2567,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
         GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
         GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
-        CHECK_NOTNULL(dmt_ws);
-        CHECK_NOTNULL(dmt_mkl_ws);
-        CHECK_NOTNULL(ws_tensors);
+        DCHECK(dmt_ws);
+        DCHECK(dmt_mkl_ws);
+        DCHECK(ws_tensors);
         // We add dummy tensor as workspace tensor.
         ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
         // We add dummy tensor as Mkl tensor for workspace tensor.
@@ -3200,8 +3231,9 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
       // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
       // output (at slot 0).
       const int kConv2DWithBiasOutputSlot = 0;
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
-                                  e->dst_input()));
+      auto new_edge = (*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
+                                    e->dst(), e->dst_input());
+      DCHECK(new_edge);
     }
   }
 
@@ -3494,8 +3526,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
         (*g)->AddControlEdge(new_node, e->dst(), true);
       }
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
-                                  e->dst(), e->dst_input()));
+      auto new_edge = (*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
+                                    e->dst(), e->dst_input());
+      DCHECK(new_edge);
     }
   }
   unique_node.clear();
@@ -3508,8 +3541,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
         (*g)->AddControlEdge(new_node, e->dst(), true);
       }
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
-                                  e->dst(), e->dst_input()));
+      auto new_edge = (*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
+                                    e->dst(), e->dst_input());
+      DCHECK(new_edge);
     }
   }
 
@@ -3530,8 +3564,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
 
 Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
                                        Node* n) {
-  CHECK_NOTNULL(m);
-  CHECK_NOTNULL(n);
+  DCHECK(m);
+  DCHECK(n);
 
   if (((m->type_string() == csinfo_.bias_add &&
         n->type_string() == csinfo_.conv2d)) ||
@@ -3637,10 +3671,11 @@ Status MklLayoutRewritePass::RewriteNodeForLayoutPropagation(
         (*g)->AddControlEdge(*new_node, e->dst(), true);
       }
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(
+      auto new_edge = (*g)->AddEdge(
           *new_node,
           GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
-          e->dst(), e->dst_input()));
+          e->dst(), e->dst_input());
+      DCHECK(new_edge);
     }
   }
   return Status::OK();
@@ -3671,6 +3706,15 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange(
     return s;
   }
 
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, orig_node, &nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+  if (are_workspace_tensors_available) {
+    DCHECK_EQ(workspace_tensors.size(), 1);
+    nb.Input(workspace_tensors[0].node, workspace_tensors[0].index);
+  }
+
   if (!NativeFormatEnabled()) {
     ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, true);
   } else {
@@ -3785,7 +3829,7 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
 
 const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
-  CHECK_NOTNULL(n);
+  DCHECK(n);
 
   // QuantizedOps may have attributes other than "T", so decoupled the check
   // with a function, CheckForQuantizedNodeRewrite(const Node*).
@@ -4004,8 +4048,9 @@ bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
   if (IsConstant(e_metadata->src())) {
     Node* e_metadata_dst = e_metadata->dst();
     int e_metadata_in_slot = e_metadata->dst_input();
-    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst,
-                                e_metadata_in_slot));
+    auto new_edge = (*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst,
+                                  e_metadata_in_slot);
+    DCHECK(new_edge);
 
     (*g)->RemoveEdge(e_metadata);
     return true;
@@ -4077,7 +4122,7 @@ bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
 
 bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   bool result = false;
-  CHECK_NOTNULL(g);
+  DCHECK(g);
 
   DumpGraph("Before running MklLayoutRewritePass", &**g);
 
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 4366a7892d3..fda5ad93352 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -3394,6 +3394,37 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV2_Negative) {
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_Positive);
 #undef REGISTER_TEST
 
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: 'Float32Input'}"                                         \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormV3'"                                      \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: " DATA_FORMAT " } }"                   \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                           \
+            "A(" #INPUT ");B(Float32Input);C(Float32Input);"                         \
+            "D(Float32Input);E(Float32Input);F(FusedBatchNormV3);G(Zeta)"            \
+            "|A->F;A->G;B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");                        \
+}
+#define DATA_FORMAT "'NCDHW'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_5D_Negative_1);
+
+#define DATA_FORMAT "'NDHWC'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_5D_Negative_2);
+
+#undef DATA_FORMAT
+#undef REGISTER_TEST
+
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
   DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
@@ -3417,6 +3448,38 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
           "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'Float32Input'}"                                         \
+      "node { name: 'G' op: 'FusedBatchNormGradV3'"                                  \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: " DATA_FORMAT " } }"                   \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                                     \
+      "node { name: 'H' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'G'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(Float32Input);D(Float32Input);"           \
+            "E(Float32Input);F(Float32Input);G(FusedBatchNormGradV3);H(Zeta)"        \
+            "|A->G;A->H;B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");                 \
+}
+#define DATA_FORMAT "'NCDHW'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGradV3_5D_Negative_1);
+
+#define DATA_FORMAT "'NDHWC'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGradV3_5D_Negative_2);
+
+#undef DATA_FORMAT
+#undef REGISTER_TEST
+
 #ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
index c29752d3c2c..8d64f6e69db 100644
--- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/common_runtime/threadpool_device.h"
-
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
@@ -37,15 +36,6 @@ TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
   EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
 }
 
-TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
-  SessionOptions options;
-  setenv("OMP_NUM_THREADS", "314", 1);
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  EXPECT_EQ(omp_get_max_threads(), 314);
-}
 #endif  // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/permuter.cc b/tensorflow/core/common_runtime/permuter.cc
index 1053cae1f7d..9aee5e5d5c9 100644
--- a/tensorflow/core/common_runtime/permuter.cc
+++ b/tensorflow/core/common_runtime/permuter.cc
@@ -90,7 +90,7 @@ void Permuter::DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
       col_params_->group.task_names[target_rank], send_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
-      done);
+      col_ctx_->op_ctx->cancellation_manager(), done);
 }
 
 void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
@@ -107,7 +107,7 @@ void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
       col_params_->task.is_local[src_rank], recv_buf_key, col_ctx_->device,
       col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
-      0, done);
+      0, col_ctx_->op_ctx->cancellation_manager(), done);
 }
 namespace {
 REGISTER_COLLECTIVE(Permute, Permuter);
diff --git a/tensorflow/core/common_runtime/permuter_test.cc b/tensorflow/core/common_runtime/permuter_test.cc
index 1b65a9ebe5f..10c527ca573 100644
--- a/tensorflow/core/common_runtime/permuter_test.cc
+++ b/tensorflow/core/common_runtime/permuter_test.cc
@@ -77,11 +77,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                     DeviceContext* to_device_ctx,
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality, int stream_index,
+                    CancellationManager* cancellation_manager,
                     const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
-        to_alloc_attr, to_tensor, client_locality, stream_index, done);
+        to_alloc_attr, to_tensor, client_locality, stream_index,
+        cancellation_manager, done);
   }
 
   void PostToPeer(const string& peer_device, const string& peer_task,
@@ -90,11 +92,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                   const AllocatorAttributes& from_alloc_attr,
                   const Tensor* from_tensor,
                   const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
                   const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::PostToPeer(
         peer_device, peer_task, key, from_device, from_device_ctx,
-        from_alloc_attr, from_tensor, client_locality, done);
+        from_alloc_attr, from_tensor, client_locality, cancellation_manager,
+        done);
   }
 
   mutex mu_;
@@ -361,6 +365,7 @@ class PermuterTest : public ::testing::Test {
       OpKernelContext::Params op_params;
       op_params.step_id = parent_->step_id_;
       op_params.device = device_;
+      op_params.cancellation_manager = &parent_->cancellation_manager_;
       gtl::InlinedVector<TensorValue, 4> inputs;
       inputs.push_back(TensorValue(&tensor_input_));
       op_params.inputs = &inputs;
@@ -427,6 +432,7 @@ class PermuterTest : public ::testing::Test {
   mutex mu_;
   int permute_counter_ TF_GUARDED_BY(mu_) = 0;
   std::vector<int> permutation_;
+  CancellationManager cancellation_manager_;
 };
 
 // TODO(b/113171733): change to use TEST_P.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 40c31185eac..50f3b52e4c6 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -200,13 +200,15 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
     // "TPU_SYSTEM" indicates that `device` is a CPU.
     return Status::OK();
   }
-  if (device_type == "GPU" || device_type == "TPU") {
+
+  if (device->IsRemoteCallAllowed()) {
     auto* dev_info = flr->device()->tensorflow_gpu_device_info();
     if (dev_info) {
       *device_context = dev_info->default_context;
       return Status::OK();
     }
   }
+
   return errors::Internal("Device type: ", device_type,
                           " is currently unsupported for remote ",
                           "function executions");
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 9a7c730c1fb..36792bd5a33 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -149,6 +149,10 @@ class RenamedDevice : public Device {
 
   bool IsLocal() const override { return underlying_device_->IsLocal(); }
 
+  bool IsRemoteCallAllowed() const override {
+    return underlying_device_->IsRemoteCallAllowed();
+  }
+
  private:
   RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
                 bool owns_underlying, bool isolate_session_state,
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index b7a3bd11ec6..e664eb90865 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -278,12 +278,17 @@ void RingAlg::StartAbort(const Status& s) {
       status_.Update(s);
     }
   }
-  // If this is the initial entry to abort mode then invoke StartAbort
-  // on the CollectiveExecutor that invoked us.  That should start
-  // cancellation on all of the outstanding CollectiveRemoteAccess
-  // actions.
+  // If this is the initial entry to abort mode and it's not a cancellation,
+  // then invoke StartAbort on the CollectiveExecutor that invoked us.  That
+  // should start cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions. If it's cancellation all pending send/recv should be cancelled as
+  // well and there's then no need to abort.
   if (abort_started) {
-    col_ctx_->col_exec->StartAbort(s);
+    if (col_ctx_->op_ctx->cancellation_manager() == nullptr ||
+        (!col_ctx_->op_ctx->cancellation_manager()->IsCancelled() &&
+         !col_ctx_->op_ctx->cancellation_manager()->IsCancelling())) {
+      col_ctx_->col_exec->StartAbort(s);
+    }
   }
 }
 
@@ -389,7 +394,8 @@ void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
       col_params_->group.task_names[send_to_dev_idx], send_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
-      col_ctx_->device_locality, done);
+      col_ctx_->device_locality, col_ctx_->op_ctx->cancellation_manager(),
+      done);
 }
 
 void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
@@ -409,7 +415,8 @@ void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
       col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
-      col_ctx_->device_locality, rf->subdiv_idx, done);
+      col_ctx_->device_locality, rf->subdiv_idx,
+      col_ctx_->op_ctx->cancellation_manager(), done);
 }
 
 string RingAlg::FieldState() {
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 6b51993e2f4..1f23ee1a8a7 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -70,12 +70,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality,
                     int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
                     const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
         to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
-        done);
+        cancellation_manager, done);
   }
 
   void PostToPeer(const string& peer_device, const string& peer_task,
@@ -84,11 +85,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                   const AllocatorAttributes& from_alloc_attr,
                   const Tensor* from_tensor,
                   const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
                   const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::PostToPeer(
         peer_device, peer_task, key, from_device, from_device_ctx,
-        from_alloc_attr, from_tensor, client_locality, done);
+        from_alloc_attr, from_tensor, client_locality, cancellation_manager,
+        done);
   }
 
   mutex mu_;
@@ -442,6 +445,7 @@ class RingGathererTest : public ::testing::Test {
       OpKernelContext::Params op_params;
       op_params.step_id = kStepId;
       op_params.device = device_;
+      op_params.cancellation_manager = &parent_->cancellation_manager_;
       gtl::InlinedVector<TensorValue, 4> inputs;
       inputs.push_back(TensorValue(&input_tensor_));
       op_params.inputs = &inputs;
@@ -523,6 +527,7 @@ class RingGathererTest : public ::testing::Test {
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
   int32 gather_counter_ TF_GUARDED_BY(mu_) = 0;
+  CancellationManager cancellation_manager_;
 };
 
 CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 71f0226549f..2a3ca4f275a 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -256,7 +256,7 @@ bool RingReducer::RunAsyncParts() {
               rf->action = RF_REDUCE;
               Status s = collective_util::ComputeBinOp(
                   col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
-                  col_params_->merge_op.get(), &rf->chunk, &rf->tmp_chunk);
+                  col_params_->merge_op, &rf->chunk, &rf->tmp_chunk);
               if (!s.ok()) {
                 aborted = true;
                 StartAbort(s);
@@ -266,13 +266,12 @@ bool RingReducer::RunAsyncParts() {
             }
             break;
           case RF_REDUCE:
-            if (!rf->second_pass && col_params_->final_op.get() &&
-                rf->is_final) {
+            if (!rf->second_pass && col_params_->final_op && rf->is_final) {
               rf->action = RF_FINALIZE;
               group_size_tensor_ready_.WaitForNotification();
               Status s = collective_util::ComputeBinOp(
                   col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
-                  col_params_->final_op.get(), &rf->chunk, &group_size_tensor_);
+                  col_params_->final_op, &rf->chunk, &group_size_tensor_);
               if (!s.ok()) {
                 aborted = true;
                 StartAbort(s);
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index ad20a243151..3b153e4ca1d 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -70,12 +71,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality,
                     int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
                     const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
         to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
-        done);
+        cancellation_manager, done);
   }
 
   void PostToPeer(const string& peer_device, const string& peer_task,
@@ -84,11 +86,13 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
                   const AllocatorAttributes& from_alloc_attr,
                   const Tensor* from_tensor,
                   const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
                   const StatusCallback& done) override {
     if (MaybeFail(done)) return;
     CollectiveRemoteAccessLocal::PostToPeer(
         peer_device, peer_task, key, from_device, from_device_ctx,
-        from_alloc_attr, from_tensor, client_locality, done);
+        from_alloc_attr, from_tensor, client_locality, cancellation_manager,
+        done);
   }
 
   mutex mu_;
@@ -462,15 +466,16 @@ class RingReducerTest : public ::testing::Test {
     }
 
     void DoReduce() {
-      col_params_.merge_op =
-          GetAdd(col_params_.instance.data_type, device_type_, device_);
-      col_params_.final_op =
-          GetDiv(col_params_.instance.data_type, device_type_, device_);
+      merge_op_ = GetAdd(col_params_.instance.data_type, device_type_, device_);
+      final_op_ = GetDiv(col_params_.instance.data_type, device_type_, device_);
+      col_params_.merge_op = merge_op_.get();
+      col_params_.final_op = final_op_.get();
 
       // Prepare an OpKernelContext.
       OpKernelContext::Params op_params;
       op_params.step_id = kStepId;
       op_params.device = device_;
+      op_params.cancellation_manager = &parent_->cancellation_manager_;
       gtl::InlinedVector<TensorValue, 4> inputs;
       inputs.push_back(TensorValue(&tensor_));
       op_params.inputs = &inputs;
@@ -531,6 +536,8 @@ class RingReducerTest : public ::testing::Test {
     Tensor tensor_;
     Device* device_;
     CollectiveParams col_params_;
+    std::unique_ptr<OpKernel> merge_op_;
+    std::unique_ptr<OpKernel> final_op_;
     std::unique_ptr<CollectiveAdapter> ca_;
     std::unique_ptr<OpKernelContext> ctx_;
     Status status_;
@@ -550,6 +557,7 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
   int32 reduce_counter_ TF_GUARDED_BY(mu_) = 0;
+  CancellationManager cancellation_manager_;
 };
 
 CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index a968aaf09b6..375f809b31b 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -59,13 +60,15 @@ namespace {
 constexpr char kArgOp[] = "_Arg";
 constexpr char kRetvalOp[] = "_Retval";
 
+}  // namespace
+
 // Runs shape inference for the given node using the given ShapeRefiner.
 // The node must be a sub-node of a function node and the outer_context is
 // the inference context of that function node in the outer graph.
-Status InferShapesForFunctionSubNode(const Node* node, ShapeRefiner* refiner,
-                                     InferenceContext* outer_context) {
-  TF_RETURN_IF_ERROR(refiner->AddNode(node));
-  InferenceContext* node_context = CHECK_NOTNULL(refiner->GetContext(node));
+Status ShapeRefiner::InferShapesForFunctionSubNode(
+    const Node* node, InferenceContext* outer_context) {
+  TF_RETURN_IF_ERROR(AddNodeInternal(node, outer_context));
+  InferenceContext* node_context = CHECK_NOTNULL(GetContext(node));
 
   if (StringPiece(node->type_string()) == kArgOp) {
     // Handle special node: function input.
@@ -126,8 +129,6 @@ Status InferShapesForFunctionSubNode(const Node* node, ShapeRefiner* refiner,
   return Status::OK();
 }
 
-}  // namespace
-
 // TODO(cwhipkey): When an inference context inside function has
 // requested_input_tensor(i) or requested_input_tensor_as_partial_shape(i)
 // set when input(i) is an _Arg op, then this request should propagate to
@@ -167,8 +168,8 @@ Status ShapeRefiner::InferShapesForFunction(
     auto node_shape_inference_lambda = [this, &outer_context, &function_nodes,
                                         &inference_status](const Node* node) {
       if (!inference_status.ok()) return;
-      inference_status = InferShapesForFunctionSubNode(
-          node, this, outer_context->get_context());
+      inference_status =
+          InferShapesForFunctionSubNode(node, outer_context->get_context());
       function_nodes.insert(node);
     };
 
@@ -187,6 +188,11 @@ Status ShapeRefiner::InferShapesForFunction(
 }
 
 Status ShapeRefiner::AddNode(const Node* node) {
+  return AddNodeInternal(node, /*outer_context=*/nullptr);
+}
+
+Status ShapeRefiner::AddNodeInternal(
+    const Node* node, shape_inference::InferenceContext* outer_context) {
   // Create the inference context for this node with the existing input shapes.
   std::unique_ptr<InferenceContext> ic(new InferenceContext(
       graph_def_version_, node->def(), node->op_def(),
@@ -240,7 +246,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
       new ExtendedInferenceContext(std::move(ic), node));
 
   // Run the shape inference function, and return if there was an error.
-  TF_RETURN_IF_ERROR(RunShapeFn(node, op_reg_data, ec.get()));
+  TF_RETURN_IF_ERROR(RunShapeFn(node, op_reg_data, ec.get(), outer_context));
 
   // Store the resulting context object in the map.
   node_to_context_[node].swap(ec);
@@ -385,25 +391,25 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
   return RunShapeFn(node, op_reg_data, node_ext_context);
 }
 
-Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
-                                                   int dst_idx, bool* evaluated,
-                                                   Tensor* result) {
+Status ShapeRefiner::EvaluateConstantTensorForEdge(
+    const Node* node, int dst_idx, bool* evaluated, Tensor* result,
+    InferenceContext* outer_context) {
   *evaluated = false;
   const Edge* input_edge;
   TF_RETURN_IF_ERROR(node->input_edge(dst_idx, &input_edge));
   OutputTensor tensor(input_edge->src(), input_edge->src_output());
-  return EvaluateConstantTensor(tensor, *this, *ops_registry_,
-                                graph_def_version_, evaluated, result,
-                                &graph_runner_, &const_tensor_map_,
-                                kMaxTensorSize, disable_constant_propagation_);
+  return EvaluateConstantTensor(
+      tensor, *this, *ops_registry_, graph_def_version_, evaluated, result,
+      &graph_runner_, &const_tensor_map_, kMaxTensorSize,
+      disable_constant_propagation_, outer_context);
 }
 
-Status ShapeRefiner::EvaluateConstantIntScalarEdge(const Node* node,
-                                                   int dst_idx, bool* evaluated,
-                                                   int64* result) {
+Status ShapeRefiner::EvaluateConstantIntScalarEdge(
+    const Node* node, int dst_idx, bool* evaluated, int64* result,
+    shape_inference::InferenceContext* outer_context) {
   Tensor scalar;
-  TF_RETURN_IF_ERROR(
-      EvaluateConstantTensorForEdge(node, dst_idx, evaluated, &scalar));
+  TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(node, dst_idx, evaluated,
+                                                   &scalar, outer_context));
   if (*evaluated) {
     if (scalar.NumElements() != 1) {
       return errors::InvalidArgument(
@@ -424,9 +430,9 @@ Status ShapeRefiner::EvaluateConstantIntScalarEdge(const Node* node,
   return Status::OK();
 }
 
-Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
-                                          const Node* node, int dst_idx,
-                                          ShapeHandle* result) {
+Status ShapeRefiner::ConstantPartialShape(
+    InferenceContext* target_context, const Node* node, int dst_idx,
+    ShapeHandle* result, shape_inference::InferenceContext* outer_context) {
   const Edge* input_edge;
   TF_RETURN_IF_ERROR(node->input_edge(dst_idx, &input_edge));
 
@@ -437,8 +443,8 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   if (src_context->Value(src_context->Rank(src_shape)) == 0) {
     Tensor t;
     bool evaluated = false;
-    TF_RETURN_IF_ERROR(
-        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(node, dst_idx, &evaluated,
+                                                     &t, outer_context));
     if (!evaluated) {
       return errors::InvalidArgument(
           "Received a shape scalar with unknown static value.  A static value "
@@ -471,7 +477,9 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     // a float.
     Tensor t;
     bool evaluated = false;
-    if (EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t).ok()) {
+    if (EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t,
+                                      outer_context)
+            .ok()) {
       if (evaluated &&
           target_context->MakeShapeFromTensor(&t, src_shape, result).ok()) {
         return Status::OK();
@@ -481,7 +489,7 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     // Then try to infer partial shape from the input to the cast tensor.
     ShapeHandle pre_cast_shape;
     if (!ConstantPartialShape(target_context, input_edge->src(), 0,
-                              &pre_cast_shape)
+                              &pre_cast_shape, outer_context)
              .ok()) {
       TF_RETURN_IF_ERROR(
           target_context->MakeShapeFromTensor(nullptr, src_shape, result));
@@ -510,8 +518,8 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     for (int i = 0; i < src_context->num_inputs(); ++i) {
       int64 size;
       bool evaluated;
-      TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(input_edge->src(), i,
-                                                       &evaluated, &size));
+      TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(
+          input_edge->src(), i, &evaluated, &size, outer_context));
       if (evaluated) {
         dims.push_back(size < 0 ? target_context->UnknownDim()
                                 : target_context->MakeDim(size));
@@ -531,7 +539,7 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
       if (i == concat_dim) continue;
       ShapeHandle sub_result;
       TF_RETURN_IF_ERROR(ConstantPartialShape(target_context, input_edge->src(),
-                                              i, &sub_result));
+                                              i, &sub_result, outer_context));
       if (!target_context->RankKnown(sub_result)) {
         // Failed to evaluate. Treat the output as completely unknown.
         // TODO(cwhipkey): we could rely on all inputs being the same rank, so
@@ -543,8 +551,8 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
           target_context->Concatenate(*result, sub_result, result));
     }
   } else if (src_op == "StridedSlice") {
-    TF_RETURN_IF_ERROR(
-        PartialStridedSliceShape(input_edge->src(), src_context, result));
+    TF_RETURN_IF_ERROR(PartialStridedSliceShape(input_edge->src(), src_context,
+                                                result, outer_context));
   } else if (src_op == "VariableShape") {
     auto* handle_data = src_context->input_handle_shapes_and_types(0);
     if (handle_data != nullptr && !handle_data->empty()) {
@@ -555,17 +563,17 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   } else {
     Tensor t;
     bool evaluated = false;
-    TF_RETURN_IF_ERROR(
-        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(node, dst_idx, &evaluated,
+                                                     &t, outer_context));
     TF_RETURN_IF_ERROR(target_context->MakeShapeFromTensor(
         evaluated ? &t : nullptr, src_shape, result));
   }
   return Status::OK();
 }
 
-Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
-                                              InferenceContext* ctx,
-                                              ShapeHandle* result) {
+Status ShapeRefiner::PartialStridedSliceShape(
+    Node* slice_node, InferenceContext* ctx, ShapeHandle* result,
+    shape_inference::InferenceContext* outer_context) {
   // Only attempt to evaluate if begin/end/strides all are scalars.
   for (int i = 1; i <= 3; ++i) {
     ShapeHandle input_shape = ctx->input(i);
@@ -600,8 +608,8 @@ Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
   if (begin_mask == 1) {
     begin = 0;
   } else {
-    TF_RETURN_IF_ERROR(
-        EvaluateConstantIntScalarEdge(slice_node, 1, &evaluated, &begin));
+    TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(slice_node, 1, &evaluated,
+                                                     &begin, outer_context));
     if (!evaluated) {
       *result = ctx->UnknownShape();
       return Status::OK();
@@ -612,8 +620,8 @@ Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
   if (end_mask == 1) {
     end = std::numeric_limits<int64>::max();
   } else {
-    TF_RETURN_IF_ERROR(
-        EvaluateConstantIntScalarEdge(slice_node, 2, &evaluated, &end));
+    TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(slice_node, 2, &evaluated,
+                                                     &end, outer_context));
     if (!evaluated) {
       *result = ctx->UnknownShape();
       return Status::OK();
@@ -621,8 +629,8 @@ Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
   }
 
   int64 stride;
-  TF_RETURN_IF_ERROR(
-      EvaluateConstantIntScalarEdge(slice_node, 3, &evaluated, &stride));
+  TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(slice_node, 3, &evaluated,
+                                                   &stride, outer_context));
   if (!evaluated) {
     *result = ctx->UnknownShape();
     return Status::OK();
@@ -630,14 +638,16 @@ Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
 
   // Apply stride to input interpreted as a partial shape.
   ShapeHandle input;
-  TF_RETURN_IF_ERROR(ConstantPartialShape(ctx, slice_node, 0, &input));
+  TF_RETURN_IF_ERROR(
+      ConstantPartialShape(ctx, slice_node, 0, &input, outer_context));
   TF_RETURN_IF_ERROR(ctx->Subshape(input, begin, end, stride, result));
   return Status::OK();
 }
 
 Status ShapeRefiner::RunShapeFn(const Node* node,
                                 const OpRegistrationData* op_reg_data,
-                                ExtendedInferenceContext* ec) {
+                                ExtendedInferenceContext* ec,
+                                InferenceContext* outer_context) {
   // This will be filled in with real data in a second pass.
   std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
   std::vector<Tensor> real_tensors(node->num_inputs());
@@ -719,8 +729,8 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
 
         Tensor result;
         bool evaluated = false;
-        TF_RETURN_IF_ERROR(
-            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
+        TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(
+            node, i, &evaluated, &result, outer_context));
         if (evaluated) {
           real_tensors[i] = result;
           input_tensors[i] = &real_tensors[i];
@@ -736,7 +746,7 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
           input_tensors_as_shapes.resize(i + 1);
         }
         ShapeHandle s;
-        TF_RETURN_IF_ERROR(ConstantPartialShape(c, node, i, &s));
+        TF_RETURN_IF_ERROR(ConstantPartialShape(c, node, i, &s, outer_context));
         input_tensors_as_shapes[i] = s;
         rerun_shape_fn = true;
       }
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index c83bd81705b..e298700a0b0 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -184,17 +184,56 @@ class ShapeRefiner {
                                 AttrSlice attributes,
                                 ExtendedInferenceContext* outer_context);
 
+  // Performs shape inference for a node inside a function.
+  //
+  // 'outer_context' is the 'InferenceContext' for the function's call op.
+  Status InferShapesForFunctionSubNode(
+      const Node* node, shape_inference::InferenceContext* outer_context);
+
+  // Performs validation of 'node' and runs 'node's shape function,
+  // storing its shape outputs.
+  //
+  // All inputs of 'node' must be added to ShapeRefiner prior to
+  // adding 'node'.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  //
+  // Returns an error if:
+  //  - the shape function for 'node' was not registered.
+  //  - 'node' was added before its inputs.
+  //  - The shape inference function returns an error.
+  Status AddNodeInternal(const Node* node,
+                         shape_inference::InferenceContext* outer_context);
+
   // Attempts to evaluate the 'dst_idx'-th input to 'node'. If the input edge
   // value can be evaluated, 'evaluated' is set to true and the value returned
   // in 'result'. Otherwise 'evaluated' is set to false.
-  Status EvaluateConstantTensorForEdge(const Node* node, int dst_idx,
-                                       bool* evaluated, Tensor* result);
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  Status EvaluateConstantTensorForEdge(
+      const Node* node, int dst_idx, bool* evaluated, Tensor* result,
+      shape_inference::InferenceContext* outer_context);
 
   // Wrapper around EvaluateConstantTensorForEdge for scalar int32/int64 input
   // tensors. The caller is responsible for checking that the specified edge is
   // scalar and int32 or int64.
-  Status EvaluateConstantIntScalarEdge(const Node* node, int dst_idx,
-                                       bool* evaluated, int64* result);
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  Status EvaluateConstantIntScalarEdge(
+      const Node* node, int dst_idx, bool* evaluated, int64* result,
+      shape_inference::InferenceContext* outer_context);
 
   // This function tries to materialize as much information about the 'node''s
   // dst_idx input as a statically computable shape, and the result may be
@@ -217,17 +256,39 @@ class ShapeRefiner {
   //
   // <target_context> is used when creating new DimensionHandle and ShapeHandle
   // objects.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
   Status ConstantPartialShape(shape_inference::InferenceContext* target_context,
                               const Node* node, int dst_idx,
-                              shape_inference::ShapeHandle* result);
+                              shape_inference::ShapeHandle* result,
+                              shape_inference::InferenceContext* outer_context);
 
   // Implementation of ConstantPartialShape for StridedSlice nodes.
-  Status PartialStridedSliceShape(Node* slice_node,
-                                  shape_inference::InferenceContext* ctx,
-                                  shape_inference::ShapeHandle* result);
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  Status PartialStridedSliceShape(
+      Node* slice_node, shape_inference::InferenceContext* ctx,
+      shape_inference::ShapeHandle* result,
+      shape_inference::InferenceContext* outer_context);
 
+  // Runs the shape function registered for the node's op type.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
   Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
-                    ExtendedInferenceContext* ec);
+                    ExtendedInferenceContext* ec,
+                    shape_inference::InferenceContext* outer_context = nullptr);
 
   int32 graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 44fa5bf2d3a..02cd53221d4 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 
+#include "absl/base/call_once.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
@@ -55,18 +55,14 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
   if (DisableMKL()) return;
 #ifdef _OPENMP
   const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  static absl::once_flag omp_setting_flag;
   if (user_omp_threads == nullptr) {
     // OMP_NUM_THREADS controls MKL's intra-op parallelization
     // Default to available physical cores
     const int mkl_intra_op = port::NumSchedulableCPUs();
     const int ht = port::NumHyperthreadsPerCore();
-    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
-  } else {
-    uint64 user_val = 0;
-    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
-      // Superflous but triggers OpenMP loading
-      omp_set_num_threads(user_val);
-    }
+    absl::call_once(omp_setting_flag, omp_set_num_threads,
+                    (mkl_intra_op + ht - 1) / ht);
   }
 #endif  // _OPENMP
 #endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 2973548ab19..8f23ae9cdb8 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -50,6 +50,7 @@ namespace data {
 
 namespace {
 // The name of the journal directory inside the dispatcher's working directory.
+// This name is load-bearing; do not change.
 constexpr char kJournalDir[] = "tf_data_dispatcher_journal";
 // The name of the datasets directory inside the dispatcher's working directory.
 constexpr char kDatasetsDir[] = "datasets";
@@ -94,6 +95,23 @@ Status CreateWorkerStub(const std::string& address, const std::string& protocol,
   stub = WorkerService::NewStub(channel);
   return Status::OK();
 }
+
+void PrepareGraph(GraphDef* graph) {
+  for (NodeDef& node : *graph->mutable_node()) {
+    for (const auto& op : kNodeNameSharingOps) {
+      // Set `use_node_name_sharing` to `true` so that resources aren't deleted
+      // prematurely. Otherwise, resources may be deleted when their ops are
+      // deleted at the end of the GraphRunner::Run used by standalone::Dataset.
+      if (node.op() == op) {
+        (*node.mutable_attr())["use_node_name_sharing"].set_b(true);
+      }
+      if (!node.device().empty()) {
+        *node.mutable_device() = "";
+      }
+    }
+  }
+  StripDevicePlacement(graph->mutable_library());
+}
 }  // namespace
 
 DataServiceDispatcherImpl::DataServiceDispatcherImpl(
@@ -324,23 +342,16 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
   TF_RETURN_IF_ERROR(CheckStarted());
   uint64 fingerprint;
   DatasetDef dataset_def = request->dataset();
-  TF_RETURN_IF_ERROR(HashGraph(dataset_def.graph(), &fingerprint));
-  // Set `use_node_name_sharing` to `true` so that resources aren't deleted
-  // prematurely. Otherwise, resources may be deleted when their ops are
-  // deleted at the end of the GraphRunner::Run used by standalone::Dataset.
-  for (NodeDef& node : *dataset_def.mutable_graph()->mutable_node()) {
-    for (const auto& op : kNodeNameSharingOps) {
-      if (node.op() == op) {
-        (*node.mutable_attr())["use_node_name_sharing"].set_b(true);
-      }
-    }
-  }
+  GraphDef* graph = dataset_def.mutable_graph();
+  PrepareGraph(graph);
+  TF_RETURN_IF_ERROR(HashGraph(*graph, &fingerprint));
+
   mutex_lock l(mu_);
 #if defined(PLATFORM_GOOGLE)
-  VLOG_LINES(4, absl::StrCat("Registering dataset graph: ",
-                             dataset_def.graph().DebugString()));
+  VLOG_LINES(4,
+             absl::StrCat("Registering dataset graph: ", graph->DebugString()));
 #else
-  VLOG(4) << "Registering dataset graph: " << dataset_def.graph().DebugString();
+  VLOG(4) << "Registering dataset graph: " << graph->DebugString();
 #endif
   std::shared_ptr<const Dataset> dataset;
   Status s = state_.DatasetFromFingerprint(fingerprint, dataset);
@@ -644,7 +655,14 @@ Status DataServiceDispatcherImpl::GetTasks(const GetTasksRequest* request,
   mutex_lock l(mu_);
   VLOG(3) << "Looking up tasks for job client id " << request->job_client_id();
   std::shared_ptr<const Job> job;
-  TF_RETURN_IF_ERROR(state_.JobForJobClientId(request->job_client_id(), job));
+  Status s = state_.JobForJobClientId(request->job_client_id(), job);
+  if (errors::IsNotFound(s) && !config_.fault_tolerant_mode()) {
+    return errors::NotFound(
+        "Unknown job client id ", request->job_client_id(),
+        ". The dispatcher is not configured to be fault tolerant, so this "
+        "could be caused by a dispatcher restart.");
+  }
+  TF_RETURN_IF_ERROR(s);
   std::vector<std::shared_ptr<const Task>> tasks;
   TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, tasks));
   for (const auto& task : tasks) {
diff --git a/tensorflow/core/data/service/grpc_util.cc b/tensorflow/core/data/service/grpc_util.cc
index 73ea384ea60..0551d9537fc 100644
--- a/tensorflow/core/data/service/grpc_util.cc
+++ b/tensorflow/core/data/service/grpc_util.cc
@@ -31,7 +31,8 @@ Status WrapError(const std::string& message, const ::grpc::Status& status) {
     return errors::Internal("Expected a non-ok grpc status. Wrapping message: ",
                             message);
   } else {
-    return Status(static_cast<tensorflow::error::Code>(status.error_code()),
+    Status s = FromGrpcStatus(status);
+    return Status(s.code(),
                   absl::StrCat(message, ": ", status.error_message()));
   }
 }
diff --git a/tensorflow/core/data/service/split_provider.cc b/tensorflow/core/data/service/split_provider.cc
index b3100d52ff1..4ebb25348b6 100644
--- a/tensorflow/core/data/service/split_provider.cc
+++ b/tensorflow/core/data/service/split_provider.cc
@@ -22,10 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-namespace {
-const int64 kRetryTimeoutMicros = 1000LL * 1000 * 60 * 60;  // 60 minutes.
-}  // namespace
-
 Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
   mutex_lock l(mu_);
   if (!dispatcher_) {
@@ -38,7 +34,8 @@ Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
                                      *end_of_splits);
       },
       "get next split",
-      /*deadline_micros=*/Env::Default()->NowMicros() + kRetryTimeoutMicros);
+      /*deadline_micros=*/Env::Default()->NowMicros() +
+          (timeout_ms_ * EnvTime::kMillisToMicros));
 }
 
 Status DataServiceSplitProvider::Reset() {
diff --git a/tensorflow/core/data/service/split_provider.h b/tensorflow/core/data/service/split_provider.h
index 110b9e26ec7..57091de9db1 100644
--- a/tensorflow/core/data/service/split_provider.h
+++ b/tensorflow/core/data/service/split_provider.h
@@ -28,8 +28,12 @@ namespace data {
 class DataServiceSplitProvider : public SplitProvider {
  public:
   DataServiceSplitProvider(const std::string& address,
-                           const std::string& protocol, int64 job_id)
-      : address_(address), protocol_(protocol), job_id_(job_id) {}
+                           const std::string& protocol, int64 job_id,
+                           int64 timeout_ms)
+      : address_(address),
+        protocol_(protocol),
+        job_id_(job_id),
+        timeout_ms_(timeout_ms) {}
 
   Status GetNext(Tensor* split, bool* end_of_splits) override;
   Status Reset() override;
@@ -42,6 +46,7 @@ class DataServiceSplitProvider : public SplitProvider {
   const std::string address_;
   const std::string protocol_;
   const int64 job_id_;
+  const int64 timeout_ms_;
 
   mutex mu_;
   int64 repetition_ = 0;
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 364681ef549..4621e1e8a80 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -150,7 +150,7 @@ Status DataServiceWorkerImpl::EnsureTaskInitialized(
     case DISTRIBUTED_EPOCH: {
       auto split_provider = absl::make_unique<DataServiceSplitProvider>(
           config_.dispatcher_address(), config_.protocol(),
-          task.task_def.job_id());
+          task.task_def.job_id(), config_.dispatcher_timeout_ms());
       TF_RETURN_IF_ERROR(task.dataset->MakeIterator(std::move(split_provider),
                                                     &task.iterator));
       break;
@@ -182,7 +182,7 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
           "Worker has not yet registered with dispatcher.");
     }
     auto it = tasks_.find(request->task_id());
-    if (it == tasks_.end()) {
+    if (it == tasks_.end() || it->second->finished) {
       response->set_end_of_sequence(true);
       return Status::OK();
     }
@@ -191,7 +191,7 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
     TF_RETURN_IF_ERROR(task->iterator->GetNext(&outputs, &end_of_sequence));
     if (end_of_sequence) {
       VLOG(3) << "Reached end_of_sequence for task " << request->task_id();
-      tasks_.erase(request->task_id());
+      task->finished = true;
       pending_completed_tasks_.insert(request->task_id());
       task_completion_cv_.notify_one();
     }
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 5f05275622b..16a0ba0cd93 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -60,6 +60,7 @@ class DataServiceWorkerImpl {
     TaskDef task_def;
     mutex mu;
     bool initialized TF_GUARDED_BY(mu) = false;
+    bool finished = false;
     // TODO(aaudibert): Have standalone::Iterator own a reference to
     // standalone::Dataset so that we don't need to store the dataset here.
     std::unique_ptr<standalone::Dataset> dataset;
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index d3030f4ca0b..12b957334aa 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -242,6 +242,7 @@ tf_cc_test(
 
 cc_library(
     name = "cancellable_call",
+    srcs = ["cancellable_call.cc"],
     hdrs = ["cancellable_call.h"],
     deps = [
         ":call_options",
@@ -531,6 +532,7 @@ cc_library(
     srcs = ["collective_rma_distributed.cc"],
     hdrs = ["collective_rma_distributed.h"],
     deps = [
+        ":call_options",
         ":cancellable_call",
         ":request_id",
         ":worker_cache",
diff --git a/tensorflow/core/distributed_runtime/cancellable_call.cc b/tensorflow/core/distributed_runtime/cancellable_call.cc
new file mode 100644
index 00000000000..ed25c3a1947
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/cancellable_call.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/cancellable_call.h"
+
+namespace tensorflow {
+
+void CancellableCall::Start(const StatusCallback& done) {
+  if (cancel_mgr_ == nullptr) {
+    IssueCall(done);
+    return;
+  }
+  CancellationToken token = cancel_mgr_->get_cancellation_token();
+  const bool not_yet_cancelled =
+      cancel_mgr_->RegisterCallback(token, [this]() { Cancel(); });
+  if (not_yet_cancelled) {
+    IssueCall([this, token, done](const Status& s) {
+      cancel_mgr_->DeregisterCallback(token);
+      done(s);
+    });
+  } else {
+    done(errors::Cancelled("RPC Request was cancelled"));
+  }
+}
+
+void CancellableCall::Cancel() {
+  {
+    mutex_lock l(mu_);
+    if (is_cancelled_) {
+      return;
+    }
+    is_cancelled_ = true;
+  }
+  opts_.StartCancel();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/cancellable_call.h b/tensorflow/core/distributed_runtime/cancellable_call.h
index 3d82bef5c80..7311c8e3a44 100644
--- a/tensorflow/core/distributed_runtime/cancellable_call.h
+++ b/tensorflow/core/distributed_runtime/cancellable_call.h
@@ -29,7 +29,8 @@ class CancellableCall {
  public:
   CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
                   WorkerCacheInterface* wc)
-      : cancel_mgr_(cancel_mgr),
+      : is_cancelled_(false),
+        cancel_mgr_(cancel_mgr),
         remote_worker_(remote_worker),
         wc_(wc),
         wi_(wc_->GetOrCreateWorker(remote_worker_)) {}
@@ -38,22 +39,17 @@ class CancellableCall {
 
   virtual void IssueCall(const StatusCallback& done) = 0;
 
-  void Start(const StatusCallback& done) {
-    CancellationToken token = cancel_mgr_->get_cancellation_token();
-    const bool not_yet_cancelled =
-        cancel_mgr_->RegisterCallback(token, [this]() { opts_.StartCancel(); });
-    if (not_yet_cancelled) {
-      IssueCall([this, token, done](const Status& s) {
-        cancel_mgr_->DeregisterCallback(token);
-        done(s);
-      });
-    } else {
-      done(errors::Cancelled("RPC Request was cancelled"));
-    }
-  }
+  void Start(const StatusCallback& done);
+
+  // Cancels the RPC if it's not cancelled yet. This must be called after
+  // Start(). This is normally used if there's a needed to cancel the RPC from a
+  // sideband. If appliable, pass a cancellation manager to the constructor
+  // instead of using this method.
+  void Cancel() TF_LOCKS_EXCLUDED(mu_);
 
  protected:
-  mutable mutex mu_;
+  mutex mu_;
+  bool is_cancelled_;
   CancellationManager* const cancel_mgr_;  // Not owned
   const string remote_worker_;
   WorkerCacheInterface* const wc_;  // Not owned
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 238d29065d2..9466c8ef96b 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -295,19 +295,30 @@ void CollectiveParamResolverDistributed::CompleteGroupDistributed(
     CompleteGroupCall* call =
         new CompleteGroupCall(cp->group, device, cp->instance.type, cancel_mgr,
                               group_leader_, worker_cache_);
-    call->Start([this, device, cp, call, done](const Status& s) {
-      if (s.ok()) {
-        Status status = UpdateGroupCache(call->resp_);
-        if (status.ok()) {
-          CompleteGroupLocal(device, cp, done);
-        } else {
-          done(status, nullptr);
-        }
-      } else {
-        done(s, nullptr);
-      }
+    CancellationToken abortion_token =
+        abortion_cancel_mgr_.get_cancellation_token();
+    bool already_aborted = !abortion_cancel_mgr_.RegisterCallback(
+        abortion_token, [call] { call->Cancel(); });
+    if (already_aborted) {
+      done(errors::Cancelled("collective ops already aborted"), nullptr);
       delete call;
-    });
+      return;
+    }
+    call->Start(
+        [this, device, cp, call, abortion_token, done](const Status& s) {
+          abortion_cancel_mgr_.DeregisterCallback(abortion_token);
+          if (s.ok()) {
+            Status status = UpdateGroupCache(call->resp_);
+            if (status.ok()) {
+              CompleteGroupLocal(device, cp, done);
+            } else {
+              done(status, nullptr);
+            }
+          } else {
+            done(s, nullptr);
+          }
+          delete call;
+        });
     return;
   } else {
     return CompleteGroupLocal(device, cp, done);
@@ -373,7 +384,17 @@ void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
     CompleteInstanceCall* call = new CompleteInstanceCall(
         cp->group, cp->instance, cp->name, device, cp->is_source, cancel_mgr,
         group_leader_, worker_cache_);
-    call->Start([this, device, gr, cp, call, done](Status s) {
+    CancellationToken abortion_token =
+        abortion_cancel_mgr_.get_cancellation_token();
+    bool already_aborted = !abortion_cancel_mgr_.RegisterCallback(
+        abortion_token, [call] { call->Cancel(); });
+    if (already_aborted) {
+      done(errors::Cancelled("collective ops already aborted"));
+      delete call;
+      return;
+    }
+    call->Start([this, device, gr, cp, call, abortion_token, done](Status s) {
+      abortion_cancel_mgr_.DeregisterCallback(abortion_token);
       if (s.ok()) {
         s = UpdateInstanceCache(gr, cp, call->resp_);
       }
@@ -388,4 +409,19 @@ void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
   }
 }
 
+void CollectiveParamResolverDistributed::StartAbort(const Status& s) {
+  {
+    mutex_lock l(status_mu_);
+    if (!status_.ok()) {
+      VLOG(2) << "CollectiveParamResolverDistributed already aborted. Ignoring "
+                 "subsequent abortion with status: "
+              << s;
+      return;
+    }
+    status_ = s;
+  }
+  StartAbortLocal(s);
+  abortion_cancel_mgr_.StartCancel();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index 89f923a800b..97445fa6cfd 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
 
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -47,6 +48,8 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                              CancellationManager* cancel_mgr,
                              const StatusCallback& done) override;
 
+  void StartAbort(const Status& s) override;
+
  protected:
   // Returns the cached group iff there's an entry for this group_key in the
   // local group_table_; returns nullptr otherwise.
@@ -87,6 +90,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
 
   WorkerCacheInterface* worker_cache_;  // Not owned
   const string group_leader_;
+  CancellationManager abortion_cancel_mgr_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index f08f7a3275d..1c62b17fe54 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -53,7 +53,7 @@ class FakeWorker : public TestWorkerInterface {
              CollectiveParamResolverDistributed* cpres)
       : name_(name), device_mgr_(dev_mgr), param_resolver_(cpres) {}
 
-  void GetStatusAsync(const GetStatusRequest* request,
+  void GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     std::vector<DeviceAttributes> dev_attr;
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 1861262e9b1..29fcd82a4df 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -19,9 +19,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/cancellable_call.h"
 #include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -78,12 +80,12 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
     const string& key, Device* to_device, DeviceContext* to_device_ctx,
     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
     const DeviceLocality& client_locality, int dev_to_dev_stream_index,
-    const StatusCallback& done) {
+    CancellationManager* cancellation_manager, const StatusCallback& done) {
   if (peer_is_local) {
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
         to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
-        done);
+        cancellation_manager, done);
     return;
   }
 
@@ -166,15 +168,27 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
     recv_buf_callback(s);
     return;
   }
-  state->call.reset(
-      new RecvBufCall(step_id_, peer_device, peer_task, key, to_device,
-                      to_device_ctx, to_alloc_attr, to_tensor, client_locality,
-                      state->server_attributes, &cancel_mgr_, worker_cache_));
-  state->call->Start(recv_buf_callback);
+  state->call.reset(new RecvBufCall(
+      step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
+      to_alloc_attr, to_tensor, client_locality, state->server_attributes,
+      cancellation_manager, worker_cache_));
+  CancellationToken abortion_token =
+      abortion_cancel_mgr_.get_cancellation_token();
+  bool already_aborted = !abortion_cancel_mgr_.RegisterCallback(
+      abortion_token, [state] { state->call->Cancel(); });
+  if (already_aborted) {
+    recv_buf_callback(errors::Cancelled("collective ops already aborted"));
+  } else {
+    state->call->Start([this, abortion_token,
+                        done = std::move(recv_buf_callback)](const Status& s) {
+      abortion_cancel_mgr_.DeregisterCallback(abortion_token);
+      done(s);
+    });
+  }
 }
 
 void CollectiveRemoteAccessDistributed::CheckPeerHealth(
-    const string& peer_task, const StatusCallback& done) {
+    const string& peer_task, int64 timeout_in_ms, const StatusCallback& done) {
   if (peer_task == task_name_) {
     // Fast path if the peer is the worker itself.
     done(Status::OK());
@@ -191,13 +205,16 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
                                  "valid form is /job:xxx/replica:0/task:N"));
     return;
   }
+  auto opts = new CallOptions();
+  opts->SetTimeout(timeout_in_ms);
   auto req = new GetStatusRequest();
   auto resp = new GetStatusResponse();
-  // We're not using Cancellable call because GetStatusAsync doesn't support
-  // cancellation yet.
+  // Note that fail_fast is not always respected, so we set a timeout as well.
+  // We're not using CancellableCall since check health shouldn't need to be
+  // cancelled.
   wi->GetStatusAsync(
-      req, resp, /*fail_fast*/ true,
-      [this, req, resp, wi, peer_task, done](Status s) {
+      opts, req, resp, /*fail_fast*/ true,
+      [this, opts, req, resp, wi, peer_task, done](Status s) {
         std::vector<DeviceAttributes> cached_attrs;
         if (s.ok()) {
           s = dev_resolver_->GetAllDeviceAttributes(peer_task, &cached_attrs);
@@ -222,6 +239,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
           // first collective.
           s = Status::OK();
         }
+        delete opts;
         delete req;
         delete resp;
         worker_cache_->ReleaseWorker(peer_task, wi);
@@ -231,7 +249,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
 
 void CollectiveRemoteAccessDistributed::StartAbort(const Status& s) {
   CollectiveRemoteAccessLocal::StartAbort(s);
-  cancel_mgr_.StartCancel();
+  abortion_cancel_mgr_.StartCancel();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
index ed4d448afd9..e3e61e537f7 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
 
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/unbounded_work_queue.h"
 
@@ -42,9 +43,10 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality,
                     int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
                     const StatusCallback& done) override;
 
-  void CheckPeerHealth(const string& peer_task,
+  void CheckPeerHealth(const string& peer_task, int64 timeout_in_ms,
                        const StatusCallback& done) override;
 
   void StartAbort(const Status& s) override;
@@ -54,7 +56,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   // Ownership of `work_queue_` is shared between `this` and
   // `CollectiveExecutorMgr`.
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
-  CancellationManager cancel_mgr_;
+  CancellationManager abortion_cancel_mgr_;
   string task_name_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 454111eb1b6..74282beff1f 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -74,7 +74,7 @@ class FakeWorker : public TestWorkerInterface {
   // worker is supposed to have.
   BufRendezvous* buf_rendezvous() { return &buf_rendezvous_; }
 
-  void GetStatusAsync(const GetStatusRequest* request,
+  void GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     if (is_failed_) {
@@ -126,7 +126,8 @@ class FakeWorker : public TestWorkerInterface {
           }
           done(s);
           if (h) BufRendezvous::DoneWithHook(h);
-        });
+        },
+        nullptr /*cancellation_manager*/);
   }
 
  private:
@@ -311,7 +312,8 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
       [&producer_note, &producer_status](const Status& s) {
         producer_status.Update(s);
         producer_note.Notify();
-      });
+      },
+      nullptr /*cancellation_manager*/);
   Device* dst_device = nullptr;
   string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
@@ -322,6 +324,7 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
       false,                                              // peer_is_local
       kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
+      nullptr /*cancellation_manager*/,
       [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
@@ -351,6 +354,7 @@ TEST_F(CollRMADistTest, ConsFirstOK) {
       false,                                              // peer_is_local
       kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
+      nullptr /*cancellation_manager*/,
       [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
@@ -361,7 +365,8 @@ TEST_F(CollRMADistTest, ConsFirstOK) {
       [&producer_note, &producer_status](const Status& s) {
         producer_status.Update(s);
         producer_note.Notify();
-      });
+      },
+      nullptr /*cancellation_manager*/);
   consumer_note.WaitForNotification();
   TF_EXPECT_OK(consumer_status);
   producer_note.WaitForNotification();
@@ -384,6 +389,7 @@ TEST_F(CollRMADistTest, ConsFirstAbort) {
       false,                                              // peer_is_local
       kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
+      nullptr /*cancellation_manager*/,
       [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
@@ -411,6 +417,7 @@ TEST_F(CollRMADistTest, WorkerRestart) {
       false,                                              // peer_is_local
       buf_key, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
+      nullptr /*cancellation_manager*/,
       [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
@@ -421,7 +428,8 @@ TEST_F(CollRMADistTest, WorkerRestart) {
       [&producer_note, &producer_status](const Status& s) {
         producer_status.Update(s);
         producer_note.Notify();
-      });
+      },
+      nullptr /*cancellation_manager*/);
   consumer_note.WaitForNotification();
   TF_EXPECT_OK(consumer_status);
   producer_note.WaitForNotification();
@@ -437,6 +445,7 @@ TEST_F(CollRMADistTest, WorkerRestart) {
       false,                                              // peer_is_local
       buf_key, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
+      nullptr /*cancellation_manager*/,
       [&consumer_status, &post_restart_note](const Status& s) {
         consumer_status = s;
         post_restart_note.Notify();
@@ -450,7 +459,7 @@ TEST_F(CollRMADistTest, CheckHealthOKWithCachedAttr) {
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
-      "/job:worker/replica:0/task:1",
+      "/job:worker/replica:0/task:1", /*timeout_in_ms=*/0,
       [&check_health_status, &check_health_done](const Status s) {
         check_health_status = s;
         check_health_done.Notify();
@@ -463,7 +472,7 @@ TEST_F(CollRMADistTest, CheckHealthOKWithoutCachedAttr) {
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
-      "/job:worker/replica:0/task:1",
+      "/job:worker/replica:0/task:1", /*timeout_in_ms=*/0,
       [&check_health_status, &check_health_done](const Status s) {
         check_health_status = s;
         check_health_done.Notify();
@@ -479,7 +488,7 @@ TEST_F(CollRMADistTest, CheckHealthRestarted) {
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
-      "/job:worker/replica:0/task:1",
+      "/job:worker/replica:0/task:1", /*timeout_in_ms=*/0,
       [&check_health_status, &check_health_done](const Status s) {
         check_health_status = s;
         check_health_done.Notify();
@@ -496,7 +505,7 @@ TEST_F(CollRMADistTest, CheckHealthFailedPeer) {
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
-      "/job:worker/replica:0/task:1",
+      "/job:worker/replica:0/task:1", /*timeout_in_ms=*/0,
       [&check_health_status, &check_health_done](const Status s) {
         check_health_status = s;
         check_health_done.Notify();
@@ -511,7 +520,7 @@ TEST_F(CollRMADistTest, CheckHealthRestartedWithDifferentDevices) {
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
-      "/job:worker/replica:0/task:1",
+      "/job:worker/replica:0/task:1", /*timeout_in_ms=*/0,
       [&check_health_status, &check_health_done](const Status s) {
         check_health_status = s;
         check_health_done.Notify();
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 2138ecdfe95..ff44642c68e 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -752,7 +752,7 @@ tensorflow::Status EagerServiceImpl::GetServerContext(
   auto iter = contexts_.find(context_id);
   if (iter == contexts_.end()) {
     *server_context = nullptr;
-    return errors::InvalidArgument(strings::Printf(
+    return errors::Unavailable(strings::Printf(
         "Unable to find a context_id matching the specified one "
         "(%llu). Perhaps the worker was restarted, or the context was GC'd?",
         static_cast<unsigned long long>(context_id)));
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 9d35ddf08f7..4a97be5c0c4 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -1248,7 +1248,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   // Unable to handle the request since there is no eager context.
   Status status = eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                              &remote_enqueue_response);
-  EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
+  EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_TRUE(absl::StrContains(
       status.error_message(),
       "Unable to find a context_id matching the specified one"));
@@ -1285,7 +1285,7 @@ TEST_F(EagerServiceImplTest, KeepAliveTest) {
   Status status =
       eager_service_impl.KeepAlive(&keep_alive_request, &keep_alive_response);
 
-  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_EQ(status.code(), error::UNAVAILABLE);
   EXPECT_PRED_FORMAT2(::testing::IsSubstring, "Unable to find a context_id",
                       status.error_message());
 
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids_test.cc b/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
index 8910a50e9cd..fadd72eaa18 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
@@ -95,10 +95,10 @@ TEST(RecentRequestIds, Ordered3) { TestOrdered(3); }
 TEST(RecentRequestIds, Ordered4) { TestOrdered(4); }
 TEST(RecentRequestIds, Ordered5) { TestOrdered(5); }
 
-void BM_TrackUnique(int iters) {
+static void BM_TrackUnique(::testing::benchmark::State& state) {
   RecentRequestIds recent_request_ids(100000);
   RecvTensorRequest request;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     TF_CHECK_OK(recent_request_ids.TrackUnique(GetUniqueRequestId(),
                                                "BM_TrackUnique", request));
   }
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 05a9072894e..dd3a0ec3521 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -50,6 +50,8 @@ class RemoteDevice : public Device {
 
   bool IsLocal() const override { return false; }
 
+  bool IsRemoteCallAllowed() const override { return true; }
+
  private:
   const string local_dev_name_;
 
@@ -143,7 +145,8 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
       }
     }
   };
-  wi->GetStatusAsync(&call->req, &call->resp, /*fail_fast=*/false, cb);
+  wi->GetStatusAsync(/*opts=*/nullptr, &call->req, &call->resp,
+                     /*fail_fast=*/false, cb);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 14a358e8ac2..ce1a20a5ae9 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -94,6 +94,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
         "//tensorflow/core/distributed_runtime:worker_interface",
@@ -495,6 +496,7 @@ tf_cc_test(
     size = "small",
     srcs = ["grpc_tensor_coding_test.cc"],
     tags = [
+        "no_mac",
         "no_windows",
     ],
     deps = [
@@ -517,6 +519,9 @@ tf_cc_test(
     name = "grpc_util_test",
     size = "small",
     srcs = ["grpc_util_test.cc"],
+    tags = [
+        "no_mac",
+    ],
     deps = [
         ":grpc_util",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 07ab6c69d2e..51a48fe88ec 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -127,10 +127,10 @@ class GrpcRemoteMaster : public MasterInterface {
                        ::grpc::Status (MasterServiceStub::*pfunc)(
                            ::grpc::ClientContext*, const Request&, Response*),
                        string trace_string = {}) {
-    int64 timeout_in_ms = call_options->GetTimeout();
-    int64 expired_time_micros = Env::Default()->NowMicros();
-    if (timeout_in_ms > 0) {
-      expired_time_micros += (timeout_in_ms * 1000);
+    absl::Duration timeout = absl::Milliseconds(call_options->GetTimeout());
+    absl::Time expired_time = absl::FromUnixMicros(Env::Default()->NowMicros());
+    if (timeout > absl::ZeroDuration()) {
+      expired_time += timeout;
     }
     Status s;
     for (int num_retries = 0;; ++num_retries) {
@@ -140,7 +140,7 @@ class GrpcRemoteMaster : public MasterInterface {
         trace.reset(NewTraceRpc(trace_string, &ctx));
       }
       ctx.set_fail_fast(false);
-      if (timeout_in_ms > 0) {
+      if (timeout > absl::ZeroDuration()) {
         // We do not modify the timeout here to match legacy behavior. However,
         // this could violate the contract of tensorflow::Session. If we retry
         // an RPC just before the deadline is exceeded, we will still set the
@@ -148,8 +148,7 @@ class GrpcRemoteMaster : public MasterInterface {
         // being double what was expected.
         // TODO(b/117162170): investigate fixing this behavior for legacy and
         // gRPC RPC layers.
-        ctx.set_deadline(absl::ToChronoTime(absl::Now() +
-                                            absl::Milliseconds(timeout_in_ms)));
+        ctx.set_deadline(absl::ToChronoTime(absl::Now() + timeout));
       }
       s = FromGrpcStatus((stub_.get()->*pfunc)(&ctx, *request, response));
       if (!errors::IsUnavailable(s)) {
@@ -164,20 +163,20 @@ class GrpcRemoteMaster : public MasterInterface {
         LOG(WARNING) << "Too many retries, returning last status: " << s;
         return s;
       }
-      const int64 now_micros = Env::Default()->NowMicros();
-      const int64 deadline_with_backoff_micros =
-          now_micros + ComputeBackoffMicroseconds(num_retries);
+      absl::Time now = absl::FromUnixMicros(Env::Default()->NowMicros());
+      const absl::Time deadline_with_backoff =
+          now + absl::Microseconds(ComputeBackoffMicroseconds(num_retries));
       // Wait for a short period of time before retrying the RPC.  If our
       // backoff would put us past the RPC deadline, we truncate it to ensure
       // our RPC starts before the deadline.
-      const auto backoff_until =
-          (timeout_in_ms <= 0 ||
-           expired_time_micros > deadline_with_backoff_micros)
-              ? deadline_with_backoff_micros
-              : expired_time_micros;
-      Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
-      const int64 now = Env::Default()->NowMicros();
-      if (now > expired_time_micros && timeout_in_ms > 0) {
+      const auto backoff_until = (timeout <= absl::ZeroDuration() ||
+                                  expired_time > deadline_with_backoff)
+                                     ? deadline_with_backoff
+                                     : expired_time;
+      Env::Default()->SleepForMicroseconds(
+          absl::ToInt64Microseconds(backoff_until - now));
+      now = absl::FromUnixMicros(Env::Default()->NowMicros());
+      if (now > expired_time && timeout > absl::ZeroDuration()) {
         // If timeout_in_ms is set, exit the retry loop on timeout.
         return errors::DeadlineExceeded(ctx.debug_error_string());
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index d529abef36c..986ae6adf78 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -70,10 +71,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   ~GrpcRemoteWorker() override {}
 
-  void GetStatusAsync(const GetStatusRequest* request,
+  void GetStatusAsync(CallOptions* call_opts, const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
-    IssueRequest(request, response, getstatus_, std::move(done), nullptr,
+    IssueRequest(request, response, getstatus_, std::move(done), call_opts,
                  fail_fast);
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 07bb8e3eeea..d50b60cf899 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -931,7 +931,6 @@ TEST(SessionTest, InvalidOpInputName) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                      "Illegal op input name");
@@ -950,7 +949,6 @@ TEST(SessionTest, InvalidOpInputName) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                      "Illegal op input name");
@@ -969,7 +967,6 @@ TEST(SessionTest, InvalidOpInputName) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                      "Illegal op input name");
@@ -988,7 +985,6 @@ TEST(SessionTest, InvalidOpInputName) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                      "Illegal op input name");
@@ -1026,7 +1022,6 @@ TEST(SessionTest, ExtendValidation) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                                                   &extension);
@@ -1043,7 +1038,6 @@ TEST(SessionTest, ExtendValidation) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                                                   &extension);
@@ -1057,7 +1051,6 @@ TEST(SessionTest, ExtendValidation) {
       attr { key: 'T' value { type: DT_FLOAT } }
       attr { key: 'transpose_a' value { b: false } }
       attr { key: 'transpose_b' value { b: false } }
-      attr { key: '_kernel' value { s: 'eigen' } }
     }
   )",
                                                   &extension);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 041b6e51ffb..d0e67cdcd57 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -49,21 +49,43 @@ class RPCState : public GrpcClientCQTag {
       : RPCState(
             stub, cq, method, request, response, std::move(done), call_opts,
             threadpool,
-            // 1) If GRPC_FAIL_FAST is specified, fail_fast=$GRPC_FAIL_FAST.
-            // See b/141948186.
-            // 2) Otherwise, if the platform is Google, use the fail_fast from
-            // the caller. See b/140260119.
-            // 3) Otherwise, use fail_fast=false.
-            [fail_fast]() -> bool {
-              bool x;
+            // 1) If GRPC_FAIL_FAST is set to 'true' or 'false',
+            // fail_fast=$GRPC_FAIL_FAST. See b/141948186.
+            // 2) Otherwise if GRPC_FAIL_FAST is set to 'use_caller', use the
+            // fail_fast from the caller. See b/140260119.
+            //
+            // Current default for PLATFORM_GOOGLE: use caller fail_fast;
+            // Current default for open source: fail_fast=false.
+            //
+            // NOTE: Callers mostly set fail_fast=true to prevent job hanging
+            // on worker task failures, except a few cases such as GetStatus
+            // in cluster initialization and collective param resolution.
+            [fail_fast, &done]() -> bool {
+              string fail_fast_env;
 #if defined(PLATFORM_GOOGLE)
-              TF_CHECK_OK(ReadBoolFromEnvVar("GRPC_FAIL_FAST", fail_fast, &x));
+              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
+                                               &fail_fast_env));
 #else
-              TF_CHECK_OK(ReadBoolFromEnvVar("GRPC_FAIL_FAST", false, &x));
+              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "false",
+                                               &fail_fast_env));
 #endif  // PLATFORM_GOOGLE
-              return x;
+              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
+              if (fail_fast_env_lower == "true") {
+                return true;
+              } else if (fail_fast_env_lower == "use_caller") {
+                return fail_fast;
+              } else if (fail_fast_env_lower == "false") {
+                return false;
+              } else {
+                string error_message = strings::StrCat(
+                    "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
+                LOG(WARNING) << error_message;
+                done(errors::InvalidArgument(error_message));
+                return false;
+              }
             }(),
-            /*timeout_in_ms=*/0, max_retries, target) {
+            (call_opts != nullptr ? call_opts->GetTimeout() : 0), max_retries,
+            target) {
   }
 
   template <typename Request>
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
index 6eaa0b18331..0dc0979850d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -134,43 +135,41 @@ TEST(GrpcProto, ParseFromString) {
   }
 }
 
-static void BM_UnparseGrpc(int iters, int size) {
-  testing::StopTiming();
+static void BM_UnparseGrpc(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   auto proto = MakeProto(size);
-  testing::StartTiming();
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     grpc::ByteBuffer buf;
     CHECK(GrpcMaybeUnparseProto(proto, &buf).ok());
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_UnparseGrpc)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
 
-static void BM_UnparseString(int iters, int size) {
-  testing::StopTiming();
+static void BM_UnparseString(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   auto proto = MakeProto(size);
   testing::StartTiming();
 
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     string buf;
     proto.SerializeToString(&buf);
   }
-
-  testing::StopTiming();
 }
 BENCHMARK(BM_UnparseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
 
-static void BM_ParseGrpc(int iters, int size, int num_slices) {
-  testing::StopTiming();
+static void BM_ParseGrpc(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+  const int num_slices = state.range(1);
+
   CleanupAllRequest proto = MakeProto(size);
   auto buf = MakeBuffer(proto.SerializeAsString(), num_slices);
   testing::StartTiming();
 
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     CHECK(GrpcMaybeParseProto(&buf, &proto));
   }
-
-  testing::StopTiming();
 }
 BENCHMARK(BM_ParseGrpc)
     ->ArgPair(1, 1)
@@ -179,17 +178,16 @@ BENCHMARK(BM_ParseGrpc)
     ->ArgPair(1 << 20, 1)
     ->ArgPair(1 << 20, 4);
 
-static void BM_ParseString(int iters, int size) {
-  testing::StopTiming();
+static void BM_ParseString(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   CleanupAllRequest proto = MakeProto(size);
   string serial = proto.SerializeAsString();
   testing::StartTiming();
 
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     CHECK(proto.ParseFromString(serial));
   }
-
-  testing::StopTiming();
 }
 BENCHMARK(BM_ParseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 10bd8936a74..723a5130161 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -194,7 +194,7 @@ class GrpcWorkerServiceThread {
     auto closure = [this, call]() {                                           \
       Status s = worker_->method(&call->request, &call->response);            \
       if (!s.ok()) {                                                          \
-        VLOG(1) << "Bad response from " << #method << ": " << s;              \
+        VLOG(3) << "Bad response from " << #method << ": " << s;              \
       }                                                                       \
       call->SendResponse(ToGrpcStatus(s));                                    \
     };                                                                        \
@@ -223,7 +223,7 @@ class GrpcWorkerServiceThread {
     Schedule([this, call]() {
       worker_->GetStepSequenceAsync(
           &call->request, &call->response, [call](const Status& s) {
-            VLOG(1) << "Bad response from GetStepSequence:" << s;
+            VLOG(3) << "Bad response from GetStepSequence:" << s;
             call->SendResponse(ToGrpcStatus(s));
           });
     });
@@ -232,7 +232,7 @@ class GrpcWorkerServiceThread {
 
   void MarkRecvFinishedHandler(
       WorkerCall<MarkRecvFinishedRequest, MarkRecvFinishedResponse>* call) {
-    VLOG(1) << "Clean cache entry for request " << call->request.request_id();
+    VLOG(3) << "Clean cache entry for request " << call->request.request_id();
     worker_->RemoveCacheEntryForId(call->request.request_id());
     call->SendResponse(::grpc::Status::OK);
     ENQUEUE_REQUEST(MarkRecvFinished, false);
@@ -249,9 +249,9 @@ class GrpcWorkerServiceThread {
       worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
                              [call, call_opts, wrapped_request,
                               wrapped_response](const Status& s) {
-                               VLOG(1) << "RunGraph::Done";
+                               VLOG(3) << "RunGraph::Done";
                                if (!s.ok()) {
-                                 VLOG(1) << "Bad response from RunGraph:" << s;
+                                 VLOG(3) << "Bad response from RunGraph:" << s;
                                }
                                call->ClearCancelCallback();
                                delete call_opts;
@@ -275,7 +275,7 @@ class GrpcWorkerServiceThread {
             call->ClearCancelCallback();
             delete call_opts;
             if (!s.ok()) {
-              VLOG(1) << "Bad response from RecvTensor:" << s;
+              VLOG(3) << "Bad response from RecvTensor:" << s;
             }
             call->SendResponse(ToGrpcStatus(s));
           });
@@ -292,7 +292,7 @@ class GrpcWorkerServiceThread {
                               call->ClearCancelCallback();
                               delete call_opts;
                               if (!s.ok()) {
-                                VLOG(1) << "Bad response from RecvBuf:" << s;
+                                VLOG(3) << "Bad response from RecvBuf:" << s;
                               }
                               call->SendResponse(ToGrpcStatus(s));
                             });
@@ -311,7 +311,7 @@ class GrpcWorkerServiceThread {
             call->ClearCancelCallback();
             delete call_opts;
             if (!s.ok()) {
-              VLOG(1) << "Bad response from CompleteGroup:" << s;
+              VLOG(3) << "Bad response from CompleteGroup:" << s;
             }
             call->SendResponse(ToGrpcStatus(s));
           });
@@ -330,7 +330,7 @@ class GrpcWorkerServiceThread {
             call->ClearCancelCallback();
             delete call_opts;
             if (!s.ok()) {
-              VLOG(1) << "Bad response from CompleteInstance:" << s;
+              VLOG(3) << "Bad response from CompleteInstance:" << s;
             }
             call->SendResponse(ToGrpcStatus(s));
           });
@@ -430,7 +430,7 @@ GrpcWorker::GrpcWorker(WorkerEnv* worker_env, const ConfigProto& config)
 }
 
 void GrpcWorker::EnableResponseCache() {
-  VLOG(1) << "Enabling gRPC tensor response cache.";
+  VLOG(3) << "Enabling gRPC tensor response cache.";
   response_cache_ = absl::make_unique<GrpcResponseCache>();
 }
 
@@ -441,7 +441,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                      const RecvTensorRequest* request,
                                      ::grpc::ByteBuffer* response,
                                      StatusCallback done) {
-  VLOG(1) << "GrpcRecvTensorAsync req: " << request->DebugString();
+  VLOG(3) << "GrpcRecvTensorAsync req: " << request->DebugString();
   const int64 request_id = request->request_id();
   const int64 step_id = request->step_id();
 
@@ -696,7 +696,8 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
   };
   rma->buf_rendezvous()->ConsumeBuf(
       request->buf_rendezvous_key(), request->src_device(),
-      request->src_incarnation(), consumer_callback);
+      request->src_incarnation(), consumer_callback,
+      /*cancellation_manager=*/nullptr);
 }
 
 void GrpcWorker::LoggingAsync(const LoggingRequest* request,
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index d3af7417e61..e6ce87c2bee 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -166,9 +166,9 @@ string DebugString(const Tensor& x, const Tensor& y, int tensor_size) {
 }
 
 // TODO: Support sharding and depth.
-static void BM_Helper(int iters, int width, int num_stages, int tensor_size,
+static void BM_Helper(::testing::benchmark::State& state, int width,
+                      int num_stages, int tensor_size,
                       bool use_multiple_devices) {
-  testing::StopTiming();
   const Cluster* cluster = GetCluster();
 
   // Creates a session.
@@ -203,17 +203,18 @@ static void BM_Helper(int iters, int width, int num_stages, int tensor_size,
   }
 
   // Iterations.
-  testing::StartTiming();
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     outputs.clear();
     TF_CHECK_OK(session->Run({{"x", x}}, {"y:0"}, {}, &outputs));
     CHECK_EQ(size_t{1}, outputs.size());
   }
-  testing::StopTiming();
   TF_CHECK_OK(session->Close());
 }
-static void BM_ShardedProgram(int iters, int width, int num_stages) {
-  BM_Helper(iters, width, num_stages, 2 /*tensor_size*/, true /*multi-device*/);
+static void BM_ShardedProgram(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int num_stages = state.range(1);
+
+  BM_Helper(state, width, num_stages, 2 /*tensor_size*/, true /*multi-device*/);
 }
 BENCHMARK(BM_ShardedProgram)
     ->ArgPair(1, 1)
@@ -232,13 +233,19 @@ BENCHMARK(BM_ShardedProgram)
     ->ArgPair(60, 3)
     ->ArgPair(60, 5);
 
-static void BM_RPC(int iters, int width, int tensor_size) {
-  BM_Helper(iters, width, 2 /*num_stages*/, tensor_size, true /*multi-device*/);
+static void BM_RPC(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int tensor_size = state.range(1);
+
+  BM_Helper(state, width, 2 /*num_stages*/, tensor_size, true /*multi-device*/);
 }
 BENCHMARK(BM_RPC)->ArgPair(30, 2)->ArgPair(30, 1000)->ArgPair(30, 100000);
 
-static void BM_SingleDevice(int iters, int width, int num_stages) {
-  BM_Helper(iters, width, num_stages, 2 /*tensor_size*/,
+static void BM_SingleDevice(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int num_stages = state.range(1);
+
+  BM_Helper(state, width, num_stages, 2 /*tensor_size*/,
             false /*not multi-device*/);
 }
 BENCHMARK(BM_SingleDevice)
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
index 02e137a46c6..18222053805 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -173,37 +173,36 @@ string MakeFloatTensorTestCase(int num_elems) {
   return encoded;
 }
 
-static void BM_TensorResponse(int iters, int arg) {
-  testing::StopTiming();
+static void BM_TensorResponse(::testing::benchmark::State& state) {
+  const int arg = state.range(0);
+
   string encoded = MakeFloatTensorTestCase(arg);
   DummyDevice cpu_device(Env::Default());
-  testing::StartTiming();
-  while (--iters > 0) {
+  size_t bytes = 0;
+  for (auto i : state) {
     TensorResponse response;
     response.InitAlloc(&cpu_device, AllocatorAttributes());
     StringSource source(&encoded, -1);
     Status s = response.ParseFrom(&source);
-    if (iters == 1) {
-      testing::SetLabel(
-          strings::StrCat("Bytes: ", response.tensor().TotalBytes()));
-    }
+    bytes = response.tensor().TotalBytes();
   }
+  state.SetLabel(strings::StrCat("Bytes: ", bytes));
 }
 BENCHMARK(BM_TensorResponse)->Arg(0)->Arg(1000)->Arg(100000);
 
-static void BM_TensorViaTensorProto(int iters, int arg) {
-  testing::StopTiming();
-  string encoded = MakeFloatTensorTestCase(arg);
-  testing::StartTiming();
-  while (--iters > 0) {
+static void BM_TensorViaTensorProto(::testing::benchmark::State& state) {
+  const int arg = state.range(0);
+
+  std::string encoded = MakeFloatTensorTestCase(arg);
+  size_t bytes = 0;
+  for (auto s : state) {
     RecvTensorResponse r;
     r.ParseFromString(encoded);
     Tensor t;
     CHECK(t.FromProto(r.tensor()));
-    if (iters == 1) {
-      testing::SetLabel(strings::StrCat("Bytes: ", t.TotalBytes()));
-    }
+    bytes = t.TotalBytes();
   }
+  state.SetLabel(strings::StrCat("Bytes: ", bytes));
 }
 BENCHMARK(BM_TensorViaTensorProto)->Arg(0)->Arg(1000)->Arg(100000);
 
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index cec09775469..dc9badfedef 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -30,7 +30,7 @@ namespace tensorflow {
 // testing.
 class TestWorkerInterface : public WorkerInterface {
  public:
-  void GetStatusAsync(const GetStatusRequest* request,
+  void GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     done(errors::Unimplemented("GetStatusAsync"));
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index c4dc51ce47d..be14a58ca49 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -35,7 +35,7 @@ Worker::Worker(WorkerEnv* env) : env_(env), recent_request_ids_(100000) {
   StatusGroup::ConfigureLogHistory();
 }
 
-void Worker::GetStatusAsync(const GetStatusRequest* request,
+void Worker::GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
                             GetStatusResponse* response, bool fail_fast,
                             StatusCallback done) {
   const DeviceMgr* dm = env_->device_mgr;
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 273335ff36f..e280cf2447d 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -45,7 +45,7 @@ class Worker : public WorkerInterface {
   Worker(WorkerEnv* env);
   virtual ~Worker() {}
 
-  void GetStatusAsync(const GetStatusRequest* request,
+  void GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override;
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 9492d1cd31b..7b759eef95b 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -36,7 +36,8 @@ class TensorResponse;
 // Interface for talking with the TensorFlow Worker service.
 class WorkerInterface {
  public:
-  virtual void GetStatusAsync(const GetStatusRequest* request,
+  virtual void GetStatusAsync(CallOptions* opts,
+                              const GetStatusRequest* request,
                               GetStatusResponse* response, bool fail_fast,
                               StatusCallback done) = 0;
 
@@ -132,7 +133,7 @@ class WorkerInterface {
                    GetStatusResponse* response) {
     Status ret;
     Notification n;
-    GetStatusAsync(request, response, /*fail_fast=*/true,
+    GetStatusAsync(/*opts=*/nullptr, request, response, /*fail_fast=*/true,
                    [&ret, &n](const Status& s) {
                      ret = s;
                      n.Notify();
diff --git a/tensorflow/core/example/BUILD b/tensorflow/core/example/BUILD
index 0c943d38707..752a3641b09 100644
--- a/tensorflow/core/example/BUILD
+++ b/tensorflow/core/example/BUILD
@@ -68,7 +68,6 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:example_parser_configuration",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -79,15 +78,14 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:example_parsing_ops",
     ],
 )
 
-filegroup(
+alias(
     name = "example_parser_configuration_testdata",
-    srcs = [
-        "testdata/parse_example_graph_def.pbtxt",
-    ],
+    actual = "//tensorflow/core/example/testdata:example_parser_configuration_testdata",
 )
 
 tf_proto_library(
diff --git a/tensorflow/core/example/testdata/BUILD b/tensorflow/core/example/testdata/BUILD
new file mode 100644
index 00000000000..5b021e95762
--- /dev/null
+++ b/tensorflow/core/example/testdata/BUILD
@@ -0,0 +1,15 @@
+# Example parser test data.
+
+package(
+    default_visibility = [
+        "//tensorflow/core/example:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "example_parser_configuration_testdata",
+    srcs = [
+        "parse_example_graph_def.pbtxt",
+    ],
+)
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 3f983634fd7..de196f20da9 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -6,14 +6,19 @@ load(
 )
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_cc_tests",
     "tf_copts",
     "tf_cuda_library",
+    "tf_gen_options_header",
 )
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "filegroup")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_generate_proto_text_sources")
 
@@ -22,6 +27,7 @@ load("//tensorflow:tensorflow.bzl", "tf_selective_registration_deps")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
+    "tf_cuda_tests_tags",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -45,7 +51,9 @@ exports_files(
         "control_flow.h",
         "dataset.h",
         "dataset_stateful_op_allowlist.h",
+        "device.h",
         "device_base.h",
+        "device_factory.h",
         "function.h",
         "function_handle_cache.h",
         "graph_def_util.h",
@@ -148,6 +156,7 @@ exports_files(
         "op.h",
         "op_def_builder.h",
         "op_def_util.h",
+        "registration_options",
         "selective_registration.h",
         "shape_inference.h",
     ],
@@ -173,7 +182,9 @@ filegroup(
         "control_flow.h",
         "dataset.h",
         "dataset_stateful_op_allowlist.h",
+        "device.h",
         "device_base.h",
+        "device_factory.h",
         "function.h",
         "function_handle_cache.h",
         "graph_def_util.h",
@@ -207,6 +218,7 @@ filegroup(
         "reader_op_kernel.h",
         "register_types.h",
         "register_types_traits.h",
+        "registration_options.h",
         "rendezvous.h",
         "resource_handle.h",
         "resource_mgr.h",
@@ -246,7 +258,9 @@ filegroup(
         "cancellation.cc",
         "collective.cc",
         "dataset.cc",
+        "device.cc",
         "device_base.cc",
+        "device_factory.cc",
         "function.cc",
         "function_handle_cache.cc",
         "graph_def_util.cc",
@@ -337,8 +351,12 @@ filegroup(
         "dataset.cc",
         "dataset.h",
         "dataset_stateful_op_allowlist.h",
+        "device.cc",
+        "device.h",
         "device_base.cc",
         "device_base.h",
+        "device_factory.cc",
+        "device_factory.h",
         "function.cc",
         "function.h",
         "function_handle_cache.cc",
@@ -385,6 +403,7 @@ filegroup(
         "queue_interface.h",
         "reader_interface.h",
         "register_types_traits.h",
+        "registration_options.h",
         "rendezvous.cc",
         "rendezvous.h",
         "resource_mgr.cc",
@@ -952,9 +971,21 @@ cc_library(
     ],
 )
 
+tf_gen_options_header(
+    name = "gen_registration_options",
+    build_settings = {
+        "//tensorflow:enable_registration_v2": "REGISTRATION_V2",
+    },
+    output_header = "registration_options.h",
+    template = "registration_options.h.tpl",
+)
+
 cc_library(
     name = "selective_registration",
-    hdrs = ["selective_registration.h"],
+    hdrs = [
+        "registration_options.h",
+        "selective_registration.h",
+    ],
     deps = tf_selective_registration_deps(),
 )
 
@@ -1022,6 +1053,110 @@ exports_files(
 )
 
 # Framework tests.
+tf_cc_test(
+    name = "framework_op_gen_lib_test",
+    size = "small",
+    srcs = ["op_gen_lib_test.cc"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:op_gen_lib",
+    ],
+)
+
+tf_cc_test_gpu(
+    name = "variant_op_copy_test",
+    size = "small",
+    srcs = ["variant_op_copy_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_run_handler_util_test",
+    size = "small",
+    srcs = ["run_handler_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_run_handler_test",
+    size = "small",
+    srcs = ["run_handler_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:matmul_op",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_op_segment_test",
+    size = "small",
+    srcs = ["op_segment_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
@@ -1066,7 +1201,6 @@ tf_cc_tests(
         "variant_op_registry_test.cc",
         "variant_test.cc",
     ],
-    create_named_test_suite = True,
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -1087,7 +1221,6 @@ tf_cc_tests(
         "//tensorflow/core",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -1097,6 +1230,7 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/util:protos_test_cc",
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 0ac3da1a19c..f1001e7ab24 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -221,14 +221,16 @@ TEST(CustomAllocatorAttributes, TestSetterAndGetter) {
   EXPECT_FALSE(HasDeviceAllocatorAttribute(AllocatorAttributes()));
 }
 
-static void BM_Allocation(int iters, int arg) {
+static void BM_Allocation(::testing::benchmark::State& state) {
+  const int arg = state.range(0);
+
   Allocator* a = cpu_allocator();
   // Exercise a few different allocation sizes
   std::vector<int> sizes = {256, 4096, 16384, 524288, 512, 1048576};
   int size_index = 0;
 
   if (arg) EnableCPUAllocatorStats();
-  while (--iters > 0) {
+  for (auto s : state) {
     int bytes = sizes[size_index++ % sizes.size()];
     void* p = a->AllocateRaw(1, bytes);
     a->DeallocateRaw(p);
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 0de298cfce8..bed21ddcc99 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -39,60 +39,60 @@ TEST(Bfloat16Test, Conversion) {
   }
 }
 
-static void BM_FloatToBFloat16(int iters) {
-  testing::StopTiming();
+void BM_FloatToBFloat16(::testing::benchmark::State& state) {
   static const int N = 32 << 20;
-  const int64 tot = static_cast<int64>(iters) * N;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
 
   float* inp = new float[N];
   bfloat16* out = new bfloat16[N];
 
-  testing::StartTiming();
-  while (iters--) {
+  for (auto s : state) {
     FloatToBFloat16(inp, out, N);
   }
+
+  const int64 tot = static_cast<int64>(state.iterations()) * N;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
+
   delete[] inp;
   delete[] out;
 }
 BENCHMARK(BM_FloatToBFloat16);
 
-static void BM_RoundFloatToBFloat16(int iters) {
-  testing::StopTiming();
+void BM_RoundFloatToBFloat16(::testing::benchmark::State& state) {
   static const int N = 32 << 20;
-  const int64 tot = static_cast<int64>(iters) * N;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
 
   float* inp = new float[N];
   bfloat16* out = new bfloat16[N];
 
-  testing::StartTiming();
-  while (iters--) {
+  for (auto s : state) {
     RoundFloatToBFloat16(inp, out, N);
     tensorflow::testing::DoNotOptimize(inp);
     tensorflow::testing::DoNotOptimize(out);
   }
+
+  const int64 tot = static_cast<int64>(state.iterations()) * N;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
+
   delete[] inp;
   delete[] out;
 }
 BENCHMARK(BM_RoundFloatToBFloat16);
 
-static void BM_BFloat16ToFloat(int iters) {
-  testing::StopTiming();
+void BM_BFloat16ToFloat(::testing::benchmark::State& state) {
   static const int N = 32 << 20;
-  const int64 tot = static_cast<int64>(iters) * N;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
 
   bfloat16* inp = new bfloat16[N];
   float* out = new float[N];
 
-  testing::StartTiming();
-  while (iters--) {
+  for (auto s : state) {
     BFloat16ToFloat(inp, out, N);
   }
+
+  const int64 tot = static_cast<int64>(state.iterations()) * N;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
+
   delete[] inp;
   delete[] out;
 }
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index e63efcc15bf..cd4c28e1d2f 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -143,8 +143,8 @@ struct CollectiveParams {
   int source_rank = -1;    // broadcast only
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
-  std::unique_ptr<OpKernel> merge_op;  // reduction only
-  std::unique_ptr<OpKernel> final_op;  // reduction only
+  OpKernel* merge_op = nullptr;  // reduction only
+  OpKernel* final_op = nullptr;  // reduction only
   string ToString() const;
 };
 
@@ -268,6 +268,7 @@ class CollectiveRemoteAccess {
                             Tensor* to_tensor,
                             const DeviceLocality& client_locality,
                             int dev_to_dev_stream_index,
+                            CancellationManager* cancellation_manager,
                             const StatusCallback& done) = 0;
 
   virtual void PostToPeer(const string& peer_device, const string& peer_task,
@@ -276,12 +277,13 @@ class CollectiveRemoteAccess {
                           const AllocatorAttributes& from_alloc_attr,
                           const Tensor* from_tensor,
                           const DeviceLocality& client_locality,
+                          CancellationManager* cancellation_manager,
                           const StatusCallback& done) = 0;
 
   // Checks the health of a collective peer. It probes the peer to see if it is
   // alive. Note that if a peer has restarted, it's considered a different one,
   // so CheckPeerHealth fails.
-  virtual void CheckPeerHealth(const string& peer_task,
+  virtual void CheckPeerHealth(const string& peer_task, int64 timeout_in_ms,
                                const StatusCallback& done) = 0;
 
   virtual BufRendezvous* buf_rendezvous() = 0;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 573ee1f44fb..60c95e04799 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1591,6 +1591,10 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/false);
 }
 
+Status MaxPoolGradShape(shape_inference::InferenceContext* c) {
+  return UnchangedShapeWithRank(c, 4);
+}
+
 Status MaxPoolShapeWithExplicitPadding(shape_inference::InferenceContext* c) {
   return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/true);
 }
@@ -1779,6 +1783,10 @@ Status Pool3DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status MaxPool3DGradShape(shape_inference::InferenceContext* c) {
+  return UnchangedShapeWithRank(c, 5);
+}
+
 Status AvgPool3DGradShape(shape_inference::InferenceContext* c) {
   ShapeHandle s;
   TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index d198bff65d3..ba03514c739 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -181,9 +181,15 @@ Status MaxPoolShape(shape_inference::InferenceContext* c);
 // Shape function for MaxPoolV2-like operations.
 Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs);
 
+// Shape function for MaxPoolGrad-like operations.
+Status MaxPoolGradShape(shape_inference::InferenceContext* c);
+
 // Shape function for 3D Pooling operations.
 Status Pool3DShape(shape_inference::InferenceContext* c);
 
+// Shape function for MaxPool3DGrad-like operations.
+Status MaxPool3DGradShape(shape_inference::InferenceContext* c);
+
 // Shape function for AvgPool3DGrad-like operations.
 Status AvgPool3DGradShape(shape_inference::InferenceContext* c);
 
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index ba0c2b84a1a..0e23c826677 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -75,6 +75,11 @@ constexpr char kTFDataResourceTag[] = "tfdata";
 class DatasetBase;
 class SerializationContext;
 
+inline bool IsTFDataFunction(const FunctionDef& func) {
+  auto iter = func.attr().find(data::kTFDataFunction);
+  return (iter != func.attr().end() && iter->second.b());
+}
+
 // Interface for reading values from a key-value store.
 // Used for restoring iterator state. This class is thread safe.
 // Please see comment on IteratorStateWriter for guidance around using the
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/framework/device.cc
similarity index 81%
rename from tensorflow/core/common_runtime/device.cc
rename to tensorflow/core/framework/device.cc
index 9925814a48a..eedb08c39c3 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/framework/device.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device.h"
 
 #include "tensorflow/core/framework/op_segment.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -54,4 +54,21 @@ DeviceAttributes Device::BuildDeviceAttributes(
   return da;
 }
 
+bool Device::IsRemoteCallAllowed() const {
+  auto& type = parsed_name_.type;
+  if (type == "TPU") {
+    return true;
+  }
+  if (type == "TPU_SYSTEM") {
+    return true;
+  }
+  if (type == "CPU") {
+    return true;
+  }
+  if (type == "GPU") {
+    return true;
+  }
+  return false;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device.h b/tensorflow/core/framework/device.h
new file mode 100644
index 00000000000..bdd671779fd
--- /dev/null
+++ b/tensorflow/core/framework/device.h
@@ -0,0 +1,205 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A Device is a something that can perform computations as part of a
+// model.  Devices can be local (runs computation on this machine), or
+// remote (contacts a device local to another machine using an RPC to
+// do the work).  Devices are registered in a DeviceSet, which is also
+// responsible for the Device <-> id mapping.
+//
+// Device names
+// * Every Device should have a unique name with the format:
+//     /job:___/replica:___/task:___/(gpu|cpu):___
+//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
+// * Task numbers are within the specified replica, so there are as
+//   many "task zeros" as replicas.
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_segment.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+class Device : public DeviceBase {
+ public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const Status&)> DoneCallback;
+
+  Device(Env* env, const DeviceAttributes& device_attributes);
+  ~Device() override;
+
+  // Full name of this device (see top comment).
+  const std::string& name() const override { return device_attributes_.name(); }
+
+  // Parsed name of this device
+  const DeviceNameUtils::ParsedName& parsed_name() const {
+    return parsed_name_;
+  }
+
+  // Describes what kind of device this is.  This is intended to be
+  // human-readable and not computer-parsed, except that two devices
+  // with the same device_type() are expected to perform similarly
+  // (both from a computation and communication perspective).
+  const std::string& device_type() const {
+    return device_attributes_.device_type();
+  }
+
+  // Returns an aggregation of device attributes.
+  const DeviceAttributes& attributes() const override {
+    return device_attributes_;
+  }
+
+  // Performs the actual compute function.
+  //
+  // Subclasses may override this function if they wish to perform
+  // some initialization before each compute.
+  virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) {
+    op_kernel->Compute(context);
+  }
+
+  // Asynchronous kernel's compute.
+  virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                            AsyncOpKernel::DoneCallback done) {
+    op_kernel->ComputeAsync(context, std::move(done));
+  }
+
+  // Blocks until all operations queued on the device at the time of
+  // the call have completed.  Returns any error pending on the device
+  // at completion.
+  virtual Status Sync() = 0;
+
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
+  // On session completion, the executor may call Device::Sync() depending on
+  // flag settings. Override this to return false for devices that don't allow
+  // such calls. Instead, these devices must use other mechanisms (such as
+  // num_deferred_ops) to ensure the device has finished processing necessary
+  // work at session completion. In addition, for these devices, RefreshStatus
+  // must be called at session completion to retrieve execution result status.
+  //
+  // Devices that override this function must also implement RefreshStatus.
+  virtual bool AllowsSyncOnCompletion() const { return true; }
+
+  // This is used in conjunction with AllowsSyncOnCompletion to allow the
+  // executor to get execution result status at session completion.
+  //
+  // For supported devices, this call returns the underlying device stream's
+  // current status in a non-blocking way, without using blocking calls such as
+  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
+  // status is also updated with the retrieved stream status.
+  virtual Status RefreshStatus() {
+    return errors::Unimplemented(
+        "RefreshStatus is not supported on this device.");
+  }
+
+  // Optionally modify the device's GraphDef before execution.
+  //
+  // This method should be considered experimental and is supplied to enable
+  // prototyping of TensorFlow device implementations that need to modify
+  // the GraphDef before execution.
+  //
+  // 'graph' supplies the partition of the graph assigned to this
+  // device.
+  virtual Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
+    return Status::OK();
+  }
+
+  // Sets `out_context` a new DeviceContext* for executing a graph, or nullptr
+  // if the device does not support contexts. Returns an error status if any
+  // error occurred while trying to create a context, otherwise OK.
+  //
+  // The caller takes ownership of one reference on the output DeviceContext*,
+  // and should call Unref().
+  virtual Status TryGetDeviceContext(DeviceContext** out_context) {
+    *out_context = nullptr;
+    return Status::OK();
+  }
+
+  // Returns the op segment of this device.  The caller can reuse op
+  // kernels registered for the same session running on this device.
+  OpSegment* op_segment() { return &op_seg_; }
+
+  // Returns the resource manager associated w/ this device.
+  virtual ResourceMgr* resource_manager() { return rmgr_; }
+
+  // Summarizes the status of this Device, for debugging.
+  std::string DebugString() const { return device_attributes_.DebugString(); }
+
+  // Assembles the parameter components into a complete DeviceAttributes value.
+  static DeviceAttributes BuildDeviceAttributes(
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality, const std::string& physical_device_desc);
+
+  static DeviceAttributes BuildDeviceAttributes(
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality) {
+    // Pass in an empty string as physical device name.
+    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
+  }
+
+  // Clears the resource manager associated with this device.
+  void ClearResourceMgr() { rmgr_->Clear(); }
+
+  virtual bool IsLocal() const { return true; }
+
+  // Informs if this Device can be used as a caller in RemoteCall operation.
+  virtual bool IsRemoteCallAllowed() const;
+
+ protected:
+  void DeleteResourceMgr() {
+    delete rmgr_;
+    rmgr_ = nullptr;
+  }
+
+ private:
+  const DeviceAttributes device_attributes_;
+  DeviceNameUtils::ParsedName parsed_name_;
+
+  // op_seg_ maps session handle and op name to OpKernel objects.
+  OpSegment op_seg_;
+
+  // Resources associated w/ this device. E.g., shared variables, etc.
+  ResourceMgr* rmgr_ = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Device);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/framework/device_factory.cc
similarity index 98%
rename from tensorflow/core/common_runtime/device_factory.cc
rename to tensorflow/core/framework/device_factory.cc
index 2872da15c26..dda5d5f0101 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/framework/device_factory.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/device_factory.h"
 
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/framework/device_factory.h b/tensorflow/core/framework/device_factory.h
new file mode 100644
index 00000000000..43d1d615555
--- /dev/null
+++ b/tensorflow/core/framework/device_factory.h
@@ -0,0 +1,151 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Device;
+struct SessionOptions;
+
+class DeviceFactory {
+ public:
+  virtual ~DeviceFactory() {}
+  static void Register(const std::string& device_type, DeviceFactory* factory,
+                       int priority);
+  static DeviceFactory* GetFactory(const std::string& device_type);
+
+  // Append to "*devices" all suitable devices, respecting
+  // any device type specific properties/counts listed in "options".
+  //
+  // CPU devices are added first.
+  static Status AddDevices(const SessionOptions& options,
+                           const std::string& name_prefix,
+                           std::vector<std::unique_ptr<Device>>* devices);
+
+  // Helper for tests.  Create a single device of type "type".  The
+  // returned device is always numbered zero, so if creating multiple
+  // devices of the same type, supply distinct name_prefix arguments.
+  static std::unique_ptr<Device> NewDevice(const string& type,
+                                           const SessionOptions& options,
+                                           const string& name_prefix);
+
+  // Iterate through all device factories and build a list of all of the
+  // possible physical devices.
+  //
+  // CPU is are added first.
+  static Status ListAllPhysicalDevices(std::vector<string>* devices);
+
+  // Get details for a specific device among all device factories.
+  // 'device_index' indexes into devices from ListAllPhysicalDevices.
+  static Status GetAnyDeviceDetails(
+      int device_index, std::unordered_map<string, string>* details);
+
+  // For a specific device factory list all possible physical devices.
+  virtual Status ListPhysicalDevices(std::vector<string>* devices) = 0;
+
+  // Get details for a specific device for a specific factory. Subclasses
+  // can store arbitrary device information in the map. 'device_index' indexes
+  // into devices from ListPhysicalDevices.
+  virtual Status GetDeviceDetails(int device_index,
+                                  std::unordered_map<string, string>* details) {
+    return Status::OK();
+  }
+
+  // Most clients should call AddDevices() instead.
+  virtual Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) = 0;
+
+  // Return the device priority number for a "device_type" string.
+  //
+  // Higher number implies higher priority.
+  //
+  // In standard TensorFlow distributions, GPU device types are
+  // preferred over CPU, and by default, custom devices that don't set
+  // a custom priority during registration will be prioritized lower
+  // than CPU.  Custom devices that want a higher priority can set the
+  // 'priority' field when registering their device to something
+  // higher than the packaged devices.  See calls to
+  // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
+  // for built-in devices.
+  static int32 DevicePriority(const std::string& device_type);
+};
+
+namespace dfactory {
+
+template <class Factory>
+class Registrar {
+ public:
+  // Multiple registrations for the same device type with different priorities
+  // are allowed.  Priorities are used in two different ways:
+  //
+  // 1) When choosing which factory (that is, which device
+  //    implementation) to use for a specific 'device_type', the
+  //    factory registered with the highest priority will be chosen.
+  //    For example, if there are two registrations:
+  //
+  //      Registrar<CPUFactory1>("CPU", 125);
+  //      Registrar<CPUFactory2>("CPU", 150);
+  //
+  //    then CPUFactory2 will be chosen when
+  //    DeviceFactory::GetFactory("CPU") is called.
+  //
+  // 2) When choosing which 'device_type' is preferred over other
+  //    DeviceTypes in a DeviceSet, the ordering is determined
+  //    by the 'priority' set during registration.  For example, if there
+  //    are two registrations:
+  //
+  //      Registrar<CPUFactory>("CPU", 100);
+  //      Registrar<GPUFactory>("GPU", 200);
+  //
+  //    then DeviceType("GPU") will be prioritized higher than
+  //    DeviceType("CPU").
+  //
+  // The default priority values for built-in devices is:
+  // GPU: 210
+  // GPUCompatibleCPU: 70
+  // ThreadPoolDevice: 60
+  // Default: 50
+  explicit Registrar(const std::string& device_type, int priority = 50) {
+    DeviceFactory::Register(device_type, new Factory(), priority);
+  }
+};
+
+}  // namespace dfactory
+
+#define REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, ...) \
+  INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory,   \
+                                         __COUNTER__, ##__VA_ARGS__)
+
+#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \
+                                               ctr, ...)                    \
+  static ::tensorflow::dfactory::Registrar<device_factory>                  \
+      INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr)(device_type,         \
+                                                       ##__VA_ARGS__)
+
+// __COUNTER__ must go through another macro to be properly expanded
+#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr) ___##ctr##__object_
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 38ab8be291d..cafe343ef3a 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -406,7 +406,7 @@ XTimesTwo[T:{float, double, int32, int64}](x:T) -> (y:T) {
 TEST(TFunc, WXPlusB) {
   auto expect = R"P(
 WXPlusB[T:{float, double}](w:T, x:T, b:T) -> (y:T) {
-  mm = MatMul[T=$T, _kernel="eigen", transpose_a=false, transpose_b=false](w, x)
+  mm = MatMul[T=$T, transpose_a=false, transpose_b=false](w, x)
   y = Add[T=$T](mm:product:0, b)
   return y = y:z:0
 }
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 5919ed7831b..dcda948c083 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -346,10 +346,7 @@ FunctionDef WXPlusB() {
       {{{"mm"},
         "MatMul",
         {"w", "x"},
-        {{"T", "$T"},
-         {"transpose_a", false},
-         {"transpose_b", false},
-         {"_kernel", "eigen"}}},
+        {{"T", "$T"}, {"transpose_a", false}, {"transpose_b", false}}},
        {{"y"}, "Add", {"mm", "b"}, {{"T", "$T"}}}});
 }
 
diff --git a/tensorflow/core/framework/lookup_interface.cc b/tensorflow/core/framework/lookup_interface.cc
index 117adbf65c4..77d3314b3e8 100644
--- a/tensorflow/core/framework/lookup_interface.cc
+++ b/tensorflow/core/framework/lookup_interface.cc
@@ -83,10 +83,17 @@ Status LookupInterface::CheckFindArguments(const Tensor& key,
                                            const Tensor& default_value) {
   TF_RETURN_IF_ERROR(CheckKeyAndValueTypes(key, default_value));
   TF_RETURN_IF_ERROR(CheckKeyShape(key.shape()));
-  if (default_value.shape() != value_shape()) {
+  TensorShape fullsize_value_shape = key.shape();
+  for (int i = 0; i < key_shape().dims(); ++i) {
+    fullsize_value_shape.RemoveDim(fullsize_value_shape.dims() - 1);
+  }
+  fullsize_value_shape.AppendShape(value_shape());
+  if (default_value.shape() != value_shape() &&
+      default_value.shape() != fullsize_value_shape) {
     return errors::InvalidArgument(
-        "Expected shape ", value_shape().DebugString(),
-        " for default value, got ", default_value.shape().DebugString());
+        "Expected shape ", value_shape().DebugString(), " or ",
+        fullsize_value_shape.DebugString(), " for default value, got ",
+        default_value.shape().DebugString());
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index 7e5dbe5632b..f0234af3110 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -128,7 +128,8 @@ class LookupInterface : public ResourceBase {
   // requirements are satisfied, otherwise it returns InvalidArgument:
   // - DataType of the tensor keys equals to the table key_dtype
   // - DataType of the tensor default_value equals to the table value_dtype
-  // - the default_value tensor shape matches the table's value shape.
+  // - the default_value tensor has the required shape given keys and the
+  //   tables's value shape.
   Status CheckFindArguments(const Tensor& keys, const Tensor& default_value);
 
   string DebugString() const override {
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 951052b794a..cc985284dac 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -311,6 +311,15 @@ class AsyncInterleaveMany : public Node {
     (*total_processing_times)[long_name()] =
         self_processing_time + inputs_processing_time;
   }
+
+  double MaximumBufferedBytes() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double result = 0;
+    auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
+    if (parameter) {
+      result += (*parameter)->value * AverageBufferedElementSize();
+    }
+    return result;
+  }
 };
 
 class KnownRatio : public Node {
@@ -593,6 +602,26 @@ class AsyncKnownRatio : public Node {
         self_processing_time + inputs_processing_time;
   }
 
+  double MaximumBufferedBytes() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double result = 0;
+    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
+    if (!parameter) {
+      parameter = gtl::FindOrNull(parameters_, kParallelism);
+    }
+
+    if (parameter) {
+      if (ratio_ == 0) {
+        result += (*parameter)->value * AverageBufferedElementSize();
+      } else {
+        // The estimation is currently not accurate for MapAndBatchDataset for
+        // the maximum buffer size does not match `num_parallel_calls`
+        // parameter.
+        result += (*parameter)->value * AverageBufferedElementSize() / ratio_;
+      }
+    }
+    return result;
+  }
+
  private:
   const double ratio_;
 };
@@ -1067,11 +1096,34 @@ double Node::TotalProcessingTime(
 }
 
 double Node::AverageBufferedElementSize() const {
-  if (buffered_elements_ == 0) {
-    return 0;
+  DCHECK_GE(num_elements_, 0);
+  DCHECK_GE(buffered_elements_, 0);
+  if (num_elements_ <= 0) {
+    if (buffered_elements_ <= 0) {
+      // If there are no produced elements or buffered elements recorded, return
+      // 0.
+      return 0;
+    }
+    // If there are no produced elements but some buffered elements, return the
+    // average size of all buffered elements.
+    return static_cast<double>(buffered_bytes_) /
+           static_cast<double>(buffered_elements_);
   }
-  return static_cast<double>(buffered_bytes_) /
-         static_cast<double>(buffered_elements_);
+
+  if (buffered_elements_ <= 0) {
+    // If there are no buffered elements but some produced elements, return the
+    // average size of all produced elements.
+    return static_cast<double>(bytes_produced_) /
+           static_cast<double>(num_elements_);
+  }
+
+  // Otherwise, return the mean value of average size of all produced elements
+  // and average size of all buffered elements.
+  return (static_cast<double>(bytes_produced_) /
+              static_cast<double>(num_elements_) +
+          static_cast<double>(buffered_bytes_) /
+              static_cast<double>(buffered_elements_)) /
+         2.0;
 }
 
 double Node::OutputTimeForInputs(
@@ -1275,20 +1327,17 @@ void Node::TotalMaximumBufferedBytesHelper(
     return;
   }
 
-  double result = 0;
-  auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
-  if (!parameter) {
-    parameter = gtl::FindOrNull(parameters_, kParallelism);
-  }
-  if (parameter) {
-    result = (*parameter)->value * AverageBufferedElementSize();
-  }
+  double result = MaximumBufferedBytes();
   for (auto& input : inputs_) {
     result += total_bytes->at(input->long_name());
   }
   total_bytes->insert(std::make_pair(long_name(), result));
 }
 
+double Node::MaximumBufferedBytes() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+  return 0;
+}
+
 void Model::AddNode(Node::Factory factory, const string& name,
                     std::shared_ptr<Node> parent,
                     std::shared_ptr<Node>* out_node) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 5199f2cbeef..a3cd0c06a48 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -251,6 +251,12 @@ class Node {
   // Returns the node output.
   Node* output() const { return output_; }
 
+  // Returns the parameter value.
+  double parameter_value(const string& name) const TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return parameters_.at(name)->state->value;
+  }
+
   // Returns the aggregate processing time.
   int64 processing_time() const TF_LOCKS_EXCLUDED(mu_) {
     return processing_time_;
@@ -517,6 +523,12 @@ class Node {
       absl::flat_hash_map<string, double>* total_bytes) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
+  // Compute and return the maximum buffered bytes on the node itself. By
+  // default non-tunable nodes are assumed not to buffer any bytes, so the
+  // tunable nodes as subclasses are expected to override this method to ensure
+  // that the optimization algorithm respects the memory budget.
+  virtual double MaximumBufferedBytes() const TF_SHARED_LOCKS_REQUIRED(mu_);
+
   // Stores the time passed to the last call to `Node::record_start()` on the
   // current thread.
   //
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index bdfd2c4df2d..97eb720b058 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -131,7 +131,9 @@ TEST_P(AsyncKnownRatioTest, Model) {
   async_known_many->record_buffer_event(110, 10);
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 110);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(),
-            110 * parallelism / 10);
+            num_inputs_per_output == 0
+                ? 110.0 * parallelism / 10
+                : 110.0 * parallelism / 10 / num_inputs_per_output);
   source1->add_processing_time(100);
   EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             0);
@@ -385,41 +387,12 @@ TEST(UnknownTest, Model) {
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 100);
 }
 
-class TestNode : public model::Node {
- public:
-  using model::Node::Node;
-
-  virtual ~TestNode() {}
-
- protected:
-  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return nullptr;
-  }
-
-  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {}
-
-  void OutputTimeLocked(
-      const absl::flat_hash_map<string, double>& input_times,
-      absl::flat_hash_map<string, double>* gradients,
-      absl::flat_hash_map<string, double>* output_times,
-      absl::flat_hash_map<string, double>* output_time_gradients) const override
-      TF_SHARED_LOCKS_REQUIRED(mu_) {
-    (*output_times)[long_name()] = 0;
-  }
-
-  void TotalProcessingTimeLocked(
-      absl::flat_hash_map<string, double>* processing_times,
-      absl::flat_hash_map<string, double>* total_processing_times) override
-      TF_SHARED_LOCKS_REQUIRED(mu_) {
-    (*total_processing_times)[long_name()] = 0;
-  }
-};
-
-TEST(SetterGetterTest, Node) {
-  std::shared_ptr<TestNode> node =
-      std::make_shared<TestNode>(model::Node::Args{-1, "TestNode", nullptr});
+TEST(BufferedBytesTest, Node) {
+  std::shared_ptr<Node> node = model::MakeAsyncInterleaveManyNode(
+      {-1, "TestNode", nullptr},
+      {model::MakeParameter("parallelism",
+                            std::make_shared<SharedState>(3, nullptr, nullptr),
+                            1, 7)});
   EXPECT_EQ(node->id(), -1);
   EXPECT_EQ(node->name(), "TestNode");
   EXPECT_EQ(node->output(), nullptr);
@@ -428,16 +401,46 @@ TEST(SetterGetterTest, Node) {
   EXPECT_EQ(node->buffered_elements(), 0);
   EXPECT_EQ(node->TotalBufferedBytes(), 0);
   EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
-  node->record_buffer_event(42, 0);
-  EXPECT_EQ(node->buffered_bytes(), 42);
-  EXPECT_EQ(node->TotalBufferedBytes(), 0);
-  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
-  EXPECT_EQ(node->buffered_elements(), 0);
-  node->record_buffer_event(0, 11);
-  EXPECT_EQ(node->buffered_bytes(), 42);
-  EXPECT_EQ(node->TotalBufferedBytes(), 0);
-  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
-  EXPECT_EQ(node->buffered_elements(), 11);
+
+  node->record_buffer_event(20, 1);
+  EXPECT_EQ(node->buffered_bytes(), 20);
+  EXPECT_EQ(node->buffered_elements(), 1);
+  EXPECT_EQ(node->TotalBufferedBytes(), 20);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 60);
+
+  node->record_buffer_event(10, 1);
+  EXPECT_EQ(node->buffered_bytes(), 30);
+  EXPECT_EQ(node->buffered_elements(), 2);
+  EXPECT_EQ(node->TotalBufferedBytes(), 30);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 45);
+
+  node->record_buffer_event(18, 1);
+  EXPECT_EQ(node->buffered_bytes(), 48);
+  EXPECT_EQ(node->buffered_elements(), 3);
+  EXPECT_EQ(node->bytes_produced(), 0);
+  EXPECT_EQ(node->num_elements(), 0);
+  EXPECT_EQ(node->TotalBufferedBytes(), 48);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 48);
+
+  node->record_buffer_event(-20, -1);
+  node->record_element();
+  node->record_bytes_produced(20);
+  EXPECT_EQ(node->buffered_bytes(), 28);
+  EXPECT_EQ(node->buffered_elements(), 2);
+  EXPECT_EQ(node->bytes_produced(), 20);
+  EXPECT_EQ(node->num_elements(), 1);
+  EXPECT_EQ(node->TotalBufferedBytes(), 28);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 51);
+
+  node->record_buffer_event(-10, -1);
+  node->record_element();
+  node->record_bytes_produced(10);
+  EXPECT_EQ(node->buffered_bytes(), 18);
+  EXPECT_EQ(node->buffered_elements(), 1);
+  EXPECT_EQ(node->bytes_produced(), 30);
+  EXPECT_EQ(node->num_elements(), 2);
+  EXPECT_EQ(node->TotalBufferedBytes(), 18);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 49.5);
 
   EXPECT_EQ(node->processing_time(), 0);
   node->record_start(1);
@@ -447,22 +450,32 @@ TEST(SetterGetterTest, Node) {
   node->add_processing_time(2);
   EXPECT_EQ(node->processing_time(), 42);
 
-  std::shared_ptr<TestNode> input =
-      std::make_shared<TestNode>(model::Node::Args{-1, "TestInput", node});
+  std::shared_ptr<Node> input = model::MakeAsyncKnownRatioNode(
+      {0, "TestInput", node}, 2,
+      {model::MakeParameter("parallelism",
+                            std::make_shared<SharedState>(5, nullptr, nullptr),
+                            0, 6)});
   EXPECT_EQ(input->output(), node.get());
   EXPECT_EQ(node->inputs().size(), 0);
   node->add_input(input);
   EXPECT_EQ(node->inputs().size(), 1);
   EXPECT_EQ(node->inputs().front(), input);
-  input->record_buffer_event(13, 0);
-  EXPECT_EQ(node->TotalBufferedBytes(), 0);
-  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
+
+  input->record_buffer_event(28, 1);
+  EXPECT_EQ(node->bytes_consumed(), 0);
+  EXPECT_EQ(node->TotalBufferedBytes(), 46);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 119.5);
+
+  input->record_buffer_event(-28, -1);
+  input->record_element();
+  input->record_bytes_produced(28);
+  node->record_bytes_consumed(28);
+  EXPECT_EQ(node->bytes_consumed(), 28);
+  EXPECT_EQ(node->TotalBufferedBytes(), 18);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 119.5);
+
   node->remove_input(input);
   EXPECT_EQ(node->inputs().size(), 0);
-
-  EXPECT_EQ(node->num_elements(), 0);
-  node->record_element();
-  EXPECT_EQ(node->num_elements(), 1);
 }
 
 // Returns a weighted sum of a prior and the actual processing time.
@@ -879,6 +892,63 @@ TEST_P(SelfProcessingTimeTest, Model) {
 INSTANTIATE_TEST_SUITE_P(Test, SelfProcessingTimeTest,
                          ::testing::Values(0, 1, 2, 5, 10, 20, 40));
 
+class OptimizeZeroRamBudgetTest
+    : public ::testing::TestWithParam<model::AutotuneAlgorithm> {};
+
+TEST_P(OptimizeZeroRamBudgetTest, Model) {
+  const model::AutotuneAlgorithm algorithm = GetParam();
+
+  std::shared_ptr<mutex> mutex1 = std::make_shared<mutex>();
+  std::shared_ptr<condition_variable> cv1 =
+      std::make_shared<condition_variable>();
+  std::shared_ptr<Node> node1 = model::MakeAsyncKnownRatioNode(
+      {1, "1", nullptr}, 2,
+      {model::MakeParameter("parallelism",
+                            std::make_shared<SharedState>(-1, mutex1, cv1), 1,
+                            5)});
+  node1->record_buffer_event(1, 1);
+
+  std::shared_ptr<mutex> mutex2 = std::make_shared<mutex>();
+  std::shared_ptr<condition_variable> cv2 =
+      std::make_shared<condition_variable>();
+  std::shared_ptr<Node> node2 = model::MakeAsyncKnownRatioNode(
+      {2, "2", node1}, 5,
+      {model::MakeParameter("buffer_size",
+                            std::make_shared<SharedState>(-1, mutex2, cv2), 0,
+                            6)});
+  node2->record_buffer_event(1, 1);
+
+  std::shared_ptr<mutex> mutex3 = std::make_shared<mutex>();
+  std::shared_ptr<condition_variable> cv3 =
+      std::make_shared<condition_variable>();
+  std::shared_ptr<Node> node3 = model::MakeAsyncInterleaveManyNode(
+      {3, "3", node2},
+      {model::MakeParameter("parallelism",
+                            std::make_shared<SharedState>(-1, mutex3, cv3), 1,
+                            7)});
+  node3->record_buffer_event(1, 1);
+
+  EXPECT_EQ(node1->parameter_value("parallelism"), -1);
+  EXPECT_EQ(node2->parameter_value("buffer_size"), -1);
+  EXPECT_EQ(node3->parameter_value("parallelism"), -1);
+
+  model::Model model;
+  model.AddNode([&node1](model::Node::Args args) { return node1; }, "1",
+                nullptr, &node1);
+  model.AddNode([&node2](model::Node::Args args) { return node2; }, "2", node1,
+                &node2);
+  model.AddNode([&node3](model::Node::Args args) { return node3; }, "3", node2,
+                &node3);
+
+  model.Optimize(algorithm, 40, 0, 0);
+  EXPECT_EQ(node1->parameter_value("parallelism"), 1);
+  EXPECT_EQ(node2->parameter_value("buffer_size"), 0);
+  EXPECT_EQ(node3->parameter_value("parallelism"), 1);
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, OptimizeZeroRamBudgetTest,
+                         ::testing::Values(0, 1));
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 1146b02ed1c..ca237edb965 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -447,9 +447,13 @@ Status AddArgToSig(const NodeDefOrAttrSlice& node_or_attrs,
   const int original_size = sig->size();
   if (!arg_def.number_attr().empty()) {
     // Same type repeated "repeats" times.
-    int32 repeats = -1;
+    int64 repeats = -1;
     TF_RETURN_IF_ERROR(
         GetNodeAttr(node_or_attrs, arg_def.number_attr(), &repeats));
+    // We can't handle outputs that are larger than int32 sizes.
+    if (static_cast<int64>(static_cast<int32>(repeats)) != repeats) {
+      return errors::InvalidArgument("Number of outputs is too big: ", repeats);
+    }
     if (repeats < 0) {
       return errors::InvalidArgument("Value for number_attr() ", repeats,
                                      " < 0");
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index b79b738353c..4ccbcf92073 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -448,6 +448,20 @@ TEST(OutputTypesForNode, Simple) {
   EXPECT_FALSE(OutputTypeForNode(node_def, op_def, 2, &type).ok());
 }
 
+TEST(OutputTypesForNode, LargeOutput) {
+  const OpDef op_def = ToOpDef(OpDefBuilder("TestSplitOp")
+                                   .Input("value: int64")
+                                   .Output("output: num_split * int64")
+                                   .Attr("num_split: int >= 1"));
+  int64 num_split = 1000000000000;
+  const NodeDef node_def =
+      ToNodeDef(std::move(NodeDefBuilder("test_split_op", &op_def)
+                              .Input(FakeInput())
+                              .Attr("num_split", num_split)));
+  DataTypeVector types;
+  EXPECT_FALSE(OutputTypesForNode(node_def, op_def, &types).ok());
+}
+
 TEST(OutputTypesForNode_AttrSliceOverload, Simple) {
   const OpDef op_def = ToOpDef(OpDefBuilder("Simple")
                                    .Input("a: float")
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 0eda2c6492f..3b7e597e425 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -145,7 +145,7 @@ bool ProcessCompoundType(const StringPiece type_string, AttrValue* allowed) {
   return true;
 }
 
-void FinalizeAttr(StringPiece spec, OpDef* op_def,
+void FinalizeAttr(StringPiece spec, bool allow_attr_type_any, OpDef* op_def,
                   std::vector<string>* errors) {
   OpDef::AttrDef* attr = op_def->add_attr();
   StringPiece orig(spec);
@@ -175,6 +175,8 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
     type = "tensor";
   } else if (absl::ConsumePrefix(&spec, "func")) {
     type = "func";
+  } else if (absl::ConsumePrefix(&spec, "any") && allow_attr_type_any) {
+    type = "any";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
@@ -633,13 +635,18 @@ OpDefBuilder& OpDefBuilder::SetShapeFn(OpShapeInferenceFn fn) {
   return *this;
 }
 
+OpDefBuilder& OpDefBuilder::AllowAttrTypeAny() {
+  allow_attr_type_any_ = true;
+  return *this;
+}
+
 Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
   std::vector<string> errors = errors_;
   *op_reg_data = op_reg_data_;
 
   OpDef* op_def = &op_reg_data->op_def;
   for (StringPiece attr : attrs_) {
-    FinalizeAttr(attr, op_def, &errors);
+    FinalizeAttr(attr, allow_attr_type_any_, op_def, &errors);
   }
   for (StringPiece input : inputs_) {
     FinalizeInputOrOutput(input, false, op_def, &errors);
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index b69ee46cd59..6789558bc80 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -142,6 +142,10 @@ class OpDefBuilder {
   // python/framework/common_shapes.py
   OpDefBuilder& SetShapeFn(OpShapeInferenceFn fn);
 
+  // Allows the `<type>` in calls to `Attr()` to be "any".
+  // This is used by PythonAPIWrapper for pass-through parameters.
+  OpDefBuilder& AllowAttrTypeAny();
+
   // Sets op_reg_data->op_def to the requested OpDef and
   // op_reg_data->shape_inference_fn to the requested shape inference function,
   // or returns an error.
@@ -168,6 +172,7 @@ class OpDefBuilder {
   std::vector<string> control_outputs_;
   std::string doc_;
   std::vector<string> errors_;
+  bool allow_attr_type_any_ = false;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 9b5648927d1..f11f85df8de 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -1002,9 +1002,9 @@ TEST_F(LabelTest, Duplicate) {
                 error::INVALID_ARGUMENT);
 }
 
-void BM_InputRangeHelper(int iters, const NodeDef& node_def,
-                         const char* input_name, int expected_start,
-                         int expected_stop) {
+void BM_InputRangeHelper(::testing::benchmark::State& state,
+                         const NodeDef& node_def, const char* input_name,
+                         int expected_start, int expected_stop) {
   Status status;
   auto device = absl::make_unique<DummyDevice>(Env::Default());
 
@@ -1013,24 +1013,20 @@ void BM_InputRangeHelper(int iters, const NodeDef& node_def,
                                               TF_GRAPH_DEF_VERSION, &status));
   TF_CHECK_OK(status);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     int start;
     int stop;
     TF_CHECK_OK(op->InputRange(input_name, &start, &stop));
     EXPECT_EQ(expected_start, start);
     EXPECT_EQ(expected_stop, stop);
   }
-  testing::StopTiming();
 }
 
 REGISTER_KERNEL_BUILDER(Name("ConcatV2").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("Select").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("MatMul").Device(DEVICE_CPU), DummyKernel);
 
-void BM_ConcatInputRange(int iters) {
-  testing::StopTiming();
-
+void BM_ConcatInputRange(::testing::benchmark::State& state) {
   // Create a ConcatV2 NodeDef with 4 inputs (plus the axis).
   NodeDef node_def;
   node_def.set_name("concat-op");
@@ -1048,12 +1044,10 @@ void BM_ConcatInputRange(int iters) {
     node_def.add_input(strings::StrCat("a:", i));
   }
 
-  BM_InputRangeHelper(iters, node_def, "values", 0, 4);
+  BM_InputRangeHelper(state, node_def, "values", 0, 4);
 }
 
-void BM_SelectInputRange(int iters) {
-  testing::StopTiming();
-
+void BM_SelectInputRange(::testing::benchmark::State& state) {
   // Create a Select NodeDef with 3 inputs.
   NodeDef node_def;
   node_def.set_name("select-op");
@@ -1065,11 +1059,11 @@ void BM_SelectInputRange(int iters) {
     node_def.add_input(strings::StrCat("a:", i));
   }
 
-  BM_InputRangeHelper(iters, node_def, "condition", 0, 1);
+  BM_InputRangeHelper(state, node_def, "condition", 0, 1);
 }
 
-void BM_TraceString(const int iters, const int verbose) {
-  testing::StopTiming();
+void BM_TraceString(::testing::benchmark::State& state) {
+  const int verbose = state.range(0);
 
   // Create a MatMul NodeDef with 2 inputs.
   NodeDef node_def;
@@ -1103,11 +1097,9 @@ void BM_TraceString(const int iters, const int verbose) {
   params.inputs = &inputs;
   auto ctx = absl::make_unique<OpKernelContext>(&params);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     auto trace = op->TraceString(*ctx, verbose);
   }
-  testing::StopTiming();
 }
 
 BENCHMARK(BM_ConcatInputRange);
diff --git a/tensorflow/core/framework/registration_options.h.tpl b/tensorflow/core/framework/registration_options.h.tpl
new file mode 100644
index 00000000000..375a1088b51
--- /dev/null
+++ b/tensorflow/core/framework/registration_options.h.tpl
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REGISTRATION_OPTIONS_TMPL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REGISTRATION_OPTIONS_TMPL_H_
+
+// This header is generated from a template; see the tf_gen_options_header()
+// build rule. Template placeholders of the form '#define_option X' result in
+// macros of the form 'TF_OPTION_X()'.
+
+#define_option REGISTRATION_V2
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTRATION_OPTIONS_TMPL_H_
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index d02d090f32b..5b8b9ff79e8 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -434,83 +434,89 @@ TEST_F(LocalRendezvousTest, TransferDummyDeviceContext) {
   args1.device_context->Unref();
 }
 
-void BM_SendRecv(int iters) {
+void BM_SendRecv(::testing::benchmark::State& state) {
   Rendezvous* rendez = NewLocalRendezvous();
   Tensor orig = V("val");
   Tensor val(DT_STRING, TensorShape({}));
   bool is_dead = false;
   Rendezvous::Args args;
-  if (iters > 0) {
-    while (iters--) {
-      TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
-      TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &val, &is_dead));
-    }
-    CHECK_EQ(V(val), V(orig));
+
+  for (auto s : state) {
+    TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
+    TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &val, &is_dead));
   }
+  CHECK_EQ(V(val), V(orig));
+
   rendez->Unref();
 }
 BENCHMARK(BM_SendRecv);
 
-void BM_RecvSend(int iters) {
+void BM_RecvSend(::testing::benchmark::State& state) {
   Rendezvous* rendez = NewLocalRendezvous();
   Tensor orig = V("val");
   Tensor val(DT_STRING, TensorShape({}));
   bool is_dead = false;
   Rendezvous::Args args;
-  if (iters > 0) {
-    while (iters--) {
-      bool received = false;
-      rendez->RecvAsync(
-          KeyFoo(), args,
-          [&val, &received](const Status& s, const Rendezvous::Args& send_args,
-                            const Rendezvous::Args& recv_args,
-                            const Tensor& tensor, bool is_dead) {
-            val = tensor;
-            received = true;
-          });
-      TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
-      CHECK(received);
-    }
-    CHECK_EQ(V(val), V(orig));
+
+  for (auto s : state) {
+    bool received = false;
+    rendez->RecvAsync(
+        KeyFoo(), args,
+        [&val, &received](const Status& /*s*/,
+                          const Rendezvous::Args& /*send_args*/,
+                          const Rendezvous::Args& /*recv_args*/,
+                          const Tensor& tensor, bool /*is_dead*/) {
+          val = tensor;
+          received = true;
+        });
+    TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
+    CHECK(received);
   }
+  CHECK_EQ(V(val), V(orig));
+
   rendez->Unref();
 }
 BENCHMARK(BM_RecvSend);
 
-void BM_PingPong(int iters) {
-  CHECK_GT(iters, 0);
+void BM_PingPong(::testing::benchmark::State& state) {
+  const int messages_count = state.range(0);
   auto* cm = new CancellationManager();
   thread::ThreadPool* pool = new thread::ThreadPool(Env::Default(), "test", 1);
 
-  // The main thread sends "foo" for iters times and receives "bar"
-  // for iters times.  The other thread sends "bar" for iters times
-  // and receives "foo" for iters times.
-  Rendezvous* rendez = NewLocalRendezvous();
-  pool->Schedule([rendez, iters]() {
-    Tensor bar = V("bar");
-    Tensor foo(DT_STRING, TensorShape({}));
+  // Benchmark loop
+  // In each iteration:
+  // The main thread sends "foo" for messages_count times and receives "bar"
+  // for messages_count times.  The other thread sends "bar" for
+  // messages_count times and receives "foo" for messages_count times.
+  for (auto s : state) {
+    Rendezvous* rendez = NewLocalRendezvous();
+    pool->Schedule([rendez, messages_count]() {
+      Tensor bar = V("bar");
+      Tensor foo(DT_STRING, TensorShape({}));
+      bool is_dead = false;
+      Rendezvous::Args args;
+      for (int i = 0; i < messages_count; ++i) {
+        TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &foo, &is_dead));
+        TF_CHECK_OK(rendez->Send(KeyBar(), args, bar, is_dead));
+      }
+      CHECK_EQ("foo", V(foo));
+    });
+    Tensor foo = V("foo");
+    Tensor bar(DT_STRING, TensorShape({}));
     bool is_dead = false;
     Rendezvous::Args args;
-    for (int i = 0; i < iters; ++i) {
-      TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &foo, &is_dead));
-      TF_CHECK_OK(rendez->Send(KeyBar(), args, bar, is_dead));
+    args.cancellation_manager = cm;
+    for (int i = 0; i < messages_count; ++i) {
+      TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
+      TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
     }
-    CHECK_EQ("foo", V(foo));
-  });
-  Tensor foo = V("foo");
-  Tensor bar(DT_STRING, TensorShape({}));
-  bool is_dead = false;
-  Rendezvous::Args args;
-  args.cancellation_manager = cm;
-  for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
-    TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
+    CHECK_EQ("bar", V(bar));
   }
-  CHECK_EQ("bar", V(bar));
+  state.SetItemsProcessed(messages_count * state.iterations());
   delete pool;
   delete cm;
 }
-BENCHMARK(BM_PingPong);
+BENCHMARK(BM_PingPong)->Arg(100)->Arg(200)->Arg(300);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h
index c9bbcb8bfe8..06ea0e00004 100644
--- a/tensorflow/core/framework/selective_registration.h
+++ b/tensorflow/core/framework/selective_registration.h
@@ -35,6 +35,10 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
+#include "tensorflow/core/framework/registration_options.h"
+
+#if !TF_OPTION_REGISTRATION_V2()
+
 #ifdef SELECTIVE_REGISTRATION
 
 // Experimental selective registration support to reduce binary size.
@@ -66,12 +70,20 @@ limitations under the License.
      !defined(SHOULD_REGISTER_OP_KERNEL))
 static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros");
 #endif
-#else
+#else  // SELECTIVE_REGISTRATION
 #define SHOULD_REGISTER_OP(op) true
 #define SHOULD_REGISTER_OP_GRADIENT true
 #define SHOULD_REGISTER_OP_KERNEL(clz) true
+#endif  // SELECTIVE_REGISTRATION
+
+#else  // ! TF_OPTION_REGISTRATION_V2()
+
+#ifdef SELECTIVE_REGISTRATION
+#error TF_OPTION_REGISTRATION_V2(): Compile-time selective registration is not supported
 #endif
 
+#endif  // ! TF_OPTION_REGISTRATION_V2()
+
 namespace tensorflow {
 
 // An InitOnStartupMarker is 'initialized' on program startup, purely for the
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 456c1826572..2d81b294372 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -719,7 +719,7 @@ Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(WithRankAtMost(input(input_idx), 1, &input_shape));
 
-  requested_input_tensor_as_partial_shape_[input_idx] = true;
+  request_input_tensor_as_partial_shape(input_idx);
   const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
   if (input_idx < input_tensors_as_shapes_size &&
       input_tensors_as_shapes_[input_idx].IsSet() &&
@@ -738,7 +738,7 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(WithRank(input(input_idx), 1, &input_shape));
 
-  requested_input_tensor_as_partial_shape_[input_idx] = true;
+  request_input_tensor_as_partial_shape(input_idx);
   const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
   if (input_idx < input_tensors_as_shapes_size &&
       input_tensors_as_shapes_[input_idx].IsSet() &&
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 10b54476d18..a7c72ebe294 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -268,15 +268,31 @@ class InferenceContext {
   // not available at the time of shape inference.
   const Tensor* input_tensor(int idx) {
     // Mark that this idx was requested.
-    requested_input_tensor_[idx] = true;
+    request_input_tensor(idx);
     return input_tensors_[idx];
   }
 
+  // Notifies the shape refiner that the value of the tensor at index <idx>
+  // is needed. The shape refiner tries to statically compute this tensor,
+  // and if successful re-runs the  shape function with this tensor available
+  // in the call to 'input_tensor(idx)'.
+  void request_input_tensor(int idx) { requested_input_tensor_[idx] = true; }
+
   // Returns true iff input_tensor(idx) was called by the shape function.
   bool requested_input_tensor(int idx) const {
     return requested_input_tensor_[idx];
   }
 
+  // Notifies the shape refiner that the value of the tensor at index <idx>
+  // as a partial shape is needed. The shape refiner tries to statically compute
+  // this, and if successful re-runs the  shape function with the
+  // computed PartialTensorShape available in the call to
+  // 'MakeShapeFromShapeTensor(idx, handle)' or
+  // 'MakeShapeFromShapeTensorTreatScalarAsUnknownShape(idx, handle)'.
+  void request_input_tensor_as_partial_shape(int idx) {
+    requested_input_tensor_as_partial_shape_[idx] = true;
+  }
+
   // Returns true if MakeShapeFromInputTensor was called but the constant
   // input_tensor was not present.
   bool requested_input_tensor_as_partial_shape(int idx) const {
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index ea93009ef40..076a5a53d76 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -684,19 +684,24 @@ static std::vector<int64> MakeSizes(int arg) {
   return sizes;
 }
 
-static void BM_TensorShape_Init(int iters, int arg) {
+void BM_TensorShape_Init(::testing::benchmark::State& state) {
+  const int arg = state.range(0);
+
   auto sizes = MakeSizes(arg);
-  while (--iters > 0) {
+  for (auto s : state) {
     TensorShape shape(sizes);
     tensorflow::testing::DoNotOptimize(shape.num_elements());
   }
 }
 BENCHMARK(BM_TensorShape_Init)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
 
-static void BM_TensorShape_Assign(int iters, int arg) {
-  TensorShape s(MakeSizes(arg));
-  while (--iters > 0) {
-    TensorShape s2 = s;
+void BM_TensorShape_Assign(::testing::benchmark::State& state) {
+  const int arg = state.range(0);
+
+  TensorShape shape(MakeSizes(arg));
+  for (auto s : state) {
+    const TensorShape s2 = shape;
+    tensorflow::testing::DoNotOptimize(s2);
   }
 }
 BENCHMARK(BM_TensorShape_Assign)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index fdfeef9e84a..daaba4bc3a5 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1468,19 +1468,19 @@ TEST(SummarizeValue, STRING_PRINT_V2) {
             x.SummarizeValue(16, true));
 }
 
-void BM_CreateAndDestroy(int iters) {
+void BM_CreateAndDestroy(::testing::benchmark::State& state) {
   TensorShape shape({10, 20});
-  while (--iters) {
+  for (auto s : state) {
     Tensor t(DT_FLOAT, shape);
   }
 }
 BENCHMARK(BM_CreateAndDestroy);
 
-void BM_Assign(int iters) {
+void BM_Assign(::testing::benchmark::State& state) {
   Tensor a(DT_FLOAT, TensorShape({10, 20}));
   Tensor b(DT_FLOAT, TensorShape({10, 20}));
   bool a_to_b = true;
-  while (--iters) {
+  for (auto s : state) {
     if (a_to_b) {
       b = a;
     } else {
@@ -1498,20 +1498,20 @@ TEST(Tensor, EmptyTensorData) {
 }
 
 // Benchmark create and destroy a tensor, with an allocated buffer.
-void BM_CreateAndDestroyWithBuf(int iters) {
+void BM_CreateAndDestroyWithBuf(::testing::benchmark::State& state) {
   TensorShape shape({10, 20});
   Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
     Tensor a(allocator, DT_FLOAT, shape);
   }
 }
 BENCHMARK(BM_CreateAndDestroyWithBuf);
 
 // Benchmark create+copy a tensor, with an allocated buffer.
-void BM_CreateAndCopyCtrWithBuf(int iters) {
+void BM_CreateAndCopyCtrWithBuf(::testing::benchmark::State& state) {
   TensorShape shape({10, 20});
   Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
     Tensor a(allocator, DT_FLOAT, shape);
     Tensor b(a);
   }
@@ -1519,10 +1519,10 @@ void BM_CreateAndCopyCtrWithBuf(int iters) {
 BENCHMARK(BM_CreateAndCopyCtrWithBuf);
 
 // Benchmark create+move a tensor, with an allocated buffer.
-void BM_CreateAndMoveCtrWithBuf(int iters) {
+void BM_CreateAndMoveCtrWithBuf(::testing::benchmark::State& state) {
   TensorShape shape({10, 20});
   Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
     Tensor a(allocator, DT_FLOAT, shape);
     Tensor b(std::move(a));
   }
@@ -1531,10 +1531,11 @@ BENCHMARK(BM_CreateAndMoveCtrWithBuf);
 
 // Benchmark creating and destroy a host-scalar tensor, using the allocator
 // interface.
-void BM_CreateAndDestroyHostScalarNonOptimized(int iters) {
+void BM_CreateAndDestroyHostScalarNonOptimized(
+    ::testing::benchmark::State& state) {
   TensorShape shape({});
   Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
     Tensor a(allocator, DT_FLOAT, shape);
     a.scalar<float>()() = 37.0;
   }
@@ -1543,32 +1544,33 @@ BENCHMARK(BM_CreateAndDestroyHostScalarNonOptimized);
 
 // Benchmark creating and destroy a host-scalar tensor, using the specialized
 // constructor.
-void BM_CreateAndDestroyHostScalarOptimized(int iters) {
-  while (--iters) {
+void BM_CreateAndDestroyHostScalarOptimized(
+    ::testing::benchmark::State& state) {
+  for (auto s : state) {
     Tensor a(37.0);
   }
 }
 BENCHMARK(BM_CreateAndDestroyHostScalarOptimized);
 
-static void BM_FromProto(int iters, int size) {
-  testing::StopTiming();
+void BM_FromProto(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   TensorShape shape({size});
   Allocator* allocator = cpu_allocator();
   Tensor a(allocator, DT_FLOAT, shape);
   std::fill_n(a.flat<float>().data(), size, 42.0);
   TensorProto p;
   a.AsProtoField(&p);
-  testing::StartTiming();
-  while (--iters) {
+  for (auto s : state) {
     Tensor b;
     ASSERT_TRUE(b.FromProto(p));
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_FromProto)->Range(1, 1 << 20);
 
-static void BM_FromProtoCompressed(int iters, int size) {
-  testing::StopTiming();
+void BM_FromProtoCompressed(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   TensorShape shape({size});
   Allocator* allocator = cpu_allocator();
   Tensor a(allocator, DT_FLOAT, shape);
@@ -1576,17 +1578,16 @@ static void BM_FromProtoCompressed(int iters, int size) {
   TensorProto p;
   a.AsProtoField(&p);
   tensor::CompressTensorProtoInPlace(&p);
-  testing::StartTiming();
-  while (--iters) {
+  for (auto s : state) {
     Tensor b;
     ASSERT_TRUE(b.FromProto(p));
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_FromProtoCompressed)->Range(1, 1 << 20);
 
-static void BM_FromProtoCompressedZero(int iters, int size) {
-  testing::StopTiming();
+void BM_FromProtoCompressedZero(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   TensorShape shape({size});
   Allocator* allocator = cpu_allocator();
   Tensor a(allocator, DT_FLOAT, shape);
@@ -1595,12 +1596,10 @@ static void BM_FromProtoCompressedZero(int iters, int size) {
   TensorProto p;
   a.AsProtoField(&p);
   tensor::CompressTensorProtoInPlace(&p);
-  testing::StartTiming();
-  while (--iters) {
+  for (auto s : state) {
     Tensor b;
     ASSERT_TRUE(b.FromProto(p));
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_FromProtoCompressedZero)->Range(1, 1 << 20);
 
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index b9461291bfc..e3b5076b9f6 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -64,6 +64,7 @@ filegroup(
         "graph_node_util.h",
         "node_builder.h",
         "tensor_id.h",
+        "types.h",
     ],
 )
 
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index faefb0b82e9..7fc4abb5492 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -203,21 +203,21 @@ TEST(AlgorithmTest, PostOrderWithEdgeFilter) {
   }
 }
 
-static void BM_PruneForReverseReachability(int iters, int num_nodes,
-                                           int num_edges_per_node) {
-  testing::StopTiming();
+void BM_PruneForReverseReachability(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, num_edges_per_node);
   const auto registry = OpRegistry::Global();
   GraphConstructorOptions opts;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
+    state.PauseTiming();
     Graph graph(registry);
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
     std::unordered_set<const Node*> visited;
     visited.insert(graph.FindNodeId(graph.num_nodes() - 1));
-    testing::StartTiming();
+    state.ResumeTiming();
     PruneForReverseReachability(&graph, std::move(visited));
-    testing::StopTiming();
   }
 }
 BENCHMARK(BM_PruneForReverseReachability)->ArgPair(10, 2);
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index bf57e263441..7680bcacba5 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -371,6 +371,13 @@ NodeDef* AddControlTrigger(const PartitionOptions& opts, GraphDef* gdef,
 void OptimizeControlFlowColocation(Graph* graph) {
   auto visit = [](Node* node) {
     if (IsSwitch(node)) {
+      // Pivot Switch nodes (which are also of type Switch) are already placed
+      // on the CPU and colocated with its inputs that are also already on the
+      // CPU (or might be placed on GPU but in host memory).
+      if (HasNodeAttr(node->def(), "_PivotSwitch")) {
+        DCHECK(node->requested_device().find("CPU") != string::npos);
+        return;
+      }
       for (const Edge* in_edge : node->in_edges()) {
         if (in_edge->dst_input() == 0) {
           // Colocate with the data input.
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index a8b421367ab..2801bd7c961 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -661,9 +661,9 @@ TEST_F(GraphTest, BuildNodeNameIndex) {
   }
 }
 
-static void BM_InEdgeIteration(int iters, int num_nodes,
-                               int num_edges_per_node) {
-  testing::StopTiming();
+void BM_InEdgeIteration(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, num_edges_per_node);
   Graph graph(OpRegistry::Global());
@@ -671,8 +671,7 @@ static void BM_InEdgeIteration(int iters, int num_nodes,
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
 
   int64 sum = 0;
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     for (const Node* node : graph.nodes()) {
       for (auto e : node->in_edges()) {
         sum += e->id();
@@ -680,7 +679,6 @@ static void BM_InEdgeIteration(int iters, int num_nodes,
     }
   }
   VLOG(1) << sum;
-  testing::StopTiming();
 }
 BENCHMARK(BM_InEdgeIteration)->ArgPair(10, 2);
 BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 6, 2);
@@ -703,8 +701,9 @@ BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 15, 16);
 
-static void BM_GraphCreation(int iters, int num_nodes, int num_edges_per_node) {
-  testing::StopTiming();
+void BM_GraphCreation(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, num_edges_per_node);
   const auto registry = OpRegistry::Global();
@@ -713,14 +712,12 @@ static void BM_GraphCreation(int iters, int num_nodes, int num_edges_per_node) {
   Graph graph(registry);
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
   int64 sum = 0;
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     Graph graph(registry);
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
     sum += graph.num_node_ids();
   }
   VLOG(1) << sum;
-  testing::StopTiming();
 }
 BENCHMARK(BM_GraphCreation)->ArgPair(10, 2);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 6, 2);
@@ -743,8 +740,9 @@ BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
-static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
-  testing::StopTiming();
+void BM_ToGraphDef(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, num_edges_per_node);
   const auto registry = OpRegistry::Global();
@@ -753,14 +751,12 @@ static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
   Graph graph(registry);
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
   int64 sum = 0;
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     GraphDef graph_def;
     graph.ToGraphDef(&graph_def);
     sum += graph_def.node_size();
   }
   VLOG(1) << sum;
-  testing::StopTiming();
 }
 BENCHMARK(BM_ToGraphDef)->ArgPair(10, 2);
 BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 2);
@@ -783,20 +779,20 @@ BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 16);
 
-static void BM_RemoveNode(int iters, int num_nodes, int num_edges_per_node) {
-  testing::StopTiming();
+void BM_RemoveNode(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, num_edges_per_node);
   const auto registry = OpRegistry::Global();
   GraphConstructorOptions opts;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     Graph graph(registry);
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
     testing::StartTiming();
     for (Node* n : graph.op_nodes()) {
       graph.RemoveNode(n);
     }
-    testing::StopTiming();
   }
 }
 BENCHMARK(BM_RemoveNode)->ArgPair(10, 2);
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 9d9727bdb46..efa1cfb0d3c 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -102,6 +102,16 @@ bool inline NativeFormatEnabled() {
   return native_fmt_enabled;
 }
 
+// Check if the data_format attribute in the node def represents 5D tensor
+bool inline Check5DFormat(const NodeDef& ndef) {
+  string data_format;
+  TF_CHECK_OK(GetNodeAttr(ndef, "data_format", &data_format));
+  if (data_format.compare("NCDHW") == 0 || data_format.compare("NDHWC") == 0) {
+    return true;
+  }
+  return false;
+}
+
 namespace mkl_op_registry {
 // MKL operators whose kernels are registered with 'MklLayoutDependentOp' label
 // (e.g., MklConv2D) understand input tensors in MKL layout. These operators
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 08292068efc..be10a9daa9f 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -347,8 +347,8 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   EXPECT_EQ(node_set.count("n/_3(Const)") + node_set.count("n/_4(Const)"), 1);
 }
 
-static void BM_CSE(int iters, int op_nodes) {
-  testing::StopTiming();
+void BM_CSE(::testing::benchmark::State& state) {
+  const int op_nodes = state.range(0);
   string s;
   for (int in = 0; in < 10; in++) {
     s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
@@ -363,7 +363,8 @@ static void BM_CSE(int iters, int op_nodes) {
   }
 
   bool first = true;
-  while (iters > 0) {
+  for (auto i : state) {
+    state.PauseTiming();
     Graph* graph = new Graph(OpRegistry::Global());
     InitGraph(s, graph);
     int N = graph->num_node_ids();
@@ -372,13 +373,12 @@ static void BM_CSE(int iters, int op_nodes) {
       first = false;
     }
     {
-      testing::StartTiming();
+      state.ResumeTiming();
       OptimizeCSE(graph, nullptr);
-      testing::StopTiming();
+      state.PauseTiming();
     }
-    iters -= N;  // Our benchmark units are individual graph nodes,
-                 // not whole graphs
     delete graph;
+    state.ResumeTiming();
   }
 }
 BENCHMARK(BM_CSE)->Arg(1000)->Arg(10000);
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index a8a834a0a83..571da3b62e5 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -342,14 +342,14 @@ TEST_F(SubgraphTest, Errors) {
 REGISTER_OP("In").Output("o: float");
 REGISTER_OP("Op").Input("i: float").Output("o: float");
 
-static void BM_SubgraphHelper(int iters, int num_nodes,
-                              bool use_function_convention) {
+void BM_SubgraphHelper(::testing::benchmark::State& state,
+                       bool use_function_convention) {
+  const int num_nodes = state.range(0);
   DeviceAttributes device_info;
   device_info.set_name("/job:a/replica:0/task:0/cpu:0");
   device_info.set_device_type(DeviceType(DEVICE_CPU).type());
   device_info.set_incarnation(0);
 
-  testing::StopTiming();
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -371,8 +371,8 @@ static void BM_SubgraphHelper(int iters, int num_nodes,
   }
   std::vector<string> fetch;
   std::vector<string> targets = {strings::StrCat("N", num_nodes - 1)};
-  testing::StartTiming();
-  while (--iters > 0) {
+
+  for (auto s : state) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(g, subgraph);
     subgraph::RewriteGraphMetadata metadata;
@@ -383,11 +383,11 @@ static void BM_SubgraphHelper(int iters, int num_nodes,
   }
 }
 
-static void BM_Subgraph(int iters, int num_nodes) {
-  BM_SubgraphHelper(iters, num_nodes, false /* use_function_convention */);
+void BM_Subgraph(::testing::benchmark::State& state) {
+  BM_SubgraphHelper(state, false /* use_function_convention */);
 }
-static void BM_SubgraphFunctionConvention(int iters, int num_nodes) {
-  BM_SubgraphHelper(iters, num_nodes, true /* use_function_convention */);
+void BM_SubgraphFunctionConvention(::testing::benchmark::State& state) {
+  BM_SubgraphHelper(state, true /* use_function_convention */);
 }
 BENCHMARK(BM_Subgraph)->Arg(100)->Arg(1000)->Arg(10000)->Arg(100000);
 BENCHMARK(BM_SubgraphFunctionConvention)
diff --git a/tensorflow/core/graph/tensor_id_test.cc b/tensorflow/core/graph/tensor_id_test.cc
index 878afbe7d65..1b9247d3e48 100644
--- a/tensorflow/core/graph/tensor_id_test.cc
+++ b/tensorflow/core/graph/tensor_id_test.cc
@@ -39,8 +39,8 @@ uint32 Skewed(random::SimplePhilox* rnd, int max_log) {
   return rnd->Rand32() % space;
 }
 
-void BM_ParseTensorName(int iters, int arg) {
-  testing::StopTiming();
+void BM_ParseTensorName(::testing::benchmark::State& state) {
+  const int arg = state.range(0);
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   std::vector<string> names;
@@ -78,11 +78,11 @@ void BM_ParseTensorName(int iters, int arg) {
     }
     names.push_back(name);
   }
-  testing::StartTiming();
+
   TensorId id;
   int index = 0;
   int sum = 0;
-  while (--iters > 0) {
+  for (auto s : state) {
     id = ParseTensorName(names[index++ % names.size()]);
     sum += id.second;
   }
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 8a9b23ff96f..f3bea5c75a3 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -74,7 +74,7 @@ tf_cuda_library(
     hdrs = ["devices.h"],
     cuda_deps = [
         "//tensorflow/core/common_runtime/gpu:gpu_init",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index c1643bc7bee..a8a337bc3fa 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -361,6 +361,8 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
   return nullptr;
 }
 
+SchedulerState::~SchedulerState() {}
+
 SchedulerState::SchedulerState(const bool use_static_shapes,
                                const bool use_aggressive_shape_inference,
                                Cluster* cluster,
@@ -1259,15 +1261,23 @@ void SchedulerState::SetNodeStateTimeScheduled(const NodeDef* node) {
   node_state.time_scheduled = device.GetCurrTime();
 }
 
+VirtualScheduler::~VirtualScheduler() {}
+
 VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
                                    const bool use_aggressive_shape_inference,
                                    Cluster* cluster,
                                    ReadyNodeManager* ready_nodes,
                                    std::unique_ptr<VirtualPlacer> placer)
-    : scheduler_state_(use_static_shapes, use_aggressive_shape_inference,
-                       cluster, std::move(placer)),
+    : scheduler_state_(absl::make_unique<SchedulerState>(
+          use_static_shapes, use_aggressive_shape_inference, cluster,
+          std::move(placer))),
       ready_nodes_(ready_nodes) {}
 
+VirtualScheduler::VirtualScheduler(
+    ReadyNodeManager* ready_nodes,
+    std::unique_ptr<SchedulerState> scheduler_state)
+    : scheduler_state_(std::move(scheduler_state)), ready_nodes_(ready_nodes) {}
+
 Status VirtualScheduler::Init(const GrapplerItem* item) {
   // SchedulerState::Init() preprocesses the input grappler_item and
   // graph_properties to extract necessary information for emulating tensorflow
@@ -1275,7 +1285,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   // DeviceState) for virtual scheduling.
   TF_RETURN_IF_ERROR(ready_nodes_->Init(GetNodeStates()));
   std::vector<const NodeDef*> initial_nodes;
-  auto status = scheduler_state_.Init(item, &initial_nodes);
+  auto status = scheduler_state_->Init(item, &initial_nodes);
   if (status.ok()) {
     // Add the set of initial nodes to ready_nodes_
     for (auto node : initial_nodes) {
@@ -1285,17 +1295,17 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   return status;
 }
 
-OpContext VirtualScheduler::GetCurrNode() const {
+OpContext VirtualScheduler::GetCurrNode() {
   const NodeDef* node = ready_nodes_->GetCurrNode();
-  return scheduler_state_.CreateOpContext(node);
+  return scheduler_state_->CreateOpContext(node);
 }
 
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
   const NodeDef* node = ready_nodes_->GetCurrNode();
-  auto new_nodes = scheduler_state_.MarkNodeExecuted(
+  auto new_nodes = scheduler_state_->MarkNodeExecuted(
       node, node_costs,
-      scheduler_state_.CreateOpContext(ready_nodes_->GetCurrNode()));
+      scheduler_state_->CreateOpContext(ready_nodes_->GetCurrNode()));
   ready_nodes_->RemoveCurrNode();
   // Add the set of new nodes obtained from MarkNodeExecuted() to ready_nodes_.
   for (auto node : new_nodes) {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 0968d2ae11d..04f1e571ae5 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -324,6 +324,21 @@ class SchedulerState {
   SchedulerState(const bool use_static_shapes,
                  const bool use_aggressive_shape_inference, Cluster* cluster,
                  std::unique_ptr<VirtualPlacer> placer);
+  // Move constructor. Explicitly defined because it otherwise gets implicitly
+  // deleted. SchedulerState is a move-only class, as we have a <unique_ptr>
+  // for it in VirtualScheduler. A derivative of VirtualScheduler can move a
+  // <unique_ptr> SchedulerState to VirtualScheduler when it is constructed,
+  // which is where this move constructor is needed.
+  SchedulerState(SchedulerState&& arg) = default;
+  // We explicitly delete assinment and copy operators, this is done implicitly,
+  // but we state it here explicitly for clarity.
+  SchedulerState& operator=(SchedulerState&& arg) = delete;
+  SchedulerState(const SchedulerState&) = delete;
+  SchedulerState& operator=(const SchedulerState&) = delete;
+  // Destructor. Must be defined such that a derivative class can override it
+  // and allow proper desctruction of the derivative class. If this is not done
+  // properly, memory leaks can occur.
+  virtual ~SchedulerState();
   // Sets up the graph while also performing some necessary transformations
   // initial_nodes is the set of nodes (primary inputs) discovered by Init()
   // which may be added by a ReadyNodeManager (or related/derivative scheduler)
@@ -332,12 +347,14 @@ class SchedulerState {
               std::vector<const NodeDef*>* initial_nodes,
               bool create_explicit_channel_device = true);
 
-  Costs Summary() const;
+  virtual Costs Summary() const;
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
-  Costs Summary(RunMetadata* metadata);
+  virtual Costs Summary(RunMetadata* metadata);
   // Generates RunMetadata's step_stats and partition_graphs fields from results
   // of the virtual execution of the graph.
+  // TODO(rdegruijl) See if we can make this function and caller Summary()
+  // const.
   void GenerateRunMetadata(RunMetadata* metadata);
 
   // Returns per device memory usage.
@@ -438,6 +455,15 @@ class VirtualScheduler {
                    const bool use_aggressive_shape_inference, Cluster* cluster,
                    ReadyNodeManager* ready_nodes,
                    std::unique_ptr<VirtualPlacer> placer);
+  // This constructor can be called by a derivative of VirtualScheduler to
+  // construct the base class. It lets VirtualScheduler take ownership of
+  // a new SchedulerState or a derivative thereof.
+  // Note that this constructor does not set a VirtualPlacer, in this
+  // constructor the VirtialPlacer is passed as a member of the SchedulerState
+  // that is passed as an argument.
+  VirtualScheduler(ReadyNodeManager* ready_nodes,
+                   std::unique_ptr<SchedulerState> scheduler_state);
+  virtual ~VirtualScheduler();
 
   // Initializes the scheduler for the specific grappler item.
   // Should be called immediately after the c'tor or when the scheduler will be
@@ -447,51 +473,51 @@ class VirtualScheduler {
   // This function should be called at least once after the scheduler is
   // constructed. An uninitialized or failed-to-initialize scheduler will cause
   // undefined behavior.
-  Status Init(const GrapplerItem* item);
+  virtual Status Init(const GrapplerItem* item);
 
   // Gets the current scheduled node for execution; the caller of this function
   // can accordingly simulate the execution of the current scheduled node.
-  OpContext GetCurrNode() const;
+  virtual OpContext GetCurrNode();
   // Marks the current scheduled node as executed. Note that we should call this
   // function only after the execution of the node has been simulated;
   // node_costs_ capture the simulated costs of the node.
   // Returns true if there is any node to be scheduled.
-  bool MarkCurrNodeExecuted(const Costs& node_costs);
+  virtual bool MarkCurrNodeExecuted(const Costs& node_costs);
 
   // Prints out summary of execution (timing, memory usage, etc.)
-  Costs Summary() const { return scheduler_state_.Summary(); }
+  Costs Summary() const { return scheduler_state_->Summary(); }
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
   Costs Summary(RunMetadata* metadata) {
-    return scheduler_state_.Summary(metadata);
+    return scheduler_state_->Summary(metadata);
   }
   // Generates RunMetadata's step_stats and partition_graphs fields from results
   // of the virtual execution of the graph.
   void GenerateRunMetadata(RunMetadata* metadata) {
-    scheduler_state_.GenerateRunMetadata(metadata);
+    scheduler_state_->GenerateRunMetadata(metadata);
   }
   // Returns per device memory usage.
   const std::unordered_map<string, int64> GetPeakMemoryUsage() const {
-    return scheduler_state_.GetPeakMemoryUsage();
+    return scheduler_state_->GetPeakMemoryUsage();
   }
   const std::unordered_map<string, int64> GetPersistentMemoryUsage() const {
-    return scheduler_state_.GetPersistentMemoryUsage();
+    return scheduler_state_->GetPersistentMemoryUsage();
   }
   // Returns VirtualScheduler (read only) device and node states.
   const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
-    return scheduler_state_.GetDeviceStates();
+    return scheduler_state_->GetDeviceStates();
   }
   const std::unordered_map<const NodeDef*, NodeState>* GetNodeStates() const {
-    return scheduler_state_.GetNodeStates();
+    return scheduler_state_->GetNodeStates();
   }
   void enable_mem_usage_tracking() {
-    scheduler_state_.enable_mem_usage_tracking();
+    scheduler_state_->enable_mem_usage_tracking();
   }
 
- private:
+ protected:
   // The state of the scheduler and the execution of the graph is encapsulated
   // by the scheduler_state_ object.
-  SchedulerState scheduler_state_;
+  std::unique_ptr<SchedulerState> scheduler_state_;
   // ready_nodes_ is responsible for ordering the traversal of the graph.
   ReadyNodeManager* ready_nodes_;  // Not owned.
 };
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index fd59b7a167a..dfc92b0b354 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/graph_view.h"
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/ops/parsing_ops.h"
@@ -291,17 +292,16 @@ TEST_F(GraphViewTest, GetRegularFaninPortOutOfBounds) {
   EXPECT_EQ(d_output_control, GraphView::OutputPort());
 }
 
-static void BM_GraphViewConstruction(int iters, int num_nodes,
-                                     int num_edges_per_node) {
-  testing::StopTiming();
+void BM_GraphViewConstruction(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
+
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, num_edges_per_node);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     GraphView graph_view(&graph_def);
   }
-  testing::StopTiming();
 }
 
 BENCHMARK(BM_GraphViewConstruction)
@@ -334,17 +334,16 @@ BENCHMARK(BM_GraphViewConstruction)
     ->ArgPair(50000, 16)
     ->ArgPair(100000, 16);
 
-static void BM_GraphViewGetNode(int iters, int num_nodes) {
-  testing::StopTiming();
+void BM_GraphViewGetNode(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+
   const GraphDef graph_def =
       test::CreateGraphDef(num_nodes, /*num_edges_per_node=*/16);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     graph_view.GetNode("out");
   }
-  testing::StopTiming();
 }
 
 BENCHMARK(BM_GraphViewGetNode)
@@ -384,124 +383,121 @@ BENCHMARK(BM_GraphViewGetNode)
       ->ArgPair(100000, 10000)           \
       ->ArgPair(100000, 100000);
 
-static void BM_GraphViewGetFanout(int iters, int num_fanins, int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetFanout(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetFanout({node, 0});
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanout);
 
-static void BM_GraphViewGetFanin(int iters, int num_fanins, int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetFanin(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetFanin({node, 0});
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanin);
 
-static void BM_GraphViewGetRegularFanin(int iters, int num_fanins,
-                                        int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetRegularFanin(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetRegularFanin({node, 0});
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanin);
 
-static void BM_GraphViewGetFanouts(int iters, int num_fanins, int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetFanouts(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetFanouts(*node, /*include_controlled_nodes=*/false);
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanouts);
 
-static void BM_GraphViewGetFanins(int iters, int num_fanins, int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetFanins(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetFanins(*node, /*include_controlling_nodes=*/false);
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanins);
 
-static void BM_GraphViewGetFanoutEdges(int iters, int num_fanins,
-                                       int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetFanoutEdges(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetFanoutEdges(*node, /*include_controlled_edges=*/false);
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanoutEdges);
 
-static void BM_GraphViewGetFaninEdges(int iters, int num_fanins,
-                                      int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewGetFaninEdges(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   const GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   GraphView graph_view(&graph_def);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     const NodeDef* node = graph_view.GetNode("node");
     graph_view.GetFaninEdges(*node, /*include_controlling_edges=*/false);
   }
-  testing::StopTiming();
 }
 
 RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFaninEdges);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 7fc74b0aca5..a1af69354e4 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -717,6 +717,7 @@ tf_cuda_cc_test(
         ":custom_graph_optimizer_registry",
         ":meta_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -869,6 +870,7 @@ tf_kernel_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
index 3a336f87f0a..f8b493f06d4 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
@@ -29,7 +29,6 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kRetValOp[] = "_Retval";
 constexpr char kMaxIntraOpParallelismDataset[] = "MaxIntraOpParallelismDataset";
 constexpr char kModelDataset[] = "ModelDataset";
 
@@ -46,17 +45,11 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
   *output = item.graph;
   MutableGraphView graph(output);
 
-  for (const auto& fetch_name : item.fetch) {
-    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
-    // because we only want to disable intra op parallelism on the main dataset
-    // pipeline.
-    auto fetch = graph.GetNode(fetch_name);
-    if (fetch == nullptr || fetch->op() == kRetValOp) {
-      // Heuristic: If the fetch nodes are Retval ops, this item is from a
-      // function.
-      return Status::OK();
-    }
-  }
+  // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+  // because we only want to disable intra op parallelism on the main dataset
+  // pipeline.
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return Status::OK();
 
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
index 4ece16542c8..4159f518e27 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
@@ -31,7 +31,6 @@ namespace {
 
 constexpr char kAlgorithm[] = "algorithm";
 constexpr char kModelDataset[] = "ModelDataset";
-constexpr char kRetValOp[] = "_Retval";
 
 constexpr int64 HILL_CLIMB = 0;
 constexpr int64 GRADIENT_DESCENT = 1;
@@ -49,17 +48,11 @@ Status EnableGradientDescent::OptimizeAndCollectStats(
   }
   MutableGraphView graph(output);
 
-  for (const auto& fetch_name : item.fetch) {
-    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
-    // because we only want to enable gradient descent on the main dataset
-    // pipeline.
-    auto fetch = graph.GetNode(fetch_name);
-    if (fetch == nullptr || fetch->op() == kRetValOp) {
-      // Heuristic: If the fetch nodes are Retval ops, this item is from a
-      // function.
-      return Status::OK();
-    }
-  }
+  // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+  // because we only want to enable gradient descent on the main dataset
+  // pipeline.
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return Status::OK();
 
   int index = graph_utils::FindGraphNodeWithOp(kModelDataset, *output);
   NodeDef& model_node = *(output->mutable_node(index));
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index d70a1ca486e..e09ea575ce4 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -430,9 +430,8 @@ FunctionDef* FuseFunctions(
     const SetInputFn& set_input, const SetOutputFn& set_output,
     const SetNodesFn& set_nodes, FunctionDefLibrary* library) {
   auto has_attrs = [](const FunctionDef& func) {
-    return !(
-        func.attr_size() == 0 ||
-        (func.attr_size() == 1 && func.attr().contains(data::kTFDataFunction)));
+    return !(func.attr_size() == 0 ||
+             (func.attr_size() == 1 && data::IsTFDataFunction(func)));
   };
   if (has_attrs(first_function) || has_attrs(second_function)) {
     return nullptr;  // Functions with attributes are currently not supported.
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 8bc33ea8464..10207de920c 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -27,6 +27,7 @@ namespace graph_utils {
 namespace {
 
 constexpr char kConstOpName[] = "Const";
+constexpr char kRetValOp[] = "_Retval";
 
 template <typename Predicate, typename Collection>
 std::vector<int> GetElementIndicesWithPredicate(const Predicate& predicate,
@@ -367,6 +368,19 @@ Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
   return Status::OK();
 }
 
+bool IsItemDerivedFromFunctionDef(const GrapplerItem& item,
+                                  const MutableGraphView& graph_view) {
+  for (const auto& fetch_name : item.fetch) {
+    auto fetch = graph_view.GetNode(fetch_name);
+    if (fetch != nullptr && fetch->op() != kRetValOp) {
+      // We found a fetch node which is not a `Retval` op.
+      return false;
+    }
+  }
+  // All fetch nodes are `Retval` ops (or we don't have any fetch nodes).
+  return true;
+}
+
 }  // namespace graph_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 87c9831126f..3a397e50106 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -169,6 +169,13 @@ Status EnsureNodeNamesUnique(Graph* g);
 Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
                     NodeDef** fetch_node);
 
+// Returns true if `item` is derived from a `FunctionDef`, false otherwise.
+// Currently, we determine this heuristically: If we don't have any fetch nodes
+// or all fetch nodes are `Retval` ops, then we consider this item as derived
+// from a `FunctionDef`.
+bool IsItemDerivedFromFunctionDef(const GrapplerItem& item,
+                                  const MutableGraphView& graph_view);
+
 }  // namespace graph_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index cd46a7356ac..b4317577cb8 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -118,7 +118,7 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   for (const auto& name : flib.ListFunctionNames()) {
     auto* func = flib.Find(name);
     // Skip non tf.data functions.
-    if (!func->attr().contains(data::kTFDataFunction)) continue;
+    if (!data::IsTFDataFunction(*func)) continue;
     VLOG(3) << "Optimize function: function=" << func->signature().name();
     optimized_functions = true;
 
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
index 211b53ba083..fad2b9f7f67 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -33,7 +33,6 @@ namespace grappler {
 
 namespace {
 
-constexpr char kRetValOp[] = "_Retval";
 constexpr char kPrefetchDatasetOp[] = "PrefetchDataset";
 
 template <std::size_t SIZE>
@@ -116,17 +115,13 @@ Status Slack::OptimizeAndCollectStats(Cluster* cluster,
 
   *output = item.graph;
   MutableGraphView graph(output);
-  for (const auto& fetch_name : item.fetch) {
-    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
-    // because we only want to add slack to the prefetch on the main dataset
-    // pipeline.
-    auto fetch = graph.GetNode(fetch_name);
-    if (fetch == nullptr || fetch->op() == kRetValOp) {
-      // Heuristic: If the fetch nodes are Retval ops, this item is from a
-      // function.
-      return Status::OK();
-    }
-  }
+
+  // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+  // because we only want to add slack to the prefetch on the main dataset
+  // pipeline.
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return Status::OK();
+
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
         "Expected only one fetch node but there were ", item.fetch.size(), ": ",
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index ef68b7e7898..ab73616eb99 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -67,6 +67,8 @@ constexpr char kOpConst[] = "Const";
 constexpr char kReshape[] = "Reshape";
 constexpr char kReshapeConst[] = "ReshapeConst";
 constexpr int kRank = 4;
+constexpr int kUnknownRank = -1;
+constexpr int kInvalidRank = -2;
 
 inline bool AttrDataFormatMatch(const utils::MutableNodeView& node,
                                 absl::string_view src_data_format,
@@ -554,15 +556,23 @@ Status Transposer::UpdateEdge(
   return Status::OK();
 }
 
-bool Transposer::IsFanoutPortRankN(const utils::MutableNodeView& node, int port,
-                                   int n) const {
+int Transposer::GetFanoutPortRank(const utils::MutableNodeView& node,
+                                  int port) const {
   const auto* output_shape_attr = node.GetAttr(kAttrOutputShape);
   if (output_shape_attr == nullptr ||
       output_shape_attr->list().shape_size() <= port) {
-    return false;
+    return kInvalidRank;
   }
   const auto& shape = output_shape_attr->list().shape(port);
-  return !shape.unknown_rank() && shape.dim_size() == n;
+  if (shape.unknown_rank()) {
+    return kUnknownRank;
+  }
+  return shape.dim_size();
+}
+
+bool Transposer::IsFanoutPortRankN(const utils::MutableNodeView& node, int port,
+                                   int n) const {
+  return GetFanoutPortRank(node, port) == n;
 }
 
 bool Transposer::IsFanoutPortsRankN(const utils::MutableNodeView& node,
@@ -575,14 +585,18 @@ bool Transposer::IsFanoutPortsRankN(const utils::MutableNodeView& node,
   return true;
 }
 
-bool Transposer::IsFaninPortRankN(const utils::MutableNodeView& node, int port,
-                                  int n) const {
+int Transposer::GetFaninPortRank(const utils::MutableNodeView& node,
+                                 int port) const {
   if (port < node.NumRegularFanins() && port >= 0) {
     const auto& regular_fanin = node.GetRegularFanin(port);
-    return IsFanoutPortRankN(*regular_fanin.node_view(), regular_fanin.index(),
-                             n);
+    return GetFanoutPortRank(*regular_fanin.node_view(), regular_fanin.index());
   }
-  return false;
+  return kInvalidRank;
+}
+
+bool Transposer::IsFaninPortRankN(const utils::MutableNodeView& node, int port,
+                                  int n) const {
+  return GetFaninPortRank(node, port) == n;
 }
 
 bool Transposer::IsFaninPortDimsNIfConst(const utils::MutableNodeView& node,
@@ -719,11 +733,12 @@ Status LayoutSensitiveOpTransposer::UpdateNode(TransposeContext* context,
 Status DefaultLayoutSensitiveOpTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsDefaultLayoutSensitiveOp(*node->node()));
-  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
-  const auto& shape = output_shape_attr->list().shape(0);
-  const int rank = shape.dim_size();
+  const int rank = GetFanoutPortRank(*node, 0);
+  if (rank != 4 && rank != 5) {
+    return Status::OK();
+  }
   ScopedDataFormatUpgrader data_format_upgrader(context, rank);
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, rank)) {
+  if (!ShouldProcess(*context, *node)) {
     return Status::OK();
   }
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
@@ -904,12 +919,12 @@ bool FusedBatchNormGradTransposer::IsTraining(
 Status FusedBatchNormGradTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsFusedBatchNormGrad(*node->node()));
-  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
-  const auto& shape = output_shape_attr->list().shape(0);
-  const int rank = shape.dim_size();
+  const int rank = GetFanoutPortRank(*node, 0);
+  if (rank != 4 && rank != 5) {
+    return Status::OK();
+  }
   ScopedDataFormatUpgrader data_format_upgrader(context, rank);
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, rank) ||
-      !IsTraining(*node)) {
+  if (!ShouldProcess(*context, *node) || !IsTraining(*node)) {
     return Status::OK();
   }
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
@@ -1089,9 +1104,7 @@ std::vector<int> LayoutAgnosticOpTransposer::GetVariadic4DFaninPorts(
 Status DefaultLayoutAgnosticOpTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsDefaultLayoutAgnosticOp(*node->node()));
-  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
-  const auto& shape = output_shape_attr->list().shape(0);
-  const int rank = shape.dim_size();
+  const int rank = GetFanoutPortRank(*node, 0);
   if (rank != 4 && rank != 5) {
     return Status::OK();
   }
@@ -1249,9 +1262,10 @@ Status BinaryOpTransposer::MaybeReshapeVectorFanin(TransposeContext* context,
 Status BinaryOpTransposer::TransposeNode(TransposeContext* context,
                                          utils::MutableNodeView* node) {
   DCHECK(IsBinaryOp(*node->node()));
-  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
-  const auto& shape = output_shape_attr->list().shape(0);
-  const int rank = shape.dim_size();
+  const int rank = GetFanoutPortRank(*node, 0);
+  if (rank != 4 && rank != 5) {
+    return Status::OK();
+  }
   ScopedDataFormatUpgrader data_format_upgrader(context, rank);
   if (!ShouldProcess(*context, *node) || !IsFaninShapeSupported(*node, rank) ||
       !IsAfterDstToSrcTransform(*context, *node)) {
@@ -1432,13 +1446,12 @@ bool ReduceTransposer::IsReduceAxisSupported(
 Status ReduceTransposer::TransposeNode(TransposeContext* context,
                                        utils::MutableNodeView* node) {
   DCHECK(IsReduceOp(*node->node()));
-  const auto& regular_fanin = node->GetRegularFanin(0);
-  const auto* output_shape_attr =
-      regular_fanin.node_view()->GetAttr(kAttrOutputShape);
-  const auto& shape = output_shape_attr->list().shape(0);
-  const int rank = shape.dim_size();
+  const int rank = GetFaninPortRank(*node, 0);
+  if (rank != 4 && rank != 5) {
+    return Status::OK();
+  }
   ScopedDataFormatUpgrader data_format_upgrader(context, rank);
-  if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 0, rank) ||
+  if (!ShouldProcess(*context, *node) ||
       !IsReduceAxisSupported(*context, *node) ||
       !IsAfterDstToSrcTransform(*context, *node)) {
     return Status::OK();
@@ -1797,10 +1810,18 @@ Status TileTransposer::TransposeNode(TransposeContext* context,
 Status UnaryGradTransposer::TransposeNode(TransposeContext* context,
                                           utils::MutableNodeView* node) {
   DCHECK(IsUnaryGrad(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4) ||
+  const int rank = GetFanoutPortRank(*node, 0);
+  if (rank != 4 && rank != 5) {
+    return Status::OK();
+  }
+  ScopedDataFormatUpgrader data_format_upgrader(context, rank);
+  if (!ShouldProcess(*context, *node) ||
       !IsAfterDstToSrcTransform(*context, *node)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0, 1}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index 11a223ee097..bfc67c0633d 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -149,10 +149,12 @@ class Transposer {
                               utils::MutationNewNode* added_node);
 
  protected:
+  int GetFanoutPortRank(const utils::MutableNodeView& node, int port) const;
   bool IsFanoutPortRankN(const utils::MutableNodeView& node, int port,
                          int n) const;
   bool IsFanoutPortsRankN(const utils::MutableNodeView& node,
                           absl::Span<const int> ports, int n) const;
+  int GetFaninPortRank(const utils::MutableNodeView& node, int port) const;
   bool IsFaninPortRankN(const utils::MutableNodeView& node, int port,
                         int n) const;
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 0bd683b7b64..2c1566d39c6 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -181,6 +181,7 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "ReluGrad",
                                           "Rint",
                                           "Select",
+                                          "SelectV2",
                                           "Selu",
                                           "SeluGrad",
                                           "Shape",
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 8f18dfdeef4..6fcbe371caa 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -93,10 +93,6 @@ bool IsRunOnceOptimizer(const string& name) {
          name == "auto_mixed_precision_mkl";
 }
 
-bool IsTFDataFunction(const FunctionDef& func) {
-  return func.attr().contains(data::kTFDataFunction);
-}
-
 // Creates a function library stub from a real function library: copy only
 // signatures and attributes of all the function defined in fdef_lib. This stub
 // can be swapped with real function library in a graph, before passing it to
@@ -615,6 +611,63 @@ Status MetaOptimizer::RunOptimizer(
   return Status::OK();
 }
 
+// Propagates `_tf_data_function` attributes from functions to their callees.
+void PropagateTFDataAttrs(const FunctionLibraryDefinition& flib,
+                          FunctionDefLibrary& fdef_lib) {
+  // Collect functions that need the attribute in this set.
+  absl::flat_hash_set<std::string> tf_data_functions;
+  std::function<void(const std::string&)> collect_tf_data_functions_dfs =
+      [&](const std::string& func_name) -> void {
+    const FunctionDef* func_def = flib.Find(func_name);
+    // Skip functions that are not reachable from the optimized graph.
+    if (func_def == nullptr) return;
+
+    // Return if we already found and added this function.
+    if (tf_data_functions.contains(func_name)) return;
+
+    // We only get here if the function is (directly or indirectly) called from
+    // a tf.data function, so add it to the set.
+    tf_data_functions.insert(func_name);
+
+    // Proceed with DFS for functions called from current function.
+    for (const NodeDef& node : func_def->node_def()) {
+      if (flib.Contains(node.op())) {
+        // This is a function call node.
+        collect_tf_data_functions_dfs(node.op());
+      }
+      // Check if there are functions in attributes.
+      for (const auto& attr : node.attr()) {
+        const AttrValue& attr_value = attr.second;
+        if (attr_value.has_func()) {
+          collect_tf_data_functions_dfs(attr_value.func().name());
+        }
+        if (attr_value.has_list()) {
+          for (const auto& func : attr_value.list().func()) {
+            collect_tf_data_functions_dfs(func.name());
+          }
+        }
+      }
+    }
+  };
+  // Perform DFS for all tf.data functions in `fdef_lib`.
+  for (const auto& func_def : fdef_lib.function()) {
+    const std::string& func_name = func_def.signature().name();
+    if (data::IsTFDataFunction(func_def))
+      collect_tf_data_functions_dfs(func_name);
+  }
+  // Set attribute for tf.data functions. We cannot do this in the DFS directly
+  // because `FunctionLibraryDefinition` does not seem to provide mutable access
+  // to a `FunctionDef`.
+  for (FunctionDef& func_def : *fdef_lib.mutable_function()) {
+    const std::string& func_name = func_def.signature().name();
+    if (tf_data_functions.contains(func_name) &&
+        !data::IsTFDataFunction(func_def)) {
+      VLOG(2) << "Marking " << func_name << " as tf.data function";
+      (*func_def.mutable_attr())[data::kTFDataFunction].set_b(true);
+    }
+  }
+}
+
 Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
                                           GraphDef* optimized_graph) {
   const uint64 start_us = Env::Default()->NowMicros();
@@ -636,13 +689,13 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
   // remove all the unreachable functions.
   // TODO(ezhulenev): Construct reachable function library definition directly
   // from the proto without constructing temporary FunctionLibraryDefinition.
+  int old_library_size = item.graph.library().function_size();
   *item.graph.mutable_library() = minimized_flib(item.graph).ToProto();
+  int new_library_size = item.graph.library().function_size();
 
   VLOG(1) << absl::Substitute(
       "Deleted $0 unreachable functions from the graph (library size = $1)",
-      item.graph.library().function_size() -
-          item.graph.library().function_size(),
-      item.graph.library().function_size());
+      old_library_size - new_library_size, new_library_size);
 
   // Save a few small fields from item before we move it.
   bool optimize_function_library =
@@ -722,6 +775,8 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
   for (const FunctionDef& function : optimized_graph->library().function()) {
     find_xla_compiled_functions(function.node_def());
   }
+  // Propagate `_tf_data_function` attributes from functions to their callees.
+  PropagateTFDataAttrs(flib, *optimized_graph->mutable_library());
 
   // Optimize each function only once.
   absl::flat_hash_set<string> optimized_funcs;
@@ -747,8 +802,9 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
       // the function optimizer, before we can optimize function body.
       if (IsParametrized(func)) continue;
 
-      // Skip tf.data functions as they are optimized by tf.data meta optimizer.
-      if (IsTFDataFunction(func)) continue;
+      // Skip tf.data functions as they are optimized by tf.data meta optimizer
+      // and in function instantiation.
+      if (data::IsTFDataFunction(func)) continue;
 
       VLOG(3) << "Optimize function: function=" << func_name << " ["
               << function_idx++ << " of "
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 595b636c7a9..1352b7b2b88 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include <atomic>
+
 #include "absl/strings/match.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -1016,6 +1019,240 @@ TEST_F(MetaOptimizerTest, CompressConstants) {
   }
 }
 
+// Tests for checking expected behavior when skipping tf.data functions in
+// meta optimizer.
+
+// Custom optimizer which counts the number of calls of its method `Optimize`
+// across all class instances.
+class TfDataTestOptimizer : public CustomGraphOptimizer {
+ public:
+  static void InitCount() { count_ = 0; }
+  static int GetCount() { return count_; }
+
+  TfDataTestOptimizer() = default;
+  ~TfDataTestOptimizer() override = default;
+  TfDataTestOptimizer(const TfDataTestOptimizer&) = delete;
+  TfDataTestOptimizer& operator=(const TfDataTestOptimizer& other) = delete;
+
+  std::string name() const override { return "tf_data_test_optimizer"; }
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    ++count_;
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+
+ private:
+  static std::atomic<int> count_;
+};
+
+std::atomic<int> TfDataTestOptimizer::count_;
+
+REGISTER_GRAPH_OPTIMIZER(TfDataTestOptimizer);
+
+// Type for specifying how the inner function is nested inside the outer
+// function.
+enum class FuncNestingType {
+  CallFromNode = 0,
+  CallFromAttr = 1,
+  CallFromList = 2
+};
+
+// Test fixture for parametrized testing.
+class TfDataTestFixture
+    : public ::testing::TestWithParam<std::tuple<bool, bool, FuncNestingType>> {
+ protected:
+  void SetUp() override {
+    is_inner_func_tf_data_ = std::get<0>(GetParam());
+    is_outer_func_tf_data_ = std::get<1>(GetParam());
+    func_nesting_type_ = std::get<2>(GetParam());
+  }
+  // Controls which of the functions is flagged as tf.data function.
+  bool is_inner_func_tf_data_ = false;
+  bool is_outer_func_tf_data_ = false;
+  // Controls how the inner function is nested inside the outer function.
+  FuncNestingType func_nesting_type_ = FuncNestingType::CallFromNode;
+};
+
+// Helper functions for setting up the call of `inner_func` inside of
+// `outer_func`.
+
+void SetUpCallFromNode(FunctionDef& outer_func) {
+  // Call `inner_func` from a node in `outer_func`.
+  outer_func = FunctionDefHelper::Create(
+      "outer_func", {"x:float"}, {"z:float"}, {},
+      /*node_def=*/
+      {{{"inner_func"}, "inner_func", {"x", "x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z", "inner_func:z:0"}});
+}
+
+void SetUpCallFromAttr(FunctionDef& outer_func) {
+  // Call `inner_func` from an attribute in a node in `outer_func`.
+  outer_func = FunctionDefHelper::Create(
+      "outer_func", {"x:float"}, {"z:float"}, {},
+      /*node_def=*/
+      {{{"identity"},
+        "Identity",
+        {"x"},
+        {{"T", DT_FLOAT},
+         {"f", FunctionDefHelper::FunctionRef("inner_func", {})}}}},
+      /*ret_def=*/
+      {{"z", "x"}});
+}
+
+void SetUpCallFromList(FunctionDef& outer_func) {
+  // Call `inner_func` from a list attribute in a node in `outer_func`.
+  outer_func = FunctionDefHelper::Create(
+      "outer_func", {"x:float"}, {"z:float"}, {},
+      /*node_def=*/
+      {{{"identity"}, "Identity", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z", "x"}});
+
+  // Add a list containing `inner_func` to the `identity` node.
+  // `list_value` will be deallocated automatically since it is passed as
+  // allocated list below.
+  AttrValue_ListValue* list_value =
+      (*outer_func.mutable_node_def(0)->mutable_attr())["list"].mutable_list();
+  NameAttrList* entry = list_value->add_func();
+  entry->set_name("inner_func");
+}
+
+TEST_P(TfDataTestFixture, TfDataTests) {
+  using test::function::NDef;
+
+  // Define function library with `outer_func` and `inner_func`.
+
+  FunctionDef inner_func = FunctionDefHelper::Create(
+      "inner_func", {"x:float", "y:float"}, {"z:float"}, {},
+      /*node_def=*/
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z", "mul:z:0"}});
+  (*inner_func.mutable_attr())[data::kTFDataFunction].set_b(
+      is_inner_func_tf_data_);
+
+  FunctionDef outer_func;
+  switch (func_nesting_type_) {
+    case FuncNestingType::CallFromNode:
+      SetUpCallFromNode(outer_func);
+      break;
+    case FuncNestingType::CallFromAttr:
+      SetUpCallFromAttr(outer_func);
+      break;
+    case FuncNestingType::CallFromList:
+      SetUpCallFromList(outer_func);
+      break;
+    default:
+      break;
+  }
+  (*outer_func.mutable_attr())[data::kTFDataFunction].set_b(
+      is_outer_func_tf_data_);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   result = outer_func(a);
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("outer_func_node", "outer_func", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       // Forward outputs
+       NDef("out_s", "Identity", {"outer_func_node:0"}, {{"T", DT_FLOAT}},
+            kDevice)},
+      /*funcs=*/
+      {inner_func, outer_func});
+
+  // Use only custom optimizer which counts its calls.
+  TfDataTestOptimizer::InitCount();
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *(config_proto.mutable_graph_options()->mutable_rewrite_options());
+  rewriter_config.add_optimizers("TfDataTestOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // We expect one graph optimization + one optimization for each non-tf.data
+  // function. Note that if `outer_func` is flagged as a tf.data function, then
+  // `inner_func` is implicitly also considered a tf.data function because it is
+  // called from `outer_func`.
+  int expected_count = 3;
+  if (is_outer_func_tf_data_)
+    expected_count = 1;
+  else if (is_inner_func_tf_data_)
+    expected_count = 2;
+  EXPECT_EQ(TfDataTestOptimizer::GetCount(), expected_count);
+
+  // We expect that the tf.data-attribute has been propagated from `outer_func`
+  // to its callee `inner_func` if the value is `true`. Otherwise, the attribute
+  // values should be unchanged.
+  FunctionLibraryDefinition flib(OpRegistry::Global(), output.library());
+  const FunctionDef* outer_func_after_opt = flib.Find("outer_func");
+  const FunctionDef* inner_func_after_opt = flib.Find("inner_func");
+
+  EXPECT_EQ(data::IsTFDataFunction(*outer_func_after_opt),
+            is_outer_func_tf_data_);
+  if (is_outer_func_tf_data_ || is_inner_func_tf_data_) {
+    EXPECT_EQ(data::IsTFDataFunction(*inner_func_after_opt), true);
+  } else {
+    EXPECT_EQ(data::IsTFDataFunction(*inner_func_after_opt), false);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MetaOptimizerTest, TfDataTestFixture,
+    ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                       ::testing::Values(FuncNestingType::CallFromNode,
+                                         FuncNestingType::CallFromAttr,
+                                         FuncNestingType::CallFromList)),
+    [](const ::testing::TestParamInfo<TfDataTestFixture::ParamType>& info) {
+      bool is_inner_func_tf_data = std::get<0>(info.param);
+      bool is_outer_func_tf_data = std::get<1>(info.param);
+      FuncNestingType func_nesting_type = std::get<2>(info.param);
+
+      std::string test_name;
+      if (is_inner_func_tf_data && is_outer_func_tf_data)
+        test_name = "both_funcs_tf_data";
+      else if (is_inner_func_tf_data)
+        test_name = "inner_func_tf_data";
+      else if (is_outer_func_tf_data)
+        test_name = "outer_func_tf_data";
+      else
+        test_name = "no_func_tf_data";
+      switch (func_nesting_type) {
+        case FuncNestingType::CallFromNode:
+          test_name += "_call_from_node";
+          break;
+        case FuncNestingType::CallFromAttr:
+          test_name += "_call_from_attribute";
+          break;
+        case FuncNestingType::CallFromList:
+          test_name += "_call_from_list";
+          break;
+        default:
+          break;
+      }
+      return test_name;
+    });
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index e8dd1a68bb3..b9bd6430991 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
@@ -306,6 +307,9 @@ bool IsCpuCompatible(const RemapperContext& ctx, const Pattern& matched) {
     return IsCpuCompatibleConv2D(&node);
   } else if (IsDepthwiseConv2dNative(node)) {
 #ifdef INTEL_MKL
+    if (DisableMKL()) {
+      return false;
+    }
     return IsCpuCompatibleDepthwiseConv2dNative(&node);
 #else
     return false;
@@ -660,6 +664,7 @@ bool IsAddWithNoBroadcast(const RemapperContext& ctx, const NodeDef& node) {
 bool FindContractionWithBiasAddAndAdd(const RemapperContext& ctx,
                                       const utils::MutableNodeView& node_view,
                                       ContractionWithBiasAddAndAdd* matched) {
+  if (DisableMKL()) return false;
   // Fusion with AddN is supported only when it has two inputs.
   // TODO(lyandy): Forward controls for patterns with control dependencies.
   if (HasControlFaninOrFanout(node_view) || node_view.NumRegularFanins() != 2)
@@ -710,6 +715,7 @@ bool FindContractionWithBiasAddAndAdd(const RemapperContext& ctx,
 bool FindContractionWithBiasAndAddActivation(
     const RemapperContext& ctx, int node_index,
     ContractionWithBiasAndAddActivation* matched) {
+  if (DisableMKL()) return false;
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   // TODO(lyandy): Forward controls for patterns with control dependencies.
   if (HasControlFaninOrFanout(*node_view)) return false;
@@ -838,6 +844,8 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 #ifndef ENABLE_MKLDNN_V1
     // We fuse FusedBatchNorm on GPU or MKL CPU.
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+#else
+    if (DisableMKL()) return false;
 #endif
 
     DataType t_dtype = GetDataTypeFromAttr(*fused_batch_norm_node_def, "T");
diff --git a/tensorflow/core/grappler/utils/colocation.cc b/tensorflow/core/grappler/utils/colocation.cc
index 0573e0a8309..bce2b49c2fd 100644
--- a/tensorflow/core/grappler/utils/colocation.cc
+++ b/tensorflow/core/grappler/utils/colocation.cc
@@ -38,11 +38,22 @@ string GetColocationGroupRoot(std::unordered_map<string, string>* map,
     map->insert({node_name, node_name});
     return node_name;
   }
+  std::list<string> nodes_to_root;
   string cur = node_name;
   while ((*map)[cur] != cur) {
     // Backtracing the map until we reach the root node.
+    nodes_to_root.push_back(cur);
     cur = (*map)[cur];
   }
+
+  // Update the nodes on the path to the root node to point to the root as well,
+  // so the further lookups can be faster.
+  if (!nodes_to_root.empty()) {
+    nodes_to_root.pop_back();
+    for (const string& node : nodes_to_root) {
+      (*map)[node] = cur;
+    }
+  }
   return cur;
 }
 
diff --git a/tensorflow/core/grappler/utils/graph_view_test.cc b/tensorflow/core/grappler/utils/graph_view_test.cc
index ce196d366ed..6df2eabad66 100644
--- a/tensorflow/core/grappler/utils/graph_view_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_test.cc
@@ -2391,47 +2391,42 @@ TEST_F(TopologicalSortTest, PushVisitedNodes) {
       ->ArgPair(100000, 16);
 
 template <typename GraphViewT>
-static void BM_GraphViewTConstruction(int iters, int num_nodes,
-                                      int num_edges_per_node) {
-  testing::StopTiming();
+void BM_GraphViewTConstruction(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
+
   GraphDef graph_def = test::CreateGraphDef(num_nodes, num_edges_per_node);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     Status s;
     GraphViewT graph_view(&graph_def, &s);
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewConstruction(int iters, int num_nodes,
-                                     int num_edges_per_node) {
-  BM_GraphViewTConstruction<GraphView>(iters, num_nodes, num_edges_per_node);
+void BM_GraphViewConstruction(::testing::benchmark::State& state) {
+  BM_GraphViewTConstruction<GraphView>(state);
 }
 
-static void BM_MutableGraphViewConstruction(int iters, int num_nodes,
-                                            int num_edges_per_node) {
-  BM_GraphViewTConstruction<MutableGraphView>(iters, num_nodes,
-                                              num_edges_per_node);
+void BM_MutableGraphViewConstruction(::testing::benchmark::State& state) {
+  BM_GraphViewTConstruction<MutableGraphView>(state);
 }
 
-static void BM_MutableGraphViewClearAttrs(int iters, int num_nodes,
-                                          int num_edges_per_node) {
-  testing::StopTiming();
+void BM_MutableGraphViewClearAttrs(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+  const int num_edges_per_node = state.range(1);
+
   GraphDef graph_def = test::CreateGraphDef(num_nodes, num_edges_per_node);
 
   Status s;
   MutableGraphView graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     utils::Mutation* mutation = graph_view.GetMutationBuilder();
     for (int j = 0; j < num_nodes; ++j) {
       mutation->RemoveNodeAttr(graph_view.GetNode(j), "_some_random_attr");
     }
     s = mutation->Apply();
   }
-  testing::StopTiming();
 }
 
 RUN_NUM_NODE_NUM_EDGE_BENCHMARK(BM_GraphViewConstruction);
@@ -2449,58 +2444,54 @@ RUN_NUM_NODE_NUM_EDGE_BENCHMARK(BM_MutableGraphViewClearAttrs);
       ->Arg(100000);
 
 template <typename GraphViewT>
-static void BM_GraphViewTConstructionWithControlDependencies(
-    int iters, int num_fanins_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTConstructionWithControlDependencies(
+    ::testing::benchmark::State& state) {
+  const int num_fanins_fanouts = state.range(0);
+
   GraphDef graph_def =
       test::CreateFaninFanoutNodeGraph(num_fanins_fanouts, num_fanins_fanouts,
                                        num_fanins_fanouts, num_fanins_fanouts,
                                        /*fanout_unique_index=*/true);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     Status s;
     GraphViewT graph_view(&graph_def, &s);
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewConstructionWithControlDependencies(
-    int iters, int num_fanins_fanouts) {
-  BM_GraphViewTConstructionWithControlDependencies<GraphView>(
-      iters, num_fanins_fanouts);
+void BM_GraphViewConstructionWithControlDependencies(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTConstructionWithControlDependencies<GraphView>(state);
 }
 
-static void BM_MutableGraphViewConstructionWithControlDependencies(
-    int iters, int num_fanins_fanouts) {
-  BM_GraphViewTConstructionWithControlDependencies<MutableGraphView>(
-      iters, num_fanins_fanouts);
+void BM_MutableGraphViewConstructionWithControlDependencies(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTConstructionWithControlDependencies<MutableGraphView>(state);
 }
 
 RUN_NUM_NODE_BENCHMARK(BM_GraphViewConstructionWithControlDependencies);
 RUN_NUM_NODE_BENCHMARK(BM_MutableGraphViewConstructionWithControlDependencies);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetNode(int iters, int num_nodes) {
-  testing::StopTiming();
+void BM_GraphViewTGetNode(::testing::benchmark::State& state) {
+  const int num_nodes = state.range(0);
+
   GraphDef graph_def =
       test::CreateGraphDef(num_nodes, /*num_edges_per_node=*/16);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     graph_view.GetNode("out");
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetNode(int iters, int num_nodes) {
-  BM_GraphViewTGetNode<GraphView>(iters, num_nodes);
+void BM_GraphViewGetNode(::testing::benchmark::State& state) {
+  BM_GraphViewTGetNode<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetNode(int iters, int num_nodes) {
-  BM_GraphViewTGetNode<MutableGraphView>(iters, num_nodes);
+void BM_MutableGraphViewGetNode(::testing::benchmark::State& state) {
+  BM_GraphViewTGetNode<MutableGraphView>(state);
 }
 
 RUN_NUM_NODE_BENCHMARK(BM_GraphViewGetNode);
@@ -2535,201 +2526,180 @@ RUN_NUM_NODE_BENCHMARK(BM_MutableGraphViewGetNode);
       ->ArgPair(100000, 100000);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetRegularFanin(int iters, int num_fanins,
-                                         int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTGetRegularFanin(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     auto* node = graph_view.GetNode("node");
     node->GetRegularFanin(0);
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetRegularFanin(int iters, int num_fanins,
-                                        int num_fanouts) {
-  BM_GraphViewTGetRegularFanin<GraphView>(iters, num_fanins, num_fanouts);
+void BM_GraphViewGetRegularFanin(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanin<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetRegularFanin(int iters, int num_fanins,
-                                               int num_fanouts) {
-  BM_GraphViewTGetRegularFanin<MutableGraphView>(iters, num_fanins,
-                                                 num_fanouts);
+void BM_MutableGraphViewGetRegularFanin(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanin<MutableGraphView>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanin);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanin);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetRegularFanout(int iters, int num_fanins,
-                                          int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTGetRegularFanout(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     auto* node = graph_view.GetNode("node");
     node->GetRegularFanout(0);
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetRegularFanout(int iters, int num_fanins,
-                                         int num_fanouts) {
-  BM_GraphViewTGetRegularFanout<GraphView>(iters, num_fanins, num_fanouts);
+void BM_GraphViewGetRegularFanout(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanout<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetRegularFanout(int iters, int num_fanins,
-                                                int num_fanouts) {
-  BM_GraphViewTGetRegularFanout<MutableGraphView>(iters, num_fanins,
-                                                  num_fanouts);
+void BM_MutableGraphViewGetRegularFanout(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanout<MutableGraphView>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanout);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanout);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetRegularFanins(int iters, int num_fanins,
-                                          int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTGetRegularFanins(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     auto* node = graph_view.GetNode("node");
     node->GetRegularFanins();
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetRegularFanins(int iters, int num_fanins,
-                                         int num_fanouts) {
-  BM_GraphViewTGetRegularFanins<GraphView>(iters, num_fanins, num_fanouts);
+void BM_GraphViewGetRegularFanins(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanins<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetRegularFanins(int iters, int num_fanins,
-                                                int num_fanouts) {
-  BM_GraphViewTGetRegularFanins<MutableGraphView>(iters, num_fanins,
-                                                  num_fanouts);
+void BM_MutableGraphViewGetRegularFanins(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanins<MutableGraphView>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanins);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanins);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetRegularFanouts(int iters, int num_fanins,
-                                           int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTGetRegularFanouts(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     auto* node = graph_view.GetNode("node");
     node->GetRegularFanouts();
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetRegularFanouts(int iters, int num_fanins,
-                                          int num_fanouts) {
-  BM_GraphViewTGetRegularFanouts<GraphView>(iters, num_fanins, num_fanouts);
+void BM_GraphViewGetRegularFanouts(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanouts<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetRegularFanouts(int iters, int num_fanins,
-                                                 int num_fanouts) {
-  BM_GraphViewTGetRegularFanouts<MutableGraphView>(iters, num_fanins,
-                                                   num_fanouts);
+void BM_MutableGraphViewGetRegularFanouts(::testing::benchmark::State& state) {
+  BM_GraphViewTGetRegularFanouts<MutableGraphView>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanouts);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanouts);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetControllingFanins(int iters, int num_fanins,
-                                              int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTGetControllingFanins(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     auto* node = graph_view.GetNode("node");
     node->GetControllingFanins();
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetControllingFanins(int iters, int num_fanins,
-                                             int num_fanouts) {
-  BM_GraphViewTGetControllingFanins<GraphView>(iters, num_fanins, num_fanouts);
+void BM_GraphViewGetControllingFanins(::testing::benchmark::State& state) {
+  BM_GraphViewTGetControllingFanins<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetControllingFanins(int iters, int num_fanins,
-                                                    int num_fanouts) {
-  BM_GraphViewTGetControllingFanins<MutableGraphView>(iters, num_fanins,
-                                                      num_fanouts);
+void BM_MutableGraphViewGetControllingFanins(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTGetControllingFanins<MutableGraphView>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetControllingFanins);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetControllingFanins);
 
 template <typename GraphViewT>
-static void BM_GraphViewTGetControlledFanouts(int iters, int num_fanins,
-                                              int num_fanouts) {
-  testing::StopTiming();
+void BM_GraphViewTGetControlledFanouts(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
   Status s;
   GraphViewT graph_view(&graph_def, &s);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     auto* node = graph_view.GetNode("node");
     node->GetControlledFanouts();
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewGetControlledFanouts(int iters, int num_fanins,
-                                             int num_fanouts) {
-  BM_GraphViewTGetControlledFanouts<GraphView>(iters, num_fanins, num_fanouts);
+void BM_GraphViewGetControlledFanouts(::testing::benchmark::State& state) {
+  BM_GraphViewTGetControlledFanouts<GraphView>(state);
 }
 
-static void BM_MutableGraphViewGetControlledFanouts(int iters, int num_fanins,
-                                                    int num_fanouts) {
-  BM_GraphViewTGetControlledFanouts<MutableGraphView>(iters, num_fanins,
-                                                      num_fanouts);
+void BM_MutableGraphViewGetControlledFanouts(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTGetControlledFanouts<MutableGraphView>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetControlledFanouts);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetControlledFanouts);
 
 template <typename GraphViewT, bool IsLast>
-inline static void BM_GraphViewTHasRegularFanin(int iters, int num_fanins,
-                                                int num_fanouts) {
-  testing::StopTiming();
+inline void BM_GraphViewTHasRegularFanin(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, /*num_controlling_fanins=*/0,
       /*num_controlled_fanouts=*/0, /*fanout_unique_index=*/false);
@@ -2739,34 +2709,27 @@ inline static void BM_GraphViewTHasRegularFanin(int iters, int num_fanins,
   auto* node = graph_view.GetNode(absl::StrFormat("out%05d", index));
   auto* fanin = graph_view.GetNode("node");
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     node->HasFanin({&graph_view, fanin->node_index(), 0});
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewHasRegularFaninFirst(int iters, int num_fanins,
-                                             int num_fanouts) {
-  BM_GraphViewTHasRegularFanin<GraphView, false>(iters, num_fanins,
-                                                 num_fanouts);
+void BM_GraphViewHasRegularFaninFirst(::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanin<GraphView, false>(state);
 }
 
-static void BM_GraphViewHasRegularFaninLast(int iters, int num_fanins,
-                                            int num_fanouts) {
-  BM_GraphViewTHasRegularFanin<GraphView, true>(iters, num_fanins, num_fanouts);
+void BM_GraphViewHasRegularFaninLast(::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanin<GraphView, true>(state);
 }
 
-static void BM_MutableGraphViewHasRegularFaninFirst(int iters, int num_fanins,
-                                                    int num_fanouts) {
-  BM_GraphViewTHasRegularFanin<MutableGraphView, false>(iters, num_fanins,
-                                                        num_fanouts);
+void BM_MutableGraphViewHasRegularFaninFirst(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanin<MutableGraphView, false>(state);
 }
 
-static void BM_MutableGraphViewHasRegularFaninLast(int iters, int num_fanins,
-                                                   int num_fanouts) {
-  BM_GraphViewTHasRegularFanin<MutableGraphView, true>(iters, num_fanins,
-                                                       num_fanouts);
+void BM_MutableGraphViewHasRegularFaninLast(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanin<MutableGraphView, true>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasRegularFaninFirst);
@@ -2775,9 +2738,11 @@ RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFaninFirst);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFaninLast);
 
 template <typename GraphViewT, bool IsLast>
-inline static void BM_GraphViewTHasControllingFanin(int iters, int num_fanins,
-                                                    int num_fanouts) {
-  testing::StopTiming();
+inline void BM_GraphViewTHasControllingFanin(
+    ::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/true);
@@ -2787,37 +2752,27 @@ inline static void BM_GraphViewTHasControllingFanin(int iters, int num_fanins,
   auto* node = graph_view.GetNode(absl::StrFormat("control_out%05d", index));
   auto* fanin = graph_view.GetNode("node");
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     node->HasFanin({&graph_view, fanin->node_index(), Graph::kControlSlot});
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewHasControllingFaninFirst(int iters, int num_fanins,
-                                                 int num_fanouts) {
-  BM_GraphViewTHasControllingFanin<GraphView, false>(iters, num_fanins,
-                                                     num_fanouts);
+void BM_GraphViewHasControllingFaninFirst(::testing::benchmark::State& state) {
+  BM_GraphViewTHasControllingFanin<GraphView, false>(state);
 }
 
-static void BM_GraphViewHasControllingFaninLast(int iters, int num_fanins,
-                                                int num_fanouts) {
-  BM_GraphViewTHasControllingFanin<GraphView, true>(iters, num_fanins,
-                                                    num_fanouts);
+void BM_GraphViewHasControllingFaninLast(::testing::benchmark::State& state) {
+  BM_GraphViewTHasControllingFanin<GraphView, true>(state);
 }
 
-static void BM_MutableGraphViewHasControllingFaninFirst(int iters,
-                                                        int num_fanins,
-                                                        int num_fanouts) {
-  BM_GraphViewTHasControllingFanin<MutableGraphView, false>(iters, num_fanins,
-                                                            num_fanouts);
+void BM_MutableGraphViewHasControllingFaninFirst(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasControllingFanin<MutableGraphView, false>(state);
 }
 
-static void BM_MutableGraphViewHasControllingFaninLast(int iters,
-                                                       int num_fanins,
-                                                       int num_fanouts) {
-  BM_GraphViewTHasControllingFanin<MutableGraphView, true>(iters, num_fanins,
-                                                           num_fanouts);
+void BM_MutableGraphViewHasControllingFaninLast(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasControllingFanin<MutableGraphView, true>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControllingFaninFirst);
@@ -2826,9 +2781,10 @@ RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControllingFaninFirst);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControllingFaninLast);
 
 template <typename GraphViewT, bool IsLast>
-inline static void BM_GraphViewTHasRegularFanout(int iters, int num_fanins,
-                                                 int num_fanouts) {
-  testing::StopTiming();
+inline void BM_GraphViewTHasRegularFanout(::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, /*num_controlling_fanins=*/0,
       /*num_controlled_fanouts=*/0, /*fanout_unique_index=*/false);
@@ -2838,35 +2794,27 @@ inline static void BM_GraphViewTHasRegularFanout(int iters, int num_fanins,
   auto* node = graph_view.GetNode(absl::StrFormat("in%05d", index));
   auto* fanout = graph_view.GetNode("node");
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     node->HasFanout({&graph_view, fanout->node_index(), index});
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewHasRegularFanoutFirst(int iters, int num_fanins,
-                                              int num_fanouts) {
-  BM_GraphViewTHasRegularFanout<GraphView, false>(iters, num_fanins,
-                                                  num_fanouts);
+void BM_GraphViewHasRegularFanoutFirst(::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanout<GraphView, false>(state);
 }
 
-static void BM_GraphViewHasRegularFanoutLast(int iters, int num_fanins,
-                                             int num_fanouts) {
-  BM_GraphViewTHasRegularFanout<GraphView, true>(iters, num_fanins,
-                                                 num_fanouts);
+void BM_GraphViewHasRegularFanoutLast(::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanout<GraphView, true>(state);
 }
 
-static void BM_MutableGraphViewHasRegularFanoutFirst(int iters, int num_fanins,
-                                                     int num_fanouts) {
-  BM_GraphViewTHasRegularFanout<MutableGraphView, false>(iters, num_fanins,
-                                                         num_fanouts);
+void BM_MutableGraphViewHasRegularFanoutFirst(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanout<MutableGraphView, false>(state);
 }
 
-static void BM_MutableGraphViewHasRegularFanoutLast(int iters, int num_fanins,
-                                                    int num_fanouts) {
-  BM_GraphViewTHasRegularFanout<MutableGraphView, true>(iters, num_fanins,
-                                                        num_fanouts);
+void BM_MutableGraphViewHasRegularFanoutLast(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasRegularFanout<MutableGraphView, true>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasRegularFanoutFirst);
@@ -2875,9 +2823,11 @@ RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFanoutFirst);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFanoutLast);
 
 template <typename GraphViewT, bool IsLast>
-inline static void BM_GraphViewTHasControlledFanout(int iters, int num_fanins,
-                                                    int num_fanouts) {
-  testing::StopTiming();
+inline void BM_GraphViewTHasControlledFanout(
+    ::testing::benchmark::State& state) {
+  const int num_fanins = state.range(0);
+  const int num_fanouts = state.range(1);
+
   GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
       num_fanins, num_fanouts, num_fanins, num_fanouts,
       /*fanout_unique_index=*/false);
@@ -2887,37 +2837,27 @@ inline static void BM_GraphViewTHasControlledFanout(int iters, int num_fanins,
   auto* node = graph_view.GetNode(absl::StrFormat("control_in%05d", index));
   auto* fanout = graph_view.GetNode("node");
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto i : state) {
     node->HasFanout({&graph_view, fanout->node_index(), Graph::kControlSlot});
   }
-  testing::StopTiming();
 }
 
-static void BM_GraphViewHasControlledFanoutFirst(int iters, int num_fanins,
-                                                 int num_fanouts) {
-  BM_GraphViewTHasControlledFanout<GraphView, false>(iters, num_fanins,
-                                                     num_fanouts);
+void BM_GraphViewHasControlledFanoutFirst(::testing::benchmark::State& state) {
+  BM_GraphViewTHasControlledFanout<GraphView, false>(state);
 }
 
-static void BM_GraphViewHasControlledFanoutLast(int iters, int num_fanins,
-                                                int num_fanouts) {
-  BM_GraphViewTHasControlledFanout<GraphView, true>(iters, num_fanins,
-                                                    num_fanouts);
+void BM_GraphViewHasControlledFanoutLast(::testing::benchmark::State& state) {
+  BM_GraphViewTHasControlledFanout<GraphView, true>(state);
 }
 
-static void BM_MutableGraphViewHasControlledFanoutFirst(int iters,
-                                                        int num_fanins,
-                                                        int num_fanouts) {
-  BM_GraphViewTHasControlledFanout<MutableGraphView, false>(iters, num_fanins,
-                                                            num_fanouts);
+void BM_MutableGraphViewHasControlledFanoutFirst(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasControlledFanout<MutableGraphView, false>(state);
 }
 
-static void BM_MutableGraphViewHasControlledFanoutLast(int iters,
-                                                       int num_fanins,
-                                                       int num_fanouts) {
-  BM_GraphViewTHasControlledFanout<MutableGraphView, true>(iters, num_fanins,
-                                                           num_fanouts);
+void BM_MutableGraphViewHasControlledFanoutLast(
+    ::testing::benchmark::State& state) {
+  BM_GraphViewTHasControlledFanout<MutableGraphView, true>(state);
 }
 
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControlledFanoutFirst);
@@ -2925,19 +2865,17 @@ RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControlledFanoutLast);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControlledFanoutFirst);
 RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControlledFanoutLast);
 
-static void BM_SortTopologically(int iters, int size) {
-  testing::StopTiming();
+void BM_SortTopologically(::testing::benchmark::State& state) {
+  const int size = state.range(0);
 
   GraphDef graph = test::CreateRandomGraph(size);
   Status status;
   MutableGraphView graph_view(&graph, &status);
   TF_ASSERT_OK(status);
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; i++) {
+  for (auto i : state) {
     TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/false, {}));
   }
-  testing::StopTiming();
 }
 
 RUN_NUM_NODE_BENCHMARK(BM_SortTopologically);
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index 11552622d82..407844a8e14 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/benchmark_testlib.h"
@@ -196,19 +197,17 @@ TEST_F(TopologicalSortTest, ExtraDependencies) {
       ComputeTopologicalOrder(graph, extra_dependencies, &topo_order).ok());
 }
 
-static void BM_ComputeTopologicalOrder(int iters, int size) {
-  testing::StopTiming();
+static void BM_ComputeTopologicalOrder(::testing::benchmark::State& state) {
+  const int size = state.range(0);
 
   GraphDef graph = test::CreateRandomGraph(size);
 
-  testing::StartTiming();
   std::vector<const NodeDef*> topo_order;
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     topo_order.clear();
     Status st = ComputeTopologicalOrder(graph, &topo_order);
     CHECK(st.ok()) << "Failed to compute topological order";
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_ComputeTopologicalOrder)
     ->Arg(10)
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index fd3f8ee89f5..5892588f05a 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -470,15 +470,16 @@ TEST(IsKernelRegisteredForNode, All) {
   EXPECT_FALSE(IsKernelRegisteredForNode(node).ok());
 }
 
-#define BM_NodePositionIfSameNode(I, N, NAME)               \
-  static void BM_NodePositionIfSameNode_##NAME(int iters) { \
-    string input = I;                                       \
-    string node = N;                                        \
-    for (int i = 0; i < iters; ++i) {                       \
-      const int pos = NodePositionIfSameNode(input, node);  \
-      CHECK_GT(pos, -3);                                    \
-    }                                                       \
-  }                                                         \
+#define BM_NodePositionIfSameNode(I, N, NAME)              \
+  static void BM_NodePositionIfSameNode_##NAME(            \
+      ::testing::benchmark::State& state) {                \
+    string input = I;                                      \
+    string node = N;                                       \
+    for (auto s : state) {                                 \
+      const int pos = NodePositionIfSameNode(input, node); \
+      CHECK_GT(pos, -3);                                   \
+    }                                                      \
+  }                                                        \
   BENCHMARK(BM_NodePositionIfSameNode_##NAME)
 
 BM_NodePositionIfSameNode("foo/bar/baz:7", "foo/bar/baz", Match_7);
@@ -487,10 +488,12 @@ BM_NodePositionIfSameNode("^foo/bar/baz", "foo/bar/baz", Match_Ctrl);
 BM_NodePositionIfSameNode("blah", "foo/bar/baz", NoMatch_0);
 BM_NodePositionIfSameNode("foo/bar/baz/gnu", "foo/bar/baz", NoMatch_end);
 
-static void BM_NodeNameAsStringPiece(int iters, int size) {
+void BM_NodeNameAsStringPiece(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   string input(size + 3, 'x');
   input[size] = ':';
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     StringPiece node_name = NodeNameAsStringPiece(input);
     CHECK_GT(node_name.size(), 0);
   }
@@ -498,9 +501,10 @@ static void BM_NodeNameAsStringPiece(int iters, int size) {
 BENCHMARK(BM_NodeNameAsStringPiece)->Range(1, 1024);
 
 #define BM_ParseNodeNameAsStringPiece(I, NAME)                               \
-  static void BM_ParseNodeNameAsStringPiece_##NAME(int iters) {              \
+  static void BM_ParseNodeNameAsStringPiece_##NAME(                          \
+      ::testing::benchmark::State& state) {                                  \
     string input = I;                                                        \
-    for (int i = 0; i < iters; ++i) {                                        \
+    for (auto s : state) {                                                   \
       int position;                                                          \
       const StringPiece name = ParseNodeNameAsStringPiece(input, &position); \
       CHECK_GE(position, -1);                                                \
@@ -683,25 +687,23 @@ TEST(SetTensorValueTest, Quantized) {
                              /*error_msg=*/"");
 }
 
-static void BM_NodeMapConstruct(int iters, int size) {
-  testing::StopTiming();
+void BM_NodeMapConstruct(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   GraphDef graph = test::CreateRandomGraph(size);
-  testing::StartTiming();
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     NodeMap node_map(&graph);
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_NodeMapConstruct)->Range(1, 1 << 20);
 
-static void BM_ImmutableNodeMapConstruct(int iters, int size) {
-  testing::StopTiming();
+void BM_ImmutableNodeMapConstruct(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
   GraphDef graph = test::CreateRandomGraph(size);
-  testing::StartTiming();
-  for (int i = 0; i < iters; i++) {
+  for (auto s : state) {
     ImmutableNodeMap node_map(&graph);
   }
-  testing::StopTiming();
 }
 BENCHMARK(BM_ImmutableNodeMapConstruct)->Range(1, 1 << 20);
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 88d71a11846..8f7ea46f1b4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -48,7 +48,6 @@ load(
 )
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl_ml",
     "mkl_deps",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
@@ -85,7 +84,10 @@ package_group(
 
 package_group(
     name = "optimizer_helper_friends",
-    packages = ["//learning/brain/research/lather/..."],
+    packages = [
+        "//learning/brain/research/lather/...",
+        "//learning/clair/alise/...",
+    ],
 )
 
 config_setting(
@@ -172,7 +174,6 @@ tf_kernel_library(
         "strided_slice_op_gpu_number_types.cu.cc",
     ],
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         ":inplace_ops",
         ":ops_util",
@@ -181,6 +182,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -279,9 +281,9 @@ tf_kernel_library(
         "gpu_device_array_gpu.h",
     ],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
     alwayslink = 0,
@@ -334,6 +336,7 @@ cc_library(
     hdrs = ["conv_2d.h"],
     deps = [
         ":eigen_helpers",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -464,8 +467,8 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
@@ -515,7 +518,7 @@ tf_cuda_library(
     deps = [
         ":gpu_util_hdrs",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
         "//tensorflow/core/util:env_var",
@@ -559,11 +562,11 @@ tf_cc_test(
     deps = [
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:direct_session_internal",
     ],
 )
 
@@ -655,9 +658,9 @@ cc_library(
     hdrs = ["save_restore_tensor.h"],
     copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util/tensor_bundle",
     ],
 )
@@ -713,19 +716,8 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bounds_check",
-    actual = "//tensorflow/core/framework:bounds_check",
-    visibility = [":friends"],
-)
-
 # Private support libraries ---------------------------------------------------
 
-cc_header_only_library(
-    name = "bounds_check_lib",
-    deps = [":bounds_check"],
-)
-
 cc_library(
     name = "gpu_device_array",
     hdrs = [
@@ -910,7 +902,6 @@ cc_library(
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
-    ":bounds_check",
     ":concat_lib",
     ":fill_functor",
     ":gather_functor",
@@ -922,6 +913,7 @@ ARRAY_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/framework:bounds_check",
     "//third_party/eigen3",
 ]
 
@@ -1419,10 +1411,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "ragged_tensor_variant",
+    srcs = ["ragged_tensor_variant.cc"],
+    hdrs = ["ragged_tensor_variant.h"],
+    deps = [
+        ":cwise_op",
+        "//tensorflow/core:framework",
+    ],
+)
+
 tf_kernel_library(
     name = "ragged_tensor_to_variant_op",
     srcs = ["ragged_tensor_to_variant_op.cc"],
     deps = [
+        ":concat_lib",
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1432,6 +1436,7 @@ tf_kernel_library(
     name = "ragged_tensor_from_variant_op",
     srcs = ["ragged_tensor_from_variant_op.cc"],
     deps = [
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1444,6 +1449,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ragged_tensor_to_variant_op",
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1460,6 +1466,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ragged_tensor_from_variant_op",
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1491,12 +1498,12 @@ tf_kernel_library(
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
     deps = [
-        ":bounds_check_lib",
         ":gpu_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ],
 )
@@ -1653,8 +1660,7 @@ tf_cuda_cc_test(
     size = "medium",
     srcs = ["conv_ops_test.cc"],
     tags = [
-        "no_cuda11",  # b/159664089
-        "no_oss",
+        "no_cuda_asan",  # TODO(b/171342275): re-enable.
     ],
     deps = [
         ":conv_ops",
@@ -1755,7 +1761,9 @@ tf_cuda_cc_test(
     name = "depthwise_conv_ops_test",
     size = "small",
     srcs = ["depthwise_conv_ops_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171342266): re-enable.
+    ],
     deps = [
         ":conv_ops",
         ":ops_testutil",
@@ -1889,8 +1897,8 @@ tf_kernel_library(
     prefix = "gather_functor",
     visibility = [":friends"],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -2119,10 +2127,10 @@ tf_kernel_library(
     prefix = "scatter_functor",
     visibility = [":friends"],
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -2234,9 +2242,9 @@ tf_cc_test(
     deps = [
         ":transpose_functor",
         "//tensorflow/core:framework",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
@@ -2287,7 +2295,7 @@ tf_kernel_library(
     name = "ctc_ops",
     prefix = "ctc",
     deps = [
-        ":bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2356,7 +2364,6 @@ cc_header_only_library(
 )
 
 DATA_FLOW_DEPS = [
-    ":bounds_check",
     ":concat_lib",
     ":conditional_accumulator",
     ":conditional_accumulator_base",
@@ -2377,6 +2384,7 @@ DATA_FLOW_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core/framework:bounds_check",
 ]
 
 tf_kernel_library(
@@ -2500,7 +2508,7 @@ tf_kernel_library(
 )
 
 DYNAMIC_DEPS = [
-    ":bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -2553,7 +2561,6 @@ tf_cc_test(
 )
 
 LOOKUP_DEPS = [
-    ":bounds_check",
     ":initializable_lookup_table",
     ":lookup_util",
     "@com_google_absl//absl/container:flat_hash_map",
@@ -2561,6 +2568,7 @@ LOOKUP_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core/framework:bounds_check",
 ]
 
 tf_kernel_library(
@@ -2718,7 +2726,6 @@ tf_kernel_library(
     srcs = ["resource_variable_ops.cc"],
     hdrs = ["resource_variable_ops.h"],
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         ":gather_functor",
         ":gather_nd_op",
@@ -2729,6 +2736,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3010,12 +3018,12 @@ tf_kernel_library(
 )
 
 SAVE_RESTORE_DEPS = [
-    ":bounds_check_lib",
     ":save_restore_tensor",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/util/tensor_bundle",
 ]
 
@@ -3178,9 +3186,9 @@ tf_kernel_library(
         "roll_op.h",
     ],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -3205,13 +3213,13 @@ tf_cc_test(
 )
 
 MATH_DEPS = [
-    ":bounds_check",
     ":fill_functor",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:math_grad",
+    "//tensorflow/core/framework:bounds_check",
     "//third_party/eigen3",
 ]
 
@@ -3235,7 +3243,6 @@ cc_library(
     deps = [
         ":aggregate_ops",
         ":argmax_op",
-        ":batch_matmul_op",
         ":betainc_op",
         ":bincount_op",
         ":bucketize_op",
@@ -3331,12 +3338,27 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "batch_matmul_op",
+    deps = [":matmul_op"],
+)
+
+tf_kernel_library(
+    name = "matmul_op",
     # <prefix>*impl.h are excluded by default from the CPU build, add explicitly.
-    hdrs = ["batch_matmul_op_impl.h"],
-    prefix = "batch_matmul_op",
-    deps = MATH_DEPS + [":eigen_contraction_kernel"] + if_mkl_ml([
-        "//third_party/mkl:intel_binary_blob",
-    ]),
+    hdrs = ["matmul_op_impl.h"],
+    defines = select({
+        ":xsmm": ["TENSORFLOW_USE_LIBXSMM"],
+        "//conditions:default": [],
+    }),
+    prefix = "matmul_op",
+    deps = MATH_DEPS + [
+        ":eigen_contraction_kernel",
+        ":fused_eigen_output_kernels",
+    ] + select({
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
+        "//conditions:default": [],
+    }) + mkl_deps() + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]) + if_cuda_or_rocm([":gpu_utils"]),
 )
 
 tf_kernel_library(
@@ -3398,28 +3420,6 @@ tf_kernel_library(
     ]),
 )
 
-tf_kernel_library(
-    name = "matmul_op",
-    srcs = [
-        "matmul_op.cc",
-        "matmul_op_fused.cc",
-    ],
-    hdrs = ["matmul_op.h"],
-    defines = select({
-        ":xsmm": ["TENSORFLOW_USE_LIBXSMM"],
-        "//conditions:default": [],
-    }),
-    deps = MATH_DEPS + [
-        ":eigen_contraction_kernel",
-        ":fused_eigen_output_kernels",
-    ] + select({
-        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
-        "//conditions:default": [],
-    }) + mkl_deps() + if_cuda([
-        "//tensorflow/core/platform/default/build_config:cublas_plugin",
-    ]) + if_cuda_or_rocm([":gpu_utils"]),
-)
-
 tf_kernel_library(
     name = "reduction_ops",
     gpu_srcs = ["reduction_gpu_kernels.cu.h"],
@@ -3551,7 +3551,6 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["cwise_ops_test.cc"],
     deps = [
-        ":bounds_check",
         ":cwise_op",
         ":nn",
         ":ops_testutil",
@@ -3563,6 +3562,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -3612,25 +3612,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_cuda_cc_test(
-    name = "batch_matmul_op_test",
-    size = "small",
-    srcs = ["batch_matmul_op_test.cc"],
-    deps = [
-        ":batch_matmul_op",
-        ":broadcast_to_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "scan_ops_test",
     size = "small",
@@ -3829,7 +3810,6 @@ tf_kernel_library(
     deps = [
         ":conv_grad_shape_utils",
         ":conv_ops_3d_headers",
-        ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":eigen_contraction_kernel",
@@ -3843,6 +3823,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util:image_resizer_state",
         "//tensorflow/core/util/proto:proto_utils",
     ] + select({
@@ -3856,7 +3837,7 @@ tf_kernel_library(
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/stream_executor:tf_allocator_adapter",
         "//tensorflow/stream_executor:stream_executor_headers",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ]) + if_cuda_or_rocm([
         ":gpu_utils",
         "//tensorflow/stream_executor/gpu:redzone_allocator",
@@ -3896,12 +3877,12 @@ tf_kernel_library(
         "depthwise_conv_op_gpu_half.cu.cc",
     ],
     deps = [
-        ":bounds_check",
         ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
     ] + if_cuda([
         "@local_config_cuda//cuda:cub_headers",
         "@local_config_cuda//cuda:cudnn_header",
@@ -3917,13 +3898,13 @@ tf_kernel_library(
     ],
     prefix = "depthwise_conv_grad_op",
     deps = [
-        ":bounds_check",
+        ":cast_op",
         ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        ":cast_op",
+        "//tensorflow/core/framework:bounds_check",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -3962,7 +3943,6 @@ cc_library(
 )
 
 NN_DEPS = [
-    ":bounds_check",
     ":conv_2d",
     ":eigen_contraction_kernel",
     ":ops_util",
@@ -3970,6 +3950,7 @@ NN_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:nn_grad",
+    "//tensorflow/core/framework:bounds_check",
     "//third_party/eigen3",
 ]
 
@@ -3994,7 +3975,7 @@ tf_kernel_library(
         ":reduction_ops",
     ]) + if_cuda([
         "@local_config_cuda//cuda:cub_headers",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/stream_executor/cuda:cuda_stream",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
@@ -4009,7 +3990,7 @@ tf_kernel_library(
         ":redux_functor",
         ":transpose_functor",
     ] + if_cuda([
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ]),
 )
 
@@ -4219,7 +4200,6 @@ tf_kernel_library(
         "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
-        ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":conv_ops",
@@ -4229,7 +4209,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ],
 )
@@ -4290,9 +4271,9 @@ tf_kernel_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -4483,7 +4464,6 @@ tf_kernel_library(
     name = "stateful_random_ops",
     prefix = "stateful_random_ops",
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         ":fill_functor",
         ":gather_functor",
@@ -4500,6 +4480,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
     ],
@@ -4509,11 +4490,11 @@ tf_kernel_library(
     name = "stateless_random_ops",
     prefix = "stateless_random_ops",
     deps = [
-        ":bounds_check",
         ":random_op",
         ":random_poisson_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -4702,9 +4683,9 @@ tf_kernel_library(
     name = "sparse_tensor_dense_matmul_op",
     prefix = "sparse_tensor_dense_matmul_op",
     deps = SPARSE_DEPS + [
-        ":bounds_check",
         ":fill_functor",
         "//third_party/eigen3",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -4720,8 +4701,8 @@ tf_kernel_library(
     name = "sparse_xent_op",
     prefix = "sparse_xent_op",
     deps = SPARSE_DEPS + [
-        ":bounds_check",
         "//third_party/eigen3",
+        "//tensorflow/core/framework:bounds_check",
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
@@ -4875,7 +4856,7 @@ cc_library(
 
 STATE_DEPS = [
     ":assign_op",
-    ":bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     ":fill_functor",
     ":scatter_functor",
     "//third_party/eigen3",
@@ -5046,7 +5027,7 @@ cc_library(
 )
 
 STRING_DEPS = [
-    ":bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     ":string_util",
     "//third_party/eigen3",
     "//tensorflow/core:framework",
@@ -5263,11 +5244,11 @@ tf_kernel_library(
     name = "unicode_ops",
     prefix = "unicode_ops",
     deps = [
-        ":bounds_check",
         ":string_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
         "//third_party/icu/data:conversion_data",
         "@icu//:common",
@@ -5284,11 +5265,11 @@ tf_kernel_library(
     name = "training_ops",
     prefix = "training_ops",
     deps = [
-        ":bounds_check",
         ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -5447,11 +5428,11 @@ tf_kernel_library(
     name = "encode_wav_op",
     prefix = "encode_wav_op",
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -5860,8 +5841,8 @@ filegroup(
         "identity_op.h",
         "immutable_constant_op.cc",
         "immutable_constant_op.h",
-        "matmul_op.cc",
-        "matmul_op.h",
+        "matmul_op_impl.h",
+        "matmul_op_real.cc",
         "no_op.cc",
         "no_op.h",
         "one_hot_op.cc",
@@ -5940,7 +5921,6 @@ filegroup(
     srcs = [
         "argmax_op.h",
         "avgpooling_op.h",
-        "batch_matmul_op_impl.h",
         "batch_norm_op.h",
         "bincount_op.h",
         "broadcast_to_op.h",
@@ -6031,7 +6011,6 @@ filegroup(
         ":android_extended_ops_headers",
         "argmax_op.cc",
         "avgpooling_op.cc",
-        "batch_matmul_op_real.cc",
         "batch_norm_op.cc",
         "bcast_ops.cc",
         "check_numerics_op.cc",
@@ -6472,6 +6451,7 @@ cc_library(
         "//tensorflow/core/platform:strong_hash",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
+        "//third_party/icu/data:conversion_data",
         "@com_google_absl//absl/base",
         "@com_google_protobuf//:protobuf",
         "@fft2d",
@@ -6644,7 +6624,7 @@ tf_cc_binary(
             "//tensorflow/cc:cc_ops",
             "//tensorflow/cc:client_session",
             "//tensorflow/core:framework",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
         ],
     }),
 )
@@ -6701,7 +6681,7 @@ cc_binary(
             ":quantized_ops",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
         ],
@@ -6906,7 +6886,7 @@ cc_binary(
             ":ops_util",
             ":quantized_ops",
             "//tensorflow/core:framework",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
         ],
@@ -7046,7 +7026,7 @@ cc_binary(
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
         ],
     }),
 )
@@ -7366,11 +7346,11 @@ cc_library(
         "fill_functor.h",
     ],
     deps = [
-        ":bounds_check",
         ":meta_support",
         ":quantization_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -7423,7 +7403,6 @@ test_suite(
         "manual",  # Avoid redundancy when using wildcard test patterns.
     ],
     tests = [
-        ":batch_matmul_op_test",
         ":batch_norm_op_test",
         ":broadcast_to_op_test",
         ":cast_op_test",
diff --git a/tensorflow/core/kernels/aggregate_ops.h b/tensorflow/core/kernels/aggregate_ops.h
index 5023d0dc8e7..f4351f390f4 100644
--- a/tensorflow/core/kernels/aggregate_ops.h
+++ b/tensorflow/core/kernels/aggregate_ops.h
@@ -370,24 +370,77 @@ class AddNOp<Device, Variant, OpKernelT, OpKernelConstructionT,
               i, " has shape: ", ctx->input(i).shape().DebugString(), "."));
     }
 
-    // Step 2: attempt to add using
+    // Step 2: Sum input variants in a tree-like structure using
     //   BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...)
     //   For the output create a default-constructed variant object.
-    // TODO(ebrevdo): Perform summation in a tree-structure.
-    Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
-    Variant* v_out = &(out.scalar<Variant>()());
-    OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(
-                            ctx, ADD_VARIANT_BINARY_OP,
-                            ctx->input(0).template scalar<Variant>()(),
-                            ctx->input(1).template scalar<Variant>()(), v_out));
-    for (int i = 2; i < num; ++i) {
-      const Variant tmp = std::move(*v_out);
-      const Variant& inp = ctx->input(i).template scalar<Variant>()();
-      OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP,
-                                                   inp, tmp, v_out));
+    //
+    // Pairwise summation provides better numerical precision by
+    // reducing round-off error:
+    //
+    //   https://en.wikipedia.org/wiki/Pairwise_summation
+    //
+    // These two vectors are used to store and mark intermediate sums.
+    gtl::InlinedVector<bool, 4> temp_filled(num, false);
+    gtl::InlinedVector<Variant, 4> temp(num);
+
+    // Tree-based summation.
+    int skip = 1;
+    int n = num;
+    while (skip < n) {
+      int i = skip;
+      while (i < n) {
+        // TODO(ebrevdo, rmlarsen): Parallelize the pairwise summations in the
+        // inner loop if the variants are "large".
+
+        // x[i - skip] += x[i]
+        OP_REQUIRES_OK(ctx,
+                       AddVariantTo(ctx, i - skip, i, &temp, &temp_filled));
+        // We won't use this index again, recover its memory.
+        temp[i].clear();
+        i += 2 * skip;
+      }
+      if (i == n) {
+        // x[0] += x[i - skip]
+        OP_REQUIRES_OK(ctx,
+                       AddVariantTo(ctx, 0, i - skip, &temp, &temp_filled));
+        // We won't use this index again, recover its memory.
+        temp[i - skip].clear();
+        n -= skip;
+      }
+      skip *= 2;
     }
+
+    Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
+    out.scalar<Variant>()() = std::move(temp[0]);
     ctx->set_output(0, out);
   }
+
+ private:
+  // AddVariantTo efficiently performs:
+  //    temp[lhs_ix] <- array(lhs_ix) + array(rhs_ix)
+  // where array(ix) := (temp_filled[ix]
+  //                     ? temp[ix]
+  //                     : ctx->input(ix).scalar<Variant>()())
+  // This reduces (possibly expensive) copying of Variants from
+  // the inputs into temp at the lowest levels of the summation tree.
+  static inline Status AddVariantTo(OpKernelContextT* ctx, const int lhs_ix,
+                                    const int rhs_ix,
+                                    gtl::InlinedVector<Variant, 4>* temp,
+                                    gtl::InlinedVector<bool, 4>* temp_filled) {
+    Variant tmp;
+    if (temp_filled->at(lhs_ix)) tmp = std::move(temp->at(lhs_ix));
+    const Variant& a = temp_filled->at(lhs_ix)
+                           ? tmp
+                           : ctx->input(lhs_ix).template scalar<Variant>()();
+    const Variant& b = temp_filled->at(rhs_ix)
+                           ? temp->at(rhs_ix)
+                           : ctx->input(rhs_ix).template scalar<Variant>()();
+    Variant* c = &temp->at(lhs_ix);
+    TF_RETURN_IF_ERROR(
+        BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP, a, b, c));
+    temp_filled->at(lhs_ix) = true;
+    return Status::OK();
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index b9af976a654..ef0c3ff9856 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -112,6 +115,8 @@ class AsStringOp : public OpKernel {
         break;
       case DT_BOOL:
         break;
+      case DT_VARIANT:
+        break;
       default:
         bool type_not_supported = true;
         OP_REQUIRES(ctx, !type_not_supported,
@@ -156,6 +161,12 @@ class AsStringOp : public OpKernel {
           output_flat(i) = (input_flat(i)) ? "true" : "false";
         }
       } break;
+      case (DT_VARIANT): {
+        const auto& input_flat = input_tensor->flat<Variant>();
+        for (int i = 0; i < input_flat.size(); ++i) {
+          output_flat(i) = input_flat(i).DebugString();
+        }
+      } break;
       case (DT_COMPLEX64): {
         const auto& input_flat = input_tensor->flat<complex64>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/as_string_op_test.cc b/tensorflow/core/kernels/as_string_op_test.cc
index dff78e25e72..159263c3141 100644
--- a/tensorflow/core/kernels/as_string_op_test.cc
+++ b/tensorflow/core/kernels/as_string_op_test.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -148,6 +151,25 @@ TEST_F(AsStringGraphTest, Bool) {
   test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
+TEST_F(AsStringGraphTest, Variant) {
+  TF_ASSERT_OK(Init(DT_VARIANT));
+
+  AddInput(DT_VARIANT, TensorShape({4}));
+  auto inputs = mutable_input(0)->flat<Variant>();
+  inputs(0) = 2;
+  inputs(1) = 3;
+  inputs(2) = true;
+  inputs(3) = Tensor("hi");
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(
+      &expected, {"Variant<type: int value: 2>", "Variant<type: int value: 3>",
+                  "Variant<type: bool value: 1>",
+                  ("Variant<type: tensorflow::Tensor value: Tensor<type: string"
+                   " shape: [] values: hi>>")});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
 TEST_F(AsStringGraphTest, String) {
   Status s = Init(DT_STRING);
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
diff --git a/tensorflow/core/kernels/basic_ops_benchmark_test.cc b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
index 5726062938b..171bf220466 100644
--- a/tensorflow/core/kernels/basic_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
@@ -64,13 +64,16 @@ static void MulChain(int chain_length, Graph** init_g, Graph** run_g) {
 
 // Benchmark a chain of simple multiplications.
 // This emphasizes per-op overhead.
-static void BM_MulChain(int iters, int chain_length) {
-  const int64 tot = static_cast<int64>(iters) * chain_length;
-  testing::ItemsProcessed(tot);
+static void BM_MulChain(::testing::benchmark::State& state) {
+  const int chain_length = state.range(0);
+
   Graph* init;
   Graph* run;
   MulChain(chain_length, &init, &run);
-  test::Benchmark("cpu", run, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", run, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(state.iterations());
 }
 BENCHMARK(BM_MulChain)->Arg(1 << 10);
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc
deleted file mode 100644
index 0c04a82818f..00000000000
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/kernels/broadcast_to_op.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-Node* BroadcastTo(Graph* g, Node* input, Node* shape) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastTo")
-                  .Input(input)
-                  .Input(shape)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
-Node* BatchMatmulV2(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BatchMatMulV2")
-                  .Input(in0)
-                  .Input(in1)
-                  .Attr("adj_x", adj_x)
-                  .Attr("adj_y", adj_y)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
-template <typename T>
-static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
-                          bool adjoint_b, DataType type) {
-  Graph* g = new Graph(OpRegistry::Global());
-  Tensor in0(type, adjoint_a ? TensorShape({b, k, m}) : TensorShape({b, m, k}));
-  in0.flat<T>().setRandom();
-  Tensor in1(type, adjoint_b ? TensorShape({b, n, k}) : TensorShape({b, k, n}));
-  in1.flat<T>().setRandom();
-  test::graph::BatchMatmul(g, test::graph::Constant(g, in0),
-                           test::graph::Constant(g, in1), adjoint_a, adjoint_b);
-  return g;
-}
-
-template <typename T>
-static Graph* BatchMatmulWithBroadcast(int b0, int b1, int m, int k, int n,
-                                       bool manual_broadcast, DataType type) {
-  Graph* g = new Graph(OpRegistry::Global());
-  Tensor in0(type, TensorShape({b0, m, k}));
-  in0.flat<T>().setRandom();
-  Tensor in1(type, TensorShape({b1, k, n}));
-  in1.flat<T>().setRandom();
-
-  Tensor broadcasted_in0_shape(DT_INT64, TensorShape({3}));
-  Tensor broadcasted_in1_shape(DT_INT64, TensorShape({3}));
-
-  Node* in0_node = nullptr;
-  Node* in1_node = nullptr;
-  if (manual_broadcast) {
-    for (int i = 0; i < 3; ++i) {
-      auto vec0 = broadcasted_in0_shape.vec<int64>();
-      auto vec1 = broadcasted_in1_shape.vec<int64>();
-      vec0(i) = (i == 0 ? std::max(b0, b1) : in0.shape().dim_size(i));
-      vec1(i) = (i == 0 ? std::max(b0, b1) : in1.shape().dim_size(i));
-    }
-    in0_node = BroadcastTo(g, test::graph::Constant(g, in0),
-                           test::graph::Constant(g, broadcasted_in0_shape));
-    in1_node = BroadcastTo(g, test::graph::Constant(g, in1),
-                           test::graph::Constant(g, broadcasted_in1_shape));
-  } else {
-    in0_node = test::graph::Constant(g, in0);
-    in1_node = test::graph::Constant(g, in1);
-  }
-
-  BatchMatmulV2(g, in0_node, in1_node, false, false);
-  return g;
-}
-
-#define BM_BatchMatmulDev(B, M, K, N, TA, TB, T, TFTYPE, DEVICE)                  \
-  static void                                                                     \
-      BM_BatchMatmul##_##B##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE( \
-          int iters) {                                                            \
-    testing::UseRealTime();                                                       \
-    testing::ItemsProcessed(static_cast<int64>(iters) * B * M * K * N * 2);       \
-    test::Benchmark(#DEVICE, BatchMatmul<T>(B, M, K, N, TA, TB, TFTYPE))          \
-        .Run(iters);                                                              \
-  }                                                                               \
-  BENCHMARK(                                                                      \
-      BM_BatchMatmul##_##B##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE);
-
-#define BM_BatchMatmul(B, M, K, N, TA, TB) \
-  BM_BatchMatmulDev(B, M, K, N, TA, TB, float, DT_FLOAT, cpu);
-// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
-// cpu);
-//  BM_BatchMatmulDev(B, M, K, N, TA, TB, float, DT_FLOAT, gpu);
-/* Uncomment to enable benchmarks for double & complex types: */
-// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
-// gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
-// \
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
-
-// Macro arguments names: --------------------------------------------------- //
-//   B1: batch size of LHS
-//   B2: batch size of RHS
-//    M: outer dimension of LHS
-//    K: inner dimensions of LHS and RHS
-//    N: outer dimension of RHS
-//   MB: boolean indicating whether to use manual broadcasting
-//    T: C++ type of scalars (e.g. float, std::complex)
-//   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
-//    D: Device (e.g. cpu, gpu)
-#define BM_BatchMatmulBCastDev(B1, B2, M, K, N, MB, T, TT, D)                  \
-  static void                                                                  \
-      BM_BatchMatmulBCast##_##B1##_##B2##_##M##_##K##_##N##_##MB##_##TT##_##D( \
-          int iters) {                                                         \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
-                            K * N * 2);                                        \
-    test::Benchmark(#D, BatchMatmulWithBroadcast<T>(B1, B2, M, K, N, MB, TT))  \
-        .Run(iters);                                                           \
-  }                                                                            \
-  BENCHMARK(                                                                   \
-      BM_BatchMatmulBCast##_##B1##_##B2##_##M##_##K##_##N##_##MB##_##TT##_##D);
-
-#define BM_BatchMatmulBCast(B1, B2, M, K, N, MB) \
-  BM_BatchMatmulBCastDev(B1, B2, M, K, N, MB, float, DT_FLOAT, cpu);
-
-// Typical fully connected layers
-BM_BatchMatmulBCast(1, 128, 1, 1024, 1024, true);
-BM_BatchMatmulBCast(1, 128, 1, 1024, 1024, false);
-BM_BatchMatmulBCast(128, 1, 1, 1024, 1024, true);
-BM_BatchMatmulBCast(128, 1, 1, 1024, 1024, false);
-BM_BatchMatmulBCast(1, 128, 128, 1024, 1024, true);
-BM_BatchMatmulBCast(1, 128, 128, 1024, 1024, false);
-BM_BatchMatmulBCast(128, 1, 128, 1024, 1024, true);
-BM_BatchMatmulBCast(128, 1, 128, 1024, 1024, false);
-
-// Square matmul.
-BM_BatchMatmulBCast(1, 128, 512, 512, 512, true);
-BM_BatchMatmulBCast(1, 128, 512, 512, 512, false);
-BM_BatchMatmulBCast(128, 1, 512, 512, 512, true);
-BM_BatchMatmulBCast(128, 1, 512, 512, 512, false);
-BM_BatchMatmulBCast(1, 128, 1024, 1024, 1024, true);
-BM_BatchMatmulBCast(1, 128, 1024, 1024, 1024, false);
-BM_BatchMatmulBCast(128, 1, 1024, 1024, 1024, true);
-BM_BatchMatmulBCast(128, 1, 1024, 1024, 1024, false);
-
-// Matrix-vector multiplies.
-BM_BatchMatmulBCast(1, 128, 10000, 200, 1, true);
-BM_BatchMatmulBCast(1, 128, 10000, 200, 1, false);
-BM_BatchMatmulBCast(128, 1, 10000, 200, 1, true);
-BM_BatchMatmulBCast(128, 1, 10000, 200, 1, false);
-
-// Vector-matrix multiplies.
-BM_BatchMatmulBCast(1, 128, 1, 200, 10000, true);
-BM_BatchMatmulBCast(1, 128, 1, 200, 10000, false);
-BM_BatchMatmulBCast(128, 1, 1, 200, 10000, true);
-BM_BatchMatmulBCast(128, 1, 1, 200, 10000, false);
-
-// Typical fully connected layers
-BM_BatchMatmul(1, 1, 1024, 1024, false, false);
-BM_BatchMatmul(1, 8, 1024, 1024, false, false);
-BM_BatchMatmul(1, 16, 1024, 1024, false, false);
-BM_BatchMatmul(1, 128, 1024, 1024, false, false);
-BM_BatchMatmul(2, 1, 1024, 1024, false, false);
-BM_BatchMatmul(2, 8, 1024, 1024, false, false);
-BM_BatchMatmul(2, 16, 1024, 1024, false, false);
-BM_BatchMatmul(2, 128, 1024, 1024, false, false);
-BM_BatchMatmul(8, 1, 1024, 1024, false, false);
-BM_BatchMatmul(8, 8, 1024, 1024, false, false);
-BM_BatchMatmul(8, 16, 1024, 1024, false, false);
-BM_BatchMatmul(8, 128, 1024, 1024, false, false);
-BM_BatchMatmul(32, 1, 1024, 1024, false, false);
-BM_BatchMatmul(32, 8, 1024, 1024, false, false);
-BM_BatchMatmul(32, 16, 1024, 1024, false, false);
-BM_BatchMatmul(32, 128, 1024, 1024, false, false);
-
-// Square matmul.
-BM_BatchMatmul(1, 32, 32, 32, false, false);
-BM_BatchMatmul(1, 128, 128, 128, false, false);
-BM_BatchMatmul(1, 256, 256, 256, false, false);
-BM_BatchMatmul(1, 1024, 1024, 1024, false, false);
-BM_BatchMatmul(1, 2048, 2048, 2048, false, false);
-BM_BatchMatmul(2, 32, 32, 32, false, false);
-BM_BatchMatmul(2, 128, 128, 128, false, false);
-BM_BatchMatmul(2, 256, 256, 256, false, false);
-BM_BatchMatmul(2, 1024, 1024, 1024, false, false);
-BM_BatchMatmul(2, 2048, 2048, 2048, false, false);
-BM_BatchMatmul(4, 32, 32, 32, false, false);
-BM_BatchMatmul(4, 128, 128, 128, false, false);
-BM_BatchMatmul(4, 256, 256, 256, false, false);
-BM_BatchMatmul(4, 1024, 1024, 1024, false, false);
-BM_BatchMatmul(4, 2048, 2048, 2048, false, false);
-BM_BatchMatmul(8, 32, 32, 32, false, false);
-BM_BatchMatmul(8, 128, 128, 128, false, false);
-BM_BatchMatmul(8, 256, 256, 256, false, false);
-BM_BatchMatmul(8, 1024, 1024, 1024, false, false);
-BM_BatchMatmul(8, 2048, 2048, 2048, false, false);
-BM_BatchMatmul(32, 32, 32, 32, false, false);
-BM_BatchMatmul(32, 128, 128, 128, false, false);
-BM_BatchMatmul(32, 256, 256, 256, false, false);
-BM_BatchMatmul(32, 1024, 1024, 1024, false, false);
-BM_BatchMatmul(32, 2048, 2048, 2048, false, false);
-
-// Matrix-vector multiplies.
-BM_BatchMatmul(1, 10000, 200, 1, false, false);
-BM_BatchMatmul(8, 10000, 200, 1, false, false);
-BM_BatchMatmul(32, 10000, 200, 1, false, false);
-BM_BatchMatmul(1, 10000, 200, 1, true, false);
-BM_BatchMatmul(8, 10000, 200, 1, true, false);
-BM_BatchMatmul(32, 10000, 200, 1, true, false);
-BM_BatchMatmul(1, 10000, 200, 1, false, true);
-BM_BatchMatmul(8, 10000, 200, 1, false, true);
-BM_BatchMatmul(32, 10000, 200, 1, false, true);
-BM_BatchMatmul(1, 10000, 200, 1, true, true);
-BM_BatchMatmul(8, 10000, 200, 1, true, true);
-BM_BatchMatmul(32, 10000, 200, 1, true, true);
-
-// Vector-matrix multiplies.
-BM_BatchMatmul(1, 1, 200, 10000, false, false);
-BM_BatchMatmul(8, 1, 200, 10000, false, false);
-BM_BatchMatmul(32, 1, 200, 10000, false, false);
-BM_BatchMatmul(1, 1, 200, 10000, true, false);
-BM_BatchMatmul(8, 1, 200, 10000, true, false);
-BM_BatchMatmul(32, 1, 200, 10000, true, false);
-BM_BatchMatmul(1, 1, 200, 10000, false, true);
-BM_BatchMatmul(8, 1, 200, 10000, false, true);
-BM_BatchMatmul(32, 1, 200, 10000, false, true);
-BM_BatchMatmul(1, 1, 200, 10000, true, true);
-BM_BatchMatmul(8, 1, 200, 10000, true, true);
-BM_BatchMatmul(32, 1, 200, 10000, true, true);
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 16de576d2c9..8f233957032 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -249,6 +249,8 @@ cc_library(
         "//tensorflow/core/kernels/batching_util:threadsafe_status",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/core/util:incremental_barrier",
     ],
 )
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index af93a3ec9a6..c0f22ab1098 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -115,7 +115,7 @@ class ThroughputBenchmark {
   ThroughputBenchmark& operator=(const ThroughputBenchmark&) = delete;
 
   // Perform the benchmark run, based on the parameters supplied to the ctor.
-  void RunBenchmark(int iters);
+  void RunBenchmark(::testing::benchmark::State& state);
 
  private:
   // Resets all mutable state, including the scheduler.
@@ -136,22 +136,18 @@ ThroughputBenchmark::ThroughputBenchmark(
     const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options)
     : scheduler_options_(scheduler_options) {}
 
-void ThroughputBenchmark::RunBenchmark(int iters) {
-  CHECK_GE(iters, 1);
+void ThroughputBenchmark::RunBenchmark(::testing::benchmark::State& state) {
+  CHECK_GE(state.max_iterations, 1);
 
-  testing::StopTiming();
   ResetState();
 
   // Have each iteration issue a reasonably large number of tasks, to ensure our
   // measurements reflect steady-state behavior.
   const int kNumTasksPerIteration = 100 * 1000;
-
-  testing::ItemsProcessed(iters * kNumTasksPerIteration);
   testing::UseRealTime();
-  testing::StartTiming();
 
   // Schedule 'num_iterations_*kNumTasksPerIteration' tasks.
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     for (int j = 0; j < kNumTasksPerIteration; ++j) {
       auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
       TF_CHECK_OK(scheduler_->Schedule(&task));
@@ -160,7 +156,7 @@ void ThroughputBenchmark::RunBenchmark(int iters) {
 
   // Wait for the scheduler to process all tasks.
   scheduler_.reset();
-  testing::StopTiming();
+  state.SetItemsProcessed(state.iterations() * kNumTasksPerIteration);
 }
 
 void ThroughputBenchmark::ResetState() {
@@ -338,7 +334,8 @@ void LatencyBenchmark::PerformBatchCpuWork() const {
   CHECK_NE(dummy, 0);
 }
 
-static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
+static void RunThroughputBenchmark(::testing::benchmark::State& state,
+                                   int64 batch_timeout_micros,
                                    int num_batch_threads) {
   BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options;
   const int kMaxBatchSize = 100;
@@ -347,13 +344,14 @@ static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
   scheduler_options.num_batch_threads = num_batch_threads;
   scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
   ThroughputBenchmark benchmark(scheduler_options);
-  benchmark.RunBenchmark(iters);
+  benchmark.RunBenchmark(state);
 }
 
-static void ThroughputBM_ZeroTimeout(int iters, int num_batch_threads) {
-  RunThroughputBenchmark(iters, 0 /* 0 ms timeout */, num_batch_threads);
+static void ThroughputBM_ZeroTimeout(::testing::benchmark::State& state) {
+  RunThroughputBenchmark(state, 0 /* 0 ms timeout */, state.range(0));
 }
 BENCHMARK(ThroughputBM_ZeroTimeout)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
@@ -362,10 +360,11 @@ BENCHMARK(ThroughputBM_ZeroTimeout)
     ->Arg(32)
     ->Arg(64);
 
-static void ThroughputBM_SmallTimeout(int iters, int num_batch_threads) {
-  RunThroughputBenchmark(iters, 1 * 1000 /* 1 ms timeout */, num_batch_threads);
+static void ThroughputBM_SmallTimeout(::testing::benchmark::State& state) {
+  RunThroughputBenchmark(state, 1 * 1000 /* 1 ms timeout */, state.range(0));
 }
 BENCHMARK(ThroughputBM_SmallTimeout)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
@@ -374,11 +373,11 @@ BENCHMARK(ThroughputBM_SmallTimeout)
     ->Arg(32)
     ->Arg(64);
 
-static void ThroughputBM_LargeTimeout(int iters, int num_batch_threads) {
-  RunThroughputBenchmark(iters, 50 * 1000 /* 50 ms timeout */,
-                         num_batch_threads);
+static void ThroughputBM_LargeTimeout(::testing::benchmark::State& state) {
+  RunThroughputBenchmark(state, 50 * 1000 /* 50 ms timeout */, state.range(0));
 }
 BENCHMARK(ThroughputBM_LargeTimeout)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 98175b5b9d0..81a16522c55 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/util/incremental_barrier.h"
 
 namespace tensorflow {
@@ -81,6 +83,26 @@ const string& GetModelName(OpKernelContext* ctx) {
 
 }  // namespace
 
+std::unique_ptr<BatchResourceBase::BatchTask>
+BatchResourceBase::BatchTask::CreateSplitTask(
+    int split_index, AsyncOpKernel::DoneCallback done_callback) {
+  std::unique_ptr<BatchTask> task = CreateDerivedTask();
+
+  task->guid = this->guid;
+  task->propagated_context = Context(ContextKind::kThread);
+  task->inputs.reserve(this->inputs.size());
+  task->captured_inputs = this->captured_inputs;
+  task->context = this->context;
+  task->done_callback = done_callback;
+  task->split_index = split_index;
+  task->output = this->output;
+  task->status = this->status;
+  task->is_partial = true;
+  task->start_time = this->start_time;
+
+  return task;
+}
+
 using ::tensorflow::concat_split_util::Concat;
 using ::tensorflow::concat_split_util::Split;
 using TensorMatrix = std::vector<std::vector<Tensor>>;
@@ -202,6 +224,11 @@ Status BatchResourceBase::ConcatInputTensors(
 
   const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
   const int padding_amount = padded_batch_size - batch.size();
+  profiler::TraceMe trace_me([padded_batch_size, padding_amount]() {
+    return profiler::TraceMeEncode(
+        "ConcatInputTensors", {{"batch_size_after_padding", padded_batch_size},
+                               {"padding_amount", padding_amount}});
+  });
   RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size);
   RecordProcessedBatchSize(padded_batch_size, GetModelName(context));
 
@@ -310,20 +337,7 @@ Status BatchResourceBase::ConcatInputTensors(
 
   output_tasks->reserve(output_task_num);
   for (int i = 0; i < output_task_num; i++) {
-    auto task = absl::make_unique<BatchTask>();
-    task->guid = input_task.guid;
-    task->propagated_context = Context(ContextKind::kThread);
-    task->captured_inputs = input_task.captured_inputs;
-    task->context = input_task.context;
-    task->done_callback = barrier.Inc();
-    task->start_time = input_task.start_time;
-    task->split_index = i;
-    task->inputs.reserve(input_task.inputs.size());
-    task->is_partial = true;
-    task->status = input_task.status;
-
-    task->output = input_task.output;
-    output_tasks->push_back(std::move(task));
+    output_tasks->push_back(input_task.CreateSplitTask(i, barrier.Inc()));
   }
 
   const int num_input_tensors = input_task.inputs.size();
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 39d6e3dd951..89391f2defe 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -87,9 +87,19 @@ class BatchResourceBase : public ResourceBase {
 
     bool is_partial = false;
 
+    uint64 start_time;
+
     size_t size() const override { return inputs[0].shape().dim_size(0); }
 
-    uint64 start_time;
+    // Create a split task from this one. The caller needs to setup the inputs
+    // of the new task
+    std::unique_ptr<BatchTask> CreateSplitTask(
+        int split_index, AsyncOpKernel::DoneCallback done_callback);
+
+   protected:
+    virtual std::unique_ptr<BatchTask> CreateDerivedTask() {
+      return std::make_unique<BatchTask>();
+    }
   };
 
   // Appending a T suffix to make the type alias different to those in
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 04b84e6054e..ed6e9a47cad 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -643,8 +643,9 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
     }
     profiler::TraceMeProducer trace_me(
         [task] {
-          return profiler::TraceMeEncode("Schedule",
-                                         {{"size", (*task)->size()}});
+          return profiler::TraceMeEncode(
+              "ScheduleWithoutSplit",
+              {{"batching_input_task_size", (*task)->size()}});
         },
         profiler::ContextType::kSharedBatchScheduler,
         batches_.back()->traceme_context_id());
@@ -672,8 +673,8 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
 template <typename TaskType>
 Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
   profiler::TraceMe trace_me([task] {
-    return profiler::TraceMeEncode("ScheduleWithSplit",
-                                   {{"size", (*task)->size()}});
+    return profiler::TraceMeEncode(
+        "ScheduleWithSplit", {{"batching_input_task_size", (*task)->size()}});
   });
   if ((*task)->size() > options_.input_batch_size_limit) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
@@ -809,9 +810,7 @@ void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
   profiler::TraceMeConsumer trace_me(
       [&] {
         return profiler::TraceMeEncode(
-            "ProcessBatch",
-            {{"size", batch->size()},
-             {"padding", max_execution_batch_size() - batch->size()}});
+            "ProcessBatch", {{"batch_size_before_padding", batch->size()}});
       },
       profiler::ContextType::kSharedBatchScheduler,
       batch->traceme_context_id());
diff --git a/tensorflow/core/kernels/bias_op_test.cc b/tensorflow/core/kernels/bias_op_test.cc
index 2da219f7e45..6119b52bc01 100644
--- a/tensorflow/core/kernels/bias_op_test.cc
+++ b/tensorflow/core/kernels/bias_op_test.cc
@@ -43,22 +43,27 @@ static Graph* BiasAddGrad(int d0, int d1, int d2, int d3) {
   return g;
 }
 
-#define BM_BiasAddNHWC(N, W, H, C, DEVICE)                                   \
-  static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(int iters) { \
-    testing::UseRealTime();                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);      \
-    test::Benchmark(#DEVICE, BiasAdd(N, H, W, C)).Run(iters);                \
-  }                                                                          \
-  BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
+#define BM_BiasAddNHWC(N, W, H, C, DEVICE)                                     \
+  static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(               \
+      ::testing::benchmark::State& state) {                                    \
+    test::Benchmark(#DEVICE, BiasAdd(N, H, W, C), /*old_benchmark_api=*/false) \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H *   \
+                            W * C);                                            \
+  }                                                                            \
+  BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE)->UseRealTime();
 
-#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE)                          \
-  static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE(    \
-      int iters) {                                                      \
-    testing::UseRealTime();                                             \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
-    test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C)).Run(iters);       \
-  }                                                                     \
-  BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
+#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE)                               \
+  static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE(         \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C),                        \
+                    /*old_benchmark_api=*/false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H * \
+                            W * C);                                          \
+  }                                                                          \
+  BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE)           \
+      ->UseRealTime();
 
 // CPU
 BM_BiasAddNHWC(32, 32, 32, 128, cpu);
diff --git a/tensorflow/core/kernels/bincount_op_test.cc b/tensorflow/core/kernels/bincount_op_test.cc
index cb04b40637a..80257fb435d 100644
--- a/tensorflow/core/kernels/bincount_op_test.cc
+++ b/tensorflow/core/kernels/bincount_op_test.cc
@@ -45,11 +45,15 @@ static Graph* Bincount(int arr_size, int nbins) {
   return g;
 }
 
-#define BM_BincountDev(K, NBINS, type)                             \
-  static void BM_Bincount##_##type##_##K##_##NBINS(int iters) {    \
-    testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
-    test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters);  \
-  }                                                                \
+#define BM_BincountDev(K, NBINS, type)                                   \
+  static void BM_Bincount##_##type##_##K##_##NBINS(                      \
+      ::testing::benchmark::State& state) {                              \
+    test::Benchmark(#type, Bincount(K * 1024, NBINS),                    \
+                    /*old_benchmark_api=*/false)                         \
+        .Run(state);                                                     \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * K * \
+                            1024);                                       \
+  }                                                                      \
   BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
 
 BM_BincountDev(32, 1000, cpu);
diff --git a/tensorflow/core/kernels/broadcast_to_op_test.cc b/tensorflow/core/kernels/broadcast_to_op_test.cc
index c8cb7ddc1a8..d0b74472565 100644
--- a/tensorflow/core/kernels/broadcast_to_op_test.cc
+++ b/tensorflow/core/kernels/broadcast_to_op_test.cc
@@ -44,29 +44,35 @@ static Graph* BroadcastTo(int dim0, int dim1, InputShape input_shape) {
   return g;
 }
 
-#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type)                          \
-  static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(int iters) { \
-    testing::UseRealTime();                                                \
-    testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1);      \
-    test::Benchmark(#type, BroadcastTo(DIM0, DIM1,                         \
-                                       [](int dim0, int dim1) {            \
-                                         return TensorShape({dim0, 1});    \
-                                       }))                                 \
-        .Run(iters);                                                       \
-  }                                                                        \
-  BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1);
+#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type)                           \
+  static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(              \
+      ::testing::benchmark::State& state) {                                 \
+    test::Benchmark(#type,                                                  \
+                    BroadcastTo(DIM0, DIM1,                                 \
+                                [](int dim0, int dim1) {                    \
+                                  return TensorShape({dim0, 1});            \
+                                }),                                         \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
+                            DIM1);                                          \
+  }                                                                         \
+  BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1)->UseRealTime();
 
-#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type)                          \
-  static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(int iters) { \
-    testing::UseRealTime();                                                \
-    testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1);      \
-    test::Benchmark(#type, BroadcastTo(DIM0, DIM1,                         \
-                                       [](int dim0, int dim1) {            \
-                                         return TensorShape({1, dim1});    \
-                                       }))                                 \
-        .Run(iters);                                                       \
-  }                                                                        \
-  BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1);
+#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type)                           \
+  static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(              \
+      ::testing::benchmark::State& state) {                                 \
+    test::Benchmark(#type,                                                  \
+                    BroadcastTo(DIM0, DIM1,                                 \
+                                [](int dim0, int dim1) {                    \
+                                  return TensorShape({1, dim1});            \
+                                }),                                         \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
+                            DIM1);                                          \
+  }                                                                         \
+  BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1)->UseRealTime();
 
 BM_BroadcastTo_InnerDim(64, 64, cpu);
 BM_BroadcastTo_InnerDim(128, 128, cpu);
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 11550be4874..a7579c0705e 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -121,102 +121,127 @@ TEST_ALL_CASTS_FROM(quint16)
 
 // TODO(wicke): check conversions from/to bool, and bfloat16
 
-static void BM_cpu_float_int64(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_float_int64(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+  test::Benchmark("cpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(int64)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_gpu_float_int64(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_gpu_float_int64(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  test::Benchmark("gpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(int64)));
-  testing::UseRealTime();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
-BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_gpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_cpu_bool_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_bool_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(bool) + sizeof(float)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_gpu_bool_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_gpu_bool_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  test::Benchmark("gpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(bool) + sizeof(float)));
-  testing::UseRealTime();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
-BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_gpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_cpu_float_bfloat16(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_float_bfloat16(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+  test::Benchmark("cpu", Cast<float, bfloat16>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(bfloat16)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_float_bfloat16)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_cpu_bfloat16_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_bfloat16_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+  test::Benchmark("cpu", Cast<bfloat16, float>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(bfloat16)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_bfloat16_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_cpu_float_half(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_float_half(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", Cast<float, Eigen::half>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<float, Eigen::half>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_cpu_half_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_half_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", Cast<Eigen::half, float>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<Eigen::half, float>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_gpu_float_half(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
-                          (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
+static void BM_gpu_float_half(::testing::benchmark::State& state) {
+  const int num = state.range(0);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
+  test::Benchmark("gpu", Cast<float, Eigen::half>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-}
-BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
-static void BM_gpu_half_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
-BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_gpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_gpu_half_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  test::Benchmark("gpu", Cast<Eigen::half, float>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
+                          (sizeof(float) + sizeof(Eigen::half)));
+}
+BENCHMARK(BM_gpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/clustering_ops_test.cc b/tensorflow/core/kernels/clustering_ops_test.cc
index 8172a7cebb8..5b5d7472296 100644
--- a/tensorflow/core/kernels/clustering_ops_test.cc
+++ b/tensorflow/core/kernels/clustering_ops_test.cc
@@ -72,22 +72,21 @@ Graph* SetUpKmeansPlusPlusInitialization(int num_dims, int num_points,
 
 template <int num_points, int num_to_sample, int num_dims,
           int retries_per_sample>
-void BM_KmeansPlusPlusInitialization(int iters) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
-                          num_to_sample);
-  testing::UseRealTime();
+void BM_KmeansPlusPlusInitialization(::testing::benchmark::State& state) {
   Graph* g = SetUpKmeansPlusPlusInitialization(
       num_dims, num_points, num_to_sample, retries_per_sample);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
+                          num_dims * num_to_sample);
 }
 
-#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r)                            \
-  void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(int iters) { \
-    BM_KmeansPlusPlusInitialization<p, c, d, r>(iters);                   \
-  }                                                                       \
-  BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r);
+#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r)                     \
+  void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(      \
+      ::testing::benchmark::State& state) {                        \
+    BM_KmeansPlusPlusInitialization<p, c, d, r>(state);            \
+  }                                                                \
+  BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r) \
+      ->UseRealTime();
 
 #define RUN_BM_KmeansPlusPlusInitialization(retries)                     \
   BENCHMARK_KMEANS_PLUS_PLUS(k10Points, k2Centers, k100Dim, retries);    \
@@ -132,20 +131,18 @@ Graph* SetUpKMC2Initialization(int num_points) {
 }
 
 template <int num_points, int num_to_sample, int num_dims>
-void BM_KMC2Initialization(int iters) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
-                          num_to_sample);
-  testing::UseRealTime();
+void BM_KMC2Initialization(::testing::benchmark::State& state) {
   Graph* g = SetUpKMC2Initialization(num_points);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
+                          num_dims * num_to_sample);
 }
-#define BENCHMARK_KMC2(p, c, d)                           \
-  void BM_KMC2Initialization_##p##_##c##_##d(int iters) { \
-    BM_KMC2Initialization<p, c, d>(iters);                \
-  }                                                       \
-  BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d);
+#define BENCHMARK_KMC2(p, c, d)               \
+  void BM_KMC2Initialization_##p##_##c##_##d( \
+      ::testing::benchmark::State& state) {   \
+    BM_KMC2Initialization<p, c, d>(state);    \
+  }                                           \
+  BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d)->UseRealTime();
 
 #define RUN_BM_KMC2Initialization                   \
   BENCHMARK_KMC2(k10Points, k2Centers, k100Dim);    \
@@ -191,14 +188,11 @@ Graph* SetUpNearestNeighbors(int num_dims, int num_points, int num_centers,
 }
 
 template <int num_dims, int num_points, int num_centers, int k>
-void BM_NearestNeighbors(int iters) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
-                          num_centers);
-  testing::UseRealTime();
+void BM_NearestNeighbors(::testing::benchmark::State& state) {
   Graph* g = SetUpNearestNeighbors(num_dims, num_points, num_centers, k);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
+                          num_dims * num_centers);
 }
 
 constexpr int kTop1 = 1;
@@ -206,11 +200,12 @@ constexpr int kTop2 = 2;
 constexpr int kTop5 = 5;
 constexpr int kTop10 = 10;
 
-#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k)              \
-  void BM_NearestNeighbors##d##_##p##_##c##_##k(int iters) { \
-    BM_NearestNeighbors<d, p, c, k>(iters);                  \
-  }                                                          \
-  BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k);
+#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k)  \
+  void BM_NearestNeighbors##d##_##p##_##c##_##k( \
+      ::testing::benchmark::State& state) {      \
+    BM_NearestNeighbors<d, p, c, k>(state);      \
+  }                                              \
+  BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k)->UseRealTime();
 
 #define RUN_BM_NearestNeighbors(k)                                 \
   BENCHMARK_NEAREST_NEIGHBORS(k100Dim, k1kPoints, k100Centers, k); \
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 451f2cb96bc..777c5fc8fc7 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -113,7 +113,7 @@ void NcclReducer::Run(StatusCallback done) {
   if (final_status.ok()) {
     final_status = collective_util::ComputeBinOp(
         col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
-        col_params_->final_op.get(), col_ctx_->output, &group_size);
+        col_params_->final_op, col_ctx_->output, &group_size);
   }
   done(final_status);
 }
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index f7725151d8a..04399504978 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -248,6 +248,8 @@ class NcclTestBase : public ::testing::Test {
       TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
           << "Could not find device " << device_name_ << " existing devices "
           << parent_->dev_mgr_->DebugString();
+      merge_op_ = GetAdd(device_);
+      final_op_ = GetDiv(device_);
       col_params_.name = parent_->col_params_.name;
       col_params_.default_rank = rank;
       col_params_.group = parent_->col_params_.group;
@@ -414,6 +416,8 @@ class NcclTestBase : public ::testing::Test {
     Tensor output_;
     Device* device_;
     CollectiveParams col_params_;
+    std::unique_ptr<OpKernel> merge_op_;
+    std::unique_ptr<OpKernel> final_op_;
     Status status_;
   };
 
@@ -459,8 +463,8 @@ class NcclReducerTest : public NcclTestBase {
   }
 
   void InitDevice(DeviceInstance* di) override {
-    di->col_params_.merge_op = GetAdd(di->device_);
-    di->col_params_.final_op = GetDiv(di->device_);
+    di->col_params_.merge_op = di->merge_op_.get();
+    di->col_params_.final_op = di->final_op_.get();
   }
 
   void RunCollectiveOnDevice(DeviceInstance* di) override { di->RunReduce(); }
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index a3db45dfea6..357ae158ea1 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -49,9 +49,40 @@ static std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
   return k;
 }
 
-class CollectiveOpKernel : public AsyncOpKernel {
+class CollectiveOpV1Kernel : public AsyncOpKernel {
  public:
-  explicit CollectiveOpKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+  explicit CollectiveOpV1Kernel(OpKernelConstruction* c)
+      : AsyncOpKernel(c), name_(name()) {}
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            name_),
+        done);
+    const CancellationToken token =
+        c->cancellation_manager()->get_cancellation_token();
+    const bool already_cancelled =
+        !c->cancellation_manager()->RegisterCallback(token, [col_exec]() {
+          // We must call StartAbort() within the callback. StartAbort() relies
+          // on resources that may be deallocated if all execution of a graph is
+          // finished.
+          col_exec->StartAbort(errors::Cancelled("op cancelled"));
+        });
+    OP_REQUIRES_ASYNC(c, !already_cancelled,
+                      errors::Cancelled("op cancelled ", name_), done);
+
+    auto deregister_and_done = [c, col_exec, token, done = std::move(done)]() {
+      // Once done() is called, StartAbort() won't have any effect, so we
+      // don't need to block on the deregistration. Also StartAbort() may call
+      // done() and DeregisterCallback may deadlock.
+      c->cancellation_manager()->TryDeregisterCallback(token);
+      done();
+    };
+    ComputeAsyncImpl(c, col_exec, std::move(deregister_and_done));
+  }
 
   // A string encoding instance, frame and iter to be handed off to
   // the implementation for use in generating RecvBuf keys.
@@ -90,14 +121,20 @@ class CollectiveOpKernel : public AsyncOpKernel {
     return true;
   }
 
+ protected:
+  virtual void ComputeAsyncImpl(OpKernelContext* c,
+                                CollectiveExecutor* col_exec,
+                                DoneCallback done) = 0;
+
+  string name_;
   CollectiveParams col_params_;
   std::vector<int32> dependencies_;
 };
 
-class CollectiveGatherOpKernel : public CollectiveOpKernel {
+class CollectiveGatherOpKernel : public CollectiveOpV1Kernel {
  public:
   explicit CollectiveGatherOpKernel(OpKernelConstruction* c)
-      : CollectiveOpKernel(c) {
+      : CollectiveOpV1Kernel(c) {
     col_params_.instance.type = GATHER_COLLECTIVE;
     OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
     OP_REQUIRES(
@@ -119,15 +156,9 @@ class CollectiveGatherOpKernel : public CollectiveOpKernel {
     col_params_.group.device_type = c->device_type();
   }
 
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    CollectiveExecutor* col_exec = c->collective_executor();
-    OP_REQUIRES_ASYNC(
-        c, col_exec,
-        errors::Internal(
-            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
-            col_params_.name),
-        done);
-
+ protected:
+  void ComputeAsyncImpl(OpKernelContext* c, CollectiveExecutor* col_exec,
+                        DoneCallback done) override {
     auto output_shape = c->input(0).shape();
     output_shape.set_dim(
         0, output_shape.dim_size(0) * col_params_.group.group_size);
@@ -171,10 +202,10 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_GPU),
                         CollectiveGatherOpKernel);
 
-class CollectiveReduceOpKernel : public CollectiveOpKernel {
+class CollectiveReduceOpKernel : public CollectiveOpV1Kernel {
  public:
   explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
-      : CollectiveOpKernel(c) {
+      : CollectiveOpV1Kernel(c) {
     col_params_.instance.type = REDUCTION_COLLECTIVE;
     OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
     OP_REQUIRES(
@@ -227,18 +258,15 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
     sub_node.set_device(real_node.device());
     SetAttrValue(col_params_.instance.data_type,
                  &(*sub_node.mutable_attr())["T"]);
-    col_params_.merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
-    col_params_.final_op = BuildOpKernel(c, final_op_name, &sub_node);
+    merge_op_ = BuildOpKernel(c, merge_op_name, &sub_node);
+    final_op_ = BuildOpKernel(c, final_op_name, &sub_node);
+    col_params_.merge_op = merge_op_.get();
+    col_params_.final_op = final_op_.get();
   }
 
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    CollectiveExecutor* col_exec = c->collective_executor();
-    OP_REQUIRES_ASYNC(
-        c, col_exec,
-        errors::Internal(
-            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
-            col_params_.name),
-        done);
+ protected:
+  void ComputeAsyncImpl(OpKernelContext* c, CollectiveExecutor* col_exec,
+                        DoneCallback done) override {
     // Allocate output on the first pass through this function.  This must be
     // done immediately, while we're still in the executor thread.  Otherwise
     // the memory is not guaranteed to be unused by any concurrently executing
@@ -272,6 +300,8 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
   }
 
  private:
+  std::unique_ptr<OpKernel> merge_op_;
+  std::unique_ptr<OpKernel> final_op_;
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceOpKernel);
 };
 
@@ -280,10 +310,10 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_GPU),
                         CollectiveReduceOpKernel);
 
-class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
+class CollectiveBcastSendOpKernel : public CollectiveOpV1Kernel {
  public:
   explicit CollectiveBcastSendOpKernel(OpKernelConstruction* c)
-      : CollectiveOpKernel(c) {
+      : CollectiveOpV1Kernel(c) {
     col_params_.instance.type = BROADCAST_COLLECTIVE;
     OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
     OP_REQUIRES(
@@ -309,14 +339,9 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
     col_params_.group.device_type = c->device_type();
   }
 
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    CollectiveExecutor* col_exec = c->collective_executor();
-    OP_REQUIRES_ASYNC(
-        c, col_exec,
-        errors::Internal(
-            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
-            col_params_.name),
-        done);
+ protected:
+  void ComputeAsyncImpl(OpKernelContext* c, CollectiveExecutor* col_exec,
+                        DoneCallback done) override {
     // Allocate output on the first pass through this function.  This must be
     // done immediately, while we're still in the executor thread.  Otherwise
     // the memory is not guaranteed to be unused by any concurrently executing
@@ -362,10 +387,10 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_GPU),
                         CollectiveBcastSendOpKernel);
 
-class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
+class CollectiveBcastRecvOpKernel : public CollectiveOpV1Kernel {
  public:
   explicit CollectiveBcastRecvOpKernel(OpKernelConstruction* c)
-      : CollectiveOpKernel(c) {
+      : CollectiveOpV1Kernel(c) {
     col_params_.instance.type = BROADCAST_COLLECTIVE;
     OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
     OP_REQUIRES(
@@ -391,14 +416,9 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     col_params_.group.device_type = c->device_type();
   }
 
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    CollectiveExecutor* col_exec = c->collective_executor();
-    OP_REQUIRES_ASYNC(
-        c, col_exec,
-        errors::Internal(
-            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
-            col_params_.name),
-        done);
+ protected:
+  void ComputeAsyncImpl(OpKernelContext* c, CollectiveExecutor* col_exec,
+                        DoneCallback done) override {
     // Allocate output on the first pass through this function.  This must be
     // done immediately, while we're still in the executor thread.  Otherwise
     // the memory is not guaranteed to be unused by any concurrently executing
@@ -440,9 +460,8 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_GPU),
 class CollectiveReduceV2OpKernel : public AsyncOpKernel {
  public:
   explicit CollectiveReduceV2OpKernel(OpKernelConstruction* c)
-      : AsyncOpKernel(c) {
-    col_params_ = std::make_shared<CollectiveParams>();
-    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_->instance.data_type));
+      : AsyncOpKernel(c), device_type_(DEVICE_DEFAULT) {
+    OP_REQUIRES_OK(c, c->GetAttr("T", &data_type_));
     string merge_op_name;
     OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
     OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
@@ -453,32 +472,23 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
     }
     string final_op_name;
     OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
-    OP_REQUIRES_OK(
-        c, c->GetAttr("communication_hint",
-                      &col_params_->instance.impl_details.communication_hint));
-    OP_REQUIRES_OK(
-        c, c->GetAttr("timeout_seconds",
-                      &col_params_->instance.impl_details.timeout_seconds));
+    OP_REQUIRES_OK(c, c->GetAttr("communication_hint", &communication_hint_));
+    OP_REQUIRES_OK(c, c->GetAttr("timeout_seconds", &timeout_seconds_));
     // Prepare OpKernels for reduction and final operations.
     // The merge_op takes two inputs
     NodeDef sub_node;
     sub_node.add_input(c->def().input(0));
     sub_node.add_input(c->def().input(0));
     sub_node.set_device(c->def().device());
-    SetAttrValue(col_params_->instance.data_type,
-                 &(*sub_node.mutable_attr())["T"]);
-    col_params_->merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
-    col_params_->final_op = BuildOpKernel(c, final_op_name, &sub_node);
+    SetAttrValue(data_type_, &(*sub_node.mutable_attr())["T"]);
+    merge_op_ = BuildOpKernel(c, merge_op_name, &sub_node);
+    final_op_ = BuildOpKernel(c, final_op_name, &sub_node);
 
-    col_params_->name = strings::StrCat(c->def().name(), ": ReduceV2(",
-                                        merge_op_name, ",", final_op_name, ")");
-    col_params_->group.device_type = c->device_type();
-    // Add a default value for subdiv offsets, which is the same as the default
-    // value in the V1 op's attribute.
-    col_params_->instance.impl_details.subdiv_offsets.push_back(0);
-    VLOG(2) << "CollectiveReduceV2 " << this << " name " << col_params_->name
-            << " communication_hint "
-            << col_params_->instance.impl_details.communication_hint;
+    name_ = strings::StrCat(c->def().name(), ": ReduceV2(", merge_op_name, ",",
+                            final_op_name, ")");
+    device_type_ = c->device_type();
+    VLOG(2) << "CollectiveReduceV2 " << this << " name " << name_
+            << " communication_hint " << communication_hint_;
   }
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
@@ -487,7 +497,7 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
         c, col_exec,
         errors::Internal(
             "Failed to get CollectiveExecutor from OpKernelContext for Op ",
-            col_params_->name),
+            name_),
         done);
     const Tensor& input = c->input(0);
     const Tensor& group_size = c->input(1);
@@ -503,48 +513,49 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
         c, instance_key.dims() == 0,
         errors::Internal("Unexpected dimensions on input instance_key"), done);
 
-    auto col_params = std::make_shared<CollectiveParams>();
-    col_params->name = col_params_->name;
-    col_params->group.device_type = col_params_->group.device_type;
+    auto col_params = new CollectiveParams();
+    col_params->name = name_;
+    col_params->group.device_type = device_type_;
     col_params->group.group_size = group_size.unaligned_flat<int32>()(0);
     col_params->group.group_key = group_key.unaligned_flat<int32>()(0);
     col_params->instance.type = REDUCTION_COLLECTIVE;
     col_params->instance.instance_key = instance_key.unaligned_flat<int32>()(0);
-    col_params->instance.data_type = col_params_->instance.data_type;
-    col_params->instance.impl_details.communication_hint =
-        col_params_->instance.impl_details.communication_hint;
-    col_params->instance.impl_details.timeout_seconds =
-        col_params_->instance.impl_details.timeout_seconds;
-    col_params->instance.impl_details.subdiv_offsets =
-        col_params_->instance.impl_details.subdiv_offsets;
-    col_params->merge_op = std::move(col_params_->merge_op);
-    col_params->final_op = std::move(col_params_->final_op);
+    col_params->instance.data_type = data_type_;
+    col_params->instance.impl_details.communication_hint = communication_hint_;
+    col_params->instance.impl_details.timeout_seconds = timeout_seconds_;
+    // Add a default value for subdiv offsets, which is the same as the default
+    // value in the V1 op's attribute.
+    col_params->instance.impl_details.subdiv_offsets.push_back(0);
+    col_params->merge_op = merge_op_.get();
+    col_params->final_op = final_op_.get();
     VLOG(1) << "CollectiveReduceV2 group_size " << col_params->group.group_size
             << " group_key " << col_params->group.group_key << " instance_key "
             << col_params->instance.instance_key;
 
+    auto done_with_cleanup = [col_params, done = std::move(done)]() {
+      delete col_params;
+      done();
+    };
+
     // Allocate the output tensor, trying to reuse the input.
     Tensor* output = nullptr;
     OP_REQUIRES_OK_ASYNC(
         c, c->forward_input_or_allocate_output({0}, 0, input.shape(), &output),
-        done);
+        done_with_cleanup);
     col_params->instance.shape = input.shape();
 
-    // Store the updated params in this OpKernel.
-    col_params_ = col_params;
-
     // Resolve the collective params.
     // Schedule the `CompleteParamsAsync` call on a work queue that can handle
     // blocking work because it's not guaranteed that this call cannot block.
-    c->collective_executor()->RunClosure([c, done = std::move(done), col_params,
-                                          col_exec]() {
+    c->collective_executor()->RunClosure([c,
+                                          done = std::move(done_with_cleanup),
+                                          col_params, col_exec]() {
       VLOG(1) << "CollectiveReduceV2 CompleteParams for collective "
               << col_params->name << " device " << c->device()->name()
               << " group " << col_params->group.group_key << " instance "
               << col_params->instance.instance_key;
       col_exec->CompleteParamsAsync(
-          c->device()->attributes(), col_params.get(),
-          c->cancellation_manager(),
+          c->device()->attributes(), col_params, c->cancellation_manager(),
           [c, done = std::move(done), col_params, col_exec](const Status& s) {
             if (s.ok()) {
               auto actual_done = [c, group_key = col_params->group.group_key,
@@ -578,12 +589,22 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
   }
 
  private:
-  std::shared_ptr<CollectiveParams> col_params_;
+  string name_;
+  DataType data_type_ = DT_INVALID;
+  string communication_hint_;
+  float timeout_seconds_ = 0;
+  DeviceType device_type_;
+  std::unique_ptr<OpKernel> merge_op_;
+  std::unique_ptr<OpKernel> final_op_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_CPU),
                         CollectiveReduceV2OpKernel);
-REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("group_size")
+                            .HostMemory("group_key")
+                            .HostMemory("instance_key"),
                         CollectiveReduceV2OpKernel);
 
 class CollectiveGatherV2OpKernel : public AsyncOpKernel {
@@ -650,15 +671,16 @@ class CollectiveGatherV2OpKernel : public AsyncOpKernel {
         0, output_shape.dim_size(0) * col_params->group.group_size);
     col_params->instance.shape = output_shape;
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_output(0, col_params->instance.shape, &output), done);
-
     auto done_with_cleanup = [col_params, done = std::move(done)]() {
       delete col_params;
       done();
     };
 
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_output(0, col_params->instance.shape, &output),
+        done_with_cleanup);
+
     // Resolve the collective params.
     // Schedule the `CompleteParamsAsync` call on a work queue that can handle
     // blocking work because it's not guaranteed that this call cannot block.
@@ -704,16 +726,20 @@ class CollectiveGatherV2OpKernel : public AsyncOpKernel {
   }
 
  private:
-  DataType data_type_;
-  string communication_hint_;
-  float timeout_seconds_;
-  DeviceType device_type_;
   string name_;
+  DataType data_type_ = DT_INVALID;
+  string communication_hint_;
+  float timeout_seconds_ = 0;
+  DeviceType device_type_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2").Device(DEVICE_CPU),
                         CollectiveGatherV2OpKernel);
-REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("group_size")
+                            .HostMemory("group_key")
+                            .HostMemory("instance_key"),
                         CollectiveGatherV2OpKernel);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index 5dffe76130d..66263a1d812 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -57,9 +57,9 @@ void FillTensorWithRandomValues<tstring>(Tensor* t, int string_length,
 // std::string, then the length of individual strings in the tensors will be
 // of length "string_length".
 template <typename T>
-static void ConcatHelper(int iters, int concat_dimension, int dim2,
+static void ConcatHelper(::testing::benchmark::State& state,
+                         int concat_dimension, int dim2,
                          int string_length = 0) {
-  testing::StopTiming();
   Graph* g = new Graph(OpRegistry::Global());
 
   DataType dt = DataTypeToEnum<T>::v();
@@ -81,49 +81,82 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2,
           .Attr("T", dt)
           .Finalize(g, &node));
 
-  testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
+                          (in0_bytes + in1_bytes));
 }
 
-static void BM_ConcatDim0Float(int iters, int dim2) {
-  ConcatHelper<float>(iters, 0, dim2);
+void BM_ConcatDim0Float(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<float>(state, 0, dim2);
 }
 
-static void BM_ConcatDim1Float(int iters, int dim2) {
-  ConcatHelper<float>(iters, 1, dim2);
+void BM_ConcatDim1Float(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<float>(state, 1, dim2);
 }
 
-BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim0Float)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_ConcatDim1Float)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
 
-static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
-  ConcatHelper<tstring>(iters, 0, dim2, string_length);
+void BM_ConcatDim0String(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+  const int string_length = state.range(1);
+
+  ConcatHelper<tstring>(state, 0, dim2, string_length);
 }
 
 BENCHMARK(BM_ConcatDim0String)
+    ->UseRealTime()
     ->ArgPair(1, 16)
     ->ArgPair(1, 10000)
     ->ArgPair(100, 16);
 
-static void BM_ConcatDim1uint8(int iters, int dim2) {
-  ConcatHelper<uint8>(iters, 1, dim2);
+void BM_ConcatDim1uint8(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<uint8>(state, 1, dim2);
 }
-static void BM_ConcatDim1int16(int iters, int dim2) {
-  ConcatHelper<int16>(iters, 1, dim2);
+void BM_ConcatDim1int16(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<int16>(state, 1, dim2);
 }
-static void BM_ConcatDim1bfloat16(int iters, int dim2) {
-  ConcatHelper<bfloat16>(iters, 1, dim2);
+void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<bfloat16>(state, 1, dim2);
 }
 
-BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1uint8)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_ConcatDim1int16)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_ConcatDim1bfloat16)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
 
 template <typename T>
-static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
-  testing::StopTiming();
+static void ConcatManyHelper(::testing::benchmark::State& state,
+                             int concat_dimension, int dim2) {
   Graph* g = new Graph(OpRegistry::Global());
 
   DataType dt = DataTypeToEnum<T>::v();
@@ -146,30 +179,25 @@ static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
                   .Attr("N", 64)
                   .Attr("T", dt)
                   .Finalize(g, &node));
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
-                          kNumInputs * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
+                          dim2 * kNumInputs * sizeof(T));
 }
 
-static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
-  ConcatManyHelper<bfloat16>(iters, 1, dim2);
+void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatManyHelper<bfloat16>(state, 1, dim2);
 }
 
-BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
-
-static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
-  testing::StopTiming();
+BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60);
 
+void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) {
   const int kDim1 = 100;
   std::vector<float> data1(kDim1 * dim2, 1.0f);
   std::vector<float> data2(kDim1 * dim2, 2.0f);
 
-  testing::BytesProcessed(static_cast<int64>(iters) *
-                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
-  testing::StartTiming();
-  while (--iters > 0) {
+  for (auto s : state) {
     const size_t n0 = data1.size();
     const size_t n1 = data2.size();
     float* result = new float[n0 + n1];
@@ -177,24 +205,37 @@ static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
     memcpy(&result[n0], &data2[0], n1 * sizeof(float));
     delete[] result;
   }
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
+                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
 }
 
-static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
-  MemcpyAlternativeHelper(iters, 0, dim2);
+void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  MemcpyAlternativeHelper(state, dim2);
 }
-static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
-  MemcpyAlternativeHelper(iters, 1, dim2);
+void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  MemcpyAlternativeHelper(state, dim2);
 }
 
-BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim0)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim1)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
 
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
                          Eigen::Unaligned>
     EigenMap;
-static void MemcpyManyAlternative1(int iters, int dim2) {
-  testing::StopTiming();
-
+void MemcpyManyAlternative1(::testing::benchmark::State& state) {
+  int dim2 = state.range(0);
   const int kDim1 = 40000;
   const int kNumCopies = 64;
   const int size = kDim1 * dim2 * kNumCopies;
@@ -202,10 +243,7 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
   EigenMap map(data, size);
   map.setRandom();
 
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
-                          kNumCopies * sizeof(bfloat16));
-  testing::StartTiming();
-  while (iters-- > 0) {
+  for (auto s : state) {
     std::vector<bfloat16*> inputs(kNumCopies);
     for (int i = 0; i < kNumCopies; ++i) {
       inputs[i] = &data[i * kDim1 * dim2];
@@ -225,11 +263,12 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
     delete[] result;
   }
   delete[] data;
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
+                          dim2 * kNumCopies * sizeof(bfloat16));
 }
 
-static void MemcpyManyAlternative2(int iters, int dim2) {
-  testing::StopTiming();
-
+void MemcpyManyAlternative2(::testing::benchmark::State& state) {
+  int dim2 = state.range(0);
   const int kDim1 = 40000;
   const int kNumCopies = 64;
   const int size = kDim1 * dim2 * kNumCopies;
@@ -237,11 +276,8 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
   EigenMap map(data, size);
   map.setRandom();
 
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
-                          kNumCopies * sizeof(bfloat16));
-  testing::StartTiming();
   std::vector<bfloat16*> inputs(kNumCopies);
-  while (--iters > 0) {
+  for (auto s : state) {
     bfloat16* result = new bfloat16[size];
     for (int i = 0; i < kNumCopies; ++i) {
       inputs[i] = &data[i * kDim1 * dim2];
@@ -260,6 +296,9 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
     delete[] result;
   }
   delete[] data;
+
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
+                          dim2 * kNumCopies * sizeof(bfloat16));
 }
 
 BENCHMARK(MemcpyManyAlternative1)
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 7f424b49994..12372202914 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -114,15 +114,23 @@ static Graph* ManyConsts(int num, bool sequential) {
   return g;
 }
 
-static void BM_ManyConsts_Parallel(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters);
+static void BM_ManyConsts_Parallel(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", ManyConsts(num, false /* !sequential */),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
 }
 BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10);
 
-static void BM_ManyConsts_Sequential(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters);
+static void BM_ManyConsts_Sequential(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", ManyConsts(num, true /* sequential */),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
 }
 BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10);
 
diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
index 339c8e2dda6..8408c8b7ec5 100644
--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@@ -309,104 +309,120 @@ static Graph* FusedConv2DWithBatchNorm(
 // The following benchmarks are always using 'float' data type with NHWC layout.
 // -------------------------------------------------------------------------- //
 
-#define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
-  testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
-  testing::SetLabel(LABEL);
+#define BM_SET_INFO(N, H, W, C, type, LABEL, NAME)                             \
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * (H) * \
+                          (W) * (C));                                          \
+  state.SetLabel(LABEL);
 
 #define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
   name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
 
-#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                      \
-  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                              \
-    test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph)     \
-        .Run(iters);                                                        \
-  }                                                                         \
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                  \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH,              \
+                      FC)(::testing::benchmark::State & state) {        \
+    test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph, \
+                    /*old_benchmark_api=*/false)                        \
+        .Run(state);                                                    \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                       \
+  }                                                                     \
   BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
 
 #define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
   static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
-                      FC)(int iters) {                                   \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
+                      FC)(::testing::benchmark::State & state) {         \
     test::Benchmark(#type,                                               \
-                    Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph) \
-        .Run(iters);                                                     \
+                    Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph, \
+                    /*old_benchmark_api=*/false)                         \
+        .Run(state);                                                     \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                        \
   }                                                                      \
   BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
 
-#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)         \
-  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,     \
-                      FC)(int iters) {                                        \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type, Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, \
-                                                              FH, FC, "Relu") \
-                               .graph)                                        \
-        .Run(iters);                                                          \
-  }                                                                           \
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)      \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,  \
+                      FC)(::testing::benchmark::State & state) {           \
+    test::Benchmark(                                                       \
+        #type,                                                             \
+        Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, FH, FC, "Relu") \
+            .graph,                                                        \
+        /*old_benchmark_api=*/false)                                       \
+        .Run(state);                                                       \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                          \
+  }                                                                        \
   BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
 
-#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
-  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
-                      FC)(int iters) {                                        \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
-                                                      {"BiasAdd"}))           \
-        .Run(iters);                                                          \
-  }                                                                           \
+#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)      \
+  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,  \
+                      FC)(::testing::benchmark::State & state) {         \
+    test::Benchmark(                                                     \
+        #type,                                                           \
+        FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, {"BiasAdd"}), \
+        /*old_benchmark_api=*/false)                                     \
+        .Run(state);                                                     \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                        \
+  }                                                                      \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
 
 #define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
   static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
-                      FC)(int iters) {                                         \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC,  \
-                                                      {"BiasAdd", "Relu"}))    \
-        .Run(iters);                                                           \
+                      FC)(::testing::benchmark::State & state) {               \
+    test::Benchmark(#type,                                                     \
+                    FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC,         \
+                                               {"BiasAdd", "Relu"}),           \
+                    /*old_benchmark_api=*/false)                               \
+        .Run(state);                                                           \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                              \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
 
 #define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)           \
   static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
-                      FC)(int iters) {                                        \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+                      FC)(::testing::benchmark::State & state) {              \
     test::Benchmark(#type,                                                    \
-                    Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph) \
-        .Run(iters);                                                          \
+                    Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph, \
+                    /*old_benchmark_api=*/false)                              \
+        .Run(state);                                                          \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                             \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
 
 #define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
   static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
-                      FC)(int iters) {                                         \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, Conv2DWithBatchNormAndActivation<float>(            \
-                               N, H, W, C, FW, FH, FC, "Relu")                 \
-                               .graph)                                         \
-        .Run(iters);                                                           \
+                      FC)(::testing::benchmark::State & state) {               \
+    test::Benchmark(#type,                                                     \
+                    Conv2DWithBatchNormAndActivation<float>(N, H, W, C, FW,    \
+                                                            FH, FC, "Relu")    \
+                        .graph,                                                \
+                    /*old_benchmark_api=*/false)                               \
+        .Run(state);                                                           \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                              \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
 
 #define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)     \
   static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
-                      FC)(int iters) {                                       \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, FusedConv2DWithBatchNorm<float>(                  \
-                               N, H, W, C, FW, FH, FC, {"FusedBatchNorm"}))  \
-        .Run(iters);                                                         \
+                      FC)(::testing::benchmark::State & state) {             \
+    test::Benchmark(#type,                                                   \
+                    FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC,  \
+                                                    {"FusedBatchNorm"}),     \
+                    /*old_benchmark_api=*/false)                             \
+        .Run(state);                                                         \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                            \
   }                                                                          \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
 
 #define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type,      \
                                            LABEL)                             \
   static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
-                      FW, FH, FC)(int iters) {                                \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(                                                          \
-        #type, FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC,        \
-                                               {"FusedBatchNorm", "Relu"}))   \
-        .Run(iters);                                                          \
+                      FW, FH, FC)(::testing::benchmark::State & state) {      \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBatchNorm<float>(                          \
+                        N, H, W, C, FW, FH, FC, {"FusedBatchNorm", "Relu"}),  \
+                    /*old_benchmark_api=*/false)                              \
+        .Run(state);                                                          \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                             \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
                     FH, FC));
@@ -561,11 +577,12 @@ BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
 
 #define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)                 \
   static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH,    \
-                           FC)(int iters) {                                   \
-    BM_SETUP(N, H, W, C, type, "", Conv2D);                                   \
+                           FC)(::testing::benchmark::State & state) {         \
     test::Benchmark(#type,                                                    \
-                    Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \
-        .Run(iters);                                                          \
+                    Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph, \
+                    /*old_benchmark_api=*/false)                              \
+        .Run(state);                                                          \
+    BM_SET_INFO(N, H, W, C, type, "", Conv2D);                                \
   }                                                                           \
   BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC));
 
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 02a82892fed..a78af454f09 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -149,21 +149,9 @@ class SelectV2Op : public OpKernel {
 
     // The `cond`, `then`, and `else` are broadcastable (bcast.IsValid()),
     // This matches the behavior of numpy.
-    // TODO (yongtang): Consolidate into n-ary broadcast, instead of multiple
-    // 2-ary broadcast.
-
-    // Combine `then` and `else`.
-    BCast then_else_bcast(BCast::FromShape(then->shape()),
-                          BCast::FromShape(else_->shape()), false);
-    OP_REQUIRES(ctx, then_else_bcast.IsValid(),
-                errors::InvalidArgument(
-                    "then ", then->shape().DebugString(), " and else ",
-                    else_->shape().DebugString(), " must be broadcastable"));
-    // Combine `cond` with `then` and `else`.
-    BCast bcast(
-        BCast::FromShape(cond->shape()),
-        BCast::FromShape(BCast::ToShape(then_else_bcast.output_shape())),
-        false);
+    BCastList<3> bcast({cond->shape().dim_sizes(), then->shape().dim_sizes(),
+                        else_->shape().dim_sizes()},
+                       false);
     OP_REQUIRES(ctx, bcast.IsValid(),
                 errors::InvalidArgument(
                     "condition ", cond->shape().DebugString(), ", then ",
@@ -172,12 +160,9 @@ class SelectV2Op : public OpKernel {
 
     // Broadcast `cond`, `then` and `else` to combined shape,
     // in order to obtain the reshape.
-    BCast cond_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
-                     BCast::FromShape(cond->shape()), false);
-    BCast then_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
-                     BCast::FromShape(then->shape()), false);
-    BCast else_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
-                     BCast::FromShape(else_->shape()), false);
+    BCast cond_bcast(bcast.output_shape(), cond->shape().dim_sizes(), false);
+    BCast then_bcast(bcast.output_shape(), then->shape().dim_sizes(), false);
+    BCast else_bcast(bcast.output_shape(), else_->shape().dim_sizes(), false);
     OP_REQUIRES(
         ctx,
         cond_bcast.IsValid() && then_bcast.IsValid() && else_bcast.IsValid(),
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 88651d7bfdc..5aa74bb7d3d 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -1071,10 +1071,12 @@ struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
 };
 
 template <typename T>
-struct maximum : base<T, Eigen::internal::scalar_max_op<T>> {};
+struct maximum
+    : base<T, Eigen::internal::scalar_max_op<T, T, Eigen::PropagateNaN>> {};
 
 template <typename T>
-struct minimum : base<T, Eigen::internal::scalar_min_op<T>> {};
+struct minimum
+    : base<T, Eigen::internal::scalar_min_op<T, T, Eigen::PropagateNaN>> {};
 
 template <typename T>
 struct igamma : base<T, Eigen::internal::scalar_igamma_op<T>> {};
@@ -1097,9 +1099,7 @@ struct scalar_atan2_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atan2_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
   operator()(const Scalar& y, const Scalar& x) const {
-#if GOOGLE_CUDA
-    return std::atan2(y, x);
-#elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_USE_ROCM
     return ::atan2(y, x);
 #else
     return std::atan2(y, x);
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 61f4b89535a..253974ba688 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -42,15 +42,19 @@ int RowsAndColsArg(int r, int c) { return r * kRows + c; }
 int RowsFromArg(int arg) { return (arg / kRows); }
 int ColsFromArg(int arg) { return (arg % kRows); }
 
-#define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
-  void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
-    const int64 tot = static_cast<int64>(iters) * num;               \
-    testing::UseRealTime();                                          \
-    testing::ItemsProcessed(tot);                                    \
-    testing::BytesProcessed(tot * sizeof(T));                        \
-    test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
-  }                                                                  \
-  BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
+#define BM_UNARY(DEVICE, FUNC, T, TYPE)                                    \
+  void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \
+    const int num = state.range(0);                                        \
+    test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE),                   \
+                    /*old_benchmark_api*/ false)                           \
+        .Run(state);                                                       \
+    const int64 tot = static_cast<int64>(state.iterations()) * num;        \
+    state.SetItemsProcessed(tot);                                          \
+    state.SetBytesProcessed(tot * sizeof(T));                              \
+  }                                                                        \
+  BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)                                 \
+      ->UseRealTime()                                                      \
+      ->Range(4 << 10, 1 << 20);
 
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -101,27 +105,30 @@ Graph* BinaryScalar(int num, const string& func) {
   return g;
 }
 
-#define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
-  void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
-    const int64 tot = static_cast<int64>(iters) * num;             \
-    testing::UseRealTime();                                        \
-    testing::ItemsProcessed(tot);                                  \
-    testing::BytesProcessed(tot * sizeof(float));                  \
-    test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
-  }                                                                \
-  BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                         \
-      ->Arg(1 << 12) /* must >= 4096 */                            \
-      ->Arg(1 << 13)                                               \
-      ->Arg(1 << 14)                                               \
-      ->Arg((1 << 15) - (1 << 13))                                 \
-      ->Arg(1 << 15)                                               \
-      ->Arg((1 << 15) + (1 << 14))                                 \
-      ->Arg(1 << 16)                                               \
-      ->Arg((1 << 17) - (1 << 15))                                 \
-      ->Arg(1 << 17)                                               \
-      ->Arg((1 << 17) + (1 << 16))                                 \
-      ->Arg(1 << 18)                                               \
-      ->Arg(1 << 19)                                               \
+#define BM_BINARY_SCALAR(DEVICE, FUNC)                                     \
+  void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \
+    const int num = state.range(0);                                        \
+                                                                           \
+    test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC),                     \
+                    /*old_benchmark_api=*/false)                           \
+        .Run(state);                                                       \
+    const int64 tot = static_cast<int64>(state.iterations()) * num;        \
+    state.SetItemsProcessed(tot);                                          \
+    state.SetBytesProcessed(tot * sizeof(float));                          \
+  }                                                                        \
+  BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                                 \
+      ->Arg(1 << 12) /* must >= 4096 */                                    \
+      ->Arg(1 << 13)                                                       \
+      ->Arg(1 << 14)                                                       \
+      ->Arg((1 << 15) - (1 << 13))                                         \
+      ->Arg(1 << 15)                                                       \
+      ->Arg((1 << 15) + (1 << 14))                                         \
+      ->Arg(1 << 16)                                                       \
+      ->Arg((1 << 17) - (1 << 15))                                         \
+      ->Arg(1 << 17)                                                       \
+      ->Arg((1 << 17) + (1 << 16))                                         \
+      ->Arg(1 << 18)                                                       \
+      ->Arg(1 << 19)                                                       \
       ->Arg(1 << 20);
 
 BM_BINARY_SCALAR(cpu, Less);
@@ -173,17 +180,20 @@ Graph* CubeWithMulSquare(int num) {
   return g;
 }
 
-#define BM_CUBE(DEVICE, Impl)                          \
-  void BM_##DEVICE##_Cube_##Impl(int iters, int num) { \
-    const int64 tot = static_cast<int64>(iters) * num; \
-    testing::UseRealTime();                            \
-    testing::ItemsProcessed(tot);                      \
-    testing::BytesProcessed(tot * sizeof(float));      \
-    test::Benchmark(#DEVICE, Impl(num)).Run(iters);    \
-  }                                                    \
-  BENCHMARK(BM_##DEVICE##_Cube_##Impl)                 \
-      ->Arg(1 << 12) /* must >= 4096 */                \
-      ->Arg(1 << 16)                                   \
+#define BM_CUBE(DEVICE, Impl)                                          \
+  void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \
+    const int num = state.range(0);                                    \
+                                                                       \
+    test::Benchmark(#DEVICE, Impl(num), /*old_benchmark_api*/ false)   \
+        .Run(state);                                                   \
+    const int64 tot = static_cast<int64>(state.iterations()) * num;    \
+    state.SetItemsProcessed(tot);                                      \
+    state.SetBytesProcessed(tot * sizeof(float));                      \
+  }                                                                    \
+  BENCHMARK(BM_##DEVICE##_Cube_##Impl)                                 \
+      ->UseRealTime()                                                  \
+      ->Arg(1 << 12) /* must >= 4096 */                                \
+      ->Arg(1 << 16)                                                   \
       ->Arg(1 << 20);
 
 BM_CUBE(cpu, CubeWithPow3);
@@ -211,17 +221,21 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
   return g;
 }
 
-#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                             \
-  void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) {      \
-    const int rows = RowsFromArg(arg);                                         \
-    const int cols = ColsFromArg(arg);                                         \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(tot);                                              \
-    testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
-    test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
-  }                                                                            \
-  BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C)                      \
+#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                          \
+  void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(                        \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+    test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE),          \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(C_TYPE));                          \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C)                   \
+      ->UseRealTime()                                                       \
       ->Arg(RowsAndColsArg(R, C));
 
 #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE)   \
@@ -264,16 +278,21 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
 
 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH)               \
   void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH(      \
-      int iters, int arg, int channels) {                                      \
+      ::testing::benchmark::State& state) {                                    \
+    const int arg = state.range(0);                                            \
+    const int channels = state.range(1);                                       \
+                                                                               \
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
-    const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(tot);                                              \
-    testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
-    test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels,         \
-                                                 TF_TYPE, FORMAT_##FMT))       \
-        .Run(iters);                                                           \
+    test::Benchmark(                                                           \
+        #DEVICE,                                                               \
+        BiasAddGrad<C_TYPE>(rows, cols, channels, TF_TYPE, FORMAT_##FMT),      \
+        /*old_benchmark_api=*/false)                                           \
+        .Run(state);                                                           \
+    const int64 tot =                                                          \
+        static_cast<int64>(state.iterations()) * rows * cols * channels;       \
+    state.SetItemsProcessed(tot);                                              \
+    state.SetBytesProcessed(tot * sizeof(C_TYPE));                             \
   }                                                                            \
   BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
       ->ArgPair(RowsAndColsArg(R, C), CH);
@@ -326,16 +345,20 @@ Graph* BcastAdd(int rows, int cols, int dim) {
   return g;
 }
 
-#define BM_BCAST_ADD_ROW(DEVICE, R, C)                             \
-  void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                             \
-    const int cols = ColsFromArg(arg);                             \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;     \
-    testing::UseRealTime();                                        \
-    testing::ItemsProcessed(tot);                                  \
-    testing::BytesProcessed(tot * sizeof(float));                  \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
-  }                                                                \
+#define BM_BCAST_ADD_ROW(DEVICE, R, C)                                      \
+  void BM_##DEVICE##_BcastAddRow_R##R##_C##C(                               \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0),                       \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
   BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
 
 #define BM_BCAST_ADD_ROW_ALL(DEVICE)   \
@@ -350,17 +373,24 @@ BM_BCAST_ADD_ROW_ALL(gpu);
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
-#define BM_BCAST_ADD_COL(DEVICE, R, C)                             \
-  void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                             \
-    const int cols = ColsFromArg(arg);                             \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;     \
-    testing::UseRealTime();                                        \
-    testing::ItemsProcessed(tot);                                  \
-    testing::BytesProcessed(tot * sizeof(float));                  \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
-  }                                                                \
-  BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+#define BM_BCAST_ADD_COL(DEVICE, R, C)                                      \
+  void BM_##DEVICE##_BcastAddCol_R##R##_C##C(                               \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1),                       \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+                                                                            \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)                          \
+      ->UseRealTime()                                                       \
+      ->Arg(RowsAndColsArg(R, C));
 
 #define BM_BCAST_ADD_COL_ALL(DEVICE)   \
   BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
@@ -374,17 +404,23 @@ BM_BCAST_ADD_COL_ALL(gpu);
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
-#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C)                            \
-  void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                                 \
-    const int cols = ColsFromArg(arg);                                 \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;         \
-    testing::UseRealTime();                                            \
-    testing::ItemsProcessed(tot);                                      \
-    testing::BytesProcessed(tot * sizeof(float));                      \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters);      \
-  }                                                                    \
-  BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C)                 \
+#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C)                                 \
+  void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(                           \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2),                       \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+                                                                            \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C)                      \
+      ->UseRealTime()                                                       \
       ->Arg(RowsAndColsArg(R, C));
 
 #define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE)   \
@@ -399,17 +435,22 @@ BM_BCAST_ADD_CROSS_RC_ALL(gpu);
 #undef BM_BCAST_ADD_CROSS_RC_ALL
 #undef BM_BCAST_ADD_CROSS_RC
 
-#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C)                            \
-  void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                                 \
-    const int cols = ColsFromArg(arg);                                 \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;         \
-    testing::UseRealTime();                                            \
-    testing::ItemsProcessed(tot);                                      \
-    testing::BytesProcessed(tot * sizeof(float));                      \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters);      \
-  }                                                                    \
-  BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C)                 \
+#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C)                                 \
+  void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(                           \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3),                       \
+                    /*old_benchmark_api*/ false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C)                      \
+      ->UseRealTime()                                                       \
       ->Arg(RowsAndColsArg(R, C));
 
 #define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE)   \
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 9f351edf11a..1f83c4ed984 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -203,9 +203,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/kernels:function_ops",
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
@@ -805,6 +805,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index c9883f9c938..3d0a51404ba 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -38,6 +38,8 @@ namespace data {
 /* static */ constexpr const char* const CacheDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const CacheDatasetOp::kOutputShapes;
 
+namespace {
+
 constexpr char kKeyStrFormat[] = "%%%zuzu_%%%zuzu";
 constexpr char kPaddingSizeStrFormat[] = "%zu";
 constexpr char kFileDatasetPrefix[] = "File";
@@ -57,6 +59,14 @@ constexpr char kCacheCompleted[] = "cache_completed";
 constexpr char kIndex[] = "index";
 constexpr char kImpl[] = "Impl";
 constexpr char kCacheDataset[] = "CacheDataset";
+constexpr char kIncompleteCacheErrorMessage[] =
+    "The calling iterator did not fully read the dataset being cached. In "
+    "order to avoid unexpected truncation of the dataset, the partially cached "
+    "contents of the dataset  will be discarded. This can happen if you have "
+    "an input pipeline similar to `dataset.cache().take(k).repeat()`. You "
+    "should use `dataset.take(k).cache().repeat()` instead.";
+
+}  // namespace
 
 class CacheDatasetOp::FileDatasetBase : public DatasetBase {
  public:
@@ -220,6 +230,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
       ~FileWriterIterator() override {
         if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) {
+          LOG(WARNING) << kIncompleteCacheErrorMessage;
           std::vector<string> cache_files;
           Status s = dataset()->env_->GetMatchingPaths(
               strings::StrCat(filename_, "*"), &cache_files);
@@ -754,13 +765,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
       ~MemoryWriterIterator() override {
         mutex_lock l(mu_);
         if (!temp_cache_.empty() && !cache_->IsCompleted()) {
-          LOG(WARNING)
-              << "The calling iterator did not fully read the dataset being "
-                 "cached. In order to avoid unexpected truncation of the "
-                 "dataset, the partially cached contents of the dataset "
-                 "will be discarded. This can happen if you have an input "
-                 "pipeline similar to `dataset.cache().take(k).repeat()`. "
-                 "You should use `dataset.take(k).cache().repeat()` instead.";
+          LOG(WARNING) << kIncompleteCacheErrorMessage;
           cache_->Reset();
         }
       }
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 1e5af931cdd..92bc48159ad 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -132,12 +132,12 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler/optimizers/data:graph_utils",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:serialization_utils",
+        "//tensorflow/core/platform:regexp",
     ],
 )
 
@@ -307,7 +307,7 @@ tf_cc_test(
     name = "lmdb_dataset_op_test",
     size = "small",
     srcs = ["lmdb_dataset_op_test.cc"],
-    data = ["//tensorflow/core:lmdb_testdata"],
+    data = ["//tensorflow/core/lib/lmdb:lmdb_testdata"],
     deps = [
         ":lmdb_dataset_op",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
@@ -424,6 +424,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:ragged_tensor_variant",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:parallel_map_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index d8176eb9499..c7fac733b32 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -263,8 +263,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             });
       }
 
-      while (results_.empty() && !job_finished_ && !cancelled_ &&
-             status_.ok()) {
+      while (results_.empty() &&
+             !(job_finished_ && num_running_worker_threads_ == 0) &&
+             !cancelled_ && status_.ok()) {
         get_next_cv_.wait(l);
       }
       if (cancelled_) {
@@ -370,6 +371,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       job_finished_ = job_finished;
       if (job_finished) {
         get_next_cv_.notify_all();
+        worker_thread_cv_.notify_all();
         return;
       }
       for (int i = 0; i < tasks_.size(); ++i) {
@@ -416,6 +418,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(mu_);
           num_running_worker_threads_--;
           outstanding_requests_--;
+          get_next_cv_.notify_all();
         };
         worker_threads_.push_back(ctx->StartThread(
             "tf-data-service-task_thread", [this, done = std::move(done)]() {
@@ -440,7 +443,8 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             worker_thread_cv_.notify_one();
           }
           outstanding_requests_--;
-          while (!cancelled_ && !(SpaceInBuffer() && TaskAvailable())) {
+          while (!cancelled_ && !(SpaceInBuffer() && TaskAvailable()) &&
+                 !job_finished_) {
             if (VLOG_IS_ON(3)) {
               VLOG(3) << "Sleeping with results_.size=" << results_.size()
                       << ", outstanding_requests_=" << outstanding_requests_
@@ -452,7 +456,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             worker_thread_cv_.wait(l);
           }
           outstanding_requests_++;
-          if (cancelled_) {
+          if (cancelled_ || job_finished_) {
             return;
           }
           // Search for a task to update.
@@ -475,10 +479,13 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
           mutex_lock l(mu_);
-          VLOG(1) << "Failed to get element for task "
-                  << task_to_process->task_id << ": " << s;
+          VLOG(1) << "Failed to get element from worker "
+                  << task_to_process->address << ": " << s;
           task_to_process->in_use = false;
-          status_ = s;
+          status_ = Status(
+              s.code(),
+              absl::StrCat("Failed to get element from worker ",
+                           task_to_process->address, ": ", s.error_message()));
           get_next_cv_.notify_all();
           return;
         }
@@ -529,6 +536,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             (deadline_micros > deadline_with_backoff_micros)
                 ? deadline_with_backoff_micros
                 : deadline_micros;
+        VLOG(1) << "Failed to get an element from worker " << task->address
+                << ": " << s << ". Will retry in "
+                << (backoff_until - now_micros) << " microseconds";
         Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
       }
 
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.cc b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
index b9f58b99b0f..4d993d9462f 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
@@ -57,7 +57,6 @@ void RegisterDatasetOp::Compute(OpKernelContext* ctx) {
   GraphDef graph_def;
   OP_REQUIRES_OK(
       ctx, AsGraphDef(ctx, dataset, std::move(serialization_ctx), &graph_def));
-  StripDevicePlacement(graph_def.mutable_library());
 
   DataServiceDispatcherClient client(address, protocol);
   int64 dataset_id;
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index a629292e2ed..59b36cb8260 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -107,6 +107,14 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return "GroupByWindowDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality) {
+        return n;
+      }
+      return kUnknownCardinality;
+    }
+
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 5cc72ba853e..717c6a9fa21 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -527,6 +527,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
               "Failed to allocate memory for the batch of component ", i);
         }
       }
+      RecordBufferEnqueue(ctx.get(), result->output);
       result->output_allocated = true;
       return Status::OK();
     }
@@ -536,6 +537,9 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
                          std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) {
       mutex_lock l(result->mu);
+      if (result->output_allocated) {
+        RecordBufferDequeue(ctx, result->output);
+      }
       if (result->num_elements == 0) {
         if (result->status.ok() || errors::IsOutOfRange(result->status)) {
           *end_of_sequence = true;
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 16cf7fe6416..80f23bb5a0c 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/parallel_map_dataset_op.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -678,12 +679,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         for (int d = 0; d < dataset()->ragged_keys_.size(); ++d) {
           int output_index =
               dataset()->key_to_output_index_.at(dataset()->ragged_keys_[d]);
-          (*output)[output_index] = Tensor(ctx->allocator({}), DT_VARIANT, {});
-          Tensor serialized_ragged =
-              Tensor(ctx->allocator({}), DT_VARIANT, {2});
-          auto serialized_ragged_t = serialized_ragged.vec<Variant>();
-          serialized_ragged_t(0) = example_result.ragged_splits[d];
-          serialized_ragged_t(1) = example_result.ragged_values[d];
+          RaggedTensorVariant serialized_ragged;
+          serialized_ragged.append_splits(example_result.ragged_splits[d]);
+          serialized_ragged.set_values(example_result.ragged_values[d]);
           (*output)[output_index] = Tensor(ctx->allocator({}), DT_VARIANT, {});
           Tensor& ragged_wrapper = (*output)[output_index];
           ragged_wrapper.scalar<Variant>()() = serialized_ragged;
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index e2cbe7d9dcc..7a65baaa680 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -417,7 +417,6 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
           std::vector<Tensor> slices;
           slices.reserve(tensors_.size());
           for (const auto& tensor : tensors_) {
-            Tensor slice = tensor.Slice(offset_, slice_end);
             slices.push_back(tensor.Slice(offset_, slice_end));
           }
           slices_to_concatenate.push_back(std::move(slices));
@@ -452,8 +451,28 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
         if (desired_batch_size == 0) {
           DCHECK_EQ(batch_size, 0);
           DCHECK_EQ(slices_to_concatenate.size(), 0);
-          for (const auto& dtype : dataset()->output_dtypes()) {
-            out_tensors->push_back(Tensor(dtype));
+          for (int i = 0; i < dataset()->output_dtypes().size(); ++i) {
+            if (dataset()->output_shapes()[i].unknown_rank()) {
+              // For unknown rank tensors, we just create a empty Tensor since
+              // it doesn't matter what shape it is.
+              out_tensors->push_back(Tensor(dataset()->output_dtypes()[i]));
+            } else {
+              auto dim_sizes = dataset()->output_shapes()[i].dim_sizes();
+
+              // The output batch size is always zero since the desired batch
+              // size is zero.
+              dim_sizes[0] = 0;
+
+              // Handle unknown dimensions by setting any unknown dimensions to
+              // zero since there isn't any data anyway.
+              for (int j = 1; j < dim_sizes.size(); ++j) {
+                if (dim_sizes[j] == -1) dim_sizes[j] = 0;
+              }
+
+              TensorShape tensor_shape(dim_sizes);
+              out_tensors->push_back(
+                  Tensor(dataset()->output_dtypes()[i], tensor_shape));
+            }
           }
           return Status::OK();
         }
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index f790b4bf07f..fdf8aebc3ab 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -140,7 +140,7 @@ class ModelDatasetOp::Dataset : public DatasetBase {
       IteratorContext::Params params(ctx);
       {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
+        TF_RETURN_IF_ERROR(EnsureModelThreadStarted(ctx));
         params.model = model_;
         int64 now_nanos = EnvTime::NowNanos();
         RecordInput(now_nanos);
@@ -175,18 +175,16 @@ class ModelDatasetOp::Dataset : public DatasetBase {
     }
 
    private:
-    Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
+    Status EnsureModelThreadStarted(IteratorContext* ctx)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!model_thread_) {
-        std::shared_ptr<IteratorContext> new_ctx =
-            std::make_shared<IteratorContext>(*ctx);
-        model_thread_ = ctx->StartThread(
-            "tf_data_model", [this, new_ctx]() { ModelThread(new_ctx); });
+        model_thread_ =
+            ctx->StartThread("tf_data_model", [this]() { ModelThread(); });
       }
       return Status::OK();
     }
 
-    void ModelThread(const std::shared_ptr<IteratorContext>& ctx) {
+    void ModelThread() {
       int64 last_optimization_ms = 0;
       int64 optimization_period_ms = 10;
       int64 current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 81fcf6d7b29..821204f8908 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -84,7 +84,8 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     // of the Borg jobs, the experiments will be randomly turned on.
     // clang-format off
     absl::flat_hash_map<string, uint64> live_experiments = {
-        {"enable_gradient_descent", 1}
+        {"enable_gradient_descent", 100},
+        {"map_parallelization", 5}
     };
     // clang-format on
     auto hash_func = [](const string& str) { return Hash64(str); };
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 4a55514ffd1..ba007d9a215 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -46,6 +47,8 @@ namespace data {
 /* static */ constexpr const char* const PrefetchDatasetOp::kSlackPeriod;
 /* static */ constexpr const char* const PrefetchDatasetOp::kLegacyAutotune;
 
+namespace {
+
 // Determines the fraction of slack time by which to delay prefetching of data.
 constexpr double kSleepFactor = 0.2;
 constexpr char kBuffer[] = "buffer";
@@ -54,6 +57,8 @@ constexpr char kSizeSuffix[] = ".size";
 constexpr char kCodeSuffix[] = ".code";
 constexpr char kErrorMessageSuffix[] = ".error_message";
 
+}  // namespace
+
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
@@ -282,17 +287,26 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
     data::TraceMeMetadata GetTraceMeMetadata() const override {
-      int64 limit = -1;
+      int64 limit = -1, size = -1;
       // NOTE: We only set the parallelism value if the lock can be acquired
       // right away to avoid introducing tracing overhead.
       if (mu_->try_lock()) {
         limit = buffer_limit();
+        size = buffer_.size();
         mu_->unlock();
       }
       data::TraceMeMetadata result;
       result.push_back(std::make_pair(
           "buffer_limit",
           strings::Printf("%lld", static_cast<long long>(limit))));
+      result.push_back(std::make_pair(
+          "buffer_size",
+          strings::Printf("%lld", static_cast<long long>(size))));
+      result.push_back(std::make_pair(
+          "autotune",
+          dataset()->buffer_size_ == model::kAutotune ? "true" : "false"));
+      result.push_back(std::make_pair(
+          "autotune_mode", legacy_autotune_ ? "legacy" : "performance"));
       if (dataset()->slack_period_ > 0) {
         result.push_back(std::make_pair(
             "slack",
@@ -369,6 +383,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         }
         *out_tensors = std::move(buffer_.front().value);
         RecordBufferDequeue(ctx, *out_tensors);
+      } else {
+        // If status not ok, we still record the dequeue event to make sure each
+        // enqueue event is paired with a dequeue event even in the presence of
+        // errors.
+        RecordBufferDequeue(ctx, buffer_.front().value);
       }
       if (legacy_autotune_) {
         auto_tuner_.RecordConsumption(buffer_.size());
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 76dbff1744d..4ed6bc63dca 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -234,7 +234,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
         }
         Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
         DCHECK(!*end_of_sequence || out_tensors->empty());
-        if (first_call_ && *end_of_sequence) {
+        if (first_call_ && *end_of_sequence && !ctx->split_provider()) {
           // If the first call to GetNext() fails because the end
           // of sequence has been reached, we terminate the
           // iteration immediately. (Otherwise, this iterator
@@ -246,6 +246,9 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
         if (!*end_of_sequence) {
           return s;
         } else {
+          if (ctx->split_provider()) {
+            TF_RETURN_IF_ERROR(ctx->split_provider()->Reset());
+          }
           input_impl_.reset();
           first_call_ = true;
         }
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 16ad78e5f9b..c0cb1e2f1e7 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -273,10 +273,10 @@ TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) {
   EXPECT_EQ(3.0, V(retvals[0]));  // out = 1.0 + 2.0 = 3.0
 }
 
-static void BM_executor(int iters, int width, int depth) {
-#ifdef PLATFORM_GOOGLE
-  BenchmarkUseRealTime();
-#endif  // PLATFORM_GOOGLE
+void BM_executor(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int depth = state.range(1);
+
   Graph* g = new Graph(OpRegistry::Global());
   random::PhiloxRandom philox(1729, 17);
   random::SimplePhilox rand(&philox);
@@ -306,30 +306,28 @@ static void BM_executor(int iters, int width, int depth) {
     }
   }
   FixupSourceAndSinkEdges(g);
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
-  SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetLabel(strings::StrCat("Nodes = ", cur));
+  state.SetItemsProcessed(cur * static_cast<int64>(state.iterations()));
 }
 
 // Tall skinny graphs
-BENCHMARK(BM_executor)->ArgPair(16, 1024);
-BENCHMARK(BM_executor)->ArgPair(32, 8192);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192);
 
 // Short fat graphs
-BENCHMARK(BM_executor)->ArgPair(1024, 16);
-BENCHMARK(BM_executor)->ArgPair(8192, 32);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32);
 
 // Tall fat graph
-BENCHMARK(BM_executor)->ArgPair(1024, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024);
+
+void BM_const_identity(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int outputs_per_const = state.range(1);
 
-static void BM_const_identity(int iters, int width, int outputs_per_const) {
-#ifdef PLATFORM_GOOGLE
-  BenchmarkUseRealTime();
-#endif  // PLATFORM_GOOGLE
   Graph* g = new Graph(OpRegistry::Global());
   for (int i = 0; i < width; ++i) {
     Tensor i_t(i);
@@ -339,22 +337,20 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) {
     }
   }
   FixupSourceAndSinkEdges(g);
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkLabel(
-      strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
-  SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
-                             static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR",
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetItemsProcessed((1 + outputs_per_const) * width *
+                          static_cast<int64>(state.iterations()));
 }
 
 // Graph with actual op execution.
-BENCHMARK(BM_const_identity)->ArgPair(1, 1);
-BENCHMARK(BM_const_identity)->ArgPair(1, 100);
-BENCHMARK(BM_const_identity)->ArgPair(100, 1);
-BENCHMARK(BM_const_identity)->ArgPair(100, 100);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 1);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 100);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 1);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 100);
 
 // TODO(mrry): This benchmark currently crashes with a use-after free, because
 // test::Benchmark::RunWithArgs() assumes that the executor will take ownership
@@ -368,7 +364,7 @@ BENCHMARK(BM_const_identity)->ArgPair(100, 100);
 #define ALICE "/job:j/replica:0/task:0/cpu:0"
 #define BOB "/job:j/replica:0/task:0/gpu:0"
 
-static void BM_FeedInputFetchOutput(int iters) {
+static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
   Graph* g = new Graph(OpRegistry::Global());
   // z = x + y: x and y are provided as benchmark inputs.  z is the
   // output of the benchmark.  Conceptually, the caller is ALICE, the
@@ -380,10 +376,10 @@ static void BM_FeedInputFetchOutput(int iters) {
   FixupSourceAndSinkEdges(g);
   Tensor val(DT_FLOAT, TensorShape({}));
   val.scalar<float>()() = 3.14;
-  SetBenchmarkItemsProcessed(static_cast<int64>(iters));
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .RunWithArgs({{x, val}, {y, val}}, {z}, iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
+      .RunWithArgs({{x, val}, {y, val}}, {z}, state);
+  state.SetItemsProcessed(state.iterations());
 }
 BENCHMARK(BM_FeedInputFetchOutput);
 #endif
diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
index 3c9d1790787..4dcb70b18c5 100644
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -247,7 +247,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) {
 }
 
 template <typename T>
-static void BM_DequantizeMinCombinedCpu(int iters) {
+static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) {
   auto root = Scope::NewRootScope().ExitOnError();
   const int64 num_values = 1500 * 250;
   std::vector<T> inputs;
@@ -262,25 +262,26 @@ static void BM_DequantizeMinCombinedCpu(int iters) {
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
 
-  test::Benchmark("cpu", g).Run(iters);
-  testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T)));
-  testing::ItemsProcessed(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(state.iterations() * num_values *
+                          (sizeof(float) + sizeof(T)));
+  state.SetItemsProcessed(state.iterations());
 }
 
-static void BM_DequantizeMinCombinedCpuQuint16(int iters) {
-  BM_DequantizeMinCombinedCpu<quint16>(iters);
+void BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<quint16>(state);
 }
 
-static void BM_DequantizeMinCombinedCpuQint16(int iters) {
-  BM_DequantizeMinCombinedCpu<qint16>(iters);
+void BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<qint16>(state);
 }
 
-static void BM_DequantizeMinCombinedCpuQuint8(int iters) {
-  BM_DequantizeMinCombinedCpu<quint8>(iters);
+void BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<quint8>(state);
 }
 
-static void BM_DequantizeMinCombinedCpuQint8(int iters) {
-  BM_DequantizeMinCombinedCpu<qint8>(iters);
+void BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<qint8>(state);
 }
 
 BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
@@ -289,7 +290,8 @@ BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
 BENCHMARK(BM_DequantizeMinCombinedCpuQint8);
 
 template <typename T>
-static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
+static void BM_DequantizeBfloat16MinCombinedCpu(
+    ::testing::benchmark::State& state) {
   auto root = Scope::NewRootScope().ExitOnError();
   const int64 num_values = 1500 * 250;
   std::vector<T> inputs;
@@ -304,25 +306,30 @@ static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
 
-  test::Benchmark("cpu", g).Run(iters);
-  testing::BytesProcessed(iters * num_values * (sizeof(bfloat16) + sizeof(T)));
-  testing::ItemsProcessed(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetBytesProcessed(state.iterations() * num_values *
+                          (sizeof(bfloat16) + sizeof(T)));
+  state.SetItemsProcessed(state.iterations());
 }
 
-static void BM_DequantizeBfloat16MinCombinedCpuQuint16(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<quint16>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQuint16(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<quint16>(state);
 }
 
-static void BM_DequantizeBfloat16MinCombinedCpuQint16(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<qint16>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQint16(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<qint16>(state);
 }
 
-static void BM_DequantizeBfloat16MinCombinedCpuQuint8(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<quint8>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQuint8(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<quint8>(state);
 }
 
-static void BM_DequantizeBfloat16MinCombinedCpuQint8(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<qint8>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQint8(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<qint8>(state);
 }
 
 BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint16);
diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc
index a708e53dd01..8fdf1018eba 100644
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@@ -30,12 +30,13 @@ static Graph* Diag(int n, DataType type) {
   return g;
 }
 
-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
-    testing::UseRealTime();                                     \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
-    test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
-  }                                                             \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                                      \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(                            \
+      ::testing::benchmark::State& state) {                                   \
+    test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE), /*old_benchmark_api=*/false) \
+        .Run(state);                                                          \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * N);  \
+  }                                                                           \
   BENCHMARK(BM_Diag##_##N##_##TFTYPE##_##DEVICE);
 
 #define BM_Diag(N)                                       \
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index ac34c4ff09f..ba9cde22845 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -188,15 +188,19 @@ static Graph* DynamicPartition(int num_partitions, int dim) {
   return g;
 }
 
-#define BM_DYNAMIC_PARTITION(DEVICE, T, num)                            \
-  static void BM_##DEVICE##_dynpart_##T##_##num(int iters, int dim) {   \
-    const int64 items = ((128 << 20) / sizeof(T));                      \
-    const int64 tot = static_cast<int64>(iters) * items;                \
-    testing::ItemsProcessed(tot);                                       \
-    testing::UseRealTime();                                             \
-    test::Benchmark(#DEVICE, DynamicPartition<T>(num, dim)).Run(iters); \
-  }                                                                     \
-  BENCHMARK(BM_##DEVICE##_dynpart_##T##_##num)->Arg(1)->Arg(256)
+#define BM_DYNAMIC_PARTITION(DEVICE, T, num)                          \
+  static void BM_##DEVICE##_dynpart_##T##_##num(                      \
+      ::testing::benchmark::State& state) {                           \
+    const int dim = state.range(0);                                   \
+                                                                      \
+    const int64 items = ((128 << 20) / sizeof(T));                    \
+    test::Benchmark(#DEVICE, DynamicPartition<T>(num, dim),           \
+                    /*old_benchmark_api=*/false)                      \
+        .Run(state);                                                  \
+    const int64 tot = static_cast<int64>(state.iterations()) * items; \
+    state.SetItemsProcessed(tot);                                     \
+  }                                                                   \
+  BENCHMARK(BM_##DEVICE##_dynpart_##T##_##num)->UseRealTime()->Arg(1)->Arg(256)
 
 BM_DYNAMIC_PARTITION(cpu, float, 2);
 BM_DYNAMIC_PARTITION(cpu, float, 100);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index ed4b65cd398..6f16df351f5 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1376,7 +1376,7 @@ TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
 }
 
 template <typename T>
-static void PackRhsHelper(int iters,
+static void PackRhsHelper(::testing::benchmark::State& state,
                           /* Input dimensions: */
                           int input_batches, int input_cols, int input_rows,
                           int input_depth,
@@ -1393,9 +1393,6 @@ static void PackRhsHelper(int iters,
   // Set random seed for benchmark repeatability.
   srand(12345);
 
-  tensorflow::testing::UseRealTime();
-  tensorflow::testing::StopTiming();
-
   using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
 
   // Default Eigen::Tensor layout is column major, so we configure dimensions
@@ -1547,8 +1544,7 @@ static void PackRhsHelper(int iters,
     return (idx / packet_size) * packet_size;
   };
 
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     int input_idx =
         num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
 
@@ -1571,15 +1567,15 @@ static void PackRhsHelper(int iters,
         input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
     pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
   }
-  tensorflow::testing::StopTiming();
-  tensorflow::testing::SetLabel(
+
+  state.SetLabel(
       absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
                    "; num_patches=", num_patches, " patch_size=", patch_size,
                    " num_inputs=", num_inputs, " padding=", padding));
 }
 
 template <typename T>
-static void PackLhsHelper(int iters,
+static void PackLhsHelper(::testing::benchmark::State& state,
                           /* Input dimensions: */
                           int input_depth,
                           /* Filter (kernel) dimensions: */
@@ -1592,9 +1588,6 @@ static void PackLhsHelper(int iters,
   eigen_assert(block_rows <= filter_count);
   eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
 
-  tensorflow::testing::UseRealTime();
-  tensorflow::testing::StopTiming();
-
   using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
 
   // Default Eigen::Tensor layout is column major, so we configure dimensions
@@ -1716,8 +1709,7 @@ static void PackLhsHelper(int iters,
   const Index max_row = filter_count;
   const Index max_col = filter_rows * filter_cols * input_depth;
 
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     int filter_idx =
         num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
 
@@ -1743,8 +1735,7 @@ static void PackLhsHelper(int iters,
     pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
 #endif
   }
-  tensorflow::testing::StopTiming();
-  tensorflow::testing::SetLabel(absl::StrCat(
+  state.SetLabel(absl::StrCat(
       "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
       "; input: depth=", input_depth, "; num_filers=", num_filters));
 }
@@ -1777,12 +1768,14 @@ static void PackLhsHelper(int iters,
 
 #define BM_PackRhs(T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, BR, BC)  \
   static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW,    \
-                          ISH, ISW, BR, BC)(int iters) {                      \
-    PackRhsHelper<T>(iters, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW,    \
+                          ISH, ISW, BR,                                       \
+                          BC)(::testing::benchmark::State & state) {          \
+    PackRhsHelper<T>(state, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW,    \
                      ISH, ISW, BR, BC);                                       \
   }                                                                           \
   BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, \
-                        ISW, BR, BC))
+                        ISW, BR, BC))                                         \
+      ->UseRealTime()
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -2019,11 +2012,12 @@ BM_PackRhs(/*type*/ qint8,                 //
 #define BM_LHS_NAME(prefix, T, C, FC, FH, FW, BR, BC) \
   BM_CONCAT(BM_##prefix##_##T##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
 
-#define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                              \
-  static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC)(int iters) { \
-    PackLhsHelper<T>(iters, C, FC, FH, FW, BR, BC);                       \
-  }                                                                       \
-  BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))
+#define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                         \
+  static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR,             \
+                          BC)(::testing::benchmark::State & state) { \
+    PackLhsHelper<T>(state, C, FC, FH, FW, BR, BC);                  \
+  }                                                                  \
+  BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))->UseRealTime()
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 0a940e52eb7..d0be01578d8 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -222,7 +222,7 @@ class ParseExampleOp : public OpKernel {
     for (int d = 0; d < attrs_.num_sparse; ++d) {
       config.sparse.emplace_back(sparse_keys_t[d], attrs_.sparse_types[d]);
     }
-    config.sparse.reserve(attrs_.num_ragged);
+    config.ragged.reserve(attrs_.num_ragged);
     for (int d = 0; d < attrs_.num_ragged; ++d) {
       config.ragged.emplace_back(ragged_keys_t[d], attrs_.ragged_value_types[d],
                                  attrs_.ragged_split_types[d]);
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 7d30d00b266..8c933eff704 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -349,15 +349,16 @@ typedef BenchmarkOptions<ExampleStore<FloatFiller>, kRagged> RaggedFloat;
 // B == batch_size, K == num_keys. F == feature_size.
 // K must be one of 10, 100, 1000
 #define BM_ParseExample(TYPE, B, K, F)                                    \
-  static void BM_ParseExample##_##TYPE##_##B##_##K##_##F(int iters) {     \
+  static void BM_ParseExample##_##TYPE##_##B##_##K##_##F(                 \
+      ::testing::benchmark::State& state) {                               \
     int64 items_per_iter = static_cast<int64>(B) * K * F;                 \
-    testing::UseRealTime();                                               \
-    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter);  \
     test::Benchmark("cpu", ParseExample<TYPE>(B, K, F), nullptr, nullptr, \
-                    nullptr, "SINGLE_THREADED_EXECUTOR")                  \
-        .Run(iters);                                                      \
+                    nullptr, "SINGLE_THREADED_EXECUTOR", false)           \
+        .Run(state);                                                      \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *      \
+                            items_per_iter);                              \
   }                                                                       \
-  BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K##_##F);
+  BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K##_##F)->UseRealTime();
 
 #define BM_AllParseExample(Type)       \
   BM_ParseExample(Type, 1, 10, 1);     \
@@ -385,15 +386,17 @@ BM_AllParseExample(VarLenDenseFloat);
 // K must be one of 10, 100, 1000
 // B=0 indicates that a scalar input should be used (instead of a vector).
 #define BM_ParseExampleV2(TYPE, B, K, F)                                    \
-  static void BM_ParseExampleV2##_##TYPE##_##B##_##K##_##F(int iters) {     \
+  static void BM_ParseExampleV2##_##TYPE##_##B##_##K##_##F(                 \
+      ::testing::benchmark::State& state) {                                 \
     int64 items_per_iter = static_cast<int64>(std::max(B, 1)) * K * F;      \
-    testing::UseRealTime();                                                 \
-    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter);    \
     test::Benchmark("cpu", ParseExampleV2<TYPE>(B, K, F), nullptr, nullptr, \
-                    nullptr, "SINGLE_THREADED_EXECUTOR")                    \
-        .Run(iters);                                                        \
+                    nullptr, "SINGLE_THREADED_EXECUTOR",                    \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *        \
+                            items_per_iter);                                \
   }                                                                         \
-  BENCHMARK(BM_ParseExampleV2##_##TYPE##_##B##_##K##_##F);
+  BENCHMARK(BM_ParseExampleV2##_##TYPE##_##B##_##K##_##F)->UseRealTime();
 
 #define BM_AllParseExampleV2(Type)        \
   /* Vector Inputs */                     \
@@ -437,15 +440,17 @@ BM_AllParseExampleV2(RaggedFloat);
 // K == num_keys. F == feature_size.
 // K must be one of 10, 100, 1000
 #define BM_ParseSingleExample(TYPE, K, F)                                    \
-  static void BM_ParseSingleExample##_##TYPE##_1_##K##_##F(int iters) {      \
+  void BM_ParseSingleExample##_##TYPE##_1_##K##_##F(                         \
+      ::testing::benchmark::State& state) {                                  \
     int64 items_per_iter = K * F;                                            \
-    testing::UseRealTime();                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter);     \
     test::Benchmark("cpu", ParseSingleExample<TYPE>(K, F), nullptr, nullptr, \
-                    nullptr, "SINGLE_THREADED_EXECUTOR")                     \
-        .Run(iters);                                                         \
+                    nullptr, "SINGLE_THREADED_EXECUTOR",                     \
+                    /*old_benchmark_api=*/false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *         \
+                            items_per_iter);                                 \
   }                                                                          \
-  BENCHMARK(BM_ParseSingleExample##_##TYPE##_1_##K##_##F);
+  BENCHMARK(BM_ParseSingleExample##_##TYPE##_1_##K##_##F)->UseRealTime();
 
 #define BM_AllParseSingleExample(Type)     \
   BM_ParseSingleExample(Type, 10, 1);      \
diff --git a/tensorflow/core/kernels/gather_nd_op_test.cc b/tensorflow/core/kernels/gather_nd_op_test.cc
index b0b5c958b5a..130345b68f6 100644
--- a/tensorflow/core/kernels/gather_nd_op_test.cc
+++ b/tensorflow/core/kernels/gather_nd_op_test.cc
@@ -132,18 +132,22 @@ static Graph* GatherNd(int dim) {
   return g;
 }
 
-#define BM_GATHER_ND(DEVICE, INDEX)                                 \
-  static void BM_##DEVICE##_gather_nd_##INDEX(int iters, int dim) { \
-    const int64 tot = static_cast<int64>(iters) * kLookups * 4;     \
-    testing::ItemsProcessed(tot);                                   \
-    testing::BytesProcessed(tot * sizeof(float));                   \
-    testing::UseRealTime();                                         \
-    test::Benchmark(#DEVICE, GatherNd<INDEX>(dim)).Run(iters);      \
-  }                                                                 \
-  BENCHMARK(BM_##DEVICE##_gather_nd_##INDEX)                        \
-      ->Arg(10)                                                     \
-      ->Arg(100)                                                    \
-      ->Arg(1000)                                                   \
+#define BM_GATHER_ND(DEVICE, INDEX)                                          \
+  static void BM_##DEVICE##_gather_nd_##INDEX(                               \
+      ::testing::benchmark::State& state) {                                  \
+    const int dim = state.range(0);                                          \
+    test::Benchmark(#DEVICE, GatherNd<INDEX>(dim),                           \
+                    /*old_benchmark_api=*/false)                             \
+        .Run(state);                                                         \
+    const int64 tot = static_cast<int64>(state.iterations()) * kLookups * 4; \
+    state.SetItemsProcessed(tot);                                            \
+    state.SetBytesProcessed(tot * sizeof(float));                            \
+  }                                                                          \
+  BENCHMARK(BM_##DEVICE##_gather_nd_##INDEX)                                 \
+      ->UseRealTime()                                                        \
+      ->Arg(10)                                                              \
+      ->Arg(100)                                                             \
+      ->Arg(1000)                                                            \
       ->Arg(10000)
 
 BM_GATHER_ND(cpu, int32);
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index e4c77881ea8..f2d96e9475f 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -222,21 +222,24 @@ static Graph* Gather(int dim) {
   return g;
 }
 
-#define BM_GATHER(DEVICE, INDEX)                                  \
-  static void BM_##DEVICE##_gather_##INDEX(int iters, int dim) {  \
-    const int64 tot = static_cast<int64>(iters) * kLookups * dim; \
-    testing::ItemsProcessed(tot);                                 \
-    testing::BytesProcessed(tot * sizeof(float));                 \
-    testing::UseRealTime();                                       \
-    test::Benchmark(#DEVICE, Gather<INDEX>(dim)).Run(iters);      \
-  }                                                               \
-  BENCHMARK(BM_##DEVICE##_gather_##INDEX)                         \
-      ->Arg(1)                                                    \
-      ->Arg(10)                                                   \
-      ->Arg(20)                                                   \
-      ->Arg(64)                                                   \
-      ->Arg(100)                                                  \
-      ->Arg(200)                                                  \
+#define BM_GATHER(DEVICE, INDEX)                                               \
+  static void BM_##DEVICE##_gather_##INDEX(                                    \
+      ::testing::benchmark::State& state) {                                    \
+    const int dim = state.range(0);                                            \
+    test::Benchmark(#DEVICE, Gather<INDEX>(dim), /*old_benchmark_api=*/false)  \
+        .Run(state);                                                           \
+    const int64 tot = static_cast<int64>(state.iterations()) * kLookups * dim; \
+    state.SetItemsProcessed(tot);                                              \
+    state.SetBytesProcessed(tot * sizeof(float));                              \
+  }                                                                            \
+  BENCHMARK(BM_##DEVICE##_gather_##INDEX)                                      \
+      ->UseRealTime()                                                          \
+      ->Arg(1)                                                                 \
+      ->Arg(10)                                                                \
+      ->Arg(20)                                                                \
+      ->Arg(64)                                                                \
+      ->Arg(100)                                                               \
+      ->Arg(200)                                                               \
       ->Arg(1000)
 
 BM_GATHER(cpu, int32);
diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h
index 33c5df1ae23..5be032fb84d 100644
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@@ -35,13 +35,15 @@ namespace gpuprim = ::cub;
 #include "rocm/include/hipcub/hipcub.hpp"
 namespace gpuprim = ::hipcub;
 
+// Required for sorting Eigen::half
 namespace rocprim {
 namespace detail {
 template <>
 struct radix_key_codec_base<Eigen::half>
-    : radix_key_codec_floating<Eigen::half, unsigned short> {};
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
 };  // namespace detail
 };  // namespace rocprim
-#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index de76b5e5ec6..a94e98dc593 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -135,7 +135,7 @@ IMAGE_DEPS = [
     "//tensorflow/core:lib_internal",
     "//tensorflow/core/lib/png:png_io",
     "//tensorflow/core:protos_all_cc",
-    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/kernels:eigen_helpers",
     "//tensorflow/core/util/tensor_bundle",
     "//tensorflow/core/util:image_resizer_state",
@@ -353,7 +353,10 @@ tf_cuda_cc_test(
         "resize_bilinear_op_test.cc",
         "resize_nearest_neighbor_op_test.cc",
     ],
-    tags = ["no_cuda_on_cpu_tap"],
+    tags = [
+        "no_cuda_asan",  # TODO(b/171334997): re-enable
+        "no_cuda_on_cpu_tap",
+    ],
     deps = [
         ":image",
         ":sampling_kernels",
@@ -387,7 +390,10 @@ tf_cuda_cc_test(
 tf_cuda_cc_test(
     name = "non_max_suppression_op_gpu_test",
     srcs = ["non_max_suppression_op_gpu_test.cc"],
-    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171263349): re-enable.
+        "no_cuda_on_cpu_tap",
+    ],
     deps = [
         ":image",
         "@com_google_absl//absl/strings",
@@ -418,8 +424,8 @@ cc_library(
     linkopts = ["-ldl"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_gif_internal",
-        "//tensorflow/core:android_jpeg_internal",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
         "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/core/lib/png:png_io",
     ],
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index 5c196df9cfe..4efc4ae8846 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -207,7 +207,7 @@ class CropAndResizeOp : public AsyncOpKernel {
 namespace functor {
 template <typename T>
 struct CropAndResize<CPUDevice, T> {
-  bool operator()(const OpKernelContext* context,
+  bool operator()(OpKernelContext* context,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_index,
@@ -222,6 +222,17 @@ struct CropAndResize<CPUDevice, T> {
     const int crop_width = crops.dimension(2);
     const int depth = crops.dimension(3);
 
+    // Since `functor::CropAndResize` operates on float, we first validate
+    // that we don't overflow (since overflow causes undefined behavior which
+    // could result in segfault in this scenario).
+    const Eigen::Tensor<bool, 0, Eigen::RowMajor> only_finite_elements =
+        boxes.isfinite().all();
+    if (!only_finite_elements()) {
+      context->SetStatus(errors::InvalidArgument(
+          "Boxes contains at least one element that is not finite"));
+      return false;
+    }
+
     // Sharding across boxes.
     auto CropAndResizePerBox = [&](int64 start_box, int64 limit_box) {
       for (int b = start_box; b < limit_box; ++b) {
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 7b55ec35750..61b126fb81e 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -242,6 +242,16 @@ class DecodeImageV2Op : public OpKernel {
       flags.crop_x = crop_window_vec(1);
       flags.crop_height = crop_window_vec(2);
       flags.crop_width = crop_window_vec(3);
+    } else if (op_type_ == "DecodeBmp") {
+      // TODO(b/171060723): Only DecodeBmp as op_type_ is not acceptable here
+      // because currently `decode_(jpeg|png|gif)` ops can decode any one of
+      // jpeg, png or gif but not bmp. Similarly, `decode_bmp` cannot decode
+      // anything but bmp formats. This behavior needs to be revisited. For more
+      // details, please refer to the bug.
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "Trying to decode JPEG format using DecodeBmp op. Use "
+                      "`decode_jpeg` or `decode_image` instead."));
     }
 
     // Output tensor and the image buffer size.
@@ -346,6 +356,24 @@ class DecodeImageV2Op : public OpKernel {
       status = context->allocate_output(
           0, TensorShape({height, width, decode.channels}), &output);
     }
+
+    if (op_type_ == "DecodeBmp") {
+      // TODO(b/171060723): Only DecodeBmp as op_type_ is not acceptable here
+      // because currently `decode_(jpeg|png|gif)` ops can decode any one of
+      // jpeg, png or gif but not bmp. Similarly, `decode_bmp` cannot decode
+      // anything but bmp formats. This behavior needs to be revisited. For more
+      // details, please refer to the bug.
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "Trying to decode PNG format using DecodeBmp op. Use "
+                      "`decode_png` or `decode_image` instead."));
+    } else if (op_type_ == "DecodeAndCropJpeg") {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "DecodeAndCropJpeg operation can run on JPEG only, but "
+                      "detected PNG."));
+    }
+
     if (!status.ok()) png::CommonFreeDecode(&decode);
     OP_REQUIRES_OK(context, status);
 
@@ -393,6 +421,23 @@ class DecodeImageV2Op : public OpKernel {
                 errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
                                         channels_));
 
+    if (op_type_ == "DecodeBmp") {
+      // TODO(b/171060723): Only DecodeBmp as op_type_ is not acceptable here
+      // because currently `decode_(jpeg|png|gif)` ops can decode any one of
+      // jpeg, png or gif but not bmp. Similarly, `decode_bmp` cannot decode
+      // anything but bmp formats. This behavior needs to be revisited. For more
+      // details, please refer to the bug.
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "Trying to decode GIF format using DecodeBmp op. Use "
+                      "`decode_gif` or `decode_image` instead."));
+    } else if (op_type_ == "DecodeAndCropJpeg") {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "DecodeAndCropJpeg operation can run on JPEG only, but "
+                      "detected GIF."));
+    }
+
     // Decode GIF, allocating tensor if dtype is uint8, otherwise defer tensor
     // allocation til after dtype conversion is done. `gif`::Decode` supports
     // uint8 only.
@@ -477,6 +522,21 @@ class DecodeImageV2Op : public OpKernel {
         errors::InvalidArgument(
             "`channels` must be 0, 3 or 4 for BMP, but got ", channels_));
 
+    if (op_type_ != "DecodeBmp" && op_type_ != "DecodeImage") {
+      if (op_type_ == "DecodeAndCropJpeg") {
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "DecodeAndCropJpeg operation can run on JPEG only, but "
+                        "detected BMP."));
+      } else {
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "Trying to decode BMP format using a wrong op. Use "
+                        "`decode_bmp` or `decode_image` instead. Op used: ",
+                        op_type_));
+      }
+    }
+
     OP_REQUIRES(context, (32 <= input.size()),
                 errors::InvalidArgument("Incomplete bmp content, requires at "
                                         "least 32 bytes to find the header "
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cc
index 701753a81d6..4175d5d56dd 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cc
@@ -43,10 +43,14 @@ static inline void CheckScoreSizes(OpKernelContext* context, int num_boxes,
                                    const Tensor& scores) {
   // The shape of 'scores' is [num_boxes]
   OP_REQUIRES(context, scores.dims() == 1,
-              errors::InvalidArgument("scores must be 1-D",
-                                      scores.shape().DebugString()));
-  OP_REQUIRES(context, scores.dim_size(0) == num_boxes,
-              errors::InvalidArgument("scores has incompatible shape"));
+              errors::InvalidArgument(
+                  "scores must be 1-D", scores.shape().DebugString(),
+                  " (Shape must be rank 1 but is rank ", scores.dims(), ")"));
+  OP_REQUIRES(
+      context, scores.dim_size(0) == num_boxes,
+      errors::InvalidArgument("scores has incompatible shape (Dimensions must "
+                              "be equal, but are ",
+                              num_boxes, " and ", scores.dim_size(0), ")"));
 }
 
 static inline void ParseAndCheckOverlapSizes(OpKernelContext* context,
@@ -67,11 +71,14 @@ static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
                                          const Tensor& boxes, int* num_boxes) {
   // The shape of 'boxes' is [num_boxes, 4]
   OP_REQUIRES(context, boxes.dims() == 2,
-              errors::InvalidArgument("boxes must be 2-D",
-                                      boxes.shape().DebugString()));
+              errors::InvalidArgument(
+                  "boxes must be 2-D", boxes.shape().DebugString(),
+                  " (Shape must be rank 2 but is rank ", boxes.dims(), ")"));
   *num_boxes = boxes.dim_size(0);
   OP_REQUIRES(context, boxes.dim_size(1) == 4,
-              errors::InvalidArgument("boxes must have 4 columns"));
+              errors::InvalidArgument("boxes must have 4 columns (Dimension "
+                                      "must be 4 but is ",
+                                      boxes.dim_size(1), ")"));
 }
 
 static inline void CheckCombinedNMSScoreSizes(OpKernelContext* context,
@@ -670,12 +677,16 @@ class NonMaxSuppressionV3Op : public OpKernel {
     OP_REQUIRES(
         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
-                                max_output_size.shape().DebugString()));
+                                max_output_size.shape().DebugString(),
+                                " (Shape must be rank 0 but is ", "rank ",
+                                max_output_size.dims(), ")"));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                        iou_threshold.shape().DebugString()));
+                                        iou_threshold.shape().DebugString(),
+                                        " (Shape must be rank 0 but is rank ",
+                                        iou_threshold.dims(), ")"));
     const T iou_threshold_val = iou_threshold.scalar<T>()();
     OP_REQUIRES(context,
                 iou_threshold_val >= static_cast<T>(0.0) &&
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
index 37d7d42e438..d23f48dd449 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
+
 #include <limits>
 
 #include "absl/strings/str_cat.h"
@@ -23,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gpu_prim.h"
-#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/stream_executor/stream_executor.h"
@@ -550,30 +551,44 @@ Status CheckValidInputs(const Tensor& boxes, const Tensor& scores,
                         const Tensor& iou_threshold) {
   if (!TensorShapeUtils::IsScalar(max_output_size.shape())) {
     return errors::InvalidArgument("max_output_size must be 0-D, got shape ",
-                                   max_output_size.shape().DebugString());
+                                   max_output_size.shape().DebugString(),
+                                   " (Shape must be rank 0 but is ", "rank ",
+                                   max_output_size.dims(), ")");
   }
   if (!TensorShapeUtils::IsScalar(iou_threshold.shape())) {
     return errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                   iou_threshold.shape().DebugString());
+                                   iou_threshold.shape().DebugString(),
+                                   " (Shape must be rank 0 but is rank ",
+                                   iou_threshold.dims(), ")");
   }
   const float iou_threshold_val = iou_threshold.scalar<float>()();
   if (iou_threshold_val < 0 || iou_threshold_val > 1) {
     return errors::InvalidArgument("iou_threshold must be in [0, 1]");
   }
   if (boxes.dims() != 2) {
-    return errors::InvalidArgument("boxes must be a rank 2 tensor!");
+    return errors::InvalidArgument(
+        "boxes must be a rank 2 tensor! (Shape must "
+        "be rank 2 but is rank ",
+        boxes.dims(), ")");
   }
   int num_boxes = boxes.dim_size(0);
   if (boxes.dim_size(1) != 4) {
-    return errors::InvalidArgument("boxes must be Nx4");
+    return errors::InvalidArgument(
+        "boxes must be Nx4 (Dimension must be 4 but"
+        " is ",
+        boxes.dim_size(1), ")");
   }
   if (scores.dims() != 1) {
-    return errors::InvalidArgument("scores must be a vector!");
+    return errors::InvalidArgument(
+        "scores must be a vector! (Shape must be "
+        "rank 1 but is rank ",
+        scores.dims(), ")");
   }
   if (scores.dim_size(0) != num_boxes) {
     return errors::InvalidArgument(
-        "scores has incompatible shape");  // message must be exactly this
-                                           // otherwise tests fail!
+        "scores has incompatible shape "        // message must be exactly this
+        "(Dimensions must be equal, but are ",  // otherwise tests fail!
+        num_boxes, " and ", scores.dim_size(0), ")");
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/in_topk_op_test.cc b/tensorflow/core/kernels/in_topk_op_test.cc
index aacecb08bbe..75476a6323d 100644
--- a/tensorflow/core/kernels/in_topk_op_test.cc
+++ b/tensorflow/core/kernels/in_topk_op_test.cc
@@ -65,20 +65,23 @@ static Graph* InTopK(int num_targets, int num_classes, T top_k) {
 #define BM_NAME(T, TARGETS, CLASSES, K, DEVICE) \
   BM_InTopK##_##T##_##TARGETS##_##CLASSES##_##K##_##DEVICE
 
-#define BM_InTopK(T, TARGETS, CLASSES, K, DEVICE)                           \
-  static void BM_NAME(T, TARGETS, CLASSES, K, DEVICE)(int iters) {          \
-    testing::UseRealTime();                                                 \
-    testing::ItemsProcessed(static_cast<int64>(iters) * TARGETS * CLASSES); \
-    test::Benchmark(#DEVICE, InTopK<T>(TARGETS, CLASSES, K)).Run(iters);    \
-  }                                                                         \
-  BENCHMARK(BM_NAME(T, TARGETS, CLASSES, K, DEVICE));
+#define BM_InTopK(T, TARGETS, CLASSES, K, DEVICE)                              \
+  static void BM_NAME(T, TARGETS, CLASSES, K,                                  \
+                      DEVICE)(::testing::benchmark::State & state) {           \
+    test::Benchmark(#DEVICE, InTopK<T>(TARGETS, CLASSES, K),                   \
+                    /*old_benchmark_api=*/false)                               \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * TARGETS * \
+                            CLASSES);                                          \
+  }                                                                            \
+  BENCHMARK(BM_NAME(T, TARGETS, CLASSES, K, DEVICE))->UseRealTime();
 
 BM_InTopK(int64, 64, 1000, 10, cpu);
 BM_InTopK(int64, 64, 10000, 10, cpu);
 
-#ifdef GOOGLE_CUDA
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 BM_InTopK(int64, 64, 1000, 10, gpu);
 BM_InTopK(int64, 64, 10000, 10, gpu);
-#endif  // GOOGLE_CUDA
+#endif  // defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index dabe8bd0b7d..0ceeb5f22ea 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -212,7 +212,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:fill_functor",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ] + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/util:cuda_solvers",
diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h
index b9b2d1f0eae..61e509f331b 100644
--- a/tensorflow/core/kernels/linalg/einsum_op_impl.h
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -30,9 +30,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/linalg/einsum_op.h"
+#include "tensorflow/core/kernels/matmul_op_impl.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
index 06d1efe6dd5..4fc41940e60 100644
--- a/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
@@ -343,9 +343,6 @@ class SvdOpGpu : public AsyncOpKernel {
   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
     const Tensor& input = context->input(0);
     const int ndims = input.dims();
-    const int64 m = input.dim_size(ndims - 2);
-    const int64 n = input.dim_size(ndims - 1);
-    const int64 p = std::min(m, n);
 
     // Validate inputs.
     OP_REQUIRES_ASYNC(
@@ -353,6 +350,10 @@ class SvdOpGpu : public AsyncOpKernel {
         errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
         done);
 
+    const int64 m = input.dim_size(ndims - 2);
+    const int64 n = input.dim_size(ndims - 1);
+    const int64 p = std::min(m, n);
+
     // output tensors.
     Tensor* outputU = NULL;
     Tensor* outputS = NULL;
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index f269aa65b4e..525f7940351 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -56,14 +56,25 @@ class MutableHashTableOfScalars final : public LookupInterface {
 
   Status Find(OpKernelContext* ctx, const Tensor& key, Tensor* value,
               const Tensor& default_value) override {
-    const V default_val = default_value.flat<V>()(0);
     const auto key_values = key.flat<K>();
     auto value_values = value->flat<V>();
+    const auto default_flat = default_value.flat<V>();
+
+    int64 total = value_values.size();
+    int64 default_total = default_flat.size();
+    bool is_full_size_default = (total == default_total);
 
     tf_shared_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
+      // is_full_size_default is true:
+      //   Each key has an independent default value, key_values(i)
+      //   corresponding uses default_flat(i) as its default value.
+      //
+      // is_full_size_default is false:
+      //   All keys will share the default_flat(0) as default value.
       value_values(i) = gtl::FindWithDefault(
-          table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
+          table_, SubtleMustCopyIfIntegral(key_values(i)),
+          is_full_size_default ? default_flat(i) : default_flat(0));
     }
 
     return Status::OK();
@@ -173,11 +184,15 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
   Status Find(OpKernelContext* ctx, const Tensor& key, Tensor* value,
               const Tensor& default_value) override {
-    const auto default_flat = default_value.flat<V>();
+    const auto default_flat = default_value.flat_inner_dims<V, 2>();
     const auto key_values = key.flat<K>();
     auto value_values = value->flat_inner_dims<V, 2>();
     int64 value_dim = value_shape_.dim_size(0);
 
+    int64 total = value_values.size();
+    int64 default_total = default_flat.size();
+    bool is_full_size_default = (total == default_total);
+
     tf_shared_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
       ValueArray* value_vec =
@@ -187,8 +202,15 @@ class MutableHashTableOfTensors final : public LookupInterface {
           value_values(i, j) = value_vec->at(j);
         }
       } else {
+        // is_full_size_default is true:
+        //   Each key has an independent default value, key_values(i)
+        //   corresponding uses default_flat(i) as its default value.
+        //
+        // is_full_size_default is false:
+        //   All keys will share the default_flat(0) as default value.
         for (int64 j = 0; j < value_dim; j++) {
-          value_values(i, j) = default_flat(j);
+          value_values(i, j) =
+              is_full_size_default ? default_flat(i, j) : default_flat(0, j);
         }
       }
     }
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 68aa3428399..3b8e7a43b2d 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -199,7 +199,7 @@ TCASE(T3, 128,   4,     3,            2.0f, 1.0f,  1.0f)
 
 #undef TCASE
 
-static Graph* BM_LRNGrad(int batches, int rows, int cols, int depth,
+static Graph* MakeRNGrad(int batches, int rows, int cols, int depth,
                          int depth_radius) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor grads(DT_FLOAT, TensorShape({batches, rows, cols, depth}));
@@ -223,12 +223,15 @@ static Graph* BM_LRNGrad(int batches, int rows, int cols, int depth,
   return g;
 }
 
-#define BM_LRNGradDev(DEVICE, B, R, C, D, DR)                                 \
-  static void BM_LRNGrad_##DEVICE##_##B##_##R##_##C##_##D##_##DR(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * B * R * C * D * DR *  \
-                            4);                                               \
-    test::Benchmark(#DEVICE, BM_LRNGrad(B, R, C, D, DR)).Run(iters);          \
-  }                                                                           \
+#define BM_LRNGradDev(DEVICE, B, R, C, D, DR)                                \
+  static void BM_LRNGrad_##DEVICE##_##B##_##R##_##C##_##D##_##DR(            \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, MakeRNGrad(B, R, C, D, DR),                     \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * B * R * \
+                            C * D * DR * 4);                                 \
+  }                                                                          \
   BENCHMARK(BM_LRNGrad_##DEVICE##_##B##_##R##_##C##_##D##_##DR)
 
 BM_LRNGradDev(cpu, 128, 12, 12, 64, 4);
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
deleted file mode 100644
index 3b57f093e23..00000000000
--- a/tensorflow/core/kernels/matmul_op.cc
+++ /dev/null
@@ -1,567 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/math_ops.cc.
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/core/kernels/matmul_op.h"
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/util/matmul_autotune.h"
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#endif
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/gpu_utils.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace tensorflow {
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
-
-template <typename Device, typename T, bool USE_CUBLAS>
-struct LaunchMatMul;
-
-namespace {
-// Converts a TensorFlow Tensor to an Eigen Matrix.
-template <typename T>
-Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-ToEigenMatrix(const Tensor& tensor) {
-  auto matrix = tensor.matrix<T>();
-  return Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Map(
-      matrix.data(), matrix.dimension(0), matrix.dimension(1));
-}
-
-// Converts a TensorFlow Tensor to an Eigen Vector.
-template <typename T>
-Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>> ToEigenVector(Tensor* tensor) {
-  auto v = tensor->flat<T>();
-  return Eigen::Matrix<T, Eigen::Dynamic, 1>::Map(v.data(), v.dimension(0));
-}
-template <typename T>
-Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> ToEigenVector(
-    const Tensor& tensor) {
-  auto v = tensor.flat<T>();
-  return Eigen::Matrix<T, Eigen::Dynamic, 1>::Map(v.data(), v.dimension(0));
-}
-}  // namespace
-
-// If either side can be represented as a vector, do an explicit vector
-// matrix multiply and return true; else return false.
-//
-// Note: this uses plain Eigen and not Eigen Tensor because it is more
-// efficient.
-template <typename T>
-bool ExplicitVectorMatrixOptimization(
-    const Tensor& a, const Tensor& b,
-    const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
-    Tensor* out) {
-  if (out->dim_size(0) == 1) {
-    if (dim_pair[0].second == 0) {
-      // Note: this case is optimized in Eigen Tensors.
-      return false;
-    } else {
-      auto out_v = ToEigenVector<T>(out);
-      auto a_v = ToEigenVector<T>(a);
-      auto b_m = ToEigenMatrix<T>(b);
-      out_v.noalias() = b_m * a_v;
-    }
-    return true;
-  } else if (out->dim_size(1) == 1) {
-    auto out_v = ToEigenVector<T>(out);
-    auto a_m = ToEigenMatrix<T>(a);
-    auto b_v = ToEigenVector<T>(b);
-    if (dim_pair[0].first == 0) {
-      out_v.noalias() = a_m.transpose() * b_v;
-    } else {
-      out_v.noalias() = a_m * b_v;
-    }
-    return true;
-  }
-  return false;
-}
-// Half is not supported.
-template <>
-bool ExplicitVectorMatrixOptimization<Eigen::half>(
-    const Tensor& a, const Tensor& b,
-    const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
-    Tensor* out) {
-  return false;
-}
-
-template <typename Device, typename T>
-struct LaunchMatMulBase {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  typedef se::blas::AlgorithmType AlgorithmType;
-#else
-  typedef int64 AlgorithmType;
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  static void launch(
-      OpKernelContext* ctx, const Tensor& a, const Tensor& b,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
-      std::vector<AlgorithmType>* algorithms, bool use_autotune, Tensor* out) {
-    // An explicit vector-matrix multiply is much better optimized than an
-    // implicit one and this is a bottleneck during non-batched inference.
-    bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
-    if (!was_vector) {
-      functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
-                                          out->matrix<T>(), a.matrix<T>(),
-                                          b.matrix<T>(), dim_pair);
-    }
-  }
-
-  static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx,
-                                   std::vector<int64>* algorithms,
-                                   bool* algorithm_set_flag) {}
-};
-// On CPUs, we ignore USE_CUBLAS
-template <typename T>
-struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
-
-template <typename T, bool USE_CUBLAS>
-struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
-
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace {
-
-template <typename T>
-struct LaunchBlasGemv {
-  static void Compute(OpKernelContext* ctx, se::Stream* stream, bool trans,
-                      uint64 m, uint64 n, const se::DeviceMemory<T>& a,
-                      const se::DeviceMemory<T>& b, se::DeviceMemory<T>* c,
-                      se::blas::ProfileResult* output_profile) {
-    const auto blas_trans = trans ? se::blas::Transpose::kTranspose
-                                  : se::blas::Transpose::kNoTranspose;
-    if (output_profile == nullptr) {
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemv(blas_trans, m, n, static_cast<T>(1.0), a, m, b, 1,
-                             static_cast<T>(0.0), c, 1)
-              .ok();
-      if (!blas_launch_status) {
-        ctx->SetStatus(
-            errors::Internal("Blas GEMV launch failed:  m=", m, ", n=", n));
-      }
-    } else {
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemvWithProfiling(blas_trans, m, n, static_cast<T>(1.0),
-                                          a, m, b, 1, static_cast<T>(0.0), c, 1,
-                                          output_profile)
-              .ok();
-      if (!blas_launch_status) {
-        ctx->SetStatus(errors::Internal(
-            "Blas GEMV with profiling launch failed:  m=", m, ", n=", n));
-      }
-    }
-  }
-
-  static bool IsSupported() { return true; }
-};
-
-template <>
-void LaunchBlasGemv<Eigen::half>::Compute(
-    OpKernelContext* ctx, se::Stream* stream, bool trans, uint64 m, uint64 n,
-    const se::DeviceMemory<Eigen::half>& a,
-    const se::DeviceMemory<Eigen::half>& b, se::DeviceMemory<Eigen::half>* c,
-    se::blas::ProfileResult* output_profile) {
-  ctx->SetStatus(errors::Internal(
-      "Blas GEMV launch failed: GEMV is not implemented for float16."));
-}
-
-template <>
-bool LaunchBlasGemv<Eigen::half>::IsSupported() {
-  return false;
-}
-
-template <typename T>
-bool ShouldUseGemv(uint64 n) {
-  return (LaunchBlasGemv<T>::IsSupported() && n == 1);
-}
-
-}  // namespace
-
-bool GetCublasAutotuneComputationType(const DataType& dtype,
-                                      se::blas::ComputationType* compute_type) {
-  using se::blas::ComputationType;
-  switch (dtype) {
-    case DT_HALF:
-    case DT_BFLOAT16:
-      static bool use_f32_for_f16_computation =
-          MatmulDoFP32ComputationFP16Input();
-      if (use_f32_for_f16_computation) {
-        *compute_type = ComputationType::kF32;
-      } else {
-        *compute_type = ComputationType::kF16;
-      }
-      return false;
-    case DT_FLOAT:
-      *compute_type = ComputationType::kF32;
-      return true;
-    case DT_DOUBLE:
-      *compute_type = ComputationType::kF64;
-      return true;
-    default:
-      // Unsupported compute_type, return false.
-      return false;
-  }
-}
-
-// A dummy type to group matmul autotune results together.
-struct MatmulAutoTuneGroup {
-  static string name() { return "Matmul"; }
-};
-typedef AutoTuneSingleton<MatmulAutoTuneGroup, MatmulParameters,
-                          se::blas::AlgorithmConfig>
-    AutoTuneMatmul;
-
-template <typename T>
-struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
-  static void launch(
-      OpKernelContext* ctx, const Tensor& a, const Tensor& b,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
-      std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
-    using se::blas::AlgorithmConfig;
-    using se::blas::ComputationType;
-    using se::blas::kDefaultAlgorithm;
-    using se::blas::kDefaultBlasGemm;
-    using se::blas::kDefaultBlasGemv;
-    using se::blas::kNoAlgorithm;
-    using se::blas::ProfileResult;
-    using se::blas::Transpose;
-    Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
-    const uint64 m = a.dim_size(1 - dim_pair[0].first);
-    const uint64 k = a.dim_size(dim_pair[0].first);
-    const uint64 n = b.dim_size(1 - dim_pair[0].second);
-    bool transpose_a = dim_pair[0].first == 0;
-    bool transpose_b = dim_pair[0].second == 1;
-    auto blas_transpose_a = trans[transpose_a];
-    auto blas_transpose_b = trans[transpose_b];
-
-    auto* stream = ctx->op_device_context()->stream();
-    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
-
-    auto a_ptr = AsDeviceMemory(a.template flat<T>().data(),
-                                a.template flat<T>().size());
-    auto b_ptr = AsDeviceMemory(b.template flat<T>().data(),
-                                b.template flat<T>().size());
-    auto c_ptr = AsDeviceMemory(out->template flat<T>().data(),
-                                out->template flat<T>().size());
-    auto alpha = static_cast<T>(1.0);
-    auto beta = static_cast<T>(0.0);
-
-    int device_id = stream->parent()->device_ordinal();
-    DataType dtype = a.dtype();
-    MatmulParameters matmul_parameters = {
-        transpose_a, transpose_b, m, n, k, dtype, device_id,
-    };
-    AlgorithmConfig algorithm_config(kNoAlgorithm);
-
-    ComputationType computation_type;
-    bool compute_type_supported =
-        GetCublasAutotuneComputationType(dtype, &computation_type);
-    if (use_autotune && compute_type_supported && !algorithms->empty()) {
-      ProfileResult best_result;
-      // TODO(yangzihao): Unify this code with conv autotuning.
-      if (!AutoTuneMatmul::GetInstance()->Find(matmul_parameters,
-                                               &algorithm_config)) {
-        ProfileResult profile_result;
-        for (auto profile_algorithm : (*algorithms)) {
-          // Cublas does
-          // C = A x B
-          // where A, B and C are assumed to be in column major.
-          // We want the output to be in row-major, so we can compute
-          // C' = B' x A' (' stands for transpose)
-          bool cublas_launch_status =
-              stream
-                  ->ThenBlasGemmWithAlgorithm(
-                      blas_transpose_b, blas_transpose_a, n, m, k, alpha, b_ptr,
-                      transpose_b ? k : n, a_ptr, transpose_a ? m : k, beta,
-                      &c_ptr, n, computation_type, profile_algorithm,
-                      &profile_result)
-                  .ok();
-          if (cublas_launch_status) {
-            if (profile_result.is_valid()) {
-              if (profile_result.elapsed_time_in_ms() <
-                  best_result.elapsed_time_in_ms()) {
-                best_result = profile_result;
-              }
-            }
-          }
-        }
-        // Try BlasGemmWithProfiling
-        bool cublas_launch_status =
-            stream
-                ->ThenBlasGemmWithProfiling(
-                    blas_transpose_b, blas_transpose_a, n, m, k, 1.0, b_ptr,
-                    transpose_b ? k : n, a_ptr, transpose_a ? m : k, 0.0,
-                    &c_ptr, n, &profile_result)
-                .ok();
-        if (cublas_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-          }
-        }
-        // Try BlasGemvWithProfiling
-        if (ShouldUseGemv<T>(n)) {
-          LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a,
-                                     transpose_a ? m : k, transpose_a ? k : m,
-                                     a_ptr, b_ptr, &c_ptr, &profile_result);
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-          }
-        }
-      }
-      // We make sure that each matmul parameter set only gets one pass of
-      // autotune. If the best result is found, assign it to algorithm_type
-      // and insert it to autotune map. If all internal kernels of
-      // cublasGemmEx() returns invalid results, we add kNoAlgorithm to the
-      // autotune map.
-      if (best_result.is_valid()) {
-        algorithm_config.set_algorithm(best_result.algorithm());
-      }
-      AutoTuneMatmul::GetInstance()->Insert(matmul_parameters,
-                                            algorithm_config);
-      if (algorithm_config.algorithm() != kNoAlgorithm &&
-          algorithm_config.algorithm() != kDefaultBlasGemm &&
-          algorithm_config.algorithm() != kDefaultBlasGemv) {
-        bool cublas_launch_status =
-            stream
-                ->ThenBlasGemmWithAlgorithm(
-                    blas_transpose_b, blas_transpose_a, n, m, k, alpha, b_ptr,
-                    transpose_b ? k : n, a_ptr, transpose_a ? m : k, beta,
-                    &c_ptr, n, computation_type, algorithm_config.algorithm(),
-                    nullptr)
-                .ok();
-        if (!cublas_launch_status) {
-          ctx->SetStatus(errors::Internal(
-              "Blas GEMM with algorithm launch failed : a.shape=(",
-              a.dim_size(0), ", ", a.dim_size(1), "), b.shape=(", b.dim_size(0),
-              ", ", b.dim_size(1), "), m=", m, ", n=", n, ", k=", k));
-        }
-      }
-    }
-    // For the following case, we use normal BlasGemm():
-    //  1) We didn't set the use_autotune flag;
-    //  2) compute type does not support autotune;
-    //  3) no algorithm is found;
-    //  4) all internal kernels in autotune return invalid results.
-    //  For the following case, we use normal BlasGemv():
-    //  1) We didn't set the use_autotune flag but LaunchBlasGemv is supported
-    //     and n == 1.
-    //  2) We set the use_autotune flag and it picked up BlasGemv() and set the
-    //     algorithm_config.algorithm() to be kDefaultBlasGemv.
-    if (!use_autotune || !compute_type_supported || algorithms->empty() ||
-        algorithm_config.algorithm() == kNoAlgorithm ||
-        algorithm_config.algorithm() == kDefaultBlasGemm ||
-        algorithm_config.algorithm() == kDefaultBlasGemv) {
-      if (algorithm_config.algorithm() == kDefaultBlasGemv ||
-          ShouldUseGemv<T>(n)) {
-        // This is a matrix*vector multiply so use GEMV to compute A * b.
-        // Here we are multiplying in the natural order, so we have to flip
-        // the transposition flag to compensate for the tensor being stored
-        // row-major.
-        // TODO(yangzihao): Add Gemv as an autotuning option too.
-        LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a,
-                                   transpose_a ? m : k, transpose_a ? k : m,
-                                   a_ptr, b_ptr, &c_ptr, nullptr);
-      } else {
-        // Use C' = B' x A' (' stands for transpose)
-        bool blas_launch_status =
-            stream
-                ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
-                               1.0f, b_ptr, transpose_b ? k : n, a_ptr,
-                               transpose_a ? m : k, 0.0f, &c_ptr, n)
-                .ok();
-        if (!blas_launch_status) {
-          ctx->SetStatus(errors::Internal(
-              "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ",
-              a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
-              "), m=", m, ", n=", n, ", k=", k));
-        }
-      }
-    }
-  }
-
-  static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx,
-                                   std::vector<int64>* algorithms,
-                                   bool* algorithm_set_flag) {
-    if (*algorithm_set_flag == false) {
-      auto* stream = ctx->device()->tensorflow_gpu_device_info()->stream;
-      stream->parent()->GetBlasGemmAlgorithms(algorithms);
-      *algorithm_set_flag = true;
-    }
-  }
-};
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-template <typename Device, typename T, bool USE_CUBLAS>
-class MatMulOp : public OpKernel {
- public:
-  explicit MatMulOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), algorithms_set_already_(false) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
-
-    LaunchMatMul<Device, T, USE_CUBLAS>::GetBlasGemmAlgorithm(
-        ctx, &algorithms_, &algorithms_set_already_);
-    use_autotune_ = MatmulAutotuneEnable();
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& a = ctx->input(0);
-    const Tensor& b = ctx->input(1);
-
-    // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(a.shape()),
-        errors::InvalidArgument("In[0] is not a matrix. Instead it has shape ",
-                                a.shape().DebugString()));
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(b.shape()),
-        errors::InvalidArgument("In[1] is not a matrix. Instead it has shape ",
-                                b.shape().DebugString()));
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
-    dim_pair[0].first = transpose_a_ ? 0 : 1;
-    dim_pair[0].second = transpose_b_ ? 1 : 0;
-
-    OP_REQUIRES(
-        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-        errors::InvalidArgument(
-            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
-            ", In[1]: ", b.shape().DebugString()));
-    int a_dim_remaining = 1 - dim_pair[0].first;
-    int b_dim_remaining = 1 - dim_pair[0].second;
-    TensorShape out_shape(
-        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
-
-    if (out->NumElements() == 0) {
-      // If a has shape [0, x] or b has shape [x, 0], the output shape
-      // is a 0-element matrix, so there is nothing to do.
-      return;
-    }
-
-    if (a.NumElements() == 0 && b.NumElements() == 0) {
-      // If a has shape [x, 0] and b has shape [0, y], the
-      // output shape is [x, y] where x and y are non-zero, so we fill
-      // the output with zeros.
-      functor::SetZeroFunctor<Device, T> f;
-      f(ctx->eigen_device<Device>(), out->flat<T>());
-      return;
-    }
-
-    if (std::is_same<T, bfloat16>::value) {
-      bool is_cpu = std::is_same<Device, CPUDevice>::value;
-      OP_REQUIRES(ctx, is_cpu,
-                  errors::Internal("bfloat16 matmul is not supported by GPU"));
-      Tensor a_float, b_float, out_float;
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, a.shape(), &a_float));
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, b.shape(), &b_float));
-      OP_REQUIRES_OK(ctx,
-                     ctx->allocate_temp(DT_FLOAT, out->shape(), &out_float));
-
-      // TODO: Avoid extra copy to make bfloat16 matmul efficient on CPU.
-      BFloat16ToFloat(a.flat<bfloat16>().data(), a_float.flat<float>().data(),
-                      a.NumElements());
-      BFloat16ToFloat(b.flat<bfloat16>().data(), b_float.flat<float>().data(),
-                      b.NumElements());
-
-      LaunchMatMul<Device, float, USE_CUBLAS>::launch(
-          ctx, a_float, b_float, dim_pair, &algorithms_, use_autotune_,
-          &out_float);
-      FloatToBFloat16(out_float.flat<float>().data(),
-                      out->flat<bfloat16>().data(), out->NumElements());
-    } else {
-      LaunchMatMul<Device, T, USE_CUBLAS>::launch(
-          ctx, a, b, dim_pair, &algorithms_, use_autotune_, out);
-    }
-  }
-
- private:
-  std::vector<int64> algorithms_;
-  bool algorithms_set_already_;
-  bool use_autotune_;
-  bool transpose_a_;
-  bool transpose_b_;
-};
-
-namespace functor {
-
-// Partial specialization MatMulFunctor<Device=CPUDevice, T>.
-template <typename T>
-struct MatMulFunctor<CPUDevice, T> {
-  void operator()(
-      const CPUDevice& d, typename MatMulTypes<T>::out_type out,
-      typename MatMulTypes<T>::in_type in0,
-      typename MatMulTypes<T>::in_type in1,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
-    MatMul<CPUDevice>(d, out, in0, in1, dim_pair);
-  }
-};
-
-
-}  // end namespace functor
-
-#define REGISTER_CPU_EIGEN(T)                                                  \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("eigen"), \
-      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
-
-#define REGISTER_CPU(T)                                             \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
-      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>); \
-  REGISTER_CPU_EIGEN(T);
-
-#define REGISTER_GPU(T)                                            \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T"),    \
-      MatMulOp<GPUDevice, T, true /* cublas, true by default */>); \
-  REGISTER_KERNEL_BUILDER(Name("MatMul")                           \
-                              .Device(DEVICE_GPU)                  \
-                              .TypeConstraint<T>("T")              \
-                              .Label("cublas"),                    \
-                          MatMulOp<GPUDevice, T, true /* cublas */>)
-
-TF_CALL_int32(REGISTER_CPU);
-TF_CALL_int64(REGISTER_CPU);
-TF_CALL_FLOAT_TYPES(REGISTER_CPU);
-TF_CALL_COMPLEX_TYPES(REGISTER_CPU);
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_complex.cc b/tensorflow/core/kernels/matmul_op_complex.cc
similarity index 94%
rename from tensorflow/core/kernels/batch_matmul_op_complex.cc
rename to tensorflow/core/kernels/matmul_op_complex.cc
index bc36b95d6a1..daec0220d10 100644
--- a/tensorflow/core/kernels/batch_matmul_op_complex.cc
+++ b/tensorflow/core/kernels/matmul_op_complex.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/batch_matmul_op_impl.h"
+#include "tensorflow/core/kernels/matmul_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 7b9d82718eb..9ba9ed6c63f 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -70,6 +70,20 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
 
     auto& d = context->eigen_device<CPUDevice>();
 
+    // Executes Eigen contraction with output kernel wrapped into type erased
+    // wrapper to reduce the number of unique template instantiations.
+    auto executeWithOutputKernel = [&](auto output_kernel) {
+      OutputKernelWrapper output_kernel_wrapper(
+          [&output_kernel](
+              const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+              const Eigen::TensorContractionParams& params, Eigen::Index i,
+              Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
+            output_kernel(output_mapper, params, i, j, num_rows, num_cols);
+          });
+
+      out.device(d) = lhs.contract(rhs, dim_pair, output_kernel_wrapper);
+    };
+
     BiasAddArgs<T> bias_add_args;
     if (BiasAddArgs<T>::IsSupported(fusion)) {
       OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
@@ -77,20 +91,16 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
 
     switch (fusion) {
       case FusedComputationType::kBiasAdd:
-        out.device(d) =
-            lhs.contract(rhs, dim_pair, WithBiasAdd<T>(bias_add_args));
+        executeWithOutputKernel(WithBiasAdd<T>(bias_add_args));
         break;
       case FusedComputationType::kBiasAddWithRelu:
-        out.device(d) =
-            lhs.contract(rhs, dim_pair, WithBiasAddAndRelu<T>(bias_add_args));
+        executeWithOutputKernel(WithBiasAddAndRelu<T>(bias_add_args));
         break;
       case FusedComputationType::kBiasAddWithRelu6:
-        out.device(d) =
-            lhs.contract(rhs, dim_pair, WithBiasAddAndRelu6<T>(bias_add_args));
+        executeWithOutputKernel(WithBiasAddAndRelu6<T>(bias_add_args));
         break;
       case FusedComputationType::kBiasAddWithElu:
-        out.device(d) =
-            lhs.contract(rhs, dim_pair, WithBiasAddAndElu<T>(bias_add_args));
+        executeWithOutputKernel(WithBiasAddAndElu<T>(bias_add_args));
         break;
       case FusedComputationType::kUndefined:
         OP_REQUIRES_OK(context, errors::Internal("Fusion type is undefined"));
@@ -100,6 +110,31 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
                        errors::Internal("Fusion type is not supported"));
     }
   }
+
+ private:
+  // Wrap output_kernel into type erased struct to reduce the number of unique
+  // template instantiations for Eigen Tensor contraction expressions.
+  //
+  // We do not pass std::function directly as an output kernel because it blows
+  // up the binary size in debug mode with super long symbol names.
+  struct OutputKernelWrapper {
+    using OutputKernelFn =
+        std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
+                           const Eigen::TensorContractionParams&, Eigen::Index,
+                           Eigen::Index, Eigen::Index, Eigen::Index)>;
+
+    explicit OutputKernelWrapper(OutputKernelFn fn)
+        : output_kernel_fn(std::move(fn)) {}
+
+    void operator()(
+        const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+        const Eigen::TensorContractionParams& params, Eigen::Index i,
+        Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
+      output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
+    }
+
+    OutputKernelFn output_kernel_fn;
+  };
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
similarity index 88%
rename from tensorflow/core/kernels/batch_matmul_op_impl.h
rename to tensorflow/core/kernels/matmul_op_impl.h
index d6cc980633f..4e29be5eb11 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
-#ifndef TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MATMUL_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_MATMUL_OP_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -633,10 +633,21 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
 template <typename Device, typename Scalar>
 class BaseBatchMatMulOp : public OpKernel {
  public:
-  explicit BaseBatchMatMulOp(OpKernelConstruction* context)
+  explicit BaseBatchMatMulOp(OpKernelConstruction* context,
+                             bool is_legacy_matmul)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
-    OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+    if (is_legacy_matmul) {
+      // The old MatMul kernel has "transpose_a/transpose_b" attributes.
+      OP_REQUIRES_OK(context, context->GetAttr("transpose_a", &trans_x_));
+      OP_REQUIRES_OK(context, context->GetAttr("transpose_b", &trans_y_));
+      adj_x_ = false;
+      adj_y_ = false;
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+      OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+      trans_x_ = false;
+      trans_y_ = false;
+    }
   }
 
   ~BaseBatchMatMulOp() override {}
@@ -672,8 +683,8 @@ class BaseBatchMatMulOp : public OpKernel {
         in1_reshaped.CopyFrom(in1, TensorShape({bcast.y_batch_size(), d2, d3})),
         errors::Internal("Failed to reshape In[1] from ",
                          in1.shape().DebugString()));
-    if (adj_x_) std::swap(d0, d1);
-    if (adj_y_) std::swap(d2, d3);
+    if (adj_x_ || trans_x_) std::swap(d0, d1);
+    if (adj_y_ || trans_y_) std::swap(d2, d3);
     OP_REQUIRES(ctx, d1 == d2,
                 errors::InvalidArgument(
                     "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
@@ -696,9 +707,36 @@ class BaseBatchMatMulOp : public OpKernel {
                 out_reshaped.CopyFrom(*out, TensorShape({batch_size, d0, d3})),
                 errors::Internal("Failed to reshape output from ",
                                  out->shape().DebugString()));
-    LaunchBatchMatMul<Device, Scalar>::Launch(
-        ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, /*trans_x=*/false,
-        /*trans_y=*/false, bcast, &out_reshaped);
+    if (std::is_same<Scalar, bfloat16>::value) {
+      bool is_cpu = std::is_same<Device, CPUDevice>::value;
+      OP_REQUIRES(ctx, is_cpu,
+                  errors::Internal("bfloat16 matmul is not supported by GPU"));
+      Tensor in0_reshaped_float, in1_reshaped_float, out_reshaped_float;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in0_reshaped.shape(),
+                                             &in0_reshaped_float));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in1_reshaped.shape(),
+                                             &in1_reshaped_float));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_reshaped.shape(),
+                                             &out_reshaped_float));
+
+      // TODO: Avoid extra copy to make bfloat16 matmul efficient on CPU.
+      BFloat16ToFloat(in0_reshaped.flat<bfloat16>().data(),
+                      in0_reshaped_float.flat<float>().data(),
+                      in0_reshaped.NumElements());
+      BFloat16ToFloat(in1_reshaped.flat<bfloat16>().data(),
+                      in1_reshaped_float.flat<float>().data(),
+                      in1_reshaped.NumElements());
+
+      LaunchBatchMatMul<Device, float>::Launch(
+          ctx, in0_reshaped_float, in1_reshaped_float, adj_x_, adj_y_, trans_x_,
+          trans_y_, bcast, &out_reshaped_float);
+      FloatToBFloat16(out_reshaped_float.flat<float>().data(),
+                      out_reshaped.flat<bfloat16>().data(), out->NumElements());
+    } else {
+      LaunchBatchMatMul<Device, Scalar>::Launch(ctx, in0_reshaped, in1_reshaped,
+                                                adj_x_, adj_y_, trans_x_,
+                                                trans_y_, bcast, &out_reshaped);
+    }
   }
 
  protected:
@@ -706,16 +744,19 @@ class BaseBatchMatMulOp : public OpKernel {
                                     const Tensor& in1) = 0;
 
  private:
+  // TODO(171979567) Make the ops take both adj and transpose attributes.
   bool adj_x_;
   bool adj_y_;
+  bool trans_x_;
+  bool trans_y_;
 };
 
 // BatchMatMul Op implementation which disallows broadcasting.
-template <typename Device, typename Scalar>
+template <typename Device, typename Scalar, bool is_legacy_matmul = false>
 class BatchMatMulOp : public BaseBatchMatMulOp<Device, Scalar> {
  public:
   explicit BatchMatMulOp(OpKernelConstruction* context)
-      : BaseBatchMatMulOp<Device, Scalar>(context) {}
+      : BaseBatchMatMulOp<Device, Scalar>(context, is_legacy_matmul) {}
 
   ~BatchMatMulOp() override {}
 
@@ -729,15 +770,21 @@ class BatchMatMulOp : public BaseBatchMatMulOp<Device, Scalar> {
                                         in0.shape().DebugString(), " vs. ",
                                         in1.shape().DebugString()));
     const int ndims = in0.dims();
-    OP_REQUIRES(
-        ctx, ndims >= 2,
-        errors::InvalidArgument("In[0] and In[1] ndims must be >= 2: ", ndims));
-    for (int i = 0; i < ndims - 2; ++i) {
-      OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
+    if (is_legacy_matmul) {
+      OP_REQUIRES(ctx, ndims == 2,
                   errors::InvalidArgument(
-                      "In[0].dim(", i, ") and In[1].dim(", i,
-                      ") must be the same: ", in0.shape().DebugString(), " vs ",
-                      in1.shape().DebugString()));
+                      "In[0] and In[1] ndims must be == 2: ", ndims));
+    } else {
+      OP_REQUIRES(ctx, ndims >= 2,
+                  errors::InvalidArgument(
+                      "In[0] and In[1] ndims must be >= 2: ", ndims));
+      for (int i = 0; i < ndims - 2; ++i) {
+        OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
+                    errors::InvalidArgument(
+                        "In[0].dim(", i, ") and In[1].dim(", i,
+                        ") must be the same: ", in0.shape().DebugString(),
+                        " vs ", in1.shape().DebugString()));
+      }
     }
   }
 };
@@ -747,7 +794,8 @@ template <typename Device, typename Scalar>
 class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Scalar> {
  public:
   explicit BatchMatMulV2Op(OpKernelConstruction* context)
-      : BaseBatchMatMulOp<Device, Scalar>(context) {}
+      : BaseBatchMatMulOp<Device, Scalar>(context,
+                                          /* is_legacy_matmul= */ false) {}
 
   ~BatchMatMulV2Op() override {}
 
@@ -771,7 +819,10 @@ class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Scalar> {
       BatchMatMulOp<CPUDevice, TYPE>);                                    \
   REGISTER_KERNEL_BUILDER(                                                \
       Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      BatchMatMulV2Op<CPUDevice, TYPE>)
+      BatchMatMulV2Op<CPUDevice, TYPE>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
+      BatchMatMulOp<CPUDevice, TYPE, /* is_legacy_matmul=*/true>)
 
 #define REGISTER_BATCH_MATMUL_GPU(TYPE)                                   \
   REGISTER_KERNEL_BUILDER(                                                \
@@ -779,8 +830,11 @@ class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Scalar> {
       BatchMatMulOp<GPUDevice, TYPE>);                                    \
   REGISTER_KERNEL_BUILDER(                                                \
       Name("BatchMatMulV2").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
-      BatchMatMulV2Op<GPUDevice, TYPE>)
+      BatchMatMulV2Op<GPUDevice, TYPE>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("MatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),        \
+      BatchMatMulOp<GPUDevice, TYPE, /* is_legacy_matmul=*/true>)
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MATMUL_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/matmul_op_real.cc
similarity index 76%
rename from tensorflow/core/kernels/batch_matmul_op_real.cc
rename to tensorflow/core/kernels/matmul_op_real.cc
index 30ec13e6b4d..34d4b8c57b4 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/matmul_op_real.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/batch_matmul_op_impl.h"
+#include "tensorflow/core/kernels/matmul_op_impl.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -21,17 +21,13 @@ limitations under the License.
 
 namespace tensorflow {
 
-TF_CALL_float(REGISTER_BATCH_MATMUL_CPU);
-TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
-TF_CALL_half(REGISTER_BATCH_MATMUL_CPU);
+TF_CALL_FLOAT_TYPES(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int16(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int64(REGISTER_BATCH_MATMUL_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-TF_CALL_float(REGISTER_BATCH_MATMUL_GPU);
-TF_CALL_double(REGISTER_BATCH_MATMUL_GPU);
-TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index aa4c8efb640..4f986e34acd 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
+namespace {
 
 template <typename T>
 class FusedMatMulOpTest : public OpsTestBase {
@@ -459,4 +460,230 @@ BM_Matmul(2000, 1, 2000, true, false);
 BM_Matmul(2000, 1, 2000, false, true);
 BM_Matmul(2000, 1, 2000, true, true);
 
-}  // end namespace tensorflow
+// Benchmarks for batched matmul with broadcasting.
+Node* BroadcastTo(Graph* g, Node* input, Node* shape) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastTo")
+                  .Input(input)
+                  .Input(shape)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* BatchMatmulV2(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BatchMatMulV2")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("adj_x", adj_x)
+                  .Attr("adj_y", adj_y)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+template <typename T>
+static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
+                          bool adjoint_b, DataType type) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(type, adjoint_a ? TensorShape({b, k, m}) : TensorShape({b, m, k}));
+  in0.flat<T>().setRandom();
+  Tensor in1(type, adjoint_b ? TensorShape({b, n, k}) : TensorShape({b, k, n}));
+  in1.flat<T>().setRandom();
+  test::graph::BatchMatmul(g, test::graph::Constant(g, in0),
+                           test::graph::Constant(g, in1), adjoint_a, adjoint_b);
+  return g;
+}
+
+template <typename T>
+static Graph* BatchMatmulWithBroadcast(int b0, int b1, int m, int k, int n,
+                                       bool manual_broadcast, DataType type) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(type, TensorShape({b0, m, k}));
+  in0.flat<T>().setRandom();
+  Tensor in1(type, TensorShape({b1, k, n}));
+  in1.flat<T>().setRandom();
+
+  Tensor broadcasted_in0_shape(DT_INT64, TensorShape({3}));
+  Tensor broadcasted_in1_shape(DT_INT64, TensorShape({3}));
+
+  Node* in0_node = nullptr;
+  Node* in1_node = nullptr;
+  if (manual_broadcast) {
+    for (int i = 0; i < 3; ++i) {
+      auto vec0 = broadcasted_in0_shape.vec<int64>();
+      auto vec1 = broadcasted_in1_shape.vec<int64>();
+      vec0(i) = (i == 0 ? std::max(b0, b1) : in0.shape().dim_size(i));
+      vec1(i) = (i == 0 ? std::max(b0, b1) : in1.shape().dim_size(i));
+    }
+    in0_node = BroadcastTo(g, test::graph::Constant(g, in0),
+                           test::graph::Constant(g, broadcasted_in0_shape));
+    in1_node = BroadcastTo(g, test::graph::Constant(g, in1),
+                           test::graph::Constant(g, broadcasted_in1_shape));
+  } else {
+    in0_node = test::graph::Constant(g, in0);
+    in1_node = test::graph::Constant(g, in1);
+  }
+
+  BatchMatmulV2(g, in0_node, in1_node, false, false);
+  return g;
+}
+
+#define BM_BatchMatmulDev(B, M, K, N, TA, TB, T, TFTYPE, DEVICE)                  \
+  static void                                                                     \
+      BM_BatchMatmul##_##B##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE( \
+          int iters) {                                                            \
+    testing::UseRealTime();                                                       \
+    testing::ItemsProcessed(static_cast<int64>(iters) * B * M * K * N * 2);       \
+    test::Benchmark(#DEVICE, BatchMatmul<T>(B, M, K, N, TA, TB, TFTYPE))          \
+        .Run(iters);                                                              \
+  }                                                                               \
+  BENCHMARK(                                                                      \
+      BM_BatchMatmul##_##B##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE);
+
+#define BM_BatchMatmul(B, M, K, N, TA, TB) \
+  BM_BatchMatmulDev(B, M, K, N, TA, TB, float, DT_FLOAT, cpu);
+// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
+// cpu);
+//  BM_BatchMatmulDev(B, M, K, N, TA, TB, float, DT_FLOAT, gpu);
+/* Uncomment to enable benchmarks for double & complex types: */
+// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
+// gpu);
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
+// \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
+
+// Macro arguments names: --------------------------------------------------- //
+//   B1: batch size of LHS
+//   B2: batch size of RHS
+//    M: outer dimension of LHS
+//    K: inner dimensions of LHS and RHS
+//    N: outer dimension of RHS
+//   MB: boolean indicating whether to use manual broadcasting
+//    T: C++ type of scalars (e.g. float, std::complex)
+//   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
+//    D: Device (e.g. cpu, gpu)
+#define BM_BatchMatmulBCastDev(B1, B2, M, K, N, MB, T, TT, D)                  \
+  static void                                                                  \
+      BM_BatchMatmulBCast##_##B1##_##B2##_##M##_##K##_##N##_##MB##_##TT##_##D( \
+          int iters) {                                                         \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
+                            K * N * 2);                                        \
+    test::Benchmark(#D, BatchMatmulWithBroadcast<T>(B1, B2, M, K, N, MB, TT))  \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_BatchMatmulBCast##_##B1##_##B2##_##M##_##K##_##N##_##MB##_##TT##_##D);
+
+#define BM_BatchMatmulBCast(B1, B2, M, K, N, MB) \
+  BM_BatchMatmulBCastDev(B1, B2, M, K, N, MB, float, DT_FLOAT, cpu);
+
+// Typical fully connected layers
+BM_BatchMatmulBCast(1, 128, 1, 1024, 1024, true);
+BM_BatchMatmulBCast(1, 128, 1, 1024, 1024, false);
+BM_BatchMatmulBCast(128, 1, 1, 1024, 1024, true);
+BM_BatchMatmulBCast(128, 1, 1, 1024, 1024, false);
+BM_BatchMatmulBCast(1, 128, 128, 1024, 1024, true);
+BM_BatchMatmulBCast(1, 128, 128, 1024, 1024, false);
+BM_BatchMatmulBCast(128, 1, 128, 1024, 1024, true);
+BM_BatchMatmulBCast(128, 1, 128, 1024, 1024, false);
+
+// Square matmul.
+BM_BatchMatmulBCast(1, 128, 512, 512, 512, true);
+BM_BatchMatmulBCast(1, 128, 512, 512, 512, false);
+BM_BatchMatmulBCast(128, 1, 512, 512, 512, true);
+BM_BatchMatmulBCast(128, 1, 512, 512, 512, false);
+BM_BatchMatmulBCast(1, 128, 1024, 1024, 1024, true);
+BM_BatchMatmulBCast(1, 128, 1024, 1024, 1024, false);
+BM_BatchMatmulBCast(128, 1, 1024, 1024, 1024, true);
+BM_BatchMatmulBCast(128, 1, 1024, 1024, 1024, false);
+
+// Matrix-vector multiplies.
+BM_BatchMatmulBCast(1, 128, 10000, 200, 1, true);
+BM_BatchMatmulBCast(1, 128, 10000, 200, 1, false);
+BM_BatchMatmulBCast(128, 1, 10000, 200, 1, true);
+BM_BatchMatmulBCast(128, 1, 10000, 200, 1, false);
+
+// Vector-matrix multiplies.
+BM_BatchMatmulBCast(1, 128, 1, 200, 10000, true);
+BM_BatchMatmulBCast(1, 128, 1, 200, 10000, false);
+BM_BatchMatmulBCast(128, 1, 1, 200, 10000, true);
+BM_BatchMatmulBCast(128, 1, 1, 200, 10000, false);
+
+// Typical fully connected layers
+BM_BatchMatmul(1, 1, 1024, 1024, false, false);
+BM_BatchMatmul(1, 8, 1024, 1024, false, false);
+BM_BatchMatmul(1, 16, 1024, 1024, false, false);
+BM_BatchMatmul(1, 128, 1024, 1024, false, false);
+BM_BatchMatmul(2, 1, 1024, 1024, false, false);
+BM_BatchMatmul(2, 8, 1024, 1024, false, false);
+BM_BatchMatmul(2, 16, 1024, 1024, false, false);
+BM_BatchMatmul(2, 128, 1024, 1024, false, false);
+BM_BatchMatmul(8, 1, 1024, 1024, false, false);
+BM_BatchMatmul(8, 8, 1024, 1024, false, false);
+BM_BatchMatmul(8, 16, 1024, 1024, false, false);
+BM_BatchMatmul(8, 128, 1024, 1024, false, false);
+BM_BatchMatmul(32, 1, 1024, 1024, false, false);
+BM_BatchMatmul(32, 8, 1024, 1024, false, false);
+BM_BatchMatmul(32, 16, 1024, 1024, false, false);
+BM_BatchMatmul(32, 128, 1024, 1024, false, false);
+
+// Square matmul.
+BM_BatchMatmul(1, 32, 32, 32, false, false);
+BM_BatchMatmul(1, 128, 128, 128, false, false);
+BM_BatchMatmul(1, 256, 256, 256, false, false);
+BM_BatchMatmul(1, 1024, 1024, 1024, false, false);
+BM_BatchMatmul(1, 2048, 2048, 2048, false, false);
+BM_BatchMatmul(2, 32, 32, 32, false, false);
+BM_BatchMatmul(2, 128, 128, 128, false, false);
+BM_BatchMatmul(2, 256, 256, 256, false, false);
+BM_BatchMatmul(2, 1024, 1024, 1024, false, false);
+BM_BatchMatmul(2, 2048, 2048, 2048, false, false);
+BM_BatchMatmul(4, 32, 32, 32, false, false);
+BM_BatchMatmul(4, 128, 128, 128, false, false);
+BM_BatchMatmul(4, 256, 256, 256, false, false);
+BM_BatchMatmul(4, 1024, 1024, 1024, false, false);
+BM_BatchMatmul(4, 2048, 2048, 2048, false, false);
+BM_BatchMatmul(8, 32, 32, 32, false, false);
+BM_BatchMatmul(8, 128, 128, 128, false, false);
+BM_BatchMatmul(8, 256, 256, 256, false, false);
+BM_BatchMatmul(8, 1024, 1024, 1024, false, false);
+BM_BatchMatmul(8, 2048, 2048, 2048, false, false);
+BM_BatchMatmul(32, 32, 32, 32, false, false);
+BM_BatchMatmul(32, 128, 128, 128, false, false);
+BM_BatchMatmul(32, 256, 256, 256, false, false);
+BM_BatchMatmul(32, 1024, 1024, 1024, false, false);
+BM_BatchMatmul(32, 2048, 2048, 2048, false, false);
+
+// Matrix-vector multiplies.
+BM_BatchMatmul(1, 10000, 200, 1, false, false);
+BM_BatchMatmul(8, 10000, 200, 1, false, false);
+BM_BatchMatmul(32, 10000, 200, 1, false, false);
+BM_BatchMatmul(1, 10000, 200, 1, true, false);
+BM_BatchMatmul(8, 10000, 200, 1, true, false);
+BM_BatchMatmul(32, 10000, 200, 1, true, false);
+BM_BatchMatmul(1, 10000, 200, 1, false, true);
+BM_BatchMatmul(8, 10000, 200, 1, false, true);
+BM_BatchMatmul(32, 10000, 200, 1, false, true);
+BM_BatchMatmul(1, 10000, 200, 1, true, true);
+BM_BatchMatmul(8, 10000, 200, 1, true, true);
+BM_BatchMatmul(32, 10000, 200, 1, true, true);
+
+// Vector-matrix multiplies.
+BM_BatchMatmul(1, 1, 200, 10000, false, false);
+BM_BatchMatmul(8, 1, 200, 10000, false, false);
+BM_BatchMatmul(32, 1, 200, 10000, false, false);
+BM_BatchMatmul(1, 1, 200, 10000, true, false);
+BM_BatchMatmul(8, 1, 200, 10000, true, false);
+BM_BatchMatmul(32, 1, 200, 10000, true, false);
+BM_BatchMatmul(1, 1, 200, 10000, false, true);
+BM_BatchMatmul(8, 1, 200, 10000, false, true);
+BM_BatchMatmul(32, 1, 200, 10000, false, true);
+BM_BatchMatmul(1, 1, 200, 10000, true, true);
+BM_BatchMatmul(8, 1, 200, 10000, true, true);
+BM_BatchMatmul(32, 1, 200, 10000, true, true);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 6cfe130431f..297173cfca3 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -22,7 +22,7 @@ MKL_SHORT_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/kernels:ops_util",
 ] + mkl_deps()
 
@@ -126,7 +126,7 @@ tf_mkl_kernel_library(
     srcs = ["mkl_quantize_op.cc"],
     deps = [
         "//tensorflow/core/kernels:quantized_ops",
-        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core/graph:mkl_graph_util",
         "@gemmlowp",
     ] + MKL_DEPS,
 )
@@ -255,7 +255,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core/graph:mkl_graph_util",
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:concat_lib_hdrs",
         "//tensorflow/core/kernels:conv_ops",
diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
index da5a239c224..ec3f526bd9d 100644
--- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
@@ -15,26 +15,16 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
-// This file uses both oneDNN and MKL CBLAS batched xGEMM for acceleration of
-// Batch Matrix-Matrix Multiplication (MatMul) operations.
-// We currently register this kernel only for oneDNN supported data
-// types (float, bfloat16). This file can be built with and without the use of
-// the binary MKL CBLAS calls, controlled by the macro INTEL_MKL_DNN_ONLY.
-// If INTEL_MKL_DNN_ONLY is defined, only oneDNN is used. For cases not
-// supported by oneDNN (ex. Batchmatmul with broadcasting) we fall back to the
-// default CPU implementation.
-// if INTEL_MKL_DNN_ONLY is not defined, both oneDNN and MKL CBLAS
-// implementations are used. This is only temporary, once we are able handle all
-// cases with oneDNN, CBLAS calls will be removed.
+// This file uses oneDNN library for acceleration of Batch Matrix-Matrix
+// Multiplication (MatMul) operations. We currently register this kernel only
+// for oneDNN supported data types (float, bfloat16). The maximum number of
+// dimensions (rank) for output tensor is 12 in oneDNN. If output tensor rank
+// exceeds 12, we fall back to Eigen library based kernel.
 
 #define EIGEN_USE_THREADS
 
 #if defined(INTEL_MKL)
-#include <vector>
 
-#if !defined(INTEL_MKL_DNN_ONLY)
-#include "mkl_cblas.h"
-#endif  // !INTEL_MKL_DNN_ONLY
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -43,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/matmul_op_impl.h"
 #include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -100,8 +90,8 @@ class BatchMatMulMkl : public OpKernel {
     }
 
     // lhs and rhs can have different dimensions
-    const int ndims_lhs = lhs.dims();
-    const int ndims_rhs = rhs.dims();
+    const auto ndims_lhs = lhs.dims();
+    const auto ndims_rhs = rhs.dims();
 
     // Get broadcast info
     MatMulBCast bcast(lhs.shape().dim_sizes(), rhs.shape().dim_sizes());
@@ -111,16 +101,7 @@ class BatchMatMulMkl : public OpKernel {
             "In[0] and In[1] must have compatible batch dimensions: ",
             lhs.shape().DebugString(), " vs. ", rhs.shape().DebugString()));
 
-#if defined(INTEL_MKL_DNN_ONLY)
-    if (bcast.IsBroadcastingRequired()) {
-      // Calling Eigen Kernel for broadcasting case and return. Eigen does
-      // not have BF16 support, so we have to fail graciously in that case.
-      eigen_batch_mm_v2_.Compute(ctx);
-      return;
-    }
-#endif  // INTEL_MKL_DNN_ONLY
     TensorShape out_shape = bcast.output_batch_shape();
-    auto batch_size = bcast.output_batch_size();
 
     auto lhs_rows = lhs.dim_size(ndims_lhs - 2);
     auto lhs_cols = lhs.dim_size(ndims_lhs - 1);
@@ -137,6 +118,12 @@ class BatchMatMulMkl : public OpKernel {
 
     out_shape.AddDim(lhs_rows);
     out_shape.AddDim(rhs_cols);
+    // The maximum number of dimensions for a tensor in DNNL is 12.
+    OP_REQUIRES(
+        ctx, out_shape.dims() <= 12,
+        errors::InvalidArgument(
+            "Rank of output tensor must be <= 12, but is ", out_shape.dims(),
+            ". Current implementation supports upto rank 12 tensors."));
 
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
@@ -149,75 +136,17 @@ class BatchMatMulMkl : public OpKernel {
       return;
     }
 
-    auto rhs_reshaped = rhs.template flat_inner_dims<Scalar, 3>();
-    auto lhs_reshaped = lhs.template flat_inner_dims<Scalar, 3>();
-    auto out_reshaped = out->template flat_inner_dims<Scalar, 3>();
-    const uint64 M = lhs_reshaped.dimension(adj_x_ ? 2 : 1);
-    const uint64 K = lhs_reshaped.dimension(adj_x_ ? 1 : 2);
-    const uint64 N = rhs_reshaped.dimension(adj_y_ ? 1 : 2);
-
-    std::vector<MKL_INT> m_array(batch_size, M);
-    std::vector<MKL_INT> n_array(batch_size, N);
-    std::vector<MKL_INT> k_array(batch_size, K);
-    std::vector<MKL_INT> lda_array(batch_size, adj_x_ ? M : K);
-    std::vector<MKL_INT> ldb_array(batch_size, adj_y_ ? K : N);
-    std::vector<MKL_INT> ldc_array(batch_size, N);
-    std::vector<MKL_INT> group_size(1, batch_size);
-
-    bool bcast_not_supported = false;
-#if defined(INTEL_MKL_DNN_ONLY)
-    bcast_not_supported = true;
-#endif  // INTEL_MKL_DNN_ONLY
-    if (std::is_same<Scalar, bfloat16>::value || bcast_not_supported) {
-      // DNNL bfloat16 API requires a, b, and c as pointers to tensors
-      // represented as flat-byte array.
-      const Scalar* a = nullptr;
-      const Scalar* b = nullptr;
-      Scalar* c = nullptr;
-      a = &lhs_reshaped(0, 0, 0);
-      b = &rhs_reshaped(0, 0, 0);
-      OP_REQUIRES(ctx, !bcast.IsBroadcastingRequired(),
-                  errors::Unimplemented("Broadcasting is not supported for "
-                                        "_MklBatchMatMul yet."));
-      c = &out_reshaped(0, 0, 0);
-      // TODO(nhasabni): Use appropriate cast instead of passing addresses of
-      // a,b and c.
-      MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array,
-                        k_array, &a, lda_array, &b, ldb_array, &c, ldc_array, 1,
-                        group_size, ctx);
-    } else {
-      std::vector<const Scalar*> a_array;
-      std::vector<const Scalar*> b_array;
-      std::vector<Scalar*> c_array;
-      a_array.reserve(batch_size);
-      b_array.reserve(batch_size);
-      c_array.reserve(batch_size);
-
-      if (!bcast.IsBroadcastingRequired()) {
-        for (int64 i = 0; i < batch_size; i++) {
-          a_array.push_back(&lhs_reshaped(i, 0, 0));
-          b_array.push_back(&rhs_reshaped(i, 0, 0));
-          c_array.push_back(&out_reshaped(i, 0, 0));
-        }
-      } else {
-        // Broadcasting is needed, so get the mapping from flattened output
-        // batch indices to x's and y's flattened batch indices.
-        const std::vector<int64>& a_batch_indices = bcast.x_batch_indices();
-        const std::vector<int64>& b_batch_indices = bcast.y_batch_indices();
-
-        for (int64 i = 0; i < batch_size; i++) {
-          a_array.push_back(&lhs_reshaped(a_batch_indices[i], 0, 0));
-          b_array.push_back(&rhs_reshaped(b_batch_indices[i], 0, 0));
-          c_array.push_back(&out_reshaped(i, 0, 0));
-        }
-      }
-
-      // MKL CBLAS API requires a, b, and c as array of pointers, where each
-      // pointer is to 2D matrix.
-      MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array,
-                        k_array, &a_array[0], lda_array, &b_array[0], ldb_array,
-                        &c_array[0], ldc_array, 1, group_size, ctx);
-    }
+    // Compute parameters for DNNL matmul primitive.
+    auto params = CreateMatMulParams(lhs.shape(), rhs.shape(), out_shape);
+    // Create or retrieve matmul primitive from cache.
+    MklMatMulPrimitive<Scalar>* matmul_prim =
+        MklMatMulPrimitiveFactory<Scalar>::Get(
+            *params, false /* value for do_not_cache */);
+    // Execute matmul primitive.
+    std::shared_ptr<stream> cpu_stream;
+    cpu_stream.reset(CreateStream(ctx, matmul_prim->GetEngine()));
+    matmul_prim->Execute(lhs.flat<Scalar>().data(), rhs.flat<Scalar>().data(),
+                         out->flat<Scalar>().data(), cpu_stream);
   }
 
  private:
@@ -225,60 +154,78 @@ class BatchMatMulMkl : public OpKernel {
   bool adj_y_;
   BatchMatMulV2Op<CPUDevice, Scalar> eigen_batch_mm_v2_;
 
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const float** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const float** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, float** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size, OpKernelContext* ctx) {
-#if !defined(INTEL_MKL_DNN_ONLY)
-    std::vector<CBLAS_TRANSPOSE> TransA_Array(
-        group_size[0], TransA ? CblasTrans : CblasNoTrans);
-    std::vector<CBLAS_TRANSPOSE> TransB_Array(
-        group_size[0], TransB ? CblasTrans : CblasNoTrans);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0],
-                      &N_Array[0], &K_Array[0], &alpha_Array[0],
-                      reinterpret_cast<const float**>(A_Array), &lda_Array[0],
-                      reinterpret_cast<const float**>(B_Array), &ldb_Array[0],
-                      &beta_Array[0], reinterpret_cast<float**>(C_Array),
-                      &ldc_Array[0], group_count, &group_size[0]);
-#else
-    DCHECK(Layout == CblasRowMajor);
-    std::vector<bool> TransA_Array(group_size[0], TransA);
-    std::vector<bool> TransB_Array(group_size[0], TransB);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    dnnl_gemm_batch<float>(TransA_Array, TransB_Array, M_Array, N_Array,
-                           K_Array, alpha_Array, *A_Array, *B_Array, beta_Array,
-                           *C_Array, group_count, group_size, ctx);
-#endif  // !INTEL_MKL_DNN_ONLY
+  using dims = dnnl::memory::dims;
+
+  // This method makes the rank (ndims) of input same as the output by adding
+  // new axes to the input. For example, if input shape is [a, b, c, d] and
+  // output shape is [e, f, g, h, i, j], then the reshaped input would have a
+  // shape of [1, 1, a, b, c, d].
+  void ExpandInputDimsToOutputShape(const TensorShape& input_shape,
+                                    const TensorShape& output_shape,
+                                    dims* reshaped_dims) {
+    auto ndims_input = input_shape.dims();
+    auto ndims_output = output_shape.dims();
+    auto dim_offset = ndims_output - ndims_input;
+    DCHECK(dim_offset > 0);
+    reshaped_dims->clear();
+    reshaped_dims->resize(ndims_output, 1);
+    auto input_dims = input_shape.dim_sizes();
+    for (int dim_idx = 0; dim_idx < ndims_input; ++dim_idx)
+      reshaped_dims->at(dim_idx + dim_offset) = input_dims[dim_idx];
   }
-// BatchMatMul BFloat16 support only exists in DNNL 1.2 onwards.
-#if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16)
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const bfloat16** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const bfloat16** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, bfloat16** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size, OpKernelContext* ctx) {
-    DCHECK(Layout == CblasRowMajor);
-    std::vector<bool> TransA_Array(group_size[0], TransA);
-    std::vector<bool> TransB_Array(group_size[0], TransB);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    // TODO(nhasabni): Remove *A when we pass a, b, and c correctly.
-    // MKLDNN API does not require lda, ldb, and ldc.
-    dnnl_gemm_batch<bfloat16>(
-        TransA_Array, TransB_Array, M_Array, N_Array, K_Array, alpha_Array,
-        *A_Array, *B_Array, beta_Array, *C_Array, group_count, group_size, ctx);
+
+  std::unique_ptr<MklMatMulParams> CreateMatMulParams(
+      const TensorShape& lhs_shape, const TensorShape& rhs_shape,
+      const TensorShape& out_shape) {
+    const auto ndims_lhs = lhs_shape.dims();
+    const auto ndims_rhs = rhs_shape.dims();
+    const auto ndims_out = out_shape.dims();
+    auto lhs_dims = TFShapeToMklDnnDims(lhs_shape);
+    auto rhs_dims = TFShapeToMklDnnDims(rhs_shape);
+    auto out_dims = TFShapeToMklDnnDims(out_shape);
+
+    // DNNL matmul_primitive requires ranks of inputs and output to be same.
+    // Create dnnl::memory::dims for inputs and output of same rank.
+    // It is assumed here that MatMulBCast object creates output_batch_shape as
+    // a conforming superset of input batch shapes, i.e., ndims_out >=
+    // ndims_lhs and ndims_out >= ndims_rhs.
+    if (ndims_lhs < ndims_out) {
+      ExpandInputDimsToOutputShape(lhs_shape, out_shape, &lhs_dims);
+    }
+    if (ndims_rhs < ndims_out) {
+      ExpandInputDimsToOutputShape(rhs_shape, out_shape, &rhs_dims);
+    }
+
+    using dim = dnnl::memory::dim;
+    dim m;  // number of rows in x
+    dim k;  // number of columns in x
+    dim n;  // number of columns in y
+    auto lhs_strides = CalculateTFStrides(lhs_dims);
+    auto rhs_strides = CalculateTFStrides(rhs_dims);
+    auto out_strides = CalculateTFStrides(out_dims);
+
+    if (adj_x_) {
+      int m_idx = ndims_out - 1;
+      int k_idx = ndims_out - 2;
+      m = lhs_dims[m_idx];
+      k = lhs_dims[k_idx];
+      std::swap(lhs_dims[m_idx], lhs_dims[k_idx]);
+      lhs_strides[m_idx] = m;
+      lhs_strides[k_idx] = 1;
+    }
+
+    if (adj_y_) {
+      int k_idx = ndims_out - 1;
+      int n_idx = ndims_out - 2;
+      k = rhs_dims[k_idx];
+      n = rhs_dims[n_idx];
+      std::swap(rhs_dims[k_idx], rhs_dims[n_idx]);
+      rhs_strides[k_idx] = k;
+      rhs_strides[n_idx] = 1;
+    }
+    return std::make_unique<MklMatMulParams>(
+        lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides, out_strides);
   }
-#endif  // ENABLE_MKLDNN_V1 && ENABLE_INTEL_MKL_BFLOAT16
 };
 
 #define REGISTER_BATCH_MATMUL_MKL(TYPE)                                       \
@@ -294,14 +241,11 @@ class BatchMatMulMkl : public OpKernel {
                               .TypeConstraint<TYPE>("T")                      \
                               .Label(mkl_op_registry::kMklNameChangeOpLabel), \
                           BatchMatMulMkl<CPUDevice, TYPE, true>)
-
 #ifdef ENABLE_MKL
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
-#if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16)
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL_V2);
-#endif  // ENABLE_MKLDNN_V1 && ENABLE_INTEL_MKL_BFLOAT16
 #endif  // ENABLE_MKL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 5814eec0b76..b579f34b17a 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -941,23 +941,15 @@ class MklConvOp : public OpKernel {
       const Tensor& add_tensor = MklGetInput(context, kInputIndex_Add);
       MklDnnShape add_mkl_shape;
       GetMklShape(context, kInputIndex_Add, &add_mkl_shape, native_format);
-      if (native_format) {
-        // Forward the summand tensor to the output only if it has no other
-        // references, otherwise make a copy of it.
-        if (!context->forward_input_to_output_with_shape(
-                kInputIndex_Add, kOutputIndex_Dst, output_tf_shape,
-                output_tensor)) {
-          AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                    output_tf_shape, *output_mkl_shape,
-                                    native_format);
-          bool result =
-              (*output_tensor)->CopyFrom(add_tensor, add_tensor.shape());
-          DCHECK(result);
-        }
+      // Forward the summand tensor to the output only if it has no other
+      // references, otherwise make a copy of it.
+      if (native_format && context->forward_input_to_output_with_shape(
+                               kInputIndex_Add, kOutputIndex_Dst,
+                               output_tf_shape, output_tensor)) {
         return;
       }
       // Check if reorder is needed
-      if (add_mkl_shape == *output_mkl_shape &&
+      if (!native_format && add_mkl_shape == *output_mkl_shape &&
           ForwardMklTensorInToOutWithMklShape(context, kInputIndex_Add,
                                               kOutputIndex_Dst, output_tensor,
                                               add_mkl_shape, false)) {
@@ -987,6 +979,13 @@ class MklConvOp : public OpKernel {
             const_cast<Toutput*>(add_tensor.flat<Toutput>().data()));
         void* dst_buf =
             static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
+        if (native_format) {
+          // We are simply deep copying the add_tensor to output_tensor without
+          // changing memory layout, hence using same memory descriptor.
+          ADD_MD = DST_MD =
+              memory::desc({add_tensor.NumElements()}, MklDnnType<Toutput>(),
+                           mkldnn::memory::format_tag::x);
+        }
         fuse_add_src_.reset(
             new MEMORY_CONSTRUCTOR(ADD_MD, this->cpu_engine_, add_buf));
         fuse_add_dst_.reset(
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index e084b25f737..b77d033c9de 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -35,12 +35,6 @@ using mkldnn::stream;
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifdef INTEL_MKL_DNN_ONLY
-// Temporarily copying some definitions from mkl_cblas.h so the same code can
-// be used when calling oneDNN or CBLAS batchmatmul in mkl_batch_matmul_op.cc.
-typedef enum { CblasRowMajor, CblasColumnMajor } CBLAS_LAYOUT;
-#define MKL_INT int
-#endif
 
 // This structure aggregates multiple inputs to MklDnnMatMul* methods.
 struct MklDnnMatMulFwdParams {
@@ -729,67 +723,6 @@ class MklMatMulPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename T>
-void dnnl_gemm_batch(const std::vector<bool>& transa,
-                     const std::vector<bool>& transb, const std::vector<int>& m,
-                     const std::vector<int>& n, const std::vector<int>& k,
-                     const std::vector<float>& alpha, const T* a, const T* b,
-                     const std::vector<float>& beta, T* c,
-                     const int group_count, const std::vector<int>& group_size,
-                     OpKernelContext* ctx = nullptr) {
-  // Current BatchMatMul support in Tensorflow is narrower than the one offered
-  // by MKL and MKL-DNN. Current BatchMatMul support in Tensorflow uses only 1
-  // group of size equal to batch_size, and all MatMul parameters (m, n, k,
-  // alpha, beta) within that group are same.
-  DCHECK(group_size.size() == 1);
-  DCHECK(transa.size() == group_size[0]);
-  DCHECK(transb.size() == group_size[0]);
-  DCHECK(alpha.size() == group_size[0]);
-  DCHECK(beta.size() == group_size[0]);
-  DCHECK(m.size() == group_size[0]);
-  DCHECK(n.size() == group_size[0]);
-  DCHECK(k.size() == group_size[0]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(transa[0] == transa[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(transb[0] == transb[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(alpha[0] == alpha[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(beta[0] == beta[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(m[0] == m[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(n[0] == n[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(k[0] == k[idx]);
-
-  using dims = mkldnn::memory::dims;
-  // Prepare strides based on the transa and transb flags: transposed
-  // matrices have strides swapped BatchMatMul in MKL-DNN supports 3D metrices
-  // so far. That is why strides are 3D also.
-  dims a_sizes = dims{group_size[0], m[0], k[0]};
-  dims b_sizes = dims{group_size[0], k[0], n[0]};
-  dims c_sizes = dims{group_size[0], m[0], n[0]};
-  dims a_strides =
-      !transa[0] ? dims{m[0] * k[0], k[0], 1} : dims{k[0] * m[0], 1, m[0]};
-  dims b_strides =
-      !transb[0] ? dims{k[0] * n[0], n[0], 1} : dims{n[0] * k[0], 1, k[0]};
-  dims c_strides = dims{m[0] * n[0], n[0], 1};
-
-  // MklMatMul uses const alpha and beta, make guarantee here to ensure
-  // they are never changed.
-  DCHECK_EQ(alpha, 1.0f);
-  DCHECK_EQ(beta, 0.f);
-
-  MklMatMulParams params(a_sizes, b_sizes, c_sizes, a_strides, b_strides,
-                         c_strides);
-  MklMatMulPrimitive<T>* matmul_prim =
-      MklMatMulPrimitiveFactory<T>::Get(params, 0);
-
-  // Execute matmul primitive.
-  std::shared_ptr<stream> cpu_stream;
-  cpu_stream.reset(CreateStream(ctx, matmul_prim->GetEngine()));
-  matmul_prim->Execute(a, b, c, cpu_stream);
-}
-
 template <typename T>
 void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
                float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
diff --git a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
index 7549ba0e920..441d5f5099f 100644
--- a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
@@ -44,7 +44,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // An implementation of MaxPooling (forward).
-template <typename Device, typename T>
+template <typename Device, typename T, bool native_format = false>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklMaxPoolingOp(OpKernelConstruction* context)
@@ -52,6 +52,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
     // In Max Pooling, MKL-DNN does not allow passing workspace as nullptr.
     // So we set workspace_enabled_ to true.
     this->workspace_enabled_ = true;
+    this->native_format_ = native_format;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -59,7 +60,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       const Tensor& input_tensor =
           MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
-      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input,
+                  this->native_format_);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
       if (!context->status().ok()) return;
 
@@ -230,7 +232,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
     workspace_tf_shape.AddDim(workspace_bytes);
     AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
                               &workspace_tensor, workspace_tf_shape,
-                              workspace_mkl_shape);
+                              workspace_mkl_shape, this->native_format_);
     DCHECK(workspace_tensor);
     dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
   }
@@ -242,11 +244,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 //   - The original output tensor
 //   - Backprop tensor for output
 // It produces one output: backprop tensor for input.
-template <class Device, class T>
+template <class Device, class T, bool native_format = false>
 class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {}
+      : MklPoolingBackwardOpBase<T>(context) {
+    this->native_format_ = native_format;
+  }
   void Compute(OpKernelContext* context) override {
     try {
       const Tensor& orig_input_tensor =
@@ -256,8 +260,10 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       const Tensor& workspace_tensor =
           MklGetInput(context, kInputTensorIndexWorkspace);
       MklDnnShape orig_input_mkl_shape, grad_mkl_shape;
-      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
-      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape,
+                  this->native_format_);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape,
+                  this->native_format_);
       if (!context->status().ok()) return;
 
       MklDnnData<T> grad_dnn_data(&cpu_engine_);
@@ -337,7 +343,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       std::shared_ptr<PoolingBwdPd> pooling_bwd_pd =
           pooling_bwd->GetPoolingBwdPd();
       T* diff_dst_data = nullptr;
-      if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
+      if (!this->native_format_ &&
+          IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
                                      pooling_bwd)) {
         grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor);
         grad_dnn_data.CheckReorderToOpMem(
@@ -391,36 +398,56 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 };  // MklMaxPoolingGradOp
 
-#define REGISTER_MKL_MAXPOOL3D_KERNELS(T)                      \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPool3D")                                    \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingOp<CPUDevice, T>);                          \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPool3DGrad")                                \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingGradOp<CPUDevice, T>);
+#define REGISTER_MKL_MAXPOOL3D_KERNELS(T)                                     \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPool3D")                                                   \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPool3DGrad")                                               \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingGradOp<CPUDevice, T>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool3D")                         \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingOp<CPUDevice, T, true>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool3DGrad")                     \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingGradOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_MAXPOOL3D_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL3D_KERNELS);
 
-#define REGISTER_MKL_MAXPOOL_KERNELS(T)                        \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPool")                                      \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingOp<CPUDevice, T>);                          \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPoolGrad")                                  \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingGradOp<CPUDevice, T>);
+#define REGISTER_MKL_MAXPOOL_KERNELS(T)                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPool")                                                     \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPoolGrad")                                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingGradOp<CPUDevice, T>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool")                           \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingOp<CPUDevice, T, true>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPoolGrad")                       \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingGradOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_MAXPOOL_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL_KERNELS);
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
index 9fd699cf704..1624a00331a 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
@@ -74,7 +74,7 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
 #pragma omp parallel for
 #endif  // !ENABLE_MKLDNN_THREADPOOL
   // TODO: Add eigen parallel_for
-  for (size_t n = 0; n < n_channel; ++n) {
+  for (int64_t n = 0; n < n_channel; ++n) {
     float a_float_for_one_quant_level =
         MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
     float b_float_for_one_quant_level =
diff --git a/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
index 0cd4843c0d8..f6bc773de4f 100644
--- a/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
@@ -77,10 +77,14 @@ class MklRequantizationRangePerChannelOp : public OpKernel {
     float out_min_max = std::numeric_limits<float>::min();
 
 #ifndef ENABLE_MKLDNN_THREADPOOL
+#ifdef _MSC_VER
+#pragma omp parallel for
+#else
 #pragma omp parallel for reduction(max : out_min_max)
+#endif
 #endif  // !ENABLE_MKLDNN_THREADPOOL
     // TODO: Add eigen parallel_for
-    for (size_t i = 0; i < depth; ++i) {
+    for (int64_t i = 0; i < depth; ++i) {
       Eigen::Tensor<qint32, 0, Eigen::RowMajor> min =
           transposed_input.chip<0>(i).minimum();
       Eigen::Tensor<qint32, 0, Eigen::RowMajor> max =
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index 9e6e6aa1ef1..062f87e745f 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -84,7 +84,7 @@ tf_kernel_library(
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ]),
 )
 
@@ -92,7 +92,9 @@ tf_cuda_cc_test(
     name = "gpu_tanh_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_tanh_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171341759): re-enable.
+    ],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -100,8 +102,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:device",
-        "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
     ],
@@ -111,7 +111,9 @@ tf_cuda_cc_test(
     name = "gpu_abs_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_abs_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171341759): re-enable.
+    ],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -119,8 +121,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:device",
-        "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
     ],
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl
index b8477acca4e..4be6db3aebc 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl
@@ -1,5 +1,5 @@
 func @Isinf_elem_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+    -> tensor<*xi1> attributes {tf_entry, llvm.emit_c_interface} {
   %0 = "tf.IsInf"(%arg0) : (tensor<*xelem_type>) -> tensor<*xi1>
   return %0 : tensor<*xi1>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl
index ac0c09d22d4..5610068935a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl
@@ -1,5 +1,5 @@
 func @Isnan_elem_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+    -> tensor<*xi1> attributes {tf_entry, llvm.emit_c_interface} {
   %0 = "tf.IsNan"(%arg0) : (tensor<*xelem_type>) -> tensor<*xi1>
   return %0 : tensor<*xi1>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h
index 457658948ed..5d34ac83fb5 100644
--- a/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h
+++ b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h
@@ -97,13 +97,18 @@ Tensor ConvertDescriptorToTensor(
       auto result_desc = MLIR_FUNCTION(tf_op, mlir_type)(ctx, &input_desc);   \
       free(input_desc.descriptor);                                            \
                                                                               \
-      tensorflow::AllocatorAttributes attrs;                                  \
-      auto* allocator = ctx->get_allocator(attrs);                            \
-                                                                              \
-      Tensor result_tensor = ConvertDescriptorToTensor<data_type>(            \
-          result_desc, tf_data_type, allocator);                              \
+      /* Compare data pointers to detect forwarding. */                       \
+      void* result_data_ptr = static_cast<void**>(result_desc.descriptor)[0]; \
+      if (input.data() == result_data_ptr) {                                  \
+        ctx->set_output(0, input);                                            \
+      } else {                                                                \
+        tensorflow::AllocatorAttributes attrs;                                \
+        auto* allocator = ctx->get_allocator(attrs);                          \
+        Tensor result_tensor = ConvertDescriptorToTensor<data_type>(          \
+            result_desc, tf_data_type, allocator);                            \
+        ctx->set_output(0, result_tensor);                                    \
+      }                                                                       \
       free(result_desc.descriptor);                                           \
-      ctx->set_output(0, result_tensor);                                      \
     }                                                                         \
   };                                                                          \
   }                                                                           \
diff --git a/tensorflow/core/kernels/multinomial_op_test.cc b/tensorflow/core/kernels/multinomial_op_test.cc
index 25326ac5ecf..e1cc9d7dcd3 100644
--- a/tensorflow/core/kernels/multinomial_op_test.cc
+++ b/tensorflow/core/kernels/multinomial_op_test.cc
@@ -40,11 +40,15 @@ static Graph* Multinomial(int batch_size, int num_classes, int num_samples) {
   return g;
 }
 
-#define BM_MultinomialDev(DEVICE, B, C, S)                           \
-  static void BM_Multinomial_##DEVICE##_##B##_##C##_##S(int iters) { \
-    test::Benchmark(#DEVICE, Multinomial(B, C, S)).Run(iters);       \
-    testing::ItemsProcessed(static_cast<int64>(B) * C * S * iters);  \
-  }                                                                  \
+#define BM_MultinomialDev(DEVICE, B, C, S)                  \
+  static void BM_Multinomial_##DEVICE##_##B##_##C##_##S(    \
+      ::testing::benchmark::State& state) {                 \
+    test::Benchmark(#DEVICE, Multinomial(B, C, S),          \
+                    /*old_benchmark_api*/ false)            \
+        .Run(state);                                        \
+    state.SetItemsProcessed(static_cast<int64>(B) * C * S * \
+                            state.iterations());            \
+  }                                                         \
   BENCHMARK(BM_Multinomial_##DEVICE##_##B##_##C##_##S);
 
 #define BM_MultinomialBCS(B, C, S) \
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index ced97481ca9..bff83abc4aa 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -103,18 +103,18 @@ enum CONV_OP {
 
 }  // namespace
 
-static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
-                         int out_depth, int filter_rows, int filter_cols,
-                         CONV_OP op, int num_threads, int stride,
-                         Padding padding, bool use_gpu, DataType data_type,
+static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
+                         int rows, int cols, int in_depth, int out_depth,
+                         int filter_rows, int filter_cols, CONV_OP op,
+                         int num_threads, int stride, Padding padding,
+                         bool use_gpu, DataType data_type,
                          const string& label) {
-  testing::StopTiming();
   if (!IsGoogleCudaEnabled() && use_gpu) {
-    testing::SetLabel(
+    state.SetLabel(
         strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
     return;
   }
-  testing::SetLabel(label);
+  state.SetLabel(label);
 
   // Set the number of threads
   SessionOptions options;
@@ -221,10 +221,10 @@ static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
 
   string device = use_gpu ? "gpu" : "cpu";
-  testing::UseRealTime();
-  testing::StartTiming();
-  test::Benchmark(device, g, &options).Run(iters);
-  testing::ItemsProcessed(num_ops * iters);
+  test::Benchmark(device, g, &options, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(num_ops * state.iterations());
 }
 
 // BS: batch_size
@@ -235,48 +235,52 @@ static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
 // KR: kernel_rows
 // KC: kernel_cols
 #define BM_ConvFloatFwd(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)             \
-  static void BM_ConvFloatFwdCPU1_##LABEL(int iters) {                         \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
+  static void BM_ConvFloatFwdCPU1_##LABEL(                                     \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
                  PAD, false, DT_FLOAT,                                         \
                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu1")); \
   }                                                                            \
-  static void BM_ConvFloatFwdCPU4_##LABEL(int iters) {                         \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR,     \
+  static void BM_ConvFloatFwdCPU4_##LABEL(                                     \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR,     \
                  PAD, false, DT_FLOAT,                                         \
                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu4")); \
   }                                                                            \
-  static void BM_ConvFloatFusedCPU1_##LABEL(int iters) {                       \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 1, STR, PAD,  \
+  static void BM_ConvFloatFusedCPU1_##LABEL(                                   \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 1, STR, PAD,  \
                  false, DT_FLOAT,                                              \
                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu1")); \
   }                                                                            \
-  static void BM_ConvFloatFusedCPU4_##LABEL(int iters) {                       \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 4, STR, PAD,  \
+  static void BM_ConvFloatFusedCPU4_##LABEL(                                   \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 4, STR, PAD,  \
                  false, DT_FLOAT,                                              \
                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu4")); \
   }                                                                            \
-  static void BM_ConvFloatFwdGPU_##LABEL(int iters) {                          \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
+  static void BM_ConvFloatFwdGPU_##LABEL(::testing::benchmark::State& state) { \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
                  PAD, true, DT_FLOAT,                                          \
                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
                                  KR, "_", KC, "_", STR, "_", PAD, "_f_gpu"));  \
   }                                                                            \
-  static void BM_ConvHalfFwdGPU_##LABEL(int iters) {                           \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
+  static void BM_ConvHalfFwdGPU_##LABEL(::testing::benchmark::State& state) {  \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
                  PAD, true, DT_HALF,                                           \
                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
                                  KR, "_", KC, "_", STR, "_", PAD, "_h_gpu"));  \
   }                                                                            \
-  BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL);                                      \
-  BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL);                                      \
-  BENCHMARK(BM_ConvFloatFusedCPU1_##LABEL);                                    \
-  BENCHMARK(BM_ConvFloatFusedCPU4_##LABEL);                                    \
-  BENCHMARK(BM_ConvFloatFwdGPU_##LABEL);                                       \
-  BENCHMARK(BM_ConvHalfFwdGPU_##LABEL)
+  BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL)->UseRealTime();                       \
+  BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL)->UseRealTime();                       \
+  BENCHMARK(BM_ConvFloatFusedCPU1_##LABEL)->UseRealTime();                     \
+  BENCHMARK(BM_ConvFloatFusedCPU4_##LABEL)->UseRealTime();                     \
+  BENCHMARK(BM_ConvFloatFwdGPU_##LABEL)->UseRealTime();                        \
+  BENCHMARK(BM_ConvHalfFwdGPU_##LABEL)->UseRealTime()
 
 BM_ConvFloatFwd(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
 BM_ConvFloatFwd(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
@@ -334,63 +338,70 @@ BM_ConvFloatFwd(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
 BM_ConvFloatFwd(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
 BM_ConvFloatFwd(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
 
-#define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)  \
-  static void BM_ConvFloatBkInCPU1_##LABEL(int iters) {                       \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
-                 STR, PAD, false, DT_FLOAT,                                   \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));  \
-  }                                                                           \
-  static void BM_ConvFloatBkInCPU4_##LABEL(int iters) {                       \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4,  \
-                 STR, PAD, false, DT_FLOAT,                                   \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));  \
-  }                                                                           \
-  static void BM_ConvFloatBkInGPU_##LABEL(int iters) {                        \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
-                 STR, PAD, true, DT_FLOAT,                                    \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
-  }                                                                           \
-  static void BM_ConvFloatBkFilterCPU1_##LABEL(int iters) {                   \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
-                 STR, PAD, false, DT_FLOAT,                                   \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));  \
-  }                                                                           \
-  static void BM_ConvFloatBkFilterCPU4_##LABEL(int iters) {                   \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4, \
-                 STR, PAD, false, DT_FLOAT,                                   \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));  \
-  }                                                                           \
-  static void BM_ConvFloatBkFilterGPU_##LABEL(int iters) {                    \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
-                 STR, PAD, true, DT_FLOAT,                                    \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
-  }                                                                           \
-  static void BM_ConvHalfBkInGPU_##LABEL(int iters) {                         \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
-                 STR, PAD, true, DT_HALF,                                     \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
-  }                                                                           \
-  static void BM_ConvHalfBkFilterGPU_##LABEL(int iters) {                     \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
-                 STR, PAD, true, DT_HALF,                                     \
-                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
-                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
-  }                                                                           \
-  BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL);                                    \
-  BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL);                                    \
-  BENCHMARK(BM_ConvFloatBkInGPU_##LABEL);                                     \
-  BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL);                                \
-  BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL);                                \
-  BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL);                                 \
-  BENCHMARK(BM_ConvHalfBkInGPU_##LABEL);                                      \
-  BENCHMARK(BM_ConvHalfBkFilterGPU_##LABEL)
+#define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)   \
+  static void BM_ConvFloatBkInCPU1_##LABEL(                                    \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,   \
+                 STR, PAD, false, DT_FLOAT,                                    \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));   \
+  }                                                                            \
+  static void BM_ConvFloatBkInCPU4_##LABEL(                                    \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4,   \
+                 STR, PAD, false, DT_FLOAT,                                    \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));   \
+  }                                                                            \
+  static void BM_ConvFloatBkInGPU_##LABEL(                                     \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,   \
+                 STR, PAD, true, DT_FLOAT,                                     \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));    \
+  }                                                                            \
+  static void BM_ConvFloatBkFilterCPU1_##LABEL(                                \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
+                 STR, PAD, false, DT_FLOAT,                                    \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));   \
+  }                                                                            \
+  static void BM_ConvFloatBkFilterCPU4_##LABEL(                                \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4,  \
+                 STR, PAD, false, DT_FLOAT,                                    \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));   \
+  }                                                                            \
+  static void BM_ConvFloatBkFilterGPU_##LABEL(                                 \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
+                 STR, PAD, true, DT_FLOAT,                                     \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));    \
+  }                                                                            \
+  static void BM_ConvHalfBkInGPU_##LABEL(::testing::benchmark::State& state) { \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,   \
+                 STR, PAD, true, DT_HALF,                                      \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));    \
+  }                                                                            \
+  static void BM_ConvHalfBkFilterGPU_##LABEL(                                  \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
+                 STR, PAD, true, DT_HALF,                                      \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));    \
+  }                                                                            \
+  BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL)->UseRealTime();                      \
+  BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL)->UseRealTime();                      \
+  BENCHMARK(BM_ConvFloatBkInGPU_##LABEL)->UseRealTime();                       \
+  BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL)->UseRealTime();                  \
+  BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL)->UseRealTime();                  \
+  BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL)->UseRealTime();                   \
+  BENCHMARK(BM_ConvHalfBkInGPU_##LABEL)->UseRealTime();                        \
+  BENCHMARK(BM_ConvHalfBkFilterGPU_##LABEL)->UseRealTime()
 
 // Benchmarks from the inception model
 
@@ -453,8 +464,8 @@ BM_ConvFloatBkInAndFilter(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
 #define BM_ConvFloatBkFCPU(BS, R, C, ID, OD, KR, KC, TH, LABEL)                \
   static void                                                                  \
       BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH(  \
-          int iters) {                                                         \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \
+          ::testing::benchmark::State& state) {                                \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \
                  1, VALID, false, DT_FLOAT, LABEL);                            \
   }                                                                            \
   BENCHMARK(                                                                   \
@@ -469,17 +480,19 @@ BM_ConvFloatBkFCPU(128, 13, 13, 384, 384, 3, 3, 4, "convnet-layer5");
 
 #define BM_ConvFloatBkFGPU(BS, R, C, ID, OD, KR, KC, LABEL)                    \
   static void BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \
-      int iters) {                                                             \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
                  1, VALID, true, DT_FLOAT, LABEL);                             \
   }                                                                            \
   static void BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC(  \
-      int iters) {                                                             \
-    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
+      ::testing::benchmark::State& state) {                                    \
+    BM_ConvFloat(state, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
                  1, VALID, true, DT_HALF, LABEL);                              \
   }                                                                            \
-  BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC);  \
-  BENCHMARK(BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC)
+  BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC)   \
+      ->UseRealTime();                                                         \
+  BENCHMARK(BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC)    \
+      ->UseRealTime()
 
 // Benchmarks from https://github.com/soumith/convnet-benchmarks
 BM_ConvFloatBkFGPU(128, 128, 128, 3, 96, 11, 11, "convnet-layer1");
@@ -498,19 +511,19 @@ enum DEPTHWISE_CONV_OP {
 
 }  // namespace
 
-static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
-                                  int in_depth, int depth_multiplier,
-                                  int out_depth, int filter_rows,
-                                  int filter_cols, DEPTHWISE_CONV_OP op,
-                                  int num_threads, int stride, Padding padding,
-                                  bool use_gpu, const string& label) {
-  testing::StopTiming();
+static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
+                                  int rows, int cols, int in_depth,
+                                  int depth_multiplier, int out_depth,
+                                  int filter_rows, int filter_cols,
+                                  DEPTHWISE_CONV_OP op, int num_threads,
+                                  int stride, Padding padding, bool use_gpu,
+                                  const string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
-    testing::SetLabel(
+    state.SetLabel(
         strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
     return;
   }
-  testing::SetLabel(label);
+  state.SetLabel(label);
 
   // Set the number of threads
   SessionOptions options;
@@ -603,10 +616,10 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
 
   string device = use_gpu ? "gpu" : "cpu";
-  testing::UseRealTime();
-  testing::StartTiming();
-  test::Benchmark(device, g, &options).Run(iters);
-  testing::ItemsProcessed(num_ops * iters);
+  test::Benchmark(device, g, &options, nullptr, nullptr, "",
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(num_ops * state.iterations());
 }
 
 // BS: batch_size
@@ -622,30 +635,33 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
 
 #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD,    \
                                  LABEL)                                     \
-  static void BM_ConvFloatDepthwiseFwdCPU1_##LABEL(int iters) {             \
+  static void BM_ConvFloatDepthwiseFwdCPU1_##LABEL(                         \
+      ::testing::benchmark::State& state) {                                 \
     BM_ConvFloatDepthwise(                                                  \
-        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
+        state, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
         PAD, false,                                                         \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));         \
   }                                                                         \
-  static void BM_ConvFloatDepthwiseFwdCPU4_##LABEL(int iters) {             \
+  static void BM_ConvFloatDepthwiseFwdCPU4_##LABEL(                         \
+      ::testing::benchmark::State& state) {                                 \
     BM_ConvFloatDepthwise(                                                  \
-        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 4, STR, \
+        state, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 4, STR, \
         PAD, false,                                                         \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));         \
   }                                                                         \
-  static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) {              \
+  static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(                          \
+      ::testing::benchmark::State& state) {                                 \
     BM_ConvFloatDepthwise(                                                  \
-        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
+        state, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
         PAD, true,                                                          \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
                         KR, "_", KC, "_", STR, "_", PAD, "_gpu"));          \
   }                                                                         \
-  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL);                          \
-  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL);                          \
-  BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL);
+  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL)->UseRealTime();           \
+  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL)->UseRealTime();           \
+  BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL)->UseRealTime();
 
 // The configurations below are mostly from mobilenet models.
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
@@ -662,53 +678,59 @@ BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 3, 3, 1, SAME, conv9);
 BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 5, 5, 1, SAME, conv10);
 
 #define BM_ConvFloatDepthwiseBk(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, LABEL) \
-  static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(int iters) {               \
+  static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(                           \
+      ::testing::benchmark::State& state) {                                    \
     BM_ConvFloatDepthwise(                                                     \
-        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
+        state, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
         1, STR, PAD, false,                                                    \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));            \
   }                                                                            \
-  static void BM_ConvFloatDepthwiseBkInCPU4_##LABEL(int iters) {               \
+  static void BM_ConvFloatDepthwiseBkInCPU4_##LABEL(                           \
+      ::testing::benchmark::State& state) {                                    \
     BM_ConvFloatDepthwise(                                                     \
-        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
+        state, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
         4, STR, PAD, false,                                                    \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));            \
   }                                                                            \
-  static void BM_ConvFloatDepthwiseBkInGPU_##LABEL(int iters) {                \
+  static void BM_ConvFloatDepthwiseBkInGPU_##LABEL(                            \
+      ::testing::benchmark::State& state) {                                    \
     BM_ConvFloatDepthwise(                                                     \
-        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
+        state, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
         4, STR, PAD, true,                                                     \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
                         KR, "_", KC, "_", STR, "_", PAD, "_gpu"));             \
   }                                                                            \
-  static void BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL(int iters) {           \
+  static void BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL(                       \
+      ::testing::benchmark::State& state) {                                    \
     BM_ConvFloatDepthwise(                                                     \
-        iters, BS, R, C, ID, DM, OD, KR, KC,                                   \
+        state, BS, R, C, ID, DM, OD, KR, KC,                                   \
         DEPTHWISE_CONV_OP_BACKPROP_FILTER, 1, STR, PAD, false,                 \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));            \
   }                                                                            \
-  static void BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL(int iters) {           \
+  static void BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL(                       \
+      ::testing::benchmark::State& state) {                                    \
     BM_ConvFloatDepthwise(                                                     \
-        iters, BS, R, C, ID, DM, OD, KR, KC,                                   \
+        state, BS, R, C, ID, DM, OD, KR, KC,                                   \
         DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, false,                 \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));            \
   }                                                                            \
-  static void BM_ConvFloatDepthwiseBkFilterGPU_##LABEL(int iters) {            \
+  static void BM_ConvFloatDepthwiseBkFilterGPU_##LABEL(                        \
+      ::testing::benchmark::State& state) {                                    \
     BM_ConvFloatDepthwise(                                                     \
-        iters, BS, R, C, ID, DM, OD, KR, KC,                                   \
+        state, BS, R, C, ID, DM, OD, KR, KC,                                   \
         DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, true,                  \
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
                         KR, "_", KC, "_", STR, "_", PAD, "_gpu"));             \
   }                                                                            \
-  BENCHMARK(BM_ConvFloatDepthwiseBkInCPU1_##LABEL);                            \
-  BENCHMARK(BM_ConvFloatDepthwiseBkInCPU4_##LABEL);                            \
-  BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL);                             \
-  BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL);                        \
-  BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL);                        \
+  BENCHMARK(BM_ConvFloatDepthwiseBkInCPU1_##LABEL)->UseRealTime();             \
+  BENCHMARK(BM_ConvFloatDepthwiseBkInCPU4_##LABEL)->UseRealTime();             \
+  BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL)->UseRealTime();              \
+  BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL)->UseRealTime();         \
+  BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL)->UseRealTime();         \
   BENCHMARK(BM_ConvFloatDepthwiseBkFilterGPU_##LABEL)
 
 // The configurations below are mostly from mobilenet models.
@@ -732,10 +754,9 @@ BM_ConvFloatDepthwiseBk(32, 112, 112, 8, 3, 24, 3, 3, 1, SAME, conv12);
 BM_ConvFloatDepthwiseBk(32, 112, 112, 12, 2, 24, 3, 3, 1, SAME, conv13);
 BM_ConvFloatDepthwiseBk(32, 112, 112, 24, 1, 24, 3, 3, 1, SAME, conv14);
 
-static void BM_LRNFloat(int iters, int depth, int cols, int rows,
-                        int batch_size, int range, int num_threads,
+static void BM_LRNFloat(::testing::benchmark::State& state, int depth, int cols,
+                        int rows, int batch_size, int range, int num_threads,
                         const string& label) {
-  tensorflow::testing::StopTiming();
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -778,26 +799,24 @@ static void BM_LRNFloat(int iters, int depth, int cols, int rows,
   std::unique_ptr<OpKernelContext> context(new OpKernelContext(&params));
 
   op->Compute(context.get());
-  testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete context->release_output(0).tensor;
     op->Compute(context.get());
   }
-  tensorflow::testing::StopTiming();
-  testing::ItemsProcessed(context->mutable_output(0)->NumElements() * iters *
-                          (2 * range + 1) * 2);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(context->mutable_output(0)->NumElements() *
+                          state.iterations() * (2 * range + 1) * 2);
+  state.SetLabel(label);
 }
 
 #define BM_LRNFloatFwdCPU(DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL)   \
   static void                                                                \
       BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS( \
-          int iters) {                                                       \
-    BM_LRNFloat(iters, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL);     \
+          ::testing::benchmark::State& state) {                              \
+    BM_LRNFloat(state, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL);     \
   }                                                                          \
   BENCHMARK(                                                                 \
-      BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS)
+      BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS) \
+      ->UseRealTime()
 
 // clang-format off
 //                DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL
@@ -815,10 +834,10 @@ BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     8,       "lrn 8 threads");
 /*
 AvgPooling Op
 */
-static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
-                       int kernel_rows, int kernel_cols, int stride,
-                       Padding padding, int num_threads, const string& label) {
-  tensorflow::testing::StopTiming();
+static void BM_AvgPool(::testing::benchmark::State& state, int batch_size,
+                       int rows, int cols, int depth, int kernel_rows,
+                       int kernel_cols, int stride, Padding padding,
+                       int num_threads, const string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -860,16 +879,13 @@ static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
       new OpKernelContext(&params));
 
   op->Compute(avgpool_context.get());
-  testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete avgpool_context->release_output(0).tensor;
     op->Compute(avgpool_context.get());
   }
-  tensorflow::testing::StopTiming();
-  testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
-                          iters);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
+                          state.iterations());
+  state.SetLabel(label);
 }
 
 // BS: batch_size
@@ -883,11 +899,12 @@ static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
 #define BM_AvgPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)            \
   static void                                                                  \
       BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
-          int iters) {                                                         \
-    BM_AvgPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
+          ::testing::benchmark::State& state) {                                \
+    BM_AvgPool(state, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
   }                                                                            \
   BENCHMARK(                                                                   \
-      BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+      BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) \
+      ->UseRealTime()
 
 // Labels are taken from the 2014-July-24 version of imagenet
 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "avgpool0_VALID");
@@ -907,11 +924,10 @@ BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "avgpool1_SAME");
 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "avgpool4_SAME");
 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME");
 
-static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
-                         int depth, int kernel_rows, int kernel_cols,
-                         int stride, Padding padding, int num_threads,
-                         const string& label) {
-  tensorflow::testing::StopTiming();
+static void BM_AvgPoolBk(::testing::benchmark::State& state, int batch_size,
+                         int rows, int cols, int depth, int kernel_rows,
+                         int kernel_cols, int stride, Padding padding,
+                         int num_threads, const string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -966,16 +982,13 @@ static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
       new OpKernelContext(&params));
 
   op->Compute(avgpool_context.get());
-  testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete avgpool_context->release_output(0).tensor;
     op->Compute(avgpool_context.get());
   }
-  tensorflow::testing::StopTiming();
-  testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
-                          iters);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
+                          state.iterations());
+  state.SetLabel(label);
 }
 
 // BS: batch_size
@@ -987,14 +1000,17 @@ static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
 // ST: stride. We use the same stride for both directions.
 // PT: padding
 // The resulted symbol is too long. Need to use two macros to fit in 80-chars
+// NOLINTBEGIN
 #define BM_AvgPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)               \
   static void                                                                    \
       BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
-          int iters) {                                                           \
-    BM_AvgPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
+          ::testing::benchmark::State& state) {                                  \
+    BM_AvgPoolBk(state, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
   }                                                                              \
   BENCHMARK(                                                                     \
-      BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+      BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) \
+      ->UseRealTime()
+// NOLINTEND
 
 // Shapes taken from the 2015/05/16 inception model
 BM_AvgPoolBkCPU(32, 35, 35, 192, 3, 3, 1, SAME, 1, "avgpool_grad0_SAME");
@@ -1010,10 +1026,10 @@ BM_AvgPoolBkCPU(32, 8, 8, 2048, 8, 8, 1, VALID, 1, "avgpool_grad8_VALID");
 /*
 MaxPooling Op
 */
-static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
-                       int kernel_rows, int kernel_cols, int stride,
-                       Padding padding, int num_threads, const string& label) {
-  tensorflow::testing::StopTiming();
+static void BM_MaxPool(::testing::benchmark::State& state, int batch_size,
+                       int rows, int cols, int depth, int kernel_rows,
+                       int kernel_cols, int stride, Padding padding,
+                       int num_threads, const string& label) {
   SessionOptions options;
   options.config.set_intra_op_parallelism_threads(num_threads);
 
@@ -1057,16 +1073,13 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
       new OpKernelContext(&params));
 
   op->Compute(maxpool_context.get());
-  testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete maxpool_context->release_output(0).tensor;
     op->Compute(maxpool_context.get());
   }
-  tensorflow::testing::StopTiming();
-  testing::ItemsProcessed(maxpool_context->mutable_output(0)->NumElements() *
-                          iters);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(maxpool_context->mutable_output(0)->NumElements() *
+                          state.iterations());
+  state.SetLabel(label);
 }
 
 // BS: batch_size
@@ -1080,11 +1093,12 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
 #define BM_MaxPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)            \
   static void                                                                  \
       BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
-          int iters) {                                                         \
-    BM_MaxPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
+          ::testing::benchmark::State& state) {                                \
+    BM_MaxPool(state, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
   }                                                                            \
   BENCHMARK(                                                                   \
-      BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+      BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) \
+      ->UseRealTime()
 
 // Labels are taken from the 2014-July-24 version of imagenet
 /* TODO XXX
@@ -1106,10 +1120,10 @@ BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "maxpool1_SAME");
 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "maxpool4_SAME");
 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME");
 
-static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols,
-                         int depth, int kernel_rows, int kernel_cols,
-                         int stride, Padding padding, int num_threads,
-                         bool use_gpu, const string& label) {
+static void BM_MaxPoolBk(::testing::benchmark::State& state, int batch_size,
+                         int rows, int cols, int depth, int kernel_rows,
+                         int kernel_cols, int stride, Padding padding,
+                         int num_threads, bool use_gpu, const string& label) {
   auto root = Scope::NewRootScope().ExitOnError();
 
   int64 out_height, out_width, pad_rows, pad_cols;
@@ -1138,11 +1152,11 @@ static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols,
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
   string device = use_gpu ? "gpu" : "cpu";
-  testing::UseRealTime();
-  test::Benchmark(device, g).Run(iters);
+  test::Benchmark(device, g, /*old_benchmark_api*/ false).Run(state);
 
-  testing::ItemsProcessed(batch_size * rows * cols * depth * iters);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(batch_size * rows * cols * depth *
+                          state.iterations());
+  state.SetLabel(label);
 }
 
 // BS: batch_size
@@ -1159,23 +1173,23 @@ static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols,
   static void                                                                  \
       BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
           ##PT##_##TH(                                                         \
-          int iters) {                                                         \
-    BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL);      \
+          ::testing::benchmark::State& state) {                                \
+    BM_MaxPoolBk(state, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL);      \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
-          ##PT##_##TH)                                                         \
+          ##PT##_##TH)->UseRealTime()
 
 #define BM_MaxPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)             \
   static void                                                                  \
       BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
           ##PT##_##TH(                                                         \
-          int iters) {                                                         \
-    BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL);     \
+          ::testing::benchmark::State& state) {                                \
+    BM_MaxPoolBk(state, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL);     \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
-          ##PT##_##TH)
+          ##PT##_##TH)->UseRealTime()
 // clang-format on
 
 // Shapes taken from the 2015/05/16 inception model
@@ -1195,9 +1209,9 @@ BM_MaxPoolBkCPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
 Relu Op
 Run benchmark with:
 */
-static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
-                         int depth, int num_threads, const string& label) {
-  tensorflow::testing::StopTiming();
+static void BM_ReluFloat(::testing::benchmark::State& state, int batch_size,
+                         int rows, int cols, int depth, int num_threads,
+                         const string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -1233,27 +1247,25 @@ static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
   std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(&params));
 
   op->Compute(relu_context.get());
-  testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete relu_context->release_output(0).tensor;
     op->Compute(relu_context.get());
   }
-  tensorflow::testing::StopTiming();
-  testing::ItemsProcessed(relu_context->mutable_output(0)->NumElements() *
-                          iters);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(relu_context->mutable_output(0)->NumElements() *
+                          state.iterations());
+  state.SetLabel(label);
 }
 
 // BS: batch_size
 // IR: input_rows
 // IC: input_cols
 // ND: node_depth
-#define BM_Relu(BS, IR, IC, ND, TH, LABEL)                               \
-  static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \
-    BM_ReluFloat(iters, BS, IR, IC, ND, TH, LABEL);                      \
-  }                                                                      \
-  BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH)
+#define BM_Relu(BS, IR, IC, ND, TH, LABEL)                   \
+  static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH( \
+      ::testing::benchmark::State& state) {                  \
+    BM_ReluFloat(state, BS, IR, IC, ND, TH, LABEL);          \
+  }                                                          \
+  BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH)->UseRealTime()
 
 BM_Relu(32, 112, 112, 64, 1, "relu0");
 BM_Relu(32, 56, 56, 192, 1, "relu1");
@@ -1268,9 +1280,9 @@ BM_Relu(32, 14, 14, 576, 4, "relu10");
 Softplus Op
 Run benchmark with:
 */
-static void BM_SoftplusFloat(int iters, int batch_size, int rows, int cols,
-                             int depth, int num_threads, const string& label) {
-  tensorflow::testing::StopTiming();
+static void BM_SoftplusFloat(::testing::benchmark::State& state, int batch_size,
+                             int rows, int cols, int depth, int num_threads,
+                             const string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -1307,27 +1319,25 @@ static void BM_SoftplusFloat(int iters, int batch_size, int rows, int cols,
       new OpKernelContext(&params));
 
   op->Compute(softplus_context.get());
-  testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete softplus_context->release_output(0).tensor;
     op->Compute(softplus_context.get());
   }
-  tensorflow::testing::StopTiming();
-  testing::ItemsProcessed(softplus_context->mutable_output(0)->NumElements() *
-                          iters);
-  testing::SetLabel(label);
+  state.SetItemsProcessed(softplus_context->mutable_output(0)->NumElements() *
+                          state.iterations());
+  state.SetLabel(label);
 }
 
 // BS: batch_size
 // IR: input_rows
 // IC: input_cols
 // ND: node_depth
-#define BM_Softplus(BS, IR, IC, ND, TH, LABEL)                               \
-  static void BM_SoftplusFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \
-    BM_SoftplusFloat(iters, BS, IR, IC, ND, TH, LABEL);                      \
-  }                                                                          \
-  BENCHMARK(BM_SoftplusFloat_##BS##_##IR##_##IC##_##ND##_##TH)
+#define BM_Softplus(BS, IR, IC, ND, TH, LABEL)                   \
+  static void BM_SoftplusFloat_##BS##_##IR##_##IC##_##ND##_##TH( \
+      ::testing::benchmark::State& state) {                      \
+    BM_SoftplusFloat(state, BS, IR, IC, ND, TH, LABEL);          \
+  }                                                              \
+  BENCHMARK(BM_SoftplusFloat_##BS##_##IR##_##IC##_##ND##_##TH)->UseRealTime()
 
 BM_Softplus(32, 112, 112, 64, 1, "softplus0");
 BM_Softplus(32, 56, 56, 192, 1, "softplus1");
@@ -1338,7 +1348,8 @@ BM_Softplus(32, 56, 56, 192, 4, "softplus1");
 BM_Softplus(32, 28, 28, 352, 4, "softplus4");
 BM_Softplus(32, 14, 14, 576, 4, "softplus10");
 
-static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
+static void BM_ImageNetSoftmaxFwd(::testing::benchmark::State& state,
+                                  int batch_size, int node_depth,
                                   int num_threads, bool use_gpu,
                                   const string& label) {
   auto root = Scope::NewRootScope().ExitOnError();
@@ -1359,19 +1370,21 @@ static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
   opts.config.mutable_graph_options()
       ->mutable_optimizer_options()
       ->set_opt_level(OptimizerOptions::L0);
-  testing::UseRealTime();
-  test::Benchmark(device, g, &opts).Run(iters);
-  testing::ItemsProcessed(batch_size * node_depth * iters);
-  testing::SetLabel(label);
+  test::Benchmark(device, g, &opts, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(batch_size * node_depth * state.iterations());
+  state.SetLabel(label);
 }
 
-#define BM_ImageNetSoftmaxFwd(BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL)     \
-  static void                                                             \
-      BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU(   \
-          int iters) {                                                    \
-    BM_ImageNetSoftmaxFwd(iters, BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL); \
-  }                                                                       \
-  BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU)
+#define BM_ImageNetSoftmaxFwd(BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL)         \
+  static void                                                                 \
+      BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU(       \
+          ::testing::benchmark::State& state) {                               \
+    BM_ImageNetSoftmaxFwd(state, BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL);     \
+  }                                                                           \
+  BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU) \
+      ->UseRealTime()
 
 // Labels are taken from the 2014-July-24 version of imagenet
 BM_ImageNetSoftmaxFwd(32, 1008, 1, false, "softmax32");
@@ -1383,9 +1396,8 @@ BM_ImageNetSoftmaxFwd(128, 1008, 1, true, "softmax128");
 BM_ImageNetSoftmaxFwd(8192, 1024, 1, true, "softmax32");
 BM_ImageNetSoftmaxFwd(8192, 32768, 1, true, "softmax128");
 
-static void BM_TopK(int iters, int rows, int cols, int k, int num_threads,
-                    bool use_gpu, const string& label) {
-  testing::StopTiming();
+static void BM_TopK(::testing::benchmark::State& state, int rows, int cols,
+                    int k, int num_threads, bool use_gpu, const string& label) {
   auto root = Scope::NewRootScope().ExitOnError();
 
   Tensor input(DT_FLOAT, TensorShape({rows, cols}));
@@ -1407,28 +1419,30 @@ static void BM_TopK(int iters, int rows, int cols, int k, int num_threads,
   opts.config.mutable_graph_options()
       ->mutable_optimizer_options()
       ->set_opt_level(OptimizerOptions::L0);
-  testing::UseRealTime();
-  testing::StartTiming();
-  test::Benchmark(device, g, &opts).Run(iters);
-  testing::ItemsProcessed(rows * cols * iters);
-  testing::SetLabel(label);
+  test::Benchmark(device, g, &opts, nullptr, nullptr, "",
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(rows * cols * state.iterations());
+  state.SetLabel(label);
 }
 
 // IR: input_rows
 // IC: input_cols
 // IK: k
 // TH: number of threads
-#define BM_TopKGPU(IR, IC, IK, TH, LABEL)                        \
-  static void BM_TopK_GPU_##IR##_##IC##_##IK##_##TH(int iters) { \
-    BM_TopK(iters, IR, IC, IK, TH, true, LABEL);                 \
-  }                                                              \
-  BENCHMARK(BM_TopK_GPU_##IR##_##IC##_##IK##_##TH)
+#define BM_TopKGPU(IR, IC, IK, TH, LABEL)            \
+  static void BM_TopK_GPU_##IR##_##IC##_##IK##_##TH( \
+      ::testing::benchmark::State& state) {          \
+    BM_TopK(state, IR, IC, IK, TH, true, LABEL);     \
+  }                                                  \
+  BENCHMARK(BM_TopK_GPU_##IR##_##IC##_##IK##_##TH)->UseRealTime()
 
-#define BM_TopKCPU(IR, IC, IK, TH, LABEL)                        \
-  static void BM_TopK_CPU_##IR##_##IC##_##IK##_##TH(int iters) { \
-    BM_TopK(iters, IR, IC, IK, TH, false, LABEL);                \
-  }                                                              \
-  BENCHMARK(BM_TopK_CPU_##IR##_##IC##_##IK##_##TH)
+#define BM_TopKCPU(IR, IC, IK, TH, LABEL)            \
+  static void BM_TopK_CPU_##IR##_##IC##_##IK##_##TH( \
+      ::testing::benchmark::State& state) {          \
+    BM_TopK(state, IR, IC, IK, TH, false, LABEL);    \
+  }                                                  \
+  BENCHMARK(BM_TopK_CPU_##IR##_##IC##_##IK##_##TH)->UseRealTime()
 
 // clang-format on
 
diff --git a/tensorflow/core/kernels/one_hot_op_test.cc b/tensorflow/core/kernels/one_hot_op_test.cc
index 95a9ea11a06..bf50c62fc07 100644
--- a/tensorflow/core/kernels/one_hot_op_test.cc
+++ b/tensorflow/core/kernels/one_hot_op_test.cc
@@ -56,9 +56,13 @@ static Graph* OneHot(int batch_size, int num_classes, int axis) {
 }
 
 #define BM_OneHot(BATCH, CLASS, AXIS, DEVICE)                                \
-  static void BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS);      \
-    test::Benchmark(#DEVICE, OneHot(BATCH, CLASS, AXIS)).Run(iters);         \
+  static void BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE(             \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, OneHot(BATCH, CLASS, AXIS),                     \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH * \
+                            CLASS);                                          \
   }                                                                          \
   BENCHMARK(BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE);
 
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
index 07f2f75ca5a..4180cfba0d3 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
@@ -107,25 +107,34 @@ static Graph* PTruncatedNormalOneTail(int num_batches, int samples_per_batch) {
   return g;
 }
 
-#define BM_PTruncatedNormalDev(DEVICE, B, S)                        \
-  static void BM_PTruncatedNormal_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, PTruncatedNormal(B, S)).Run(iters);    \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);     \
-  }                                                                 \
+#define BM_PTruncatedNormalDev(DEVICE, B, S)                                 \
+  static void BM_PTruncatedNormal_##DEVICE##_##B##_##S(                      \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, PTruncatedNormal(B, S),                         \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_PTruncatedNormal_##DEVICE##_##B##_##S);
 
-#define BM_PTruncatedNormalDev_2SD(DEVICE, B, S)                        \
-  static void BM_PTruncatedNormal_2SD_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, PTruncatedNormal2SD(B, S)).Run(iters);     \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);         \
-  }                                                                     \
+#define BM_PTruncatedNormalDev_2SD(DEVICE, B, S)                             \
+  static void BM_PTruncatedNormal_2SD_##DEVICE##_##B##_##S(                  \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, PTruncatedNormal2SD(B, S),                      \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_PTruncatedNormal_2SD_##DEVICE##_##B##_##S);
 
-#define BM_PTruncatedNormalDev_OneTail(DEVICE, B, S)                        \
-  static void BM_PTruncatedNormal_OneTail_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, PTruncatedNormalOneTail(B, S)).Run(iters);     \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);             \
-  }                                                                         \
+#define BM_PTruncatedNormalDev_OneTail(DEVICE, B, S)                         \
+  static void BM_PTruncatedNormal_OneTail_##DEVICE##_##B##_##S(              \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, PTruncatedNormalOneTail(B, S),                  \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_PTruncatedNormal_OneTail_##DEVICE##_##B##_##S);
 
 BM_PTruncatedNormalDev(cpu, 1000, 1000);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 8f71d09c083..675bdaec225 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -71,6 +71,10 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
+    OP_REQUIRES(
+        ctx, (axis_ == -1 || axis_ < input.shape().dims()),
+        errors::InvalidArgument("Shape must be at least rank ", axis_ + 1,
+                                " but is rank ", input.shape().dims()));
     const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
     Tensor input_min_tensor;
     Tensor input_max_tensor;
@@ -131,6 +135,75 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   bool narrow_range_;
 };
 
+// Implementation of QuantizeAndDequantizeV4GradientOp.
+// When back-propagating the error through a quantized layer, the following
+// paper gives evidence that clipped-ReLU is better than non-clipped:
+// "Deep Learning with Low Precision by Half-wave Gaussian Quantization"
+// http://zpascal.net/cvpr2017/Cai_Deep_Learning_With_CVPR_2017_paper.pdf
+template <typename Device, typename T>
+class QuantizeAndDequantizeV4GradientOp : public OpKernel {
+ public:
+  explicit QuantizeAndDequantizeV4GradientOp(OpKernelConstruction* ctx)
+      : OpKernel::OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& gradient = ctx->input(0);
+    const Tensor& input = ctx->input(1);
+    Tensor* input_backprop = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, input.shape(), &input_backprop));
+
+    OP_REQUIRES(
+        ctx, input.IsSameSize(gradient),
+        errors::InvalidArgument("gradient and input must be the same size"));
+    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
+    const Tensor& input_min_tensor = ctx->input(2);
+    const Tensor& input_max_tensor = ctx->input(3);
+    if (axis_ != -1) {
+      OP_REQUIRES(
+          ctx, input_min_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("min has incorrect size, expected ", depth,
+                                  " was ", input_min_tensor.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("max has incorrect size, expected ", depth,
+                                  " was ", input_max_tensor.dim_size(0)));
+    }
+
+    TensorShape min_max_shape(input_min_tensor.shape());
+    Tensor* input_min_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, min_max_shape, &input_min_backprop));
+
+    Tensor* input_max_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(2, min_max_shape, &input_max_backprop));
+
+    if (axis_ == -1) {
+      functor::QuantizeAndDequantizeOneScaleGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), gradient.template flat<T>(),
+        input.template flat<T>(), input_min_tensor.scalar<T>(),
+        input_max_tensor.scalar<T>(), input_backprop->template flat<T>(),
+        input_min_backprop->template scalar<T>(),
+        input_max_backprop->template scalar<T>());
+    } else {
+      functor::QuantizeAndDequantizePerChannelGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(),
+        gradient.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        &input_min_tensor, &input_max_tensor,
+        input_backprop->template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input_min_backprop->template flat<T>(),
+        input_max_backprop->template flat<T>());
+    }
+  }
+
+ private:
+  int axis_;
+};
+
 // Simulate quantization precision loss in a float tensor by:
 // 1. Quantize the tensor to fixed point numbers, which should match the target
 //    quantization method when it is used in inference.
@@ -295,6 +368,43 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
         input_max_tensor, round_mode, narrow_range, out);
   }
 };
+
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    CPUDevice, double>;
+
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
@@ -306,6 +416,14 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV3Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4")                      \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4Grad")                  \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV4GradientOp<CPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<CPUDevice, T>);
@@ -329,6 +447,18 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("num_bits")                          \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV3Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4")                      \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4Grad")                  \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV4GradientOp<GPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<GPUDevice, T>);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 4dd6e5c839b..c286a10a9c6 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -60,6 +60,28 @@ struct QuantizeAndDequantizePerChannelFunctor {
                   typename TTypes<T, 3>::Tensor output);
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min,
+                  typename TTypes<T>::ConstScalar input_max,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop);
+};
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T, typename Func,
           typename Vec = typename TTypes<T>::Vec,
@@ -249,6 +271,55 @@ struct QuantizeAndDequantizePerChannelImpl {
   }
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                      typename TTypes<T>::ConstFlat input,
+                      typename TTypes<T>::ConstScalar input_min,
+                      typename TTypes<T>::ConstScalar input_max,
+                      typename TTypes<T>::Flat input_backprop,
+                      typename TTypes<T>::Scalar input_min_backprop,
+                      typename TTypes<T>::Scalar input_max_backprop) {
+    const T min_val = input_min();
+    const T max_val = input_max();
+    const auto in_range =
+        (input >= min_val && input <= max_val)
+            .select(input.constant(1.0f), input.constant(0.0f));
+    input_backprop.device(d) = gradient * in_range;
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientImpl {
+  static void Compute(const Device& d,
+                      typename TTypes<T, 3>::ConstTensor gradient,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      const Tensor* input_min_tensor,
+                      const Tensor* input_max_tensor,
+                      typename TTypes<T, 3>::Tensor input_backprop,
+                      typename TTypes<T>::Flat input_min_backprop,
+                      typename TTypes<T>::Flat input_max_backprop) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    int num_channels = input.dimension(1);
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto gradient_chip = gradient.template chip<1>(i);
+      const auto input_chip = input.template chip<1>(i);
+      const T min_val = input_min(i);
+      const T max_val = input_max(i);
+      const auto in_range =
+          (input_chip >= min_val && input_chip <= max_val)
+              .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
+      input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
+    }
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
 }  // end of namespace functor
 }  // end of namespace tensorflow
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index f3bb41071cb..9f074535770 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -53,6 +53,37 @@ struct QuantizeAndDequantizePerChannelFunctor<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
 }  // end namespace functor
 
 // Instantiate the GPU implementation for float and double.
@@ -65,6 +96,15 @@ template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
 template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
                                                                 double>;
 
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      double>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 90764b0feb2..a685c1ad0f8 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -362,6 +362,54 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   }
 }
 
+// Verifies the Gradient.
+TEST_P(ParameterizedQuantizeAndDequantizeTest, GradientV4_op) {
+  const int axis = GetParam();
+  TF_ASSERT_OK(NodeDefBuilder("qdq_v4_grad_op", "QuantizeAndDequantizeV4Grad")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("axis", axis)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Input gradient. (repeating 11 values multiplied by (slice_idx + 1))
+  auto gradients = ScalePerSliceAlongAxis<float>(
+      dims, axis, {1, -2, -3, 4, 5, 6, -7, -8, -9, -10, 11});
+  AddInputFromArray<float>(TensorShape(dims), gradients);
+  // Forward op inputs. (repeating 7 values multiplied by (slice_idx + 1)).
+  auto inputs = ScalePerSliceAlongAxis<float>(
+      dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.55, 0.6});
+  AddInputFromArray<float>(TensorShape(dims), inputs);
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> input_min_values(num_slices), input_max_values(num_slices);
+  for (int i = 0; i < num_slices; ++i) {
+    input_max_values[i] = 0.8f + i * 0.4f;
+    input_min_values[i] = -input_max_values[i];
+  }
+  AddInputFromArray<float>(range_shape, input_min_values);
+  AddInputFromArray<float>(range_shape, input_max_values);
+  std::vector<float> expected_vals(inputs.size());
+  int minor_size = 1;
+  for (int i = axis + 1; i < dims.size(); ++i) {
+    minor_size *= dims[i];
+  }
+  for (int i = 0; i < inputs.size(); ++i) {
+    int slice_idx = (i / minor_size) % num_slices;
+    expected_vals[i] = ((inputs[i] >= input_min_values[slice_idx]) &&
+                        (inputs[i] <= input_max_values[slice_idx]))
+                           ? gradients[i]
+                           : 0;
+  }
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(&expected, expected_vals);
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Instantiate parameterized tests for axis = -1, 1, 3.
 INSTANTIATE_TEST_SUITE_P(All, ParameterizedQuantizeAndDequantizeTest,
                          ::testing::Values(-1, 1, 3));
@@ -711,15 +759,16 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
       << s;
 }
 
-#define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                     \
-  static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(int iters) { \
-    auto root = Scope::NewRootScope().ExitOnError();      \
-    ops::QuantizeAndDequantizeV2(root, -3.5, -3.5, -3.5); \
-    TF_CHECK_OK(root.status());                           \
-    Graph* g = new Graph(OpRegistry::Global());           \
-    TF_CHECK_OK(root.ToGraph(g));                         \
-    test::Benchmark(#DEVICE, g).Run(iters);               \
-  }                                                       \
+#define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                                    \
+  static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(                            \
+      ::testing::benchmark::State& state) {                              \
+    auto root = Scope::NewRootScope().ExitOnError();                     \
+    ops::QuantizeAndDequantizeV2(root, -3.5, -3.5, -3.5);                \
+    TF_CHECK_OK(root.status());                                          \
+    Graph* g = new Graph(OpRegistry::Global());                          \
+    TF_CHECK_OK(root.ToGraph(g));                                        \
+    test::Benchmark(#DEVICE, g, /*old_benchmark_api*/ false).Run(state); \
+  }                                                                      \
   BENCHMARK(BM_SIMPLE_QUAN_DEQUAN_##DEVICE);
 
 BM_SIMPLE_QUAN_DEQUAN(cpu);
diff --git a/tensorflow/core/kernels/quantized_concat_op_test.cc b/tensorflow/core/kernels/quantized_concat_op_test.cc
index 2b7fd248e9e..09cb7f00bfd 100644
--- a/tensorflow/core/kernels/quantized_concat_op_test.cc
+++ b/tensorflow/core/kernels/quantized_concat_op_test.cc
@@ -248,9 +248,8 @@ void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
 // If <same_limits> is true, then both concatenated dimensions have the same
 // quantized range; otherwise, they are set to different values.
 template <typename T>
-static void ConcatHelper(int iters, int concat_dimension, bool same_limits,
-                         int dim2) {
-  testing::StopTiming();
+static void ConcatHelper(::testing::benchmark::State& state,
+                         int concat_dimension, bool same_limits, int dim2) {
   Graph* g = new Graph(OpRegistry::Global());
 
   DataType dt = DataTypeToEnum<T>::v();
@@ -278,61 +277,111 @@ static void ConcatHelper(int iters, int concat_dimension, bool same_limits,
                   .Attr("T", dt)
                   .Finalize(g, &node));
 
-  testing::BytesProcessed(static_cast<int64>(iters) *
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
                           ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
 }
 
-static void BM_QConcatDim0SameLimitQInt32(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, true /* same_limits */,
+static void BM_QConcatDim0SameLimitQInt32(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 0 /* concat_dimension */, true /* same_limits */,
                        dim2);
 }
 
-static void BM_QConcatDim1SameLimitQInt32(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, true /* same_limits */,
+static void BM_QConcatDim1SameLimitQInt32(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 1 /* concat_dimension */, true /* same_limits */,
                        dim2);
 }
 
-static void BM_QConcatDim0DifferLimitQInt32(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, false /* same_limits */,
+static void BM_QConcatDim0DifferLimitQInt32(
+    ::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 0 /* concat_dimension */, false /* same_limits */,
                        dim2);
 }
 
-static void BM_QConcatDim1DifferLimitQInt32(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, false /* same_limits */,
+static void BM_QConcatDim1DifferLimitQInt32(
+    ::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 1 /* concat_dimension */, false /* same_limits */,
                        dim2);
 }
 
-BENCHMARK(BM_QConcatDim0SameLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
-BENCHMARK(BM_QConcatDim1SameLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
-BENCHMARK(BM_QConcatDim0DifferLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
-BENCHMARK(BM_QConcatDim1DifferLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim0SameLimitQInt32)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
+BENCHMARK(BM_QConcatDim1SameLimitQInt32)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
+BENCHMARK(BM_QConcatDim0DifferLimitQInt32)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
+BENCHMARK(BM_QConcatDim1DifferLimitQInt32)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
 
-static void BM_QConcatDim0SameLimitQUint8(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, true /* same_limits */,
+static void BM_QConcatDim0SameLimitQUint8(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 0 /* concat_dimension */, true /* same_limits */,
                        dim2);
 }
 
-static void BM_QConcatDim1SameLimitQUint8(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, true /* same_limits */,
+static void BM_QConcatDim1SameLimitQUint8(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 1 /* concat_dimension */, true /* same_limits */,
                        dim2);
 }
 
-static void BM_QConcatDim0DifferLimitQUint8(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, false /* same_limits */,
+static void BM_QConcatDim0DifferLimitQUint8(
+    ::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 0 /* concat_dimension */, false /* same_limits */,
                        dim2);
 }
 
-static void BM_QConcatDim1DifferLimitQUint8(int iters, int dim2) {
-  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, false /* same_limits */,
+static void BM_QConcatDim1DifferLimitQUint8(
+    ::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<qint32>(state, 1 /* concat_dimension */, false /* same_limits */,
                        dim2);
 }
 
-BENCHMARK(BM_QConcatDim0SameLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
-BENCHMARK(BM_QConcatDim1SameLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
-BENCHMARK(BM_QConcatDim0DifferLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
-BENCHMARK(BM_QConcatDim1DifferLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim0SameLimitQUint8)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
+BENCHMARK(BM_QConcatDim1SameLimitQUint8)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
+BENCHMARK(BM_QConcatDim0DifferLimitQUint8)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
+BENCHMARK(BM_QConcatDim1DifferLimitQUint8)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(20000)
+    ->Arg(100000);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index aa736ad7f60..d9993bb6d39 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -20,110 +20,76 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace {
 
-struct RaggedTensor {
-  Tensor values;
-  std::vector<Tensor> nested_splits;
-};
-
-Status RaggedComponentsFromVariant(const Tensor& encoded_variant,
-                                   int ragged_rank, DataType value_dtype,
-                                   DataType split_dtype,
-                                   std::vector<RaggedTensor>* decoded_ragged) {
+Status RaggedComponentsFromVariant(
+    const Tensor& encoded_variant, int ragged_rank, DataType value_dtype,
+    DataType split_dtype, std::vector<RaggedTensorVariant>* decoded_ragged) {
   const auto& flat_variants = encoded_variant.flat<Variant>();
-  decoded_ragged->resize(flat_variants.size());
-  // Step 1: Extract the 1-D DT_VARIANT Tensor from each Variant element in the
-  // input.
+  decoded_ragged->reserve(flat_variants.size());
+
   for (int i = 0; i < flat_variants.size(); i++) {
     const auto& flat_variant = flat_variants(i);
-    const Tensor* encoded_list = flat_variant.get<Tensor>();
-    if (encoded_list == nullptr) {
+    const RaggedTensorVariant* decoded =
+        flat_variant.get<RaggedTensorVariant>();
+    if (decoded == nullptr) {
       return errors::InvalidArgument(
           "Input Variant element at index ", i,
-          " doesn't hold a Tensor: ", flat_variant.DebugString());
+          " doesn't hold a RaggedTensorVariant: ", flat_variant.DebugString());
     }
-    if (encoded_list->dims() != 1) {
+    decoded_ragged->push_back(*decoded);
+    decoded = &decoded_ragged->back();
+    // Check ragged rank & types
+    if (decoded->ragged_rank() != ragged_rank) {
       return errors::InvalidArgument(
-          "Encoded input Variant must have rank 1, but found rank: ",
-          encoded_list->dims(),
-          ". encoded input Variant: ", encoded_list->DebugString());
+          "Encoded input RaggedTensorVariant has ragged_rank=",
+          decoded->ragged_rank(), ".  Expected ragged_rank=", ragged_rank, ".");
     }
-    if (encoded_list->NumElements() != (ragged_rank + 1) &&
-        encoded_list->NumElements() != 1) {
-      return errors::InvalidArgument(
-          "Encoded input Variant must hold either input_ragged_rank + 1 "
-          "Tensors or an empty Tensor (zero splits Tensors, 1 values Tensor), "
-          "input_ragged_rank: ",
-          ragged_rank,
-          ", encoded input Variant: ", encoded_list->DebugString());
-    }
-    const auto& input_vec = encoded_list->vec<Variant>();
-
-    // Step 2: Get the splits and value Tensors from the 1-D DT_VARIANT Tensor
-    // to create the component RaggedTensors.
-    (*decoded_ragged)[i].nested_splits.reserve(ragged_rank);
-    for (int j = 0; j < ragged_rank; j++) {
-      const Tensor* split_tensor = input_vec(j).get<Tensor>();
-      if (split_tensor == nullptr) {
-        return errors::InvalidArgument(
-            "Encoded scalar element at index ", i,
-            " doesn't have a splits Tensor at split_index ", j, ": ",
-            input_vec(j).DebugString());
-      }
-      Tensor splits_tensor = *split_tensor;
-      if (splits_tensor.dtype() != split_dtype) {
-        return errors::InvalidArgument(
-            "Expected splits Tensor dtype: ", split_dtype,
-            ", found: ", splits_tensor.dtype());
-      }
-      if (splits_tensor.dims() != 1) {
-        return errors::InvalidArgument(
-            "Ragged splits must have rank 1; encoded scalar element at index ",
-            i, " has splits Tensor at split_index ", j, ": ",
-            splits_tensor.DebugString());
-      }
-      (*decoded_ragged)[i].nested_splits.push_back(splits_tensor);
-    }
-    const Tensor* values_tensor = input_vec(ragged_rank).get<Tensor>();
-    if (values_tensor == nullptr) {
-      return errors::InvalidArgument("Encoded scalar element at index ", i,
-                                     " doesn't have a values Tensor: ",
-                                     input_vec(ragged_rank).DebugString());
-    }
-    if (values_tensor->dtype() != value_dtype) {
+    if (decoded->values().dtype() != value_dtype) {
       return errors::InvalidArgument(
           "Expected values Tensor dtype: ", DataTypeString(value_dtype),
-          ", found: ", DataTypeString(values_tensor->dtype()));
+          ", found: ", DataTypeString(decoded->values().dtype()));
     }
-    if (values_tensor->dims() < 1) {
+    if (decoded->values().dims() < 1) {
       return errors::InvalidArgument(
           "Ragged values must have rank >= 1; encoded scalar element at index ",
-          i, " has values Tensor: ", values_tensor->DebugString());
+          i, " has values Tensor: ", decoded->values().DebugString());
+    }
+    for (const auto& splits : decoded->nested_splits()) {
+      if (splits.dtype() != split_dtype) {
+        return errors::InvalidArgument(
+            "Expected row_splits Tensor dtype: ", DataTypeString(split_dtype),
+            ", found: ", DataTypeString(splits.dtype()));
+      }
+      if (splits.dims() != 1) {
+        return errors::InvalidArgument(
+            "Ragged splits must have rank 1; encoded scalar element at index ",
+            i, " has splits Tensor ", splits.DebugString());
+      }
     }
-    (*decoded_ragged)[i].values = *values_tensor;
   }
   return Status::OK();
 }
 
 template <typename VALUE_TYPE, typename SPLIT_TYPE>
 Status NestedStackRaggedTensors(
-    const std::vector<RaggedTensor>& ragged_components,
+    const std::vector<RaggedTensorVariant>& ragged_components,
     const std::vector<int>& nested_dim_sizes, const int input_ragged_rank,
-    const int output_ragged_rank, RaggedTensor* output_ragged) {
-  output_ragged->nested_splits.reserve(output_ragged_rank);
+    const int output_ragged_rank, RaggedTensorVariant* output_ragged) {
+  output_ragged->mutable_nested_splits()->reserve(output_ragged_rank);
   const int dims = nested_dim_sizes.size();
 
   // Populate first `dims - 1` splits.
   for (int i = 0; i < dims - 1; i++) {
     int dims_splits_size = nested_dim_sizes[i] + 1;
-    output_ragged->nested_splits.push_back(Tensor(
-        DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({dims_splits_size})));
-    auto splits_vec = output_ragged->nested_splits[i].vec<SPLIT_TYPE>();
+    output_ragged->append_splits(Tensor(DataTypeToEnum<SPLIT_TYPE>::value,
+                                        TensorShape({dims_splits_size})));
+    auto splits_vec = output_ragged->mutable_splits(i)->vec<SPLIT_TYPE>();
     int split_diff = nested_dim_sizes[i + 1];
     for (int j = 0; j < dims_splits_size; j++) {
       splits_vec(j) = j * split_diff;
@@ -132,15 +98,15 @@ Status NestedStackRaggedTensors(
 
   // Populate `dims`-th split.
   int splits_size = ragged_components.size() + 1;
-  output_ragged->nested_splits.push_back(
+  output_ragged->append_splits(
       Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({splits_size})));
   auto dims_splits_vec =
-      output_ragged->nested_splits[dims - 1].vec<SPLIT_TYPE>();
+      output_ragged->mutable_splits(dims - 1)->vec<SPLIT_TYPE>();
   dims_splits_vec(0) = 0;
   for (int i = 0; i < ragged_components.size(); i++) {
-    int split_val = ragged_components[i].values.shape().dim_size(0);
-    if (input_ragged_rank != 0 && !ragged_components[i].nested_splits.empty()) {
-      split_val = ragged_components[i].nested_splits[0].NumElements() - 1;
+    int split_val = ragged_components[i].values().shape().dim_size(0);
+    if (input_ragged_rank != 0 && ragged_components[i].ragged_rank() > 0) {
+      split_val = ragged_components[i].splits(0).NumElements() - 1;
     }
     dims_splits_vec(i + 1) = dims_splits_vec(i) + split_val;
   }
@@ -150,24 +116,24 @@ Status NestedStackRaggedTensors(
     int split_index = dims + i;
     int split_size = 1;
     for (int j = 0; j < ragged_components.size(); j++) {
-      if (!ragged_components[j].nested_splits.empty()) {
-        split_size += ragged_components[j].nested_splits[i].NumElements() - 1;
+      if (!ragged_components[j].nested_splits().empty()) {
+        split_size += ragged_components[j].splits(i).NumElements() - 1;
       }
     }
-    output_ragged->nested_splits.push_back(
+    output_ragged->append_splits(
         Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({split_size})));
     auto splits_vec =
-        output_ragged->nested_splits[split_index].vec<SPLIT_TYPE>();
+        output_ragged->mutable_splits(split_index)->vec<SPLIT_TYPE>();
     splits_vec(0) = 0;
     SPLIT_TYPE last_split_value = 0;
     int index = 1;
     for (int j = 0; j < ragged_components.size(); j++) {
-      if (ragged_components[j].nested_splits.empty()) {
+      if (ragged_components[j].nested_splits().empty()) {
         // Corner case: empty row. e.g [ [[x], [x]], [] ]
         continue;
       }
       auto component_splits_vec =
-          ragged_components[j].nested_splits[i].vec<SPLIT_TYPE>();
+          ragged_components[j].splits(i).vec<SPLIT_TYPE>();
       for (int k = 1; k < component_splits_vec.size(); k++, index++) {
         splits_vec(index) = component_splits_vec(k) + last_split_value;
       }
@@ -187,35 +153,35 @@ Status NestedStackRaggedTensors(
   if (ragged_components.empty()) {
     component_values_shape = TensorShape({0});
   } else {
-    component_values_shape = ragged_components[0].values.shape();
+    component_values_shape = ragged_components[0].values().shape();
   }
 
   // Populate values.
   int values_size = component_values_shape.dim_size(0);
   for (int i = 1; i < ragged_components.size(); i++) {
-    if (ragged_components[i].values.dims() != component_values_shape.dims()) {
+    if (ragged_components[i].values().dims() != component_values_shape.dims()) {
       return errors::InvalidArgument(
           "Rank of values must match for all "
           "components; values shape at index 0: ",
           component_values_shape.DebugString(), ", values shape at index ", i,
-          ": ", ragged_components[i].values.shape().DebugString());
+          ": ", ragged_components[i].values().shape().DebugString());
     }
-    values_size += ragged_components[i].values.shape().dim_size(0);
+    values_size += ragged_components[i].values().shape().dim_size(0);
   }
   component_values_shape.set_dim(0, values_size);
-  output_ragged->values =
-      Tensor(DataTypeToEnum<VALUE_TYPE>::value, component_values_shape);
+  output_ragged->set_values(
+      Tensor(DataTypeToEnum<VALUE_TYPE>::value, component_values_shape));
   auto output_values_flat =
-      output_ragged->values.flat_outer_dims<VALUE_TYPE, 2>();
+      output_ragged->mutable_values()->flat_outer_dims<VALUE_TYPE, 2>();
   int values_index = 0;
   for (int i = 0; i < ragged_components.size(); i++) {
     auto component_values_flat =
-        ragged_components[i].values.flat_outer_dims<VALUE_TYPE, 2>();
-    int num_inner_elements = ragged_components[i].values.NumElements();
-    if (ragged_components[i].values.dim_size(0) > 0) {
-      num_inner_elements /= ragged_components[i].values.dim_size(0);
+        ragged_components[i].values().flat_outer_dims<VALUE_TYPE, 2>();
+    int num_inner_elements = ragged_components[i].values().NumElements();
+    if (ragged_components[i].values().dim_size(0) > 0) {
+      num_inner_elements /= ragged_components[i].values().dim_size(0);
     }
-    for (int j = 0; j < ragged_components[i].values.dim_size(0);
+    for (int j = 0; j < ragged_components[i].values().dim_size(0);
          j++, values_index++) {
       for (int k = 0; k < num_inner_elements; k++) {
         output_values_flat(values_index, k) = component_values_flat(j, k);
@@ -265,7 +231,7 @@ class RaggedTensorFromVariantOp : public OpKernel {
     // Decode all variants.
     const auto value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
     const auto split_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
-    std::vector<RaggedTensor> decoded_components;
+    std::vector<RaggedTensorVariant> decoded_components;
     OP_REQUIRES_OK(context, RaggedComponentsFromVariant(
                                 encoded_variant, input_ragged_rank_,
                                 value_dtype, split_dtype, &decoded_components));
@@ -281,7 +247,7 @@ class RaggedTensorFromVariantOp : public OpKernel {
     for (int i = 0; i < encoded_variant.dims(); i++) {
       encoded_dim_sizes[i] = encoded_variant.dim_size(i);
     }
-    RaggedTensor output_ragged;
+    RaggedTensorVariant output_ragged;
     OP_REQUIRES_OK(
         context, NestedStackRaggedTensors<VALUE_TYPE, SPLIT_TYPE>(
                      decoded_components, encoded_dim_sizes, input_ragged_rank_,
@@ -296,15 +262,15 @@ class RaggedTensorFromVariantOp : public OpKernel {
   int output_ragged_rank_;
 
   void ReturnRaggedTensor(OpKernelContext* context,
-                          RaggedTensor ragged_tensor) {
-    int ragged_rank = ragged_tensor.nested_splits.size();
+                          const RaggedTensorVariant& ragged_tensor) {
+    int ragged_rank = ragged_tensor.ragged_rank();
     OpOutputList splits_out;
     OP_REQUIRES_OK(context,
                    context->output_list("output_nested_splits", &splits_out));
     for (int i = 0; i < ragged_rank; i++) {
-      splits_out.set(i, ragged_tensor.nested_splits[i]);
+      splits_out.set(i, ragged_tensor.splits(i));
     }
-    context->set_output(ragged_rank, ragged_tensor.values);
+    context->set_output(ragged_rank, ragged_tensor.values());
   }
 };
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
index bdf321d0515..fc46283c90e 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -55,28 +56,22 @@ class RaggedTensorFromVariantKernelTest : public ::tensorflow::OpsTestBase {
   }
 
   template <typename VALUE_TYPE, typename SPLIT_TYPE>
-  Tensor CreateVariantFromRagged(
+  RaggedTensorVariant CreateVariantFromRagged(
       const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
       const TensorShape& ragged_values_shape,
       const std::vector<VALUE_TYPE>& ragged_values) {
-    // Step 1: Create Tensors out of ragged splits and values.
-    std::vector<Variant> ragged_components;
+    RaggedTensorVariant encoded;
     for (auto ragged_split : ragged_splits) {
       int splits_size = ragged_split.size();
       Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
                     TensorShape({splits_size}));
       test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
-      ragged_components.push_back(splits);
+      encoded.append_splits(splits);
     }
     Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
     test::FillValues<VALUE_TYPE>(&values, ragged_values);
-    ragged_components.push_back(values);
-
-    // Step 2: Encode into a 1-D Variant Tensor.
-    int num_splits = ragged_splits.size();
-    Tensor encoded_list(DT_VARIANT, TensorShape({num_splits + 1}));
-    test::FillValues<Variant>(&encoded_list, ragged_components);
-    return encoded_list;
+    encoded.set_values(values);
+    return encoded;
   }
 };
 
@@ -85,7 +80,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, ScalarInput) {
   const std::vector<int64> split_2 = {0, 1, 2, 5, 6, 7};
   const std::vector<int> values = {0, 1, 1, 2, 2, 3, 4};
 
-  Tensor encoded_variant = CreateVariantFromRagged<int, int64>(
+  auto encoded_variant = CreateVariantFromRagged<int, int64>(
       {split_1, split_2}, TensorShape({7}), values);
   Tensor expected_splits_1(DT_INT64, TensorShape({6}));
   Tensor expected_splits_2(DT_INT64, TensorShape({6}));
@@ -113,7 +108,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, OneInputElement) {
   const std::vector<int> values = {0, 1, 1, 2, 2, 3, 4};
   const std::vector<int64> batched_splits_1 = {0, 5};
 
-  Tensor encoded_variant = CreateVariantFromRagged<int, int64>(
+  auto encoded_variant = CreateVariantFromRagged<int, int64>(
       {split_1, split_2}, TensorShape({7}), values);
   Tensor expected_splits_1(DT_INT64, TensorShape({2}));
   Tensor expected_splits_2(DT_INT64, TensorShape({6}));
@@ -157,13 +152,13 @@ TEST_F(RaggedTensorFromVariantKernelTest, TensorIn2DOut) {
   const std::vector<int64> batched_splits_2 = {0, 3, 3, 5, 6};
   const std::vector<int> batched_values = {1, 2, 3, 4, 5, 6};
 
-  Tensor component_variant_1 =
+  auto component_variant_1 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({3}), values_1);
-  Tensor component_variant_2 =
+  auto component_variant_2 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({0}), values_2);
-  Tensor component_variant_3 =
+  auto component_variant_3 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({2}), values_3);
-  Tensor component_variant_4 =
+  auto component_variant_4 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({1}), values_4);
 
   Tensor expected_splits_1(DT_INT64, TensorShape({3}));
@@ -223,15 +218,15 @@ TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOut) {
   test::FillValues<int64>(&expected_splits_3, batched_splits_3);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2}), component_values_2);
-  Tensor variant_component_3 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_3 = CreateVariantFromRagged<int, int64>(
       {component_split_3_1}, TensorShape({2}), component_values_3);
-  Tensor variant_component_4 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_4 = CreateVariantFromRagged<int, int64>(
       {component_split_4_1}, TensorShape({3}), component_values_4);
-  Tensor variant_component_5 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_5 = CreateVariantFromRagged<int, int64>(
       {component_split_5_1}, TensorShape({3}), component_values_5);
   int input_ragged_rank = 1;
   int output_ragged_rank = 3;
@@ -297,10 +292,10 @@ TEST_F(RaggedTensorFromVariantKernelTest,
   test::FillValues<int64>(&expected_splits_4, batched_splits_4);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1, component_split_1_2}, TensorShape({11}),
       component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1, component_split_2_2}, TensorShape({11}),
       component_values_2);
   int input_ragged_rank = -1;
@@ -336,9 +331,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, EmptyRow1DIn2DOut) {
   test::FillValues<int64>(&expected_splits_2, batched_splits_2);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({3}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({0}), {});  // Empty row.
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -371,9 +366,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, NDValues1DIn2DOut) {
   test::FillValues<int64>(&expected_splits_2, batched_splits_2);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1, 2}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2, 2}), component_values_2);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -423,15 +418,15 @@ TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOutInt32Splits) {
   test::FillValues<int>(&expected_splits_3, batched_splits_3);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int>(
       {component_split_2_1}, TensorShape({2}), component_values_2);
-  Tensor variant_component_3 = CreateVariantFromRagged<int, int>(
+  auto variant_component_3 = CreateVariantFromRagged<int, int>(
       {component_split_3_1}, TensorShape({2}), component_values_3);
-  Tensor variant_component_4 = CreateVariantFromRagged<int, int>(
+  auto variant_component_4 = CreateVariantFromRagged<int, int>(
       {component_split_4_1}, TensorShape({3}), component_values_4);
-  Tensor variant_component_5 = CreateVariantFromRagged<int, int>(
+  auto variant_component_5 = CreateVariantFromRagged<int, int>(
       {component_split_5_1}, TensorShape({3}), component_values_5);
   int input_ragged_rank = 1;
   int output_ragged_rank = 3;
@@ -451,13 +446,13 @@ TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOutInt32Splits) {
 
 // Tests for invalid inputs.
 TEST_F(RaggedTensorFromVariantKernelTest, InvalidInferredInputRaggedRank) {
-  Tensor component_variant_1 =
+  auto component_variant_1 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({3}), {1, 2, 3});
-  Tensor component_variant_2 =
+  auto component_variant_2 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({0}), {});
-  Tensor component_variant_3 =
+  auto component_variant_3 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({2}), {1, 2});
-  Tensor component_variant_4 =
+  auto component_variant_4 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({1}), {1});
 
   int input_ragged_rank = -1;
@@ -478,9 +473,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, InputDimsAndRaggedRankAttrsMismatch) {
   const std::vector<int> component_values_1 = {0};
   const std::vector<int> component_values_2 = {0, 1};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2}), component_values_2);
 
   int input_ragged_rank = 1;
@@ -493,33 +488,21 @@ TEST_F(RaggedTensorFromVariantKernelTest, InputDimsAndRaggedRankAttrsMismatch) {
                                "input_ragged_rank + encoded_ragged.dims()"));
 }
 
-TEST_F(RaggedTensorFromVariantKernelTest, InputDoesNotHoldTensors) {
+TEST_F(RaggedTensorFromVariantKernelTest, InputDoesNotHoldRaggedTensorVariant) {
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
   BuildDecodeRaggedTensorGraph<int, int64>(
       input_ragged_rank, output_ragged_rank, TensorShape({2}), {1, 2});
   EXPECT_TRUE(absl::StartsWith(
       RunOpKernel().error_message(),
-      "Input Variant element at index 0 doesn't hold a Tensor"));
-}
-
-TEST_F(RaggedTensorFromVariantKernelTest, InputVariantTensorRankNotOne) {
-  Tensor variant_list(DT_VARIANT, TensorShape({2, 1}));
-  test::FillValues<Variant>(&variant_list, {1, 2});
-  int input_ragged_rank = 1;
-  int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<int, int64>(
-      input_ragged_rank, output_ragged_rank, TensorShape({1}), {variant_list});
-  EXPECT_TRUE(absl::StartsWith(
-      RunOpKernel().error_message(),
-      "Encoded input Variant must have rank 1, but found rank: 2"));
+      "Input Variant element at index 0 doesn't hold a RaggedTensorVariant"));
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest,
        InputScalarElementDoesNotMatchInputRaggedRank) {
   const std::vector<int64> component_split_1_1 = {0, 1};
   const std::vector<int> component_values_1 = {1, 2};
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1, 2}), component_values_1);
 
   int input_ragged_rank = 2;
@@ -527,31 +510,17 @@ TEST_F(RaggedTensorFromVariantKernelTest,
   BuildDecodeRaggedTensorGraph<int, int64>(input_ragged_rank,
                                            output_ragged_rank, TensorShape({1}),
                                            {variant_component_1});
-  EXPECT_TRUE(absl::StartsWith(
-      RunOpKernel().error_message(),
-      "Encoded input Variant must hold either input_ragged_rank + 1 "
-      "Tensors or an empty Tensor"));
-}
-
-TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitNotATensor) {
-  Tensor variant_list(DT_VARIANT, TensorShape({2}));
-  test::FillValues<Variant>(&variant_list, {1, 2});
-
-  int input_ragged_rank = 1;
-  int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<int, int>(input_ragged_rank, output_ragged_rank,
-                                         TensorShape({1}), {variant_list});
   EXPECT_TRUE(
       absl::StartsWith(RunOpKernel().error_message(),
-                       "Encoded scalar element at index 0 doesn't have a "
-                       "splits Tensor at split_index 0"));
+                       "Encoded input RaggedTensorVariant has ragged_rank=1.  "
+                       "Expected ragged_rank=2."));
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitTypeMismatch) {
   const std::vector<int64> component_split_1_1 = {0, 1};
   const std::vector<int> component_values_1 = {0};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
 
   int input_ragged_rank = 1;
@@ -559,46 +528,29 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitTypeMismatch) {
   BuildDecodeRaggedTensorGraph<int, int>(input_ragged_rank, output_ragged_rank,
                                          TensorShape({1}),
                                          {variant_component_1});
-  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
-                               "Expected splits Tensor dtype: 3, found: 9"));
+  EXPECT_TRUE(absl::StartsWith(
+      RunOpKernel().error_message(),
+      "Expected row_splits Tensor dtype: int32, found: int64"));
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitRankNotOne) {
-  Tensor splits(DT_INT64, TensorShape({2, 1}));
-  test::FillValues<int64>(&splits, {1, 2});
-  Tensor values(DT_INT32, {2});
-  test::FillValues<int>(&values, {1, 2});
-  Tensor encoded_list(DT_VARIANT, TensorShape({2}));
-  test::FillValues<Variant>(&encoded_list, {splits, values});
+  RaggedTensorVariant encoded(Tensor(DT_INT32, {2}),
+                              {Tensor(DT_INT64, {2, 1})});
+  test::FillValues<int64>(encoded.mutable_splits(0), {1, 2});
+  test::FillValues<int>(encoded.mutable_values(), {1, 2});
 
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
   BuildDecodeRaggedTensorGraph<int, int64>(
-      input_ragged_rank, output_ragged_rank, TensorShape({1}), {encoded_list});
+      input_ragged_rank, output_ragged_rank, TensorShape({1}), {encoded});
   EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
                                "Ragged splits must have rank 1"));
 }
 
-TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesNotATensor) {
-  Tensor splits(DT_INT64, TensorShape({3}));
-  test::FillValues<int64>(&splits, {0, 2, 3});
-  Tensor variant_list(DT_VARIANT, TensorShape({2}));
-  test::FillValues<Variant>(&variant_list, {splits, 2});
-
-  int input_ragged_rank = 1;
-  int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<int, int64>(
-      input_ragged_rank, output_ragged_rank, TensorShape({1}), {variant_list});
-  EXPECT_TRUE(
-      absl::StartsWith(RunOpKernel().error_message(),
-                       "Encoded scalar element at index 0 doesn't have a "
-                       "values Tensor"));
-}
-
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
   const std::vector<int64> component_split_1_1 = {0, 1};
   const std::vector<int> component_values_1 = {0};
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -611,7 +563,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankNotGreaterThanOne) {
-  Tensor variant_component_1 =
+  auto variant_component_1 =
       CreateVariantFromRagged<int, int64>({{0, 1}}, TensorShape({}), {1});
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -628,9 +580,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankMismatch) {
   const std::vector<int> component_values_1 = {0};
   const std::vector<int> component_values_2 = {0, 1, 2, 3};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2, 2}), component_values_2);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -711,13 +663,13 @@ TEST_F(RaggedTensorFromVariantKernelTest, 2DValuesTensorIn1DOut) {
   const std::vector<int> batched_values = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3,
                                            3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {}, TensorShape({2, 2, 2}), {1, 1, 1, 1, 2, 2, 2, 2});
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {}, TensorShape({1, 2, 2}), {3, 3, 3, 3});
-  Tensor variant_component_3 =
+  auto variant_component_3 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({0, 2, 2}), {});
-  Tensor variant_component_4 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_4 = CreateVariantFromRagged<int, int64>(
       {}, TensorShape({2, 2, 2}), {4, 4, 4, 4, 5, 5, 5, 5});
 
   Tensor expected_splits_1(DT_INT64, TensorShape({5}));
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 64c372b005e..549dc68dfbf 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -18,50 +18,38 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 
 namespace tensorflow {
 namespace {
 
-struct RaggedTensor {
-  Tensor values;
-  std::vector<Tensor> nested_splits;
-};
-
-Status RaggedToVariant(const RaggedTensor& ragged, Tensor* encoded_list) {
-  // Encode as a rank-1 Variant Tensor.
-  int ragged_rank = ragged.nested_splits.size();
-  *encoded_list = Tensor(DT_VARIANT, TensorShape({ragged_rank + 1}));
-  auto encoded_vec = encoded_list->vec<Variant>();
-  for (int i = 0; i < ragged_rank; i++) {
-    encoded_vec(i) = ragged.nested_splits[i];
-  }
-  encoded_vec(ragged_rank) = ragged.values;
-  return Status::OK();
-}
-
 template <typename VALUE_TYPE, typename SPLIT_TYPE>
-Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
-                              std::vector<RaggedTensor>* ragged_components) {
+Status UnbatchRaggedZerothDim(
+    const RaggedTensorVariant& batched_ragged,
+    std::vector<RaggedTensorVariant>* ragged_components) {
   // Set up the component Ragged Tensors.
-  int ragged_rank = batched_ragged.nested_splits.size();
-  auto batched_splits_top_vec =
-      batched_ragged.nested_splits[0].vec<SPLIT_TYPE>();
+  int ragged_rank = batched_ragged.ragged_rank();
+  auto batched_splits_top_vec = batched_ragged.splits(0).vec<SPLIT_TYPE>();
   int num_components = batched_splits_top_vec.size() - 1;
   int num_splits = ragged_rank - 1;
   ragged_components->resize(num_components);
-  for (RaggedTensor ragged_component : *ragged_components) {
-    ragged_component.nested_splits.reserve(num_splits);
+  for (RaggedTensorVariant& ragged_component : *ragged_components) {
+    ragged_component.mutable_nested_splits()->reserve(num_splits);
   }
-  const auto& batched_flat = batched_ragged.values.flat<VALUE_TYPE>();
-  int num_inner_elems = batched_ragged.values.NumElements();
-  if (batched_ragged.values.dim_size(0) > 1) {
-    num_inner_elems /= batched_ragged.values.dim_size(0);
+  const auto& batched_flat = batched_ragged.values().flat<VALUE_TYPE>();
+  int num_inner_elems = batched_ragged.values().NumElements();
+  if (batched_ragged.values().dim_size(0) > 1) {
+    num_inner_elems /= batched_ragged.values().dim_size(0);
   }
-  TensorShape values_shape = batched_ragged.values.shape();
+  TensorShape values_shape = batched_ragged.values().shape();
 
   // Corner case: ragged_rank == 1, e.g. [[1, 2, 3], [4, 5]]
   if (num_splits == 0) {
@@ -70,10 +58,10 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
       int limit = batched_splits_top_vec(i + 1);
       int num_values = limit - start;
       values_shape.set_dim(0, num_values);
-      (*ragged_components)[i].values =
-          Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape);
+      (*ragged_components)[i].set_values(
+          Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape));
       auto ragged_component_values_flat =
-          (*ragged_components)[i].values.flat<VALUE_TYPE>();
+          (*ragged_components)[i].mutable_values()->flat<VALUE_TYPE>();
       for (int j = 0; j < num_values * num_inner_elems; j++) {
         ragged_component_values_flat(j) =
             batched_flat(j + start * num_inner_elems);
@@ -86,8 +74,7 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
   std::vector<typename TTypes<SPLIT_TYPE>::ConstVec> batched_splits_vec;
   batched_splits_vec.reserve(ragged_rank);
   for (int i = 0; i < ragged_rank; i++) {
-    batched_splits_vec.push_back(
-        batched_ragged.nested_splits[i].vec<SPLIT_TYPE>());
+    batched_splits_vec.push_back(batched_ragged.splits(i).vec<SPLIT_TYPE>());
   }
   std::vector<int> index(num_splits, 1);
   std::vector<int> ragged_component_values_size(num_components, 0);
@@ -104,10 +91,10 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
         int last_index = ragged_component_splits_vec[j - 1].size() - 1;
         split_size = ragged_component_splits_vec[j - 1](last_index) + 1;
       }
-      (*ragged_components)[i].nested_splits.push_back(
+      (*ragged_components)[i].append_splits(
           Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({split_size})));
       ragged_component_splits_vec.push_back(
-          (*ragged_components)[i].nested_splits[j].vec<SPLIT_TYPE>());
+          (*ragged_components)[i].mutable_splits(j)->vec<SPLIT_TYPE>());
       SPLIT_TYPE last_split_value = batched_splits_vec[j + 1](index[j] - 1);
       ragged_component_splits_vec[j](0) = 0;
       for (int k = 1; k < split_size; k++, index[j]++) {
@@ -125,10 +112,10 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
   for (int i = 0; i < num_components; i++) {
     int num_values = ragged_component_values_size[i];
     values_shape.set_dim(0, num_values);
-    (*ragged_components)[i].values =
-        Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape);
+    (*ragged_components)[i].set_values(
+        Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape));
     auto ragged_component_values_flat =
-        (*ragged_components)[i].values.flat<VALUE_TYPE>();
+        (*ragged_components)[i].mutable_values()->flat<VALUE_TYPE>();
     for (int j = 0; j < num_values * num_inner_elems; j++, value_index++) {
       ragged_component_values_flat(j) = batched_flat(value_index);
     }
@@ -152,46 +139,38 @@ class RaggedTensorToVariantOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input_list("rt_nested_splits",
                                                 &ragged_nested_splits_in));
     const int ragged_nested_splits_len = ragged_nested_splits_in.size();
-    RaggedTensor batched_ragged_input;
+    RaggedTensorVariant batched_ragged_input;
     // Read ragged_values input.
-    batched_ragged_input.values = context->input(ragged_nested_splits_len);
-    batched_ragged_input.nested_splits.reserve(ragged_nested_splits_len);
+    batched_ragged_input.set_values(context->input(ragged_nested_splits_len));
+    batched_ragged_input.mutable_nested_splits()->reserve(
+        ragged_nested_splits_len);
     for (int i = 0; i < ragged_nested_splits_len; i++) {
-      batched_ragged_input.nested_splits.push_back(ragged_nested_splits_in[i]);
+      batched_ragged_input.append_splits(ragged_nested_splits_in[i]);
     }
 
     if (!batched_input_) {
-      // Encode the input as is.
-      Tensor encoded_list;
-      OP_REQUIRES_OK(context,
-                     RaggedToVariant(batched_ragged_input, &encoded_list));
       // Encode as a Scalar Variant Tensor.
       Tensor* encoded_scalar;
       OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}),
                                                        &encoded_scalar));
-      encoded_scalar->scalar<Variant>()() = std::move(encoded_list);
+      encoded_scalar->scalar<Variant>()() = std::move(batched_ragged_input);
       return;
     }
 
     // Unbatch the Ragged Tensor and encode the components.
-    std::vector<RaggedTensor> ragged_components;
+    std::vector<RaggedTensorVariant> unbatched_ragged_input;
     OP_REQUIRES_OK(context, UnbatchRaggedZerothDim<VALUE_TYPE, SPLIT_TYPE>(
-                                batched_ragged_input, &ragged_components));
-    std::vector<Tensor> encoded_components(ragged_components.size());
-    for (int i = 0; i < ragged_components.size(); i++) {
-      OP_REQUIRES_OK(context, RaggedToVariant(ragged_components[i],
-                                              &encoded_components[i]));
-    }
+                                batched_ragged_input, &unbatched_ragged_input));
 
     // Bundle the encoded scalar Variant Tensors into a rank-1 Variant Tensor.
-    Tensor* encoded_ragged;
-    int output_size = ragged_components.size();
+    Tensor* encoded_vector;
+    int output_size = unbatched_ragged_input.size();
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({output_size}),
-                                            &encoded_ragged));
-    auto encoded_ragged_vec = encoded_ragged->vec<Variant>();
+                                            &encoded_vector));
+    auto encoded_vector_t = encoded_vector->vec<Variant>();
     for (int i = 0; i < output_size; i++) {
-      encoded_ragged_vec(i) = encoded_components[i];
+      encoded_vector_t(i) = unbatched_ragged_input[i];
     }
   }
 
@@ -199,12 +178,81 @@ class RaggedTensorToVariantOp : public OpKernel {
   bool batched_input_;
 };
 
-#define REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, split_type)      \
-  REGISTER_KERNEL_BUILDER(Name("RaggedTensorToVariant")               \
-                              .Device(DEVICE_CPU)                     \
-                              .TypeConstraint<value_type>("Tvalues")  \
-                              .TypeConstraint<split_type>("Tsplits"), \
-                          RaggedTensorToVariantOp<value_type, split_type>);
+template <typename VALUE_TYPE, typename SPLIT_TYPE>
+class RaggedTensorToVariantGradientOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    // Read inputs.
+    Tensor encoded_variant = context->input(0);
+    Tensor row_splits = context->input(1);
+    auto flat_row_splits = row_splits.flat<SPLIT_TYPE>();
+    TensorShape dense_values_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeUtils::MakeShape(context->input(2).vec<int32>(),
+                                               &dense_values_shape));
+
+    const auto& flat_variants = encoded_variant.flat<Variant>();
+
+    // Get a Tensor containing the flat_values for each variant.
+    std::vector<Tensor> values;
+    for (int i = 0; i < flat_variants.size(); ++i) {
+      if (const auto* encoded = flat_variants(i).get<RaggedTensorVariant>()) {
+        values.push_back(encoded->values());
+      } else {
+        // Missing value: this happens if only some of the variant values
+        // generated by ragged_tensor_to_variant impacted the value that we're
+        // calculating the gradient for.  In this case, we will see a
+        // default-constructed variant; so treat it as a zero tensor with the
+        // appropriate shape.
+        const auto value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+        int piece_size = flat_row_splits(i + 1) - flat_row_splits(i);
+        TensorShape zeros_shape = dense_values_shape;
+        zeros_shape.set_dim(0, piece_size);
+        Tensor zero(value_dtype, zeros_shape);
+        zero.flat<VALUE_TYPE>() =
+            zero.flat<VALUE_TYPE>().constant(VALUE_TYPE());
+        values.push_back(zero);
+      }
+    }
+
+    if (values.size() == 1) {
+      // Just one flat_value tensor: return as-is.
+      context->set_output(0, values[0]);
+    } else {
+      // Multiple flat_values tensors: concatenate them together.
+      using Piece = typename TTypes<VALUE_TYPE, 2>::Matrix;
+      using ConstPiece = typename TTypes<VALUE_TYPE, 2>::ConstMatrix;
+      std::vector<std::unique_ptr<ConstPiece>> pieces;
+      pieces.reserve(values.size());
+      for (const Tensor& t : values) {
+        pieces.emplace_back(
+            new ConstPiece(t.shaped<VALUE_TYPE, 2>({1, t.NumElements()})));
+      }
+      Tensor* out = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, dense_values_shape, &out));
+      Piece out_flat =
+          out->shaped<VALUE_TYPE, 2>({1, dense_values_shape.num_elements()});
+      ConcatCPU<VALUE_TYPE>(context->device(), pieces, &out_flat);
+    }
+  }
+};
+
+#define REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, split_type)            \
+  REGISTER_KERNEL_BUILDER(Name("RaggedTensorToVariant")                     \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<value_type>("Tvalues")        \
+                              .TypeConstraint<split_type>("Tsplits"),       \
+                          RaggedTensorToVariantOp<value_type, split_type>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("RaggedTensorToVariantGradient")                                 \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<value_type>("Tvalues")                            \
+          .TypeConstraint<split_type>("Tsplits"),                           \
+      RaggedTensorToVariantGradientOp<value_type, split_type>);
+
 #define REGISTER_KERNELS(value_type)                  \
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int32) \
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int64)
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
index c1438dd7af9..94f35673c8b 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -60,6 +61,43 @@ class RaggedTensorToVariantKernelTest : public ::tensorflow::OpsTestBase {
     }
     AddInputFromArray<VALUE_TYPE>(ragged_values_shape, ragged_values);
   }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    RaggedTensorVariant encoded;
+    for (auto ragged_split : ragged_splits) {
+      int splits_size = ragged_split.size();
+      Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
+                    TensorShape({splits_size}));
+      test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
+      encoded.append_splits(splits);
+    }
+    Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
+    test::FillValues<VALUE_TYPE>(&values, ragged_values);
+    encoded.set_values(values);
+    return encoded;
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    int num_values = ragged_values.size();
+    return CreateVariantFromRagged(ragged_splits, {num_values}, ragged_values);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void ExpectRaggedTensorVariantEqual(const RaggedTensorVariant& expected,
+                                      const RaggedTensorVariant& actual) {
+    test::ExpectTensorEqual<VALUE_TYPE>(actual.values(), expected.values());
+    EXPECT_EQ(actual.ragged_rank(), expected.ragged_rank());
+    for (int i = 0; i < actual.ragged_rank(); ++i) {
+      test::ExpectTensorEqual<SPLIT_TYPE>(actual.splits(i), expected.splits(i));
+    }
+  }
 };
 
 TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
@@ -67,18 +105,6 @@ TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
   const std::vector<int64> batched_splits_1 = {0, 2, 3, 3};
   const std::vector<int64> batched_splits_2 = {0, 0, 0, 0};
 
-  const std::vector<int64> component_splits_1_1 = {0, 0, 0};
-  const std::vector<int64> component_splits_2_1 = {0, 0};
-  const std::vector<int64> component_splits_3_1 = {0};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({3}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({2}));
-  Tensor expected_splits_3_1(DT_INT64, TensorShape({1}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
-  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
-  test::FillValues<int64>(&expected_splits_3_1, component_splits_3_1);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({0}), {}, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -86,55 +112,26 @@ TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 3);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_3_1 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(1);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_3_1.get<Tensor>(),
-                                 expected_splits_3_1);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               Tensor(DT_INT32, TensorShape({0})));
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               Tensor(DT_INT32, TensorShape({0})));
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               Tensor(DT_INT32, TensorShape({0})));
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 0, 0}}, {}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 0}}, {}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0}}, {}),
+      *encoded_list(2).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, 1DValuesRaggedRankOneInput) {
   // ragged_tensor=
-  // [ [x, x, x],
+  // [ [1, 2, 3],
   //   [       ],
-  //   [x, x   ],
-  //   [x      ]]
+  //   [4, 5   ],
+  //   [6      ]]
   const std::vector<int64> batched_splits = {0, 3, 3, 5, 6};
   const std::vector<int> batched_values = {1, 2, 3, 4, 5, 6};
 
-  const std::vector<int> component_values_1 = {1, 2, 3};
-  const std::vector<int> component_values_3 = {4, 5};
-  const std::vector<int> component_values_4 = {6};
-
-  Tensor expected_values_1(DT_INT32, TensorShape({3}));
-  Tensor expected_values_2(DT_INT32, TensorShape({0}));
-  Tensor expected_values_3(DT_INT32, TensorShape({2}));
-  Tensor expected_values_4(DT_INT32, TensorShape({1}));
-
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_3, component_values_3);
-  test::FillValues<int>(&expected_values_4, component_values_4);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits}, TensorShape({6}),
                                            batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -142,45 +139,28 @@ TEST_F(RaggedTensorToVariantKernelTest, 1DValuesRaggedRankOneInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 4);
 
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_4 =
-      encoded_list(3).get<Tensor>()->vec<Variant>()(0);
-
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               expected_values_3);
-  test::ExpectTensorEqual<int>(*encoded_values_4.get<Tensor>(),
-                               expected_values_4);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2, 3}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {4, 5}),
+      *encoded_list(2).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {6}),
+      *encoded_list(3).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankOneInput) {
   // ragged_tensor=
-  // [[x, x],
-  //  [x, x],
-  //  [x, x]]
+  // [[1, 2],
+  //  [4, 5],
+  //  [6, 7]]
   const std::vector<int64> batched_splits = {0, 1, 2, 3};
   const std::vector<int> batched_values = {1, 2, 4, 5, 6, 7};
 
-  const std::vector<int> component_values_1 = {1, 2};
-  const std::vector<int> component_values_2 = {4, 5};
-  const std::vector<int> component_values_3 = {6, 7};
-
-  Tensor expected_values_1(DT_INT32, TensorShape({1, 2}));
-  Tensor expected_values_2(DT_INT32, TensorShape({1, 2}));
-  Tensor expected_values_3(DT_INT32, TensorShape({1, 2}));
-
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
-  test::FillValues<int>(&expected_values_3, component_values_3);
-
   BuildEncodeRaggedTensorGraph<int, int64>(
       {batched_splits}, TensorShape({3, 2}), batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -188,44 +168,25 @@ TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankOneInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 3);
 
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               expected_values_3);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2}, {1, 2}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2}, {4, 5}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2}, {6, 7}),
+      *encoded_list(2).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankTwoInput) {
-  // ragged_tensor=[
-  // [ [[x, x], [x, x]],
-  //   [[x, x]        ] ]
+  // ragged_tensor=
+  // [ [[[1, 2], [4, 5]]],
+  //   [[[6 7]]]          ]
   const std::vector<int64> batched_splits_1 = {0, 1, 2};
   const std::vector<int64> batched_splits_2 = {0, 2, 3};
   const std::vector<int> batched_values = {1, 2, 4, 5, 6, 7};
 
-  const std::vector<int64> component_splits_1_1 = {0, 2};
-  const std::vector<int64> component_splits_2_1 = {0, 1};
-  const std::vector<int> component_values_1 = {1, 2, 4, 5};
-  const std::vector<int> component_values_2 = {6, 7};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({2}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({2}));
-  Tensor expected_values_1(DT_INT32, TensorShape({2, 2}));
-  Tensor expected_values_2(DT_INT32, TensorShape({1, 2}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
-  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({3, 2}), batched_values,
                                            true);
@@ -234,23 +195,12 @@ TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankTwoInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 2}}, {2, 2}, {1, 2, 4, 5}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 1}}, {1, 2}, {6, 7}),
+      *encoded_list(1).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
@@ -263,30 +213,6 @@ TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
   const std::vector<int64> batched_splits_2 = {0, 1, 3, 3, 8, 11, 11, 15};
   const std::vector<int> batched_values = {1, 2,  3,  4,  5,  6,  7, 8,
                                            9, 10, 11, 12, 13, 14, 15};
-  const std::vector<int64> component_splits_1_1 = {0, 1, 3, 3};
-  const std::vector<int64> component_splits_2_1 = {0};
-  const std::vector<int64> component_splits_3_1 = {0, 5, 8};
-  const std::vector<int64> component_splits_4_1 = {0, 0, 4};
-  const std::vector<int> component_values_1 = {1, 2, 3};
-  const std::vector<int> component_values_3 = {4, 5, 6, 7, 8, 9, 10, 11};
-  const std::vector<int> component_values_4 = {12, 13, 14, 15};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({4}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({1}));
-  Tensor expected_splits_3_1(DT_INT64, TensorShape({3}));
-  Tensor expected_splits_4_1(DT_INT64, TensorShape({3}));
-  Tensor expected_values_1(DT_INT32, TensorShape({3}));
-  Tensor expected_values_2(DT_INT32, TensorShape({0}));
-  Tensor expected_values_3(DT_INT32, TensorShape({8}));
-  Tensor expected_values_4(DT_INT32, TensorShape({4}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
-  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
-  test::FillValues<int64>(&expected_splits_3_1, component_splits_3_1);
-  test::FillValues<int64>(&expected_splits_4_1, component_splits_4_1);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_3, component_values_3);
-  test::FillValues<int>(&expected_values_4, component_values_4);
 
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({15}), batched_values,
@@ -296,39 +222,19 @@ TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 4);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_3_1 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_4_1 =
-      encoded_list(3).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_4 =
-      encoded_list(3).get<Tensor>()->vec<Variant>()(1);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
-  test::ExpectTensorEqual<int64>(*encoded_splits_3_1.get<Tensor>(),
-                                 expected_splits_3_1);
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               expected_values_3);
-  test::ExpectTensorEqual<int64>(*encoded_splits_4_1.get<Tensor>(),
-                                 expected_splits_4_1);
-  test::ExpectTensorEqual<int>(*encoded_values_4.get<Tensor>(),
-                               expected_values_4);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 1, 3, 3}}, {1, 2, 3}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0}}, {}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 5, 8}},
+                                          {4, 5, 6, 7, 8, 9, 10, 11}),
+      *encoded_list(2).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 0, 4}}, {12, 13, 14, 15}),
+      *encoded_list(3).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
@@ -350,26 +256,6 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
                                                7, 8, 9, 12, 13, 14};
   const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
                                            5, 6, 7, 8, 9, 8, 9};
-  const std::vector<int64> component_split_1_1 = {0, 1, 3, 4, 5, 6};
-  const std::vector<int64> component_split_1_2 = {0, 2, 3, 4, 5, 6, 7};
-  const std::vector<int64> component_split_2_1 = {0, 1, 2, 3, 4, 5};
-  const std::vector<int64> component_split_2_2 = {0, 1, 2, 5, 6, 7};
-  const std::vector<int> component_values_1 = {0, 1, 1, 2, 2, 3, 4};
-  const std::vector<int> component_values_2 = {5, 6, 7, 8, 9, 8, 9};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({6}));
-  Tensor expected_splits_1_2(DT_INT64, TensorShape({7}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({6}));
-  Tensor expected_splits_2_2(DT_INT64, TensorShape({6}));
-  Tensor expected_values_1(DT_INT32, TensorShape({7}));
-  Tensor expected_values_2(DT_INT32, TensorShape({7}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_split_1_1);
-  test::FillValues<int64>(&expected_splits_1_2, component_split_1_2);
-  test::FillValues<int64>(&expected_splits_2_1, component_split_2_1);
-  test::FillValues<int64>(&expected_splits_2_2, component_split_2_2);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
 
   BuildEncodeRaggedTensorGraph<int, int64>(
       {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
@@ -379,31 +265,14 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_1_2 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(2);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_2_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(2);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_2.get<Tensor>(),
-                                 expected_splits_1_2);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_2.get<Tensor>(),
-                                 expected_splits_2_2);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>(
+          {{0, 1, 3, 4, 5, 6}, {0, 2, 3, 4, 5, 6, 7}}, {0, 1, 1, 2, 2, 3, 4}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>(
+          {{0, 1, 2, 3, 4, 5}, {0, 1, 2, 5, 6, 7}}, {5, 6, 7, 8, 9, 8, 9}),
+      *encoded_list(1).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
@@ -424,28 +293,8 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
                                              7, 8, 9, 12, 13, 14};
   const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
                                            5, 6, 7, 8, 9, 8, 9};
-  const std::vector<int> component_split_1_1 = {0, 1, 3, 4, 5, 6};
-  const std::vector<int> component_split_1_2 = {0, 2, 3, 4, 5, 6, 7};
-  const std::vector<int> component_split_2_1 = {0, 1, 2, 3, 4, 5};
-  const std::vector<int> component_split_2_2 = {0, 1, 2, 5, 6, 7};
-  const std::vector<int> component_values_1 = {0, 1, 1, 2, 2, 3, 4};
-  const std::vector<int> component_values_2 = {5, 6, 7, 8, 9, 8, 9};
 
-  Tensor expected_splits_1_1(DT_INT32, TensorShape({6}));
-  Tensor expected_splits_1_2(DT_INT32, TensorShape({7}));
-  Tensor expected_splits_2_1(DT_INT32, TensorShape({6}));
-  Tensor expected_splits_2_2(DT_INT32, TensorShape({6}));
-  Tensor expected_values_1(DT_INT32, TensorShape({7}));
-  Tensor expected_values_2(DT_INT32, TensorShape({7}));
-
-  test::FillValues<int>(&expected_splits_1_1, component_split_1_1);
-  test::FillValues<int>(&expected_splits_1_2, component_split_1_2);
-  test::FillValues<int>(&expected_splits_2_1, component_split_2_1);
-  test::FillValues<int>(&expected_splits_2_2, component_split_2_2);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
-
-  BuildEncodeRaggedTensorGraph<int, int>(
+  BuildEncodeRaggedTensorGraph<int, int32>(
       {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
       batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -453,31 +302,14 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_1_2 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(2);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_2_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(2);
-
-  test::ExpectTensorEqual<int>(*encoded_splits_1_1.get<Tensor>(),
-                               expected_splits_1_1);
-  test::ExpectTensorEqual<int>(*encoded_splits_1_2.get<Tensor>(),
-                               expected_splits_1_2);
-  test::ExpectTensorEqual<int>(*encoded_splits_2_1.get<Tensor>(),
-                               expected_splits_2_1);
-  test::ExpectTensorEqual<int>(*encoded_splits_2_2.get<Tensor>(),
-                               expected_splits_2_2);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
+  ExpectRaggedTensorVariantEqual<int, int32>(
+      CreateVariantFromRagged<int, int32>(
+          {{0, 1, 3, 4, 5, 6}, {0, 2, 3, 4, 5, 6, 7}}, {0, 1, 1, 2, 2, 3, 4}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int32>(
+      CreateVariantFromRagged<int, int32>(
+          {{0, 1, 2, 3, 4, 5}, {0, 1, 2, 5, 6, 7}}, {5, 6, 7, 8, 9, 8, 9}),
+      *encoded_list(1).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NonBatchInput) {
@@ -491,33 +323,17 @@ TEST_F(RaggedTensorToVariantKernelTest, NonBatchInput) {
   const std::vector<int> batched_values = {1, 2,  3,  4,  5,  6,  7, 8,
                                            9, 10, 11, 12, 13, 14, 15};
 
-  Tensor batched_ragged_splits_1(DT_INT64, TensorShape({5}));
-  Tensor batched_ragged_splits_2(DT_INT64, TensorShape({8}));
-  Tensor batched_ragged_values(DT_INT32, TensorShape({15}));
-
-  test::FillValues<int64>(&batched_ragged_splits_1, batched_splits_1);
-  test::FillValues<int64>(&batched_ragged_splits_2, batched_splits_2);
-  test::FillValues<int>(&batched_ragged_values, batched_values);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({15}), batched_values,
                                            false);
   TF_ASSERT_OK(RunOpKernel());
 
   const auto& encoded_scalar = GetOutput(0)->scalar<Variant>()();
-  const Variant& encoded_splits_1 =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_2 =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(2);
 
-  test::ExpectTensorEqual<int64>(*encoded_splits_1.get<Tensor>(),
-                                 batched_ragged_splits_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2.get<Tensor>(),
-                                 batched_ragged_splits_2);
-  test::ExpectTensorEqual<int>(*encoded_values.get<Tensor>(),
-                               batched_ragged_values);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({batched_splits_1, batched_splits_2},
+                                          batched_values),
+      *encoded_scalar.get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, ShapeFnTestBatched) {
@@ -598,17 +414,14 @@ TEST_F(RaggedTensorToVariantKernelTest, ShapeFnTestNotBatched) {
 
 TEST_F(RaggedTensorToVariantKernelTest, NonRaggedInput) {
   const std::vector<int> values = {1, 2, 3, 4, 5, 6};
-  Tensor expected_values(DT_INT32, TensorShape({6}));
-  test::FillValues<int>(&expected_values, values);
 
   BuildEncodeRaggedTensorGraph<int, int64>({}, TensorShape({6}), values, false);
   TF_ASSERT_OK(RunOpKernel());
 
   const auto& encoded_scalar = GetOutput(0)->scalar<Variant>()();
-  const Variant& encoded_values =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(0);
-
-  test::ExpectTensorEqual<int>(*encoded_values.get<Tensor>(), expected_values);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, values),
+      *encoded_scalar.get<RaggedTensorVariant>());
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.cc b/tensorflow/core/kernels/ragged_tensor_variant.cc
new file mode 100644
index 00000000000..9466313819b
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_variant.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
+
+namespace tensorflow {
+
+string RaggedTensorVariant::TypeName() const { return "RaggedTensorVariant"; }
+
+string RaggedTensorVariant::DebugString() const {
+  return absl::StrCat(
+      "RaggedTensorVariant(dtype=", DataTypeString(values_.dtype()),
+      ", ragged_rank=", nested_splits_.size(), ", splits_dtype=",
+      DataTypeString(nested_splits_.empty() ? DT_INVALID
+                                            : nested_splits_.back().dtype()));
+}
+
+void RaggedTensorVariant::Encode(VariantTensorData* data) const {
+  data->set_type_name(TypeName());
+  for (const auto& splits : nested_splits_) {
+    *data->add_tensors() = splits;
+  }
+  *data->add_tensors() = values_;
+}
+
+bool RaggedTensorVariant::Decode(const VariantTensorData& data) {
+  if (data.tensors_size() < 1) {
+    return false;
+  }
+  nested_splits_.assign(data.tensors().begin(),
+                        std::prev(data.tensors().end()));
+  values_ = data.tensors().back();
+  return true;
+}
+
+namespace {
+
+Status RaggedTensorVariantDeviceCopy(
+    const RaggedTensorVariant& from, RaggedTensorVariant* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  TF_RETURN_IF_ERROR(copy(from.values(), to->mutable_values()));
+  // TODO(b/170415165) Should we use `copy` to move splits from device<->host?
+  *to->mutable_nested_splits() = from.nested_splits();
+  return Status::OK();
+}
+
+}  // namespace
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(
+    ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, RaggedTensorVariant,
+    RaggedTensorVariantZerosLike<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(
+    ADD_VARIANT_BINARY_OP, DEVICE_CPU, RaggedTensorVariant,
+    RaggedTensorVariantBinaryAdd<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(RaggedTensorVariant,
+                                       "RaggedTensorVariant");
+
+#define REGISTER_RAGGED_TENSOR_VARIANT_COPY(DIRECTION)  \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      RaggedTensorVariant, DIRECTION, RaggedTensorVariantDeviceCopy)
+
+REGISTER_RAGGED_TENSOR_VARIANT_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_RAGGED_TENSOR_VARIANT_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_RAGGED_TENSOR_VARIANT_COPY(
+    VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.h b/tensorflow/core/kernels/ragged_tensor_variant.h
new file mode 100644
index 00000000000..730758a3e82
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_variant.h
@@ -0,0 +1,110 @@
+#include "tensorflow/core/framework/tensor_key.h"
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
+#define TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+
+// Class used to store a RaggedTensor as a Variant scalar.
+class RaggedTensorVariant {
+ public:
+  RaggedTensorVariant() {}
+  RaggedTensorVariant(Tensor values, const std::vector<Tensor>& nested_splits)
+      : values_(std::move(values)), nested_splits_(nested_splits) {}
+
+  // Variant support methods.
+  string TypeName() const;
+  string DebugString() const;
+  void Encode(VariantTensorData* data) const;
+  bool Decode(const VariantTensorData& data);
+
+  // The flat_values of the RaggedTensor.
+  const Tensor& values() const { return values_; }
+  Tensor* mutable_values() { return &values_; }
+  void set_values(const Tensor& new_values) { values_ = new_values; }
+
+  // The nested row_splits of the RaggedTensor.
+  int ragged_rank() const { return nested_splits_.size(); }
+  const std::vector<Tensor>& nested_splits() const { return nested_splits_; }
+  std::vector<Tensor>* mutable_nested_splits() { return &nested_splits_; }
+  const Tensor& splits(int i) const { return nested_splits_[i]; }
+  Tensor* mutable_splits(int i) { return &nested_splits_[i]; }
+  void set_nested_splits(const std::vector<Tensor>& nested_splits) {
+    nested_splits_ = nested_splits;
+  }
+  void append_splits(const Tensor& splits) { nested_splits_.push_back(splits); }
+
+ private:
+  Tensor values_;
+  std::vector<Tensor> nested_splits_;
+};
+
+template <typename Device>
+Status RaggedTensorVariantZerosLike(OpKernelContext* c,
+                                    const RaggedTensorVariant& x,
+                                    RaggedTensorVariant* y) {
+  y->set_nested_splits(x.nested_splits());
+  TF_RETURN_IF_ERROR(
+      ZerosLikeTensor<Device>(c, x.values(), y->mutable_values()));
+  return Status::OK();
+}
+
+template <typename Device>
+Status RaggedTensorVariantBinaryAdd(OpKernelContext* c,
+                                    const RaggedTensorVariant& x,
+                                    const RaggedTensorVariant& y,
+                                    RaggedTensorVariant* out) {
+  if (x.values().dtype() != y.values().dtype()) {
+    return errors::InvalidArgument(
+        "Can't add RaggedTensorVariants of different dtypes. One is ",
+        DataTypeString(x.values().dtype()), " and the other is ",
+        DataTypeString(y.values().dtype()));
+  }
+  if (x.ragged_rank() != y.ragged_rank()) {
+    return errors::InvalidArgument(
+        "Can't add RaggedTensorVariants of different ragged rank. ", "One is ",
+        x.ragged_rank(), " and the other is ", y.ragged_rank());
+  }
+  for (int i = 0; i < x.ragged_rank(); ++i) {
+    if (TensorKey(x.splits(i)) != TensorKey(y.splits(i))) {
+      return errors::InvalidArgument(
+          "Can't add RaggedTensorVariants with different row_splits.");
+    }
+  }
+  out->set_nested_splits(x.nested_splits());
+  TF_RETURN_IF_ERROR(BinaryAddTensors<Device>(c, x.values(), y.values(),
+                                              out->mutable_values()));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
diff --git a/tensorflow/core/kernels/random_binomial_op_test.cc b/tensorflow/core/kernels/random_binomial_op_test.cc
index 9f8f47ef853..d3d090a47f3 100644
--- a/tensorflow/core/kernels/random_binomial_op_test.cc
+++ b/tensorflow/core/kernels/random_binomial_op_test.cc
@@ -67,32 +67,44 @@ static Graph* RandomBinomialRejComplement(int num_batches,
   return RandomBinomialGraph(100., 0.2, num_batches, samples_per_batch);
 }
 
-#define BM_RandomBinomialInv(DEVICE, B, S)                           \
-  static void BM_RandomBinomialInv_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, RandomBinomialInv(B, S)).Run(iters);    \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);      \
-  }                                                                  \
+#define BM_RandomBinomialInv(DEVICE, B, S)                                   \
+  static void BM_RandomBinomialInv_##DEVICE##_##B##_##S(                     \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, RandomBinomialInv(B, S),                        \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_RandomBinomialInv_##DEVICE##_##B##_##S);
 
-#define BM_RandomBinomialRej(DEVICE, B, S)                           \
-  static void BM_RandomBinomialRej_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, RandomBinomialRej(B, S)).Run(iters);    \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);      \
-  }                                                                  \
+#define BM_RandomBinomialRej(DEVICE, B, S)                                   \
+  static void BM_RandomBinomialRej_##DEVICE##_##B##_##S(                     \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, RandomBinomialRej(B, S),                        \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_RandomBinomialRej_##DEVICE##_##B##_##S);
 
-#define BM_RandomBinomialInvComplement(DEVICE, B, S)                           \
-  static void BM_RandomBinomialInvComplement_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, RandomBinomialInvComplement(B, S)).Run(iters);    \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);                \
-  }                                                                            \
+#define BM_RandomBinomialInvComplement(DEVICE, B, S)                         \
+  static void BM_RandomBinomialInvComplement_##DEVICE##_##B##_##S(           \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, RandomBinomialInvComplement(B, S),              \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_RandomBinomialInvComplement_##DEVICE##_##B##_##S);
 
-#define BM_RandomBinomialRejComplement(DEVICE, B, S)                           \
-  static void BM_RandomBinomialRejComplement_##DEVICE##_##B##_##S(int iters) { \
-    test::Benchmark(#DEVICE, RandomBinomialRejComplement(B, S)).Run(iters);    \
-    testing::ItemsProcessed(static_cast<int64>(B) * S * iters);                \
-  }                                                                            \
+#define BM_RandomBinomialRejComplement(DEVICE, B, S)                         \
+  static void BM_RandomBinomialRejComplement_##DEVICE##_##B##_##S(           \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, RandomBinomialRejComplement(B, S),              \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(B) * S * state.iterations()); \
+  }                                                                          \
   BENCHMARK(BM_RandomBinomialRejComplement_##DEVICE##_##B##_##S);
 
 BM_RandomBinomialInv(cpu, 1000, 1000);
diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc
index 47d94ad9028..e32ec11c9b3 100644
--- a/tensorflow/core/kernels/random_op_test.cc
+++ b/tensorflow/core/kernels/random_op_test.cc
@@ -58,11 +58,14 @@ Graph* TruncatedNormal(int64 n) {
   return g;
 }
 
-#define BM_RNG(DEVICE, RNG)                                   \
-  void BM_##DEVICE##_##RNG(int iters, int arg) {              \
-    testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
-    test::Benchmark(#DEVICE, RNG(arg)).Run(iters);            \
-  }                                                           \
+#define BM_RNG(DEVICE, RNG)                                                \
+  void BM_##DEVICE##_##RNG(::testing::benchmark::State& state) {           \
+    const int arg = state.range(0);                                        \
+                                                                           \
+    test::Benchmark(#DEVICE, RNG(arg), /*old_benchmark_api*/ false)        \
+        .Run(state);                                                       \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * arg); \
+  }                                                                        \
   BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20);
 
 BM_RNG(cpu, RandomUniform);
@@ -84,60 +87,48 @@ Tensor VecAlphas(int64 n) {
   return alphas;
 }
 
-void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * nsamp * nalpha);
+void BM_cpu_RandomGamma(::testing::benchmark::State& state) {
+  const int nsamp = state.range(0);
+  const int nalpha = state.range(1);
+
   Graph* g = new Graph(OpRegistry::Global());
   test::graph::RandomGamma(g, test::graph::Constant(g, VecShape(nsamp)),
                            test::graph::Constant(g, VecAlphas(nalpha)));
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * nsamp *
+                          nalpha);
 }
 BENCHMARK(BM_cpu_RandomGamma)->RangePair(1 << 14, 4 << 15, 2, 50);
 
-void BM_PhiloxRandom(int iters) {
+void BM_PhiloxRandom(::testing::benchmark::State& state) {
   // Fill 2M random numbers
   int count = 2 << 20;
-
-  testing::ItemsProcessed(static_cast<int64>(iters) * count);
-
   random::PhiloxRandom gen(0x12345);
 
-  int val = 1;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     for (int j = 0; j < count; j += 4) {
       /// each invocation of gen() returns 128-bit samples
       auto samples = gen();
-
-      // use the result trivially so the compiler does not optimize it away
-      val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3];
+      tensorflow::testing::DoNotOptimize(samples);
     }
   }
-
-  // A anchor point to make sure the compiler does not cut corners
-  CHECK(val) << val;
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
 }
 BENCHMARK(BM_PhiloxRandom);
 
-void BM_StdMTRandom(int iters) {
+void BM_StdMTRandom(::testing::benchmark::State& state) {
   // Fill 2M random numbers
   int count = 2 << 20;
-
-  testing::ItemsProcessed(static_cast<int64>(iters) * count);
-
   std::mt19937 gen(0x12345);
 
-  uint_fast32_t val = 1;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     for (int j = 0; j < count; ++j) {
       /// each invocation of gen() returns 32-bit sample
       uint_fast32_t sample = gen();
-
-      // use the result trivially so the compiler does not optimize it away
-      val ^= sample;
+      tensorflow::testing::DoNotOptimize(sample);
     }
   }
-
-  // A anchor point to make sure the compiler does not cut corners
-  CHECK(val) << val;
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
 }
 BENCHMARK(BM_StdMTRandom);
 
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 59184ab061d..fc439a08df1 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -73,6 +73,20 @@ struct DividesBy {
   __host__ __device__ OUT_T operator()(const T& x) const { return x / divisor; }
 };
 
+struct MaxPropagateNaN {
+  template <typename T>
+  __host__ __device__ inline T operator()(const T& a, const T& b) const {
+    return (a != a ? a : (a > b ? a : b));
+  }
+};
+
+struct MinPropagateNaN {
+  template <typename T>
+  __host__ __device__ inline T operator()(const T& a, const T& b) const {
+    return (a != a ? a : (a < b ? a : b));
+  }
+};
+
 #if GOOGLE_CUDA
 // TODO(rocm) : enable this once ROCm platform has support for complex datatypes
 //
@@ -986,15 +1000,19 @@ struct IsSum {
 template <typename T, typename Op>
 struct IsMax {
   constexpr static bool value =
-      (std::is_same<Op, gpuprim::Max>::value ||
-       std::is_same<Op, Eigen::internal::MaxReducer<T>>::value);
+      (std::is_same<Op, MaxPropagateNaN>::value ||
+       std::is_same<Op, gpuprim::Max>::value ||
+       std::is_same<
+           Op, Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>>::value);
 };
 
 template <typename T, typename Op>
 struct IsMin {
   constexpr static bool value =
-      (std::is_same<Op, gpuprim::Min>::value ||
-       std::is_same<Op, Eigen::internal::MinReducer<T>>::value);
+      (std::is_same<Op, MinPropagateNaN>::value ||
+       std::is_same<Op, gpuprim::Min>::value ||
+       std::is_same<
+           Op, Eigen::internal::MinReducer<T, Eigen::PropagateNaN>>::value);
 };
 
 template <typename T, typename Op>
@@ -1222,41 +1240,47 @@ struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::half>> {
 };
 
 template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MaxReducer<T>> {
+struct ReduceFunctor<GPUDevice,
+                     Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MaxReducer<T>& reducer) {
-    ReduceImpl<T, gpuprim::Max, T*, T*, ReductionAxes>(
+  static void Reduce(
+      OpKernelContext* ctx, OUT_T out, IN_T in,
+      const ReductionAxes& reduction_axes,
+      const Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>& reducer) {
+    ReduceImpl<T, MaxPropagateNaN, T*, T*, ReductionAxes>(
         ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
         in.rank() >= 2 ? in.dimension(1) : 1,
         in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        gpuprim::Max());
+        MaxPropagateNaN());
   }
 
   template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::MaxReducer<T>& reducer) {
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
 
 template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MinReducer<T>> {
+struct ReduceFunctor<GPUDevice,
+                     Eigen::internal::MinReducer<T, Eigen::PropagateNaN>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MinReducer<T>& reducer) {
-    ReduceImpl<T, gpuprim::Min, T*, T*, ReductionAxes>(
+  static void Reduce(
+      OpKernelContext* ctx, OUT_T out, IN_T in,
+      const ReductionAxes& reduction_axes,
+      const Eigen::internal::MinReducer<T, Eigen::PropagateNaN>& reducer) {
+    ReduceImpl<T, MinPropagateNaN, T*, T*, ReductionAxes>(
         ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
         in.rank() >= 2 ? in.dimension(1) : 1,
         in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        gpuprim::Min());
+        MinPropagateNaN());
   }
 
   template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::MinReducer<T>& reducer) {
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::MinReducer<T, Eigen::PropagateNaN>& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
index c952c4c9fa4..dfd31795b35 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@@ -44,22 +44,27 @@ typedef TTypes<float>::Tensor::Index Index;
   template void ReduceFunctor<GPUDevice, REDUCER>::FillIdentity( \
       const GPUDevice& d, TTypes<T>::Vec out, const REDUCER& reducer);
 
-#define DEFINE_FOR_TYPE_AND_R(T, R) \
-  DEFINE(T, R, 1, 1);               \
-  DEFINE(T, R, 2, 1);               \
-  DEFINE(T, R, 3, 1);               \
-  DEFINE(T, R, 3, 2);               \
-  DEFINE_IDENTITY(T, R)
+#define SINGLE_ARG(...) __VA_ARGS__
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                            \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
-  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+  DEFINE(T, SINGLE_ARG(R), 1, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 2, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 2);   \
+  DEFINE_IDENTITY(T, SINGLE_ARG(R))
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                                         \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);                \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);                       \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>);              \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MinReducer<T, Eigen::PropagateNaN>)); \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>)); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(double);
+#undef SINGLE_ARG
 #undef DEFINE_FOR_ALL_REDUCERS
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
index 92f4b9d707c..bf9831a1207 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@@ -44,22 +44,27 @@ typedef TTypes<float>::Tensor::Index Index;
   template void ReduceFunctor<GPUDevice, REDUCER>::FillIdentity( \
       const GPUDevice& d, TTypes<T>::Vec out, const REDUCER& reducer);
 
-#define DEFINE_FOR_TYPE_AND_R(T, R) \
-  DEFINE(T, R, 1, 1);               \
-  DEFINE(T, R, 2, 1);               \
-  DEFINE(T, R, 3, 1);               \
-  DEFINE(T, R, 3, 2);               \
-  DEFINE_IDENTITY(T, R)
+#define SINGLE_ARG(...) __VA_ARGS__
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                            \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
-  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+  DEFINE(T, SINGLE_ARG(R), 1, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 2, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 2);   \
+  DEFINE_IDENTITY(T, SINGLE_ARG(R))
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                                         \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);                \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);                       \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>);              \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MinReducer<T, Eigen::PropagateNaN>)); \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>)); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(float);
+#undef SINGLE_ARG
 #undef DEFINE_FOR_ALL_REDUCERS
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
index c35d8c2ec86..2efcad02950 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@@ -44,23 +44,28 @@ typedef TTypes<float>::Tensor::Index Index;
   template void ReduceFunctor<GPUDevice, REDUCER>::FillIdentity( \
       const GPUDevice& d, TTypes<T>::Vec out, const REDUCER& reducer);
 
-#define DEFINE_FOR_TYPE_AND_R(T, R) \
-  DEFINE(T, R, 1, 1);               \
-  DEFINE(T, R, 2, 1);               \
-  DEFINE(T, R, 3, 1);               \
-  DEFINE(T, R, 3, 2);               \
-  DEFINE_IDENTITY(T, R)
+#define SINGLE_ARG(...) __VA_ARGS__
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                            \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
-  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+  DEFINE(T, SINGLE_ARG(R), 1, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 2, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 2);   \
+  DEFINE_IDENTITY(T, SINGLE_ARG(R))
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                                         \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);                \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);                       \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>);              \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MinReducer<T, Eigen::PropagateNaN>)); \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>)); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(int32);
 DEFINE_FOR_ALL_REDUCERS(int64);
+#undef SINGLE_ARG
 #undef DEFINE_FOR_ALL_REDUCERS
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
index d2a180ba351..23c9ec9e592 100644
--- a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
@@ -44,19 +44,24 @@ typedef TTypes<float>::Tensor::Index Index;
   template void ReduceFunctor<GPUDevice, REDUCER>::FillIdentity( \
       const GPUDevice& d, TTypes<T>::Vec out, const REDUCER& reducer);
 
-#define DEFINE_FOR_TYPE_AND_R(T, R) \
-  DEFINE(T, R, 1, 1);               \
-  DEFINE(T, R, 2, 1);               \
-  DEFINE(T, R, 3, 1);               \
-  DEFINE(T, R, 3, 2);               \
-  DEFINE_IDENTITY(T, R)
+#define SINGLE_ARG(...) __VA_ARGS__
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+  DEFINE(T, SINGLE_ARG(R), 1, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 2, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 2);   \
+  DEFINE_IDENTITY(T, SINGLE_ARG(R))
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                                         \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MinReducer<T, Eigen::PropagateNaN>)); \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>)); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(Eigen::half);
+#undef SINGLE_ARG
 #undef DEFINE_FOR_ALL_REDUCERS
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index 99b17f402af..c52874818f0 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -17,39 +17,43 @@ limitations under the License.
 
 namespace tensorflow {
 
-#define REGISTER_CPU_KERNELS(type)                                             \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Max")                                                              \
-          .Device(DEVICE_CPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int32>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, int32, Eigen::internal::MaxReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Max")                                                              \
-          .Device(DEVICE_CPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int64>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, int64, Eigen::internal::MaxReducer<type>>);
+#define REGISTER_CPU_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Max")                                                           \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int32>("Tidx"),                                   \
+      ReductionOp<CPUDevice, type, int32,                                   \
+                  Eigen::internal::MaxReducer<type, Eigen::PropagateNaN>>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Max")                                                           \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int64>("Tidx"),                                   \
+      ReductionOp<CPUDevice, type, int64,                                   \
+                  Eigen::internal::MaxReducer<type, Eigen::PropagateNaN>>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#define REGISTER_GPU_KERNELS(type)                                             \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Max")                                                              \
-          .Device(DEVICE_GPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int32>("Tidx")                                       \
-          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, int32, Eigen::internal::MaxReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Max")                                                              \
-          .Device(DEVICE_GPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int64>("Tidx")                                       \
-          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, int64, Eigen::internal::MaxReducer<type>>);
+#define REGISTER_GPU_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Max")                                                           \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int32>("Tidx")                                    \
+          .HostMemory("reduction_indices"),                                 \
+      ReductionOp<GPUDevice, type, int32,                                   \
+                  Eigen::internal::MaxReducer<type, Eigen::PropagateNaN>>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Max")                                                           \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int64>("Tidx")                                    \
+          .HostMemory("reduction_indices"),                                 \
+      ReductionOp<GPUDevice, type, int64,                                   \
+                  Eigen::internal::MaxReducer<type, Eigen::PropagateNaN>>);
 
 REGISTER_GPU_KERNELS(Eigen::half);
 REGISTER_GPU_KERNELS(float);
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index be1d09352e0..ee5310a469a 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -17,39 +17,43 @@ limitations under the License.
 
 namespace tensorflow {
 
-#define REGISTER_CPU_KERNELS(type)                                             \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Min")                                                              \
-          .Device(DEVICE_CPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int32>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, int32, Eigen::internal::MinReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Min")                                                              \
-          .Device(DEVICE_CPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int64>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
+#define REGISTER_CPU_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Min")                                                           \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int32>("Tidx"),                                   \
+      ReductionOp<CPUDevice, type, int32,                                   \
+                  Eigen::internal::MinReducer<type, Eigen::PropagateNaN>>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Min")                                                           \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int64>("Tidx"),                                   \
+      ReductionOp<CPUDevice, type, int64,                                   \
+                  Eigen::internal::MinReducer<type, Eigen::PropagateNaN>>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#define REGISTER_GPU_KERNELS(type)                                             \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Min")                                                              \
-          .Device(DEVICE_GPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int32>("Tidx")                                       \
-          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, int32, Eigen::internal::MinReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Min")                                                              \
-          .Device(DEVICE_GPU)                                                  \
-          .TypeConstraint<type>("T")                                           \
-          .TypeConstraint<int64>("Tidx")                                       \
-          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
+#define REGISTER_GPU_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Min")                                                           \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int32>("Tidx")                                    \
+          .HostMemory("reduction_indices"),                                 \
+      ReductionOp<GPUDevice, type, int32,                                   \
+                  Eigen::internal::MinReducer<type, Eigen::PropagateNaN>>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Min")                                                           \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("T")                                        \
+          .TypeConstraint<int64>("Tidx")                                    \
+          .HostMemory("reduction_indices"),                                 \
+      ReductionOp<GPUDevice, type, int64,                                   \
+                  Eigen::internal::MinReducer<type, Eigen::PropagateNaN>>);
 REGISTER_GPU_KERNELS(Eigen::half);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 359d7dbeca5..90666a77de6 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -84,108 +84,167 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
 // Creates a bench which reduces a 3D tensor with total "num" floats
 // into a scalar on a "device". Runs the bench for "iters" times.
 template <typename T>
-static void ReduceToScalar(int iters, const string& device,
-                           const string& reduce, int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(T));
-  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
+static void ReduceToScalar(::testing::benchmark::State& state,
+                           const string& device, const string& reduce,
+                           int num_x, int num_y) {
+  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(T));
 }
 
-static void DoRowReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
+static void DoRowReduce(::testing::benchmark::State& state,
+                        const string& device, const string& reduce, int num_x,
+                        int num_y) {
+  test::Benchmark(device, RowReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void DoColReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
+static void DoColReduce(::testing::benchmark::State& state,
+                        const string& device, const string& reduce, int num_x,
+                        int num_y) {
+  test::Benchmark(device, ColReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void Do3DYReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
+static void Do3DYReduce(::testing::benchmark::State& state,
+                        const string& device, const string& reduce, int num_x,
+                        int num_y) {
+  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void Do3DXZReduce(int iters, const string& device, const string& reduce,
-                         int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
+static void Do3DXZReduce(::testing::benchmark::State& state,
+                         const string& device, const string& reduce, int num_x,
+                         int num_y) {
+  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
-  ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DToScalarGPUComplex(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<std::complex<float>>(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
-  ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DToScalarGPUHalf(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<Eigen::half>(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
-  DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DRowReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoRowReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
-  DoColReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DColumnReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoColReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
-  Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum3DYReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DYReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
 
-static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
-  Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum3DXZReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DXZReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
 
-static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
+static void BM_Mean2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Mean", num_x, num_y);
 }
 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
-static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
+static void BM_EuclideanNorm2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "EuclideanNorm", num_x, num_y);
 }
 BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
-static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
+static void BM_Max2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Max", num_x, num_y);
 }
 BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
-static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
+static void BM_Min2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Min", num_x, num_y);
 }
 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
-static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
-  ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
+static void BM_Min2DToScalarGPUHalf(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<Eigen::half>(state, "gpu", "Min", num_x, num_y);
 }
 BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
 
-static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
+static void BM_Bool2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<bool>(state, "gpu", "All", num_x, num_y);
 }
 BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
diff --git a/tensorflow/core/kernels/regex_replace_op_test.cc b/tensorflow/core/kernels/regex_replace_op_test.cc
index b9e960efecc..7c537b6dbde 100644
--- a/tensorflow/core/kernels/regex_replace_op_test.cc
+++ b/tensorflow/core/kernels/regex_replace_op_test.cc
@@ -84,17 +84,17 @@ Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
   return g;
 }
 
-void BM_RegexReplace(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_RegexReplace(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
   Tensor input = GetTestTensor(batch_size);
   Graph* g = SetupRegexReplaceGraph(input, kRegExPattern, kRewrite);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
 
 BENCHMARK(BM_RegexReplace)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(8)
     ->Arg(16)
@@ -115,17 +115,17 @@ Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
                   .Finalize(g, nullptr /* node */));
   return g;
 }
-void BM_StaticRegexReplace(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_StaticRegexReplace(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
   Tensor input = GetTestTensor(batch_size);
   Graph* g = SetupStaticGraph(input, kRegExPattern, kRewrite);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
 
 BENCHMARK(BM_StaticRegexReplace)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(8)
     ->Arg(16)
diff --git a/tensorflow/core/kernels/requantization_range_op_test.cc b/tensorflow/core/kernels/requantization_range_op_test.cc
index dd04da373d8..a9740dd31d7 100644
--- a/tensorflow/core/kernels/requantization_range_op_test.cc
+++ b/tensorflow/core/kernels/requantization_range_op_test.cc
@@ -67,56 +67,29 @@ TEST_F(RequantizationRangeTest, HandCrafted) {
   test::ExpectTensorEqual<float>(expected_max, *GetOutput(1));
 }
 
-static void BM_RequantizationRange(int iters, int size) {
-  testing::StopTiming();
-  testing::UseRealTime();
-  testing::ItemsProcessed(static_cast<int64>(iters) * size);
-  testing::ItemsProcessed(static_cast<int64>(iters) * size * 4);
+static void BM_RequantizationRange(::testing::benchmark::State& state) {
+  const int size = state.range(0);
 
   Tensor quantized_tensor(DT_QINT32, TensorShape({1, size}));
   test::FillFn<qint32>(&quantized_tensor, [](int n) { return qint32(n); });
 
   qint32 actual_min;
   qint32 actual_max;
-  testing::StartTiming();
-  for (int iter = 0; iter < iters; ++iter) {
+  for (auto s : state) {
     CalculateUsedRange(quantized_tensor, &actual_min, &actual_max);
   }
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * size);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * size * 4);
 }
 
-static void BM_RequantizationRange100(int iters) {
-  BM_RequantizationRange(100, iters);
-}
-BENCHMARK(BM_RequantizationRange100);
-
-static void BM_RequantizationRange1000(int iters) {
-  BM_RequantizationRange(1000, iters);
-}
-BENCHMARK(BM_RequantizationRange1000);
-
-static void BM_RequantizationRange10000(int iters) {
-  BM_RequantizationRange(10000, iters);
-}
-BENCHMARK(BM_RequantizationRange10000);
-
-static void BM_RequantizationRange100000(int iters) {
-  BM_RequantizationRange(100000, iters);
-}
-BENCHMARK(BM_RequantizationRange100000);
-
-static void BM_RequantizationRange1000000(int iters) {
-  BM_RequantizationRange(1000000, iters);
-}
-BENCHMARK(BM_RequantizationRange1000000);
-
-static void BM_RequantizationRange10000000(int iters) {
-  BM_RequantizationRange(10000000, iters);
-}
-BENCHMARK(BM_RequantizationRange10000000);
-
-static void BM_RequantizationRange100000000(int iters) {
-  BM_RequantizationRange(100000000, iters);
-}
-BENCHMARK(BM_RequantizationRange100000000);
+BENCHMARK(BM_RequantizationRange)
+    ->UseRealTime()
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(100000)
+    ->Arg(1000000)
+    ->Arg(10000000)
+    ->Arg(100000000);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 62d7d294597..d34e97ea2c2 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -197,148 +197,187 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
 }
 
 template <typename T>
-static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim,
+static void RunReverseRowsBenchmark(::testing::benchmark::State& state,
+                                    int outer_dim, int middle_dim,
                                     int intra_threads, int channels) {
   SessionOptions opts = GetOptions(intra_threads);
   TensorShape shape{outer_dim, middle_dim, channels};
-  const int64 num_items = static_cast<int64>(iters) * shape.num_elements();
-  testing::ItemsProcessed(num_items);
-  testing::BytesProcessed(num_items * sizeof(T));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Reverse<T>(shape, 1), &opts).Run(iters);
+  test::Benchmark("cpu", Reverse<T>(shape, 1), &opts, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 num_items =
+      static_cast<int64>(state.iterations()) * shape.num_elements();
+  state.SetItemsProcessed(num_items);
+  state.SetBytesProcessed(num_items * sizeof(T));
 }
 
-static void BM_ReverseRowsOf1Channel_1T_float(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_1T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                  1 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_1T_float)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf1Channel_1T_uint8(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                  1 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf1Channel_4T_float(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_4T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                  4 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_4T_float)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf1Channel_4T_uint8(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                  4 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf3Channels_1T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_1T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                  1 /* intra_threads */, 3 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf3Channels_1T_float)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(30, 30)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf3Channels_1T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                  1 /* intra_threads */, 3 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(30, 30)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf3Channels_4T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_4T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                  4 /* intra_threads */, 3 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf3Channels_4T_float)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(30, 30)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf3Channels_4T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                  4 /* intra_threads */, 3 /* channels */);
 }
 BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(30, 30)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf4Channels_1T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_1T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                  1 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_1T_float)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf4Channels_1T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                  1 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf4Channels_4T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_4T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                  4 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_4T_float)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf4Channels_4T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                  4 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
+    ->UseRealTime()
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
diff --git a/tensorflow/core/kernels/risc/BUILD b/tensorflow/core/kernels/risc/BUILD
new file mode 100644
index 00000000000..8160c14e234
--- /dev/null
+++ b/tensorflow/core/kernels/risc/BUILD
@@ -0,0 +1,14 @@
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_kernel_library(
+    name = "risc",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/kernels/risc/experimental",
+    ],
+)
diff --git a/tensorflow/core/kernels/risc/experimental/BUILD b/tensorflow/core/kernels/risc/experimental/BUILD
new file mode 100644
index 00000000000..a16c0b66271
--- /dev/null
+++ b/tensorflow/core/kernels/risc/experimental/BUILD
@@ -0,0 +1,25 @@
+# TF-RISC
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_kernel_library(
+    name = "risc_add_op",
+    srcs = ["risc_add_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "experimental",
+    deps = [
+        ":risc_add_op",
+    ],
+)
diff --git a/tensorflow/core/kernels/risc/experimental/risc_add_op.cc b/tensorflow/core/kernels/risc/experimental/risc_add_op.cc
new file mode 100644
index 00000000000..4d2c9dcc731
--- /dev/null
+++ b/tensorflow/core/kernels/risc/experimental/risc_add_op.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace risc {
+namespace experimental {
+
+template <typename T>
+class RiscAddOp : public OpKernel {
+ public:
+  explicit RiscAddOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // TODO(b/171294012): Implement RiscAdd op.
+  }
+};
+
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("RiscAdd").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RiscAddOp<T>);
+
+REGISTER_CPU(bfloat16);
+REGISTER_CPU(Eigen::half);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+}  // namespace experimental
+}  // namespace risc
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 0ac9b08ed15..104cc1eba03 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -32,9 +32,9 @@ tf_gpu_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index 3ee66906139..6e0b638c79d 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -450,34 +450,44 @@ static Graph* RollGraph(const TensorShape& shape, int isd) {
   return g;
 }
 
-#define BM_ROLL_OUTER(DEVICE)                                                 \
-  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
-    TensorShape shape{rows, columns};                                         \
-    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
-    testing::ItemsProcessed(num_items);                                       \
-    testing::BytesProcessed(num_items * sizeof(float));                       \
-    testing::UseRealTime();                                                   \
-    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
-  }                                                                           \
-  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
-      ->ArgPair(256, 256)                                                     \
-      ->ArgPair(512, 512)                                                     \
-      ->ArgPair(1024, 1024)                                                   \
+#define BM_ROLL_OUTER(DEVICE)                                                  \
+  static void BM_##DEVICE##_roll_outer(::testing::benchmark::State& state) {   \
+    const int rows = state.range(0);                                           \
+    const int columns = state.range(1);                                        \
+                                                                               \
+    TensorShape shape{rows, columns};                                          \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0), /*old_benchmark_api*/ false) \
+        .Run(state);                                                           \
+    const int64 num_items =                                                    \
+        static_cast<int64>(state.iterations()) * shape.num_elements();         \
+    state.SetItemsProcessed(num_items);                                        \
+    state.SetBytesProcessed(num_items * sizeof(float));                        \
+  }                                                                            \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                          \
+      ->UseRealTime()                                                          \
+      ->ArgPair(256, 256)                                                      \
+      ->ArgPair(512, 512)                                                      \
+      ->ArgPair(1024, 1024)                                                    \
       ->ArgPair(2048, 2048)
 
-#define BM_ROLL_ALL(DEVICE)                                                   \
-  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
-    TensorShape shape{rows, columns};                                         \
-    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
-    testing::ItemsProcessed(num_items);                                       \
-    testing::BytesProcessed(num_items * sizeof(float));                       \
-    testing::UseRealTime();                                                   \
-    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
-  }                                                                           \
-  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
-      ->ArgPair(256, 256)                                                     \
-      ->ArgPair(512, 512)                                                     \
-      ->ArgPair(1024, 1024)                                                   \
+#define BM_ROLL_ALL(DEVICE)                                                    \
+  static void BM_##DEVICE##_roll_all(::testing::benchmark::State& state) {     \
+    const int rows = state.range(0);                                           \
+    const int columns = state.range(1);                                        \
+                                                                               \
+    TensorShape shape{rows, columns};                                          \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1), /*old_benchmark_api*/ false) \
+        .Run(state);                                                           \
+    const int64 num_items =                                                    \
+        static_cast<int64>(state.iterations()) * shape.num_elements();         \
+    state.SetItemsProcessed(num_items);                                        \
+    state.SetBytesProcessed(num_items * sizeof(float));                        \
+  }                                                                            \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                            \
+      ->UseRealTime()                                                          \
+      ->ArgPair(256, 256)                                                      \
+      ->ArgPair(512, 512)                                                      \
+      ->ArgPair(1024, 1024)                                                    \
       ->ArgPair(2048, 2048)
 
 BM_ROLL_OUTER(cpu);
diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc
index 1f6d8257bdd..b46609ef193 100644
--- a/tensorflow/core/kernels/save_op_test.cc
+++ b/tensorflow/core/kernels/save_op_test.cc
@@ -663,8 +663,8 @@ TEST_F(SaveOpSlices2Test, TwoSlices) {
 
 // Benchmark-related code below.
 
-static void BM_LargeTensorWrite(int iters, int num_elements) {
-  testing::StopTiming();
+void BM_LargeTensorWrite(::testing::benchmark::State& state) {
+  const int num_elements = state.range(0);
 
   // 4 * num_elements bytes total , since sizeof(float) == 4.
   Tensor tensor(DT_FLOAT, TensorShape({num_elements}));
@@ -689,8 +689,9 @@ static void BM_LargeTensorWrite(int iters, int num_elements) {
   VLOG(1) << "Save op's output path: " << temp_filename;
   VLOG(1) << "# nodes in Graph: " << g->num_nodes();
 
-  testing::StartTiming();
-  test::Benchmark("cpu", g, &session_options).Run(iters);
+  test::Benchmark("cpu", g, &session_options, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
 }
 BENCHMARK(BM_LargeTensorWrite)->Arg((1 << 30) / 4 /* 1GB float tensor */);
 
diff --git a/tensorflow/core/kernels/scan_ops.h b/tensorflow/core/kernels/scan_ops.h
index 8afcac86c3f..0584e0ac77c 100644
--- a/tensorflow/core/kernels/scan_ops.h
+++ b/tensorflow/core/kernels/scan_ops.h
@@ -111,7 +111,7 @@ struct LogSumExpReducer {
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
   finalizeBoth(const T saccum, const Packet& vaccum) const {
-    auto max_reducer = Eigen::internal::MaxReducer<T>();
+    auto max_reducer = Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>();
     auto sum_reducer = Eigen::internal::SumReducer<T>();
     auto exp = Eigen::internal::scalar_exp_op<T>();
     auto cmp_lt =
diff --git a/tensorflow/core/kernels/scan_ops_test.cc b/tensorflow/core/kernels/scan_ops_test.cc
index 588b606a99b..88cb351eb53 100644
--- a/tensorflow/core/kernels/scan_ops_test.cc
+++ b/tensorflow/core/kernels/scan_ops_test.cc
@@ -67,79 +67,120 @@ static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
 }
 
 template <typename T>
-static void LargeOneDimensional(int iters, const string& device, int num_x,
+static void LargeOneDimensional(::testing::benchmark::State& state,
+                                const string& device, int num_x,
                                 bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
-  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
+  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          sizeof(T));
 }
 
-static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
+static void DoRowCumsum(::testing::benchmark::State& state,
+                        const string& device, int num_x, int num_y,
                         bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
+  test::Benchmark(device, RowCumsum(num_x, num_y, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
+static void DoColCumsum(::testing::benchmark::State& state,
+                        const string& device, int num_x, int num_y,
                         bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
+  test::Benchmark(device, ColCumsum(num_x, num_y, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
+static void Do3DYCumsum(::testing::benchmark::State& state,
+                        const string& device, int num_x, int num_y,
                         bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
+  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }
 
-static void BM_OneDCumsumGPU(int iters, int num_x) {
-  LargeOneDimensional<float>(iters, "gpu", num_x);
+static void BM_OneDCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+
+  LargeOneDimensional<float>(state, "gpu", num_x);
 }
 BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
 
-static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
-  LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
+static void BM_OneDCumsumGPUHalf(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+
+  LargeOneDimensional<Eigen::half>(state, "gpu", num_x);
 }
 BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
 
-static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
-  DoRowCumsum(iters, "gpu", num_x, num_y);
+static void BM_Sum2DRowCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoRowCumsum(state, "gpu", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
-  DoColCumsum(iters, "gpu", num_x, num_y);
+static void BM_Sum2DColumnCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoColCumsum(state, "gpu", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
-  Do3DYCumsum(iters, "gpu", num_x, num_y);
+static void BM_Sum3DYCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DYCumsum(state, "gpu", num_x, num_y);
 }
 BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
 
-static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
-  LargeOneDimensional<float>(iters, "gpu", num_x, true);
+static void BM_OneDCumsumGPU_reverse(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+
+  LargeOneDimensional<float>(state, "gpu", num_x, true);
 }
 BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
 
-static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
-  DoRowCumsum(iters, "gpu", num_x, num_y, true);
+static void BM_Sum2DRowCumsumGPU_reverse(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoRowCumsum(state, "gpu", num_x, num_y, true);
 }
 BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
-  DoColCumsum(iters, "gpu", num_x, num_y, true);
+static void BM_Sum2DColumnCumsumGPU_reverse(
+    ::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoColCumsum(state, "gpu", num_x, num_y, true);
 }
 BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
-  Do3DYCumsum(iters, "gpu", num_x, num_y, true);
+static void BM_Sum3DYCumsumGPU_reverse(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DYCumsum(state, "gpu", num_x, num_y, true);
 }
 BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);
 
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index 9c31bed784f..b7837e11e73 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -254,8 +254,8 @@ class ScatterNdUpdateBM : public ScatterNdUpdateOpTest {
 };
 
 template <typename Index>
-static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
-  testing::StopTiming();
+void BM_ScatterNdHelper(::testing::benchmark::State& state, int embedding_size,
+                        const char* op) {
   const int kRows = 10000000 / embedding_size;
   std::vector<float> values;
   values.reserve(kRows);
@@ -280,27 +280,33 @@ static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
   bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
   bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
                               updates);
-  testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
-                          iters);
-  testing::StartTiming();
-  while (iters-- > 0) {
+  for (auto i : state) {
     Status s = bm.RunOpKernel();
   }
-  testing::StopTiming();
+  state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
+                          state.iterations());
 }
 
-static void BM_ScatterNdUpdateInt32(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdUpdate");
+void BM_ScatterNdUpdateInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdUpdate");
 }
-static void BM_ScatterNdUpdateInt64(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdUpdate");
+void BM_ScatterNdUpdateInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdUpdate");
 }
 
-static void BM_ScatterNdAddInt32(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdAdd");
+void BM_ScatterNdAddInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdAdd");
 }
-static void BM_ScatterNdAddInt64(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdAdd");
+void BM_ScatterNdAddInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdAdd");
 }
 
 BENCHMARK(BM_ScatterNdUpdateInt32)
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index e52f6e74dd5..7febb0e1cb7 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -280,9 +280,8 @@ class ScatterUpdateBM : public ScatterUpdateOpTest {
 };
 
 template <typename Index>
-static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
-                             bool big_num_updates = false) {
-  testing::StopTiming();
+void BM_ScatterHelper(::testing::benchmark::State& state, int embedding_size,
+                      const char* op, bool big_num_updates = false) {
   const int kRows = 10000000 / embedding_size;
   std::vector<float> values;
   values.reserve(kRows);
@@ -307,59 +306,83 @@ static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
   bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
   bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
                               updates);
-  testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
-                          iters);
-  testing::StartTiming();
-  while (iters-- > 0) {
+  for (auto i : state) {
     Status s = bm.RunOpKernel();
   }
-  testing::StopTiming();
+  state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
+                          state.iterations());
 }
 
-static void BM_ScatterUpdateInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate");
+void BM_ScatterUpdateInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterUpdate");
 }
-static void BM_ScatterUpdateInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate");
+void BM_ScatterUpdateInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterUpdate");
 }
 
-static void BM_ScatterAddInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd");
+void BM_ScatterAddInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd");
 }
 
-static void BM_ScatterAddInt32Large(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd", true);
+void BM_ScatterAddInt32Large(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd", true);
 }
-static void BM_ScatterAddInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd");
+void BM_ScatterAddInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterAdd");
 }
 
-static void BM_ScatterMulInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMul");
+void BM_ScatterMulInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterMul");
 }
-static void BM_ScatterMulInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMul");
+void BM_ScatterMulInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterMul");
 }
 
-static void BM_ScatterDivInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterDiv");
+void BM_ScatterDivInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterDiv");
 }
-static void BM_ScatterDivInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterDiv");
+void BM_ScatterDivInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterDiv");
 }
 
-static void BM_ScatterMinInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMin");
+void BM_ScatterMinInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterMin");
 }
-static void BM_ScatterMinInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMin");
+void BM_ScatterMinInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterMin");
 }
 
-static void BM_ScatterMaxInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMax");
+void BM_ScatterMaxInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterMax");
 }
-static void BM_ScatterMaxInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMax");
+void BM_ScatterMaxInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterMax");
 }
 
 BENCHMARK(BM_ScatterUpdateInt32)
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 8d7b70878b7..ca8c3db3d42 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -39,10 +39,9 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename Index>
-static void BM_SegmentReduction(int iters, const string& reduction,
-                                Index num_rows, Index num_cols,
-                                Index segment_size) {
-  testing::StopTiming();
+static void BM_SegmentReduction(::testing::benchmark::State& state,
+                                const string& reduction, Index num_rows,
+                                Index num_cols, Index segment_size) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -81,24 +80,25 @@ static void BM_SegmentReduction(int iters, const string& reduction,
 
   reduction_op->Compute(reduction_context.get());
   TF_CHECK_OK(reduction_context->status());
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete reduction_context->release_output(0).tensor;
     reduction_op->Compute(reduction_context.get());
   }
   int64 bytes_per_iter =
       static_cast<int64>(num_rows * num_cols * sizeof(float));
-  testing::BytesProcessed(bytes_per_iter * iters);
+  state.SetBytesProcessed(bytes_per_iter * state.iterations());
 }
 
-#define BM_Reduce(O, R, C, S)                                      \
-  static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
-    BM_SegmentReduction<int32>(iters, #O, R, C, S);                \
-  }                                                                \
-  static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
-    BM_SegmentReduction<int64>(iters, #O, R, C, S);                \
-  }                                                                \
-  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);              \
+#define BM_Reduce(O, R, C, S)                          \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int32( \
+      ::testing::benchmark::State & state) {           \
+    BM_SegmentReduction<int32>(state, #O, R, C, S);    \
+  }                                                    \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int64( \
+      ::testing::benchmark::State & state) {           \
+    BM_SegmentReduction<int64>(state, #O, R, C, S);    \
+  }                                                    \
+  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);  \
   BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
 
 #define BM_Reduce_Arg(R, C, S)    \
@@ -113,8 +113,8 @@ BM_Reduce_Arg(64, 32, 2);
 BM_Reduce_Arg(4096, 32, 2);
 BM_Reduce_Arg(4096, 128, 2);
 
-static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
-  testing::StopTiming();
+static void SparseSegmentMeanGradHelper(::testing::benchmark::State& state,
+                                        float uniqueness, int size) {
   Graph* g = new Graph(OpRegistry::Global());
   CHECK_LE(uniqueness, 1.0);
   CHECK_GT(uniqueness, 0.0);
@@ -148,22 +148,24 @@ static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
                   .Attr("T", DT_FLOAT)
                   .Finalize(g, &node));
 
-  testing::UseRealTime();
-  testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
-                          sizeof(float));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
+                          (kDim1 * kDim2) * sizeof(float));
 }
 
-static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
-  return SparseSegmentMeanGradHelper(iters, 1.0, size);
+static void BM_SparseSegmentMeanGrad_Low(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  return SparseSegmentMeanGradHelper(state, 1.0, size);
 }
 
-static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
-  return SparseSegmentMeanGradHelper(iters, 0.01, size);
+static void BM_SparseSegmentMeanGrad_High(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  return SparseSegmentMeanGradHelper(state, 0.01, size);
 }
 
-BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
-BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_Low)->UseRealTime()->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_High)->UseRealTime()->Arg(1000)->Arg(100000);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc
index 092a29f2f3c..347f7d933d0 100644
--- a/tensorflow/core/kernels/sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/sendrecv_ops_test.cc
@@ -54,21 +54,21 @@ static Graph* Recv() {
   return g;
 }
 
-static void BM_Send(int iters) {
-  testing::UseRealTime();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
-      .Run(iters);
+void BM_Send(::testing::benchmark::State& state) {
+  test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
-BENCHMARK(BM_Send);
+BENCHMARK(BM_Send)->UseRealTime();
 
-static void BM_Recv(int iters) {
-  testing::UseRealTime();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
-      .Run(iters);
+void BM_Recv(::testing::benchmark::State& state) {
+  test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
-BENCHMARK(BM_Recv);
+BENCHMARK(BM_Recv)->UseRealTime();
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc
index f589a09c4fc..aeb96566da6 100644
--- a/tensorflow/core/kernels/slice_op_test.cc
+++ b/tensorflow/core/kernels/slice_op_test.cc
@@ -37,8 +37,8 @@ namespace {
 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
 // in size, and concat them together along "concat_dimension"
 template <typename T>
-static void SliceHelper(int iters, int size) {
-  testing::StopTiming();
+static void SliceHelper(::testing::benchmark::State& state) {
+  const int size = state.range(0);
   Graph* g = new Graph(OpRegistry::Global());
   DataType dt = DataTypeToEnum<T>::v();
   int kDim = 100;
@@ -65,26 +65,24 @@ static void SliceHelper(int iters, int size) {
                   .Finalize(g, &node));
   FixupSourceAndSinkEdges(g);
 
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
-  testing::StartTiming();
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
-
-  testing::UseRealTime();
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim * size *
+                          sizeof(T));
 }
 
-static void BM_SliceFloat(int iters, int dim2) {
-  SliceHelper<float>(iters, dim2);
+void BM_SliceFloat(::testing::benchmark::State& state) {
+  SliceHelper<float>(state);
 }
 
-BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
 
-static void BM_SliceBFloat16(int iters, int dim2) {
-  SliceHelper<bfloat16>(iters, dim2);
+void BM_SliceBFloat16(::testing::benchmark::State& state) {
+  SliceHelper<bfloat16>(state);
 }
 
-BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index e3e9a27f316..4f6c20921ed 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -276,15 +276,18 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) {
 
 // [8, 4, N{nnz}] cmul [8, 4, N]
 #define BM_SparseMatCMulDenseMatArgs(N, NNZ_INNER)                             \
-  static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(int iters) {          \
+  static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(                      \
+      ::testing::benchmark::State& state) {                                    \
     Graph* g = new Graph(OpRegistry::Global());                                \
     Node* dense = MakeTensor(g, 8, 4, N);                                      \
     ST sp = MakeSparseTensor(g, 8, 4, N, NNZ_INNER);                           \
                                                                                \
-    testing::ItemsProcessed(static_cast<int64>(iters * 8 * 4 * N * 2));        \
     test::Benchmark(                                                           \
-        "cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense)) \
-        .Run(iters);                                                           \
+        "cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense), \
+        /*old_benchmark_api*/ false)                                           \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(                                                   \
+        static_cast<int64>(state.iterations() * 8 * 4 * N * 2));               \
   }                                                                            \
   BENCHMARK(BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER)
 
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index 84e1e09c219..a1f22e355ec 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -198,9 +198,11 @@ TEST_F(SparseToDenseTest, ThreeD_MultValues) {
 
 }  // namespace
 
-static void BM_SparseToDense(int iters, int NDIM, int N) {
+static void BM_SparseToDense(::testing::benchmark::State& state) {
+  const int NDIM = state.range(0);
+  const int N = state.range(1);
+
   // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
-  tensorflow::testing::StopTiming();
 
   const int IndexDim = (NDIM == 1) ? 0 : 1;
 
@@ -253,18 +255,15 @@ static void BM_SparseToDense(int iters, int NDIM, int N) {
 
   std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(&params));
   op->Compute(sparse_context.get());
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     delete sparse_context->release_output(0).tensor;
     op->Compute(sparse_context.get());
     TF_ASSERT_OK(sparse_context->status());
   }
-  tensorflow::testing::StopTiming();
 
   // processing input, mainly
   int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float));
-
-  tensorflow::testing::BytesProcessed(bytes_per_iter * iters);
+  state.SetBytesProcessed(bytes_per_iter * state.iterations());
 }
 
 BENCHMARK(BM_SparseToDense)
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index 3b252d77d0a..85a5cd3befc 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -41,11 +41,15 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
-  }                                                                     \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                               \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(                  \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS),                       \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH * \
+                            CLASS);                                          \
+  }                                                                          \
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
 /// The representative tests for ptb_word on GPU
diff --git a/tensorflow/core/kernels/split_op_test.cc b/tensorflow/core/kernels/split_op_test.cc
index ac25b6a710e..2617f36fb2e 100644
--- a/tensorflow/core/kernels/split_op_test.cc
+++ b/tensorflow/core/kernels/split_op_test.cc
@@ -44,38 +44,34 @@ static Graph* MakeGraph(int split_dim, int num_split,
 }
 
 #define BM_SPLIT_1D(num_split, chunk_size)                                  \
-  static void BM_Split_1d_##num_split##_##chunk_size(int iters) {           \
-    testing::StopTiming();                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * num_split *         \
-                            chunk_size);                                    \
+  static void BM_Split_1d_##num_split##_##chunk_size(                       \
+      ::testing::benchmark::State& state) {                                 \
     auto label =                                                            \
         strings::Printf("1-D %d chunks of %d each", num_split, chunk_size); \
-    testing::SetLabel(label);                                               \
-    testing::UseRealTime();                                                 \
+    state.SetLabel(label);                                                  \
     auto g = MakeGraph(/* split_dim = */ 0, num_split, {chunk_size});       \
-    testing::StartTiming();                                                 \
-    test::Benchmark("cpu", g).Run(iters);                                   \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);      \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *        \
+                            num_split * chunk_size);                        \
   }                                                                         \
-  BENCHMARK(BM_Split_1d_##num_split##_##chunk_size);
+  BENCHMARK(BM_Split_1d_##num_split##_##chunk_size)->UseRealTime();
 
 #define BM_SPLIT_2D(split_dim, num_split, chunk_size0, chunk_size1)          \
   static void                                                                \
       BM_Split_2d_##split_dim##_##num_split##_##chunk_size0##_##chunk_size1( \
-          int iters) {                                                       \
-    testing::StopTiming();                                                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * num_split *          \
-                            chunk_size0 * chunk_size1);                      \
+          ::testing::benchmark::State& state) {                              \
     auto label =                                                             \
         strings::Printf("2-D %d chunks in dim %d of (%d * %d) each",         \
                         num_split, split_dim, chunk_size0, chunk_size1);     \
-    testing::SetLabel(label);                                                \
-    testing::UseRealTime();                                                  \
+    state.SetLabel(label);                                                   \
     auto g = MakeGraph(split_dim, num_split, {chunk_size0, chunk_size1});    \
-    testing::StartTiming();                                                  \
-    test::Benchmark("cpu", g).Run(iters);                                    \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);       \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *         \
+                            num_split * chunk_size0 * chunk_size1);          \
   }                                                                          \
   BENCHMARK(                                                                 \
-      BM_Split_2d_##split_dim##_##num_split##_##chunk_size0##_##chunk_size1);
+      BM_Split_2d_##split_dim##_##num_split##_##chunk_size0##_##chunk_size1) \
+      ->UseRealTime();
 
 BM_SPLIT_1D(5, 1);
 BM_SPLIT_1D(262144, 1);
diff --git a/tensorflow/core/kernels/split_v_op_test.cc b/tensorflow/core/kernels/split_v_op_test.cc
index ea2bdd8c3b1..3ffaae4e0fb 100644
--- a/tensorflow/core/kernels/split_v_op_test.cc
+++ b/tensorflow/core/kernels/split_v_op_test.cc
@@ -73,43 +73,40 @@ static Graph* MakeGraph(int split_dim, const std::vector<int64>& size_splits,
 }
 
 #define BM_SPLITV_1D(num_split, total_size)                                  \
-  static void BM_SplitV_1d_##num_split##_##total_size(int iters) {           \
-    testing::StopTiming();                                                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * total_size);         \
+  static void BM_SplitV_1d_##num_split##_##total_size(                       \
+      ::testing::benchmark::State& state) {                                  \
     auto label =                                                             \
         strings::Printf("1-D %d chunks totaling %d", num_split, total_size); \
-    testing::SetLabel(label);                                                \
-    testing::UseRealTime();                                                  \
+    state.SetLabel(label);                                                   \
     auto g = MakeGraph(/* split_dim = */ 0,                                  \
                        GenerateRandomIntsWithSum(total_size, num_split),     \
                        {total_size});                                        \
-    testing::StartTiming();                                                  \
-    test::Benchmark("cpu", g).Run(iters);                                    \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);       \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *         \
+                            total_size);                                     \
   }                                                                          \
-  BENCHMARK(BM_SplitV_1d_##num_split##_##total_size);
+  BENCHMARK(BM_SplitV_1d_##num_split##_##total_size)->UseRealTime();
 
 #define BM_SPLITV_2D(split_dim, num_split, total_size0, total_size1)          \
   static void                                                                 \
       BM_SplitV_2d_##split_dim##_##num_split##_##total_size0##_##total_size1( \
-          int iters) {                                                        \
-    testing::StopTiming();                                                    \
+          ::testing::benchmark::State& state) {                               \
     std::vector<int64> total_size_vec{total_size0, total_size1};              \
-    testing::ItemsProcessed(static_cast<int64>(iters) * total_size0 *         \
-                            total_size1);                                     \
     auto label =                                                              \
         strings::Printf("2-D %d chunks in dim %d totaling (%d * %d)",         \
                         num_split, split_dim, total_size0, total_size1);      \
-    testing::SetLabel(label);                                                 \
-    testing::UseRealTime();                                                   \
+    state.SetLabel(label);                                                    \
     auto g = MakeGraph(                                                       \
         split_dim,                                                            \
         GenerateRandomIntsWithSum(total_size_vec[split_dim], num_split),      \
         {total_size0, total_size1});                                          \
-    testing::StartTiming();                                                   \
-    test::Benchmark("cpu", g).Run(iters);                                     \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *          \
+                            total_size0 * total_size1);                       \
   }                                                                           \
   BENCHMARK(                                                                  \
-      BM_SplitV_2d_##split_dim##_##num_split##_##total_size0##_##total_size1);
+      BM_SplitV_2d_##split_dim##_##num_split##_##total_size0##_##total_size1) \
+      ->UseRealTime();
 
 BM_SPLITV_1D(5, 20);
 BM_SPLITV_1D(262144, 1000000);
diff --git a/tensorflow/core/kernels/strided_slice_op_test.cc b/tensorflow/core/kernels/strided_slice_op_test.cc
index 281ca0f58fe..78f0e47c31e 100644
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@@ -38,8 +38,8 @@ namespace {
 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
 // in size, and concat them together along "concat_dimension"
 template <typename T>
-static void SliceHelper(int iters, int size) {
-  testing::StopTiming();
+static void SliceHelper(::testing::benchmark::State& state) {
+  const int size = state.range(0);
   Graph* g = new Graph(OpRegistry::Global());
   DataType dt = DataTypeToEnum<T>::v();
   int kDim = 100;
@@ -70,32 +70,30 @@ static void SliceHelper(int iters, int size) {
                   .Attr("T", dt)
                   .Finalize(g, &node));
 
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim * size *
+                          sizeof(T));
 }
 
-static void BM_SliceFloat(int iters, int dim2) {
-  SliceHelper<float>(iters, dim2);
+void BM_SliceFloat(::testing::benchmark::State& state) {
+  SliceHelper<float>(state);
 }
 
-BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
 
-static void BM_SliceComplex64(int iters, int dim2) {
-  SliceHelper<std::complex<float>>(iters, dim2);
+void BM_SliceComplex64(::testing::benchmark::State& state) {
+  SliceHelper<std::complex<float>>(state);
 }
 
-BENCHMARK(BM_SliceComplex64)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceComplex64)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
 
-static void BM_SliceBFloat16(int iters, int dim2) {
-  SliceHelper<bfloat16>(iters, dim2);
+void BM_SliceBFloat16(::testing::benchmark::State& state) {
+  SliceHelper<bfloat16>(state);
 }
 
-BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
 
-static void BM_ValidateStridedSliceOp(int iters) {
-  testing::StopTiming();
+void BM_ValidateStridedSliceOp(::testing::benchmark::State& state) {
   int kDim = 100;
   int kMaxSize = 15000;
   int size = 100;
@@ -104,8 +102,7 @@ static void BM_ValidateStridedSliceOp(int iters) {
   Tensor strides = test::AsTensor<int32>({1, 1});
   TensorShape input_shape({2 * kDim, kMaxSize});
 
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     TensorShape processing_shape, final_shape;
     bool is_identity = true, slice_dim0 = true, is_simple_slice = true;
     gtl::InlinedVector<int64, 4> begin_out, end_out, strides_out;
diff --git a/tensorflow/core/kernels/string_split_op_test.cc b/tensorflow/core/kernels/string_split_op_test.cc
index 4494cf9dcf3..2aed21db4af 100644
--- a/tensorflow/core/kernels/string_split_op_test.cc
+++ b/tensorflow/core/kernels/string_split_op_test.cc
@@ -76,17 +76,17 @@ Graph* SetupStringSplitGraph(const Tensor& input) {
   return g;
 }
 
-void BM_StringSplit(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_StringSplit(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
   Tensor input = GetTestTensor(batch_size);
   Graph* g = SetupStringSplitGraph(input);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
 
 BENCHMARK(BM_StringSplit)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(8)
     ->Arg(16)
@@ -107,17 +107,17 @@ Graph* SetupStringSplitV2Graph(const Tensor& input) {
   return g;
 }
 
-void BM_StringSplitV2(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_StringSplitV2(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
   Tensor input = GetTestTensor(batch_size);
   Graph* g = SetupStringSplitV2Graph(input);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
 
 BENCHMARK(BM_StringSplitV2)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(8)
     ->Arg(16)
diff --git a/tensorflow/core/kernels/substr_op_test.cc b/tensorflow/core/kernels/substr_op_test.cc
index 3aebfe3a212..02ac6503cae 100644
--- a/tensorflow/core/kernels/substr_op_test.cc
+++ b/tensorflow/core/kernels/substr_op_test.cc
@@ -149,27 +149,26 @@ Graph* SetupSubstrGraph(const Tensor& input, const int32 pos, const int32 len,
   return g;
 }
 
-void BM_SubstrByte(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_SubstrByte(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
   Tensor input = GetTestTensor(batch_size);
   Graph* g = SetupSubstrGraph(input, 3, 30, kByteUnit);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(state.iterations());
 }
 
-void BM_SubstrUTF8(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_SubstrUTF8(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
   Tensor input = GetTestUTF8Tensor(batch_size);
   Graph* g = SetupSubstrGraph(input, 3, 30, kUTF8Unit);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(state.iterations());
 }
 
 BENCHMARK(BM_SubstrByte)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(8)
     ->Arg(16)
@@ -178,6 +177,7 @@ BENCHMARK(BM_SubstrByte)
     ->Arg(128)
     ->Arg(256);
 BENCHMARK(BM_SubstrUTF8)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(8)
     ->Arg(16)
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index a8db29926fc..191553f462e 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -66,7 +66,7 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
   }
   // Copies the input strides, output strides and input dimension sizes to the
   // device.
-  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto num_bytes = sizeof(int32) * host_buf.size();
   auto dev_buf = d.allocate(num_bytes);
   // NOTE: host_buf is not allocated by GpuHostAllocator, and
   // therefore we are doing a sync copy effectively.
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 3390bd07308..77fb521941f 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -244,7 +244,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS_NAME
 #undef REGISTER_KERNELS
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                  \
@@ -277,6 +277,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#endif  // end GOOGLE_CUDA
+#endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index d26dd7a8bc3..481025fbf01 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -15,11 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
 #include <cmath>
+#include <string>
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -39,7 +40,7 @@ limitations under the License.
 namespace cub {
 template <>
 struct NumericTraits<Eigen::half>
-    : BaseTraits<FLOATING_POINT, true, false, unsigned short, Eigen::half> {};
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, Eigen::half> {};
 }  // namespace cub
 #endif  // GOOGLE_CUDA
 
@@ -93,7 +94,6 @@ struct IndirectLinearData {
   Entry* const backing_data;
 };
 
-#if GOOGLE_CUDA
 template <typename T>
 struct StridedData {
   typedef impl::Entry<T> Entry;
@@ -107,7 +107,6 @@ struct StridedData {
 
   Entry* const data;
 };
-#endif
 
 // A heap of Entry<T> that can either work as a min-heap or as a max-heap.
 template <HeapType heapType, PreferIndices preferIndices,
@@ -115,6 +114,7 @@ template <HeapType heapType, PreferIndices preferIndices,
 struct IndexedHeap {
   typedef typename Data<T>::Entry Entry;
   const Data<T> data;
+  __device__ IndexedHeap(const Data<T>& d) : data(d) {}
 
   __device__ bool is_above(int left, int right) {
     T left_value = data.get_value(left);
@@ -337,12 +337,21 @@ __device__ void mergeShards(int num_shards, int k,
   }
 }
 
+#if GOOGLE_CUDA
 extern __shared__ char shared_memory[];
+#endif  // GOOGLE_CUDA
 
 template <typename T>
-__global__ void TopKKernel(const T* __restrict__ input, int length, int k,
-                           bool sorted, T* __restrict__ output,
-                           int* __restrict__ indices) {
+#if TENSORFLOW_USE_ROCM
+__attribute__((amdgpu_flat_work_group_size(1, 256)))
+#endif  // TENSORFLOW_USE_ROCM
+__global__ void
+TopKKernel(const T* __restrict__ input, int length, int k, bool sorted,
+           T* __restrict__ output, int* __restrict__ indices) {
+#if TENSORFLOW_USE_ROCM
+  HIP_DYNAMIC_SHARED(char, shared_memory);
+#endif  // TENSORFLOW_USE_ROCM
+
   const int batch_index = blockIdx.x;
   const T* batch_input = input + batch_index * length;
 
@@ -370,7 +379,7 @@ __global__ void TopKKernel(const T* __restrict__ input, int length, int k,
 }
 
 template <typename T>
-cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
+cudaError LaunchTopKKernel(const gpuStream_t& stream, int num_shards,
                            const T* input, int batch_size, int length, int k,
                            bool sorted, T* output, int* indices) {
   // This code assumes that k is small enough that the computation
@@ -395,9 +404,17 @@ cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
     }
     if (num_shards <= 0) {
       num_shards = 1;
+#if GOOGLE_CUDA
     } else if (num_shards > 1024) {
       num_shards = 1024;
     }
+#elif TENSORFLOW_USE_ROCM
+      // ROCm can't execute with 1024 and requires an explicit
+      // amdgpu_flat_work_group_size attribute with >256
+    } else if (num_shards > 256) {
+      num_shards = 256;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   }
   // We are limited by the amount of shared memory we have per block.
   auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
@@ -439,8 +456,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
   const auto& cu_stream = GetGpuStream(ctx);
   size_t temp_storage_bytes = -1;
 
-  // TODO(ebrevdo): Once cub supports iterators for ValueT replace that tensor
-  // with an iterator that directly returns the correct value.
+  // TODO(ebrevdo): Once gpuprim supports iterators for ValueT replace that
+  // tensor with an iterator that directly returns the correct value.
   Tensor input_indices;
   TF_RETURN_IF_ERROR(ctx->allocate_temp(
       DT_INT32, TensorShape({num_rows, num_cols}), &input_indices));
@@ -448,9 +465,9 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
   input_indices_t.device(d) =
       input_indices_t.generate(ColumnIndexCreator(num_cols));
 
-  cub::CountingInputIterator<int> counting_iter(0);
-  cub::TransformInputIterator<int, SegmentOffsetCreator,
-                              cub::CountingInputIterator<int>>
+  gpuprim::CountingInputIterator<int> counting_iter(0);
+  gpuprim::TransformInputIterator<int, SegmentOffsetCreator,
+                                  gpuprim::CountingInputIterator<int>>
       segment_offsets_t(counting_iter, SegmentOffsetCreator(num_cols));
 
   Tensor temp_values;
@@ -472,7 +489,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
     sorted_values_ptr = temp_values.flat<T>().data();
   }
 
-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  auto err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
       /* d_temp_storage */ nullptr,
       /* temp_storage_bytes */ temp_storage_bytes,
       /* d_keys_in */ input,
@@ -489,7 +506,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
   if (err != cudaSuccess) {
     return errors::Internal(
         "TopKOp: Could not launch "
-        "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+        "gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
         "temp_storage_bytes, status: ",
         cudaGetErrorString(err));
   }
@@ -497,7 +514,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
   TF_RETURN_IF_ERROR(ctx->allocate_temp(
       DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
       &temp_storage));
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
       /* d_temp_storage */ temp_storage.flat<int8>().data(),
       /* temp_storage_bytes */ temp_storage_bytes,
       /* d_keys_in */ input,
@@ -514,7 +531,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
   if (err != cudaSuccess) {
     return errors::Internal(
         "TopKOp: Could not launch "
-        "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
+        "gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
         "temp_storage_bytes: ",
         temp_storage_bytes, ", status: ", cudaGetErrorString(err));
   }
@@ -543,8 +560,8 @@ struct TopKFunctor<GPUDevice, T> {
           const int64 num_cols, typename TTypes<T, 2>::Tensor values,
           typename TTypes<int, 2>::Tensor indices) {
     // For small k, use the heap implementation.  For larger k, use
-    // the in-place cub sort.  For k == num_cols, always use the
-    // in-place cub sort.  The thresholds for n and k were determined
+    // the in-place gpuprim sort.  For k == num_cols, always use the
+    // in-place gpuprim sort.  The thresholds for n and k were determined
     // empirically.
     if (num_cols <= 1000 || k == num_cols || k >= 100) {
       return impl::LaunchSortKernel(context, input.data(), num_rows, num_cols,
@@ -567,6 +584,6 @@ struct TopKFunctor<GPUDevice, T> {
 }  // end namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
diff --git a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
index 8a5a7e71b1b..787aafdfd07 100644
--- a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, double>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
index 0b69396bb13..10d106248f9 100644
--- a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, float>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
index e53586aeca2..bde26cb0951 100644
--- a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
index 5bd310523c9..fba39300700 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int16>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
index 55b393a0c02..a017234597d 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int32>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
index 3e4a7750563..ed9f6ea52c6 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int64>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
index ac73cd170b8..647700ebcda 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int8>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
index bc64a2ecd63..41ab6ffa601 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -27,4 +27,4 @@ template struct functor::TopKFunctor<GPUDevice, uint32>;
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
index 16e2e0e9420..6725f478c15 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, uint32>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
index 895247a63a2..0dd65145d41 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, uint64>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
index fc1a8a2c8cc..6d544291fed 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/topk_op.h"
@@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, uint8>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index aaa0310cd1b..f2f6889c900 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/util/util.h"
 
-
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -53,7 +52,6 @@ struct ApplyGradientDescent<CPUDevice, T> {
   }
 };
 
-
 template <typename T>
 struct ApplyAdadelta<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -350,6 +348,179 @@ struct ApplyFtrlMultiplyLinearByLr<CPUDevice, T> {
   }
 };
 
+namespace {
+
+template <typename T>
+inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
+                     const T& l2, const T& lr_power,
+                     const bool multiply_linear_by_lr) {
+  T quadratic;
+  if (multiply_linear_by_lr) {
+    if (lr_power == static_cast<T>(-0.5)) {
+      quadratic = Eigen::numext::sqrt(accum) + static_cast<T>(2) * l2 * lr;
+    } else {
+      quadratic =
+          Eigen::numext::pow(accum, -lr_power) + static_cast<T>(2) * l2 * lr;
+    }
+    auto l1_reg_adjust = std::max(std::min(linear, l1 * lr), -l1 * lr);
+    return (l1_reg_adjust - linear) / quadratic;
+  } else {
+    if (lr_power == static_cast<T>(-0.5)) {
+      quadratic = Eigen::numext::sqrt(accum) / lr + static_cast<T>(2) * l2;
+    } else {
+      quadratic =
+          Eigen::numext::pow(accum, -lr_power) / lr + static_cast<T>(2) * l2;
+    }
+    auto l1_reg_adjust = std::max(std::min(linear, l1), -l1);
+    return (l1_reg_adjust - linear) / quadratic;
+  }
+}
+
+}  // namespace
+
+template <typename T, typename Tindex, bool has_l2_shrinkage>
+struct SparseApplyFtrl<CPUDevice, T, Tindex, has_l2_shrinkage> {
+  Status operator()(const CPUDevice& d, typename TTypes<T>::Matrix var_flat,
+                    typename TTypes<T>::Matrix accum_flat,
+                    typename TTypes<T>::Matrix linear_flat,
+                    typename TTypes<T>::ConstScalar lr,
+                    typename TTypes<T>::ConstScalar l1,
+                    typename TTypes<T>::ConstScalar l2,
+                    typename TTypes<T>::ConstScalar l2_shrinkage,
+                    typename TTypes<T>::ConstScalar lr_power,
+                    typename TTypes<T>::ConstMatrix grad_flat,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    int64 inner_dim, bool multiply_linear_by_lr) {
+    const Tindex N = static_cast<Tindex>(indices_vec.dimension(0));
+    if (N > 0) {
+      T lr_scalar = lr();
+      T l1_scalar = l1();
+      T l2_scalar = l2();
+      T l2_shrinkage_scalar;
+      if (has_l2_shrinkage) {
+        l2_shrinkage_scalar = l2_shrinkage();
+      }
+      T lr_power_scalar = lr_power();
+      if (inner_dim > 1) {
+        const Tindex first_dim_size =
+            static_cast<Tindex>(var_flat.dimension(0));
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          if (!FastBoundsCheck(index, first_dim_size)) {
+            return errors::InvalidArgument(
+                strings::StrCat("Index ", index, " at offset ", i,
+                                " in indices is out of range"));
+          }
+          auto accum = accum_flat.template chip<0>(index);
+          auto linear = linear_flat.template chip<0>(index);
+          auto grad = grad_flat.template chip<0>(i);
+          auto var = var_flat.template chip<0>(index);
+
+// TODO(sanjoy): Remove this macro.
+// Use a macro to implement the computation here due to the templating of the
+// eigen tensor library.
+#define COMPUTE_FTRL(grad, grad_maybe_with_shrinkage)                          \
+  auto new_accum = accum + grad.square();                                      \
+  if (multiply_linear_by_lr) {                                                 \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      linear += grad_maybe_with_shrinkage * lr_scalar -                        \
+                (new_accum.sqrt() - accum.sqrt()) * var;                       \
+    } else {                                                                   \
+      linear +=                                                                \
+          grad_maybe_with_shrinkage * lr_scalar -                              \
+          (new_accum.pow(-lr_power_scalar) - accum.pow(-lr_power_scalar)) *    \
+              var;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      linear += grad_maybe_with_shrinkage -                                    \
+                (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;           \
+    } else {                                                                   \
+      linear += grad_maybe_with_shrinkage - (new_accum.pow(-lr_power_scalar) - \
+                                             accum.pow(-lr_power_scalar)) /    \
+                                                lr_scalar * var;               \
+    }                                                                          \
+  }                                                                            \
+  auto l1_reg_adjust =                                                         \
+      (multiply_linear_by_lr                                                   \
+           ? linear.cwiseMin(l1_scalar * lr_scalar)                            \
+                 .cwiseMax(-l1_scalar * lr_scalar)                             \
+           : linear.cwiseMin(l1_scalar).cwiseMax(-l1_scalar));                 \
+  auto x = l1_reg_adjust - linear;                                             \
+  if (multiply_linear_by_lr) {                                                 \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      auto y = new_accum.sqrt() +                                              \
+               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
+      var = x / y;                                                             \
+    } else {                                                                   \
+      auto y = new_accum.pow(-lr_power_scalar) +                               \
+               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
+      var = x / y;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) +              \
+               linear.constant(static_cast<T>(2) * l2_scalar);                 \
+      var = x / y;                                                             \
+    } else {                                                                   \
+      auto y =                                                                 \
+          new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) +    \
+          linear.constant(static_cast<T>(2) * l2_scalar);                      \
+      var = x / y;                                                             \
+    }                                                                          \
+  }                                                                            \
+  accum += grad.square();
+
+          if (has_l2_shrinkage) {
+            auto grad_with_shrinkage =
+                grad + static_cast<T>(2) * l2_shrinkage_scalar * var;
+            COMPUTE_FTRL(grad, grad_with_shrinkage);
+          } else {
+            COMPUTE_FTRL(grad, grad);
+          }
+        }
+#undef COMPUTE_FTRL
+      } else {
+        const Tindex first_dim_size = accum_flat.size();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          if (!FastBoundsCheck(index, first_dim_size)) {
+            return errors::InvalidArgument(
+                strings::StrCat("Index ", index, " at offset ", i,
+                                " in indices is out of range"));
+          }
+          T& a = accum_flat(index);
+          T& l = linear_flat(index);
+          T& v = var_flat(index);
+          T g;
+          if (has_l2_shrinkage) {
+            g = grad_flat(i) +
+                (static_cast<T>(2) * l2_shrinkage_scalar * var_flat(index));
+          } else {
+            g = grad_flat(i);
+          }
+
+          T updated_a = a + grad_flat(i) * grad_flat(i);
+          using Eigen::numext::pow;
+          T sigma = pow(updated_a, -lr_power_scalar) - pow(a, -lr_power_scalar);
+          if (!multiply_linear_by_lr) {
+            sigma /= lr_scalar;
+          }
+          T updated_l = (multiply_linear_by_lr ? l + g * lr_scalar - sigma * v
+                                               : l + g - sigma * v);
+          v = FtrlCompute(updated_a, updated_l, lr_scalar, l1_scalar, l2_scalar,
+                          lr_power_scalar, multiply_linear_by_lr);
+          a = updated_a;
+          l = updated_l;
+        }
+      }
+    }
+    return Status::OK();
+  }
+};
+
 template <typename T>
 struct ApplyMomentum<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -483,7 +654,6 @@ struct ApplyAdamNonCuda {
   }
 };
 
-
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
@@ -638,7 +808,6 @@ class ApplyGradientDescentOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
@@ -682,7 +851,6 @@ REGISTER_KERNELS(GPU, complex128);
 #endif
 #endif
 
-
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
@@ -694,24 +862,12 @@ class ApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    Var* resource;
     const bool sparse = false;
-    mutex* mu = GetTrainingVariableMutex<Device, T>(ctx, 0, sparse, &resource);
-    core::ScopedUnref scoped_unref(resource);
-    if (use_exclusive_lock_ && mu != nullptr) {
-      mutex_lock l1(*mu);
-      // Don't try to acquire a lock on the second ref as they share the same
-      // mutex.
-      //
-      // mutex_lock l2(*ctx->input_ref_mutex(1));
-      DoValidate(ctx);
-      if (!ctx->status().ok()) return;
-      DoCompute(ctx);
-    } else {
-      DoValidate(ctx);
-      if (!ctx->status().ok()) return;
-      DoCompute(ctx);
-    }
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
+    DoValidate(ctx);
+    if (!ctx->status().ok()) return;
+    DoCompute(ctx);
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
@@ -856,20 +1012,10 @@ class SparseApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    Var* var;
     const bool sparse = true;
-    mutex* mu = GetTrainingVariableMutex<CPUDevice, T>(ctx, 0, sparse, &var);
-    core::ScopedUnref scoped_unref(var);
-    // mu_accum is actually the same mutex as mu_var since currently we use a
-    // global mutex.
-    //
-    // mutex* mu_accum = ctx->input_ref_mutex(1);
-    if (use_exclusive_lock_ && mu != nullptr) {
-      mutex_lock ml(*mu);
-      DoCompute(ctx);
-    } else {
-      DoCompute(ctx);
-    }
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
+    DoCompute(ctx);
   }
 
   void DoCompute(OpKernelContext* ctx) {
@@ -1538,35 +1684,6 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-namespace {
-
-template <typename T>
-inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
-                     const T& l2, const T& lr_power,
-                     const bool multiply_linear_by_lr) {
-  T quadratic;
-  if (multiply_linear_by_lr) {
-    if (lr_power == static_cast<T>(-0.5)) {
-      quadratic = Eigen::numext::sqrt(accum) + static_cast<T>(2) * l2 * lr;
-    } else {
-      quadratic =
-          Eigen::numext::pow(accum, -lr_power) + static_cast<T>(2) * l2 * lr;
-    }
-    auto l1_reg_adjust = std::max(std::min(linear, l1 * lr), -l1 * lr);
-    return (l1_reg_adjust - linear) / quadratic;
-  } else {
-    if (lr_power == static_cast<T>(-0.5)) {
-      quadratic = Eigen::numext::sqrt(accum) / lr + static_cast<T>(2) * l2;
-    } else {
-      quadratic =
-          Eigen::numext::pow(accum, -lr_power) / lr + static_cast<T>(2) * l2;
-    }
-    auto l1_reg_adjust = std::max(std::min(linear, l1), -l1);
-    return (l1_reg_adjust - linear) / quadratic;
-  }
-}
-}  // namespace
-
 // Note, this op works on cpu only.
 template <typename T, typename Tindex>
 class SparseApplyAdagradOp : public OpKernel {
@@ -2701,146 +2818,18 @@ class SparseApplyFtrlOp : public OpKernel {
                                   l2_shrinkage->shape().DebugString()));
     }
 
-    if (N > 0) {
-      if (inner_dim > 1) {
-        const Tindex first_dim_size = var.dim_size(0);
-        auto indices_vec = indices.vec<Tindex>();
-        auto var_flat = var.flat_outer_dims<T>();
-        auto accum_flat = accum.flat_outer_dims<T>();
-        auto linear_flat = linear.flat_outer_dims<T>();
-        auto grad_flat = grad.flat_outer_dims<T>();
-        T lr_scalar = lr.scalar<T>()();
-        T l1_scalar = l1.scalar<T>()();
-        T l2_scalar = l2.scalar<T>()();
-        T l2_shrinkage_scalar;
-        if (has_l2_shrinkage) {
-          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
-        }
-        T lr_power_scalar = lr_power.scalar<T>()();
-
-        for (Tindex i = 0; i < N; i++) {
-          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
-          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
-                      errors::InvalidArgument(
-                          strings::StrCat("Index ", index, " at offset ", i,
-                                          " in indices is out of range")));
-          auto accum = accum_flat.template chip<0>(index);
-          auto linear = linear_flat.template chip<0>(index);
-          auto grad = grad_flat.template chip<0>(i);
-          auto var = var_flat.template chip<0>(index);
-
-// Use a macro to implement the computation here due to the templating of the
-// eigen tensor library.
-#define COMPUTE_FTRL(grad, grad_maybe_with_shrinkage)                          \
-  auto new_accum = accum + grad.square();                                      \
-  if (multiply_linear_by_lr_) {                                                \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      linear += grad_maybe_with_shrinkage * lr_scalar -                        \
-                (new_accum.sqrt() - accum.sqrt()) * var;                       \
-    } else {                                                                   \
-      linear +=                                                                \
-          grad_maybe_with_shrinkage * lr_scalar -                              \
-          (new_accum.pow(-lr_power_scalar) - accum.pow(-lr_power_scalar)) *    \
-              var;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      linear += grad_maybe_with_shrinkage -                                    \
-                (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;           \
-    } else {                                                                   \
-      linear += grad_maybe_with_shrinkage - (new_accum.pow(-lr_power_scalar) - \
-                                             accum.pow(-lr_power_scalar)) /    \
-                                                lr_scalar * var;               \
-    }                                                                          \
-  }                                                                            \
-  auto l1_reg_adjust =                                                         \
-      (multiply_linear_by_lr_                                                  \
-           ? linear.cwiseMin(l1_scalar * lr_scalar)                            \
-                 .cwiseMax(-l1_scalar * lr_scalar)                             \
-           : linear.cwiseMin(l1_scalar).cwiseMax(-l1_scalar));                 \
-  auto x = l1_reg_adjust - linear;                                             \
-  if (multiply_linear_by_lr_) {                                                \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      auto y = new_accum.sqrt() +                                              \
-               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
-      var = x / y;                                                             \
-    } else {                                                                   \
-      auto y = new_accum.pow(-lr_power_scalar) +                               \
-               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
-      var = x / y;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) +              \
-               linear.constant(static_cast<T>(2) * l2_scalar);                 \
-      var = x / y;                                                             \
-    } else {                                                                   \
-      auto y =                                                                 \
-          new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) +    \
-          linear.constant(static_cast<T>(2) * l2_scalar);                      \
-      var = x / y;                                                             \
-    }                                                                          \
-  }                                                                            \
-  accum += grad.square();
-
-          if (has_l2_shrinkage) {
-            auto grad_with_shrinkage =
-                grad + static_cast<T>(2) * l2_shrinkage_scalar * var;
-            COMPUTE_FTRL(grad, grad_with_shrinkage);
-          } else {
-            COMPUTE_FTRL(grad, grad);
-          }
-        }
-#undef COMPUTE_FTRL
-      } else {
-        T lr_scalar = lr.scalar<T>()();
-        T l1_scalar = l1.scalar<T>()();
-        T l2_scalar = l2.scalar<T>()();
-        T lr_power_scalar = lr_power.scalar<T>()();
-        T l2_shrinkage_scalar;
-        if (has_l2_shrinkage) {
-          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
-        }
-
-        auto indices_vec = indices.vec<Tindex>();
-        auto var_flat = var.flat<T>();
-        auto accum_flat = accum.flat<T>();
-        auto linear_flat = linear.flat<T>();
-        auto grad_flat = grad.flat<T>();
-        const Tindex first_dim_size = accum_flat.size();
-
-        for (Tindex i = 0; i < N; i++) {
-          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
-          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
-                      errors::InvalidArgument(
-                          strings::StrCat("Index ", index, " at offset ", i,
-                                          " in indices is out of range")));
-          T& a = accum_flat(index);
-          T& l = linear_flat(index);
-          T& v = var_flat(index);
-          T g;
-          if (has_l2_shrinkage) {
-            g = grad_flat(i) +
-                (static_cast<T>(2) * l2_shrinkage_scalar * var_flat(index));
-          } else {
-            g = grad_flat(i);
-          }
-
-          T updated_a = a + grad_flat(i) * grad_flat(i);
-          using Eigen::numext::pow;
-          T sigma = pow(updated_a, -lr_power_scalar) - pow(a, -lr_power_scalar);
-          if (!multiply_linear_by_lr_) {
-            sigma /= lr_scalar;
-          }
-          T updated_l = (multiply_linear_by_lr_ ? l + g * lr_scalar - sigma * v
-                                                : l + g - sigma * v);
-          v = FtrlCompute(updated_a, updated_l, lr_scalar, l1_scalar, l2_scalar,
-                          lr_power_scalar, multiply_linear_by_lr_);
-          a = updated_a;
-          l = updated_l;
-        }
-      }
-    }
+    const Device& device = ctx->template eigen_device<Device>();
+    auto indices_vec = indices.vec<Tindex>();
+    OP_REQUIRES_OK(
+        ctx, functor::SparseApplyFtrl<Device, T, Tindex, has_l2_shrinkage>()(
+                 device, var.flat_outer_dims<T>(), accum.flat_outer_dims<T>(),
+                 linear.flat_outer_dims<T>(), lr.scalar<T>(), l1.scalar<T>(),
+                 l2.scalar<T>(),
+                 // Note: Passing l2 as a placeholder when not has_l2_shrinkage
+                 // (it will not be used).
+                 has_l2_shrinkage ? l2_shrinkage->scalar<T>() : l2.scalar<T>(),
+                 lr_power.scalar<T>(), grad.flat_outer_dims<T>(), indices_vec,
+                 inner_dim, multiply_linear_by_lr_));
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -3471,7 +3460,6 @@ class ApplyAdamOp : public OpKernel {
   bool use_nesterov_;
 };
 
-
 #define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
@@ -3488,7 +3476,6 @@ class ApplyAdamOp : public OpKernel {
 TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
 TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
-
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index ef44b5f9659..5af603077a5 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -151,6 +152,21 @@ struct ApplyFtrlV2MultiplyLinearByLr {
                   typename TTypes<T>::ConstScalar lr_power);
 };
 
+template <typename Device, typename T, typename Tindex, bool has_l2_shrinkage>
+struct SparseApplyFtrl {
+  Status operator()(const Device& d, typename TTypes<T>::Matrix var_flat,
+                    typename TTypes<T>::Matrix accum_flat,
+                    typename TTypes<T>::Matrix linear_flat,
+                    typename TTypes<T>::ConstScalar lr,
+                    typename TTypes<T>::ConstScalar l1,
+                    typename TTypes<T>::ConstScalar l2,
+                    typename TTypes<T>::ConstScalar l2_shrinkage,
+                    typename TTypes<T>::ConstScalar lr_power,
+                    typename TTypes<T>::ConstMatrix grad_flat,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    int64 inner_dim, bool multiply_linear_by_lr);
+};
+
 template <typename Device, typename T>
 struct ApplyMomentum {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index a92a7b29984..364fc84c507 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -103,14 +103,18 @@ static void SGD(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_SGD(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_SGD(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
   Graph* init;
   Graph* train;
   SGD(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_SGD)->Arg(128 << 10)->Arg(256 << 10);
 
@@ -135,14 +139,18 @@ static void Adagrad(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_Adagrad(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_Adagrad(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
   Graph* init;
   Graph* train;
   Adagrad(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_Adagrad)->Arg(128 << 10)->Arg(256 << 10);
 
@@ -168,17 +176,22 @@ static void SparseAdagrad(int32 m, int32 n, Graph** init_g, Graph** train_g) {
     *train_g = g;
   }
 }
-static void BM_SparseAdagrad(int iters, int m, int n) {
-  const int64 tot = static_cast<int64>(iters) * m * n;
-  testing::UseRealTime();
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_SparseAdagrad(::testing::benchmark::State& state) {
+  const int m = state.range(0);
+  const int n = state.range(1);
+
   Graph* init;
   Graph* train;
   SparseAdagrad(m, n, &init, &train);
-  test::Benchmark("cpu", train, GetMultiThreadedOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetMultiThreadedOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * m * n;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_SparseAdagrad)
+    ->UseRealTime()
     ->ArgPair(128, 1 << 10)
     ->ArgPair(128, 4 << 10)
     ->ArgPair(128, 8 << 10)
@@ -208,14 +221,18 @@ static void Momentum(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_Momentum(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_Momentum(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
   Graph* init;
   Graph* train;
   Momentum(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);
 
@@ -251,19 +268,26 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_Adam(int iters, int params, int is_multi_threaded) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_Adam(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+  const int is_multi_threaded = state.range(1);
+
   Graph* init;
   Graph* train;
   Adam(params, &init, &train);
   if (is_multi_threaded) {
     // Use max thread number if test performance.
-    test::Benchmark("cpu", train, nullptr, init).Run(iters);
+    test::Benchmark("cpu", train, nullptr, init, nullptr, "",
+                    /*old_benchmark_api*/ false)
+        .Run(state);
   } else {
-    test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+    test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                    /*old_benchmark_api*/ false)
+        .Run(state);
   }
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_Adam)->ArgPair(128 << 10, 0)->ArgPair(256 << 10, 0);
 BENCHMARK(BM_Adam)->ArgPair(256 << 5, 1)->ArgPair(256 << 16, 1);
@@ -297,14 +321,18 @@ static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_RMSProp(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_RMSProp(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
   Graph* init;
   Graph* train;
   RMSProp(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benhcmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10);
 
@@ -334,14 +362,18 @@ static void AddSign(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_AddSign(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_AddSign(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
   Graph* init;
   Graph* train;
   AddSign(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benhcmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_AddSign)->Arg(128 << 10)->Arg(256 << 10);
 
@@ -371,14 +403,19 @@ static void PowerSign(int32 n, Graph** init_g, Graph** train_g) {
   }
 }
 
-static void BM_PowerSign(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_PowerSign(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
   Graph* init;
   Graph* train;
   PowerSign(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benhcmark_api*/ false)
+      .Run(state);
+
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_PowerSign)->Arg(128 << 10)->Arg(256 << 10);
 
diff --git a/tensorflow/core/kernels/unary_ops_composition_test.cc b/tensorflow/core/kernels/unary_ops_composition_test.cc
index 807dc56e3e7..3110f435038 100644
--- a/tensorflow/core/kernels/unary_ops_composition_test.cc
+++ b/tensorflow/core/kernels/unary_ops_composition_test.cc
@@ -108,11 +108,15 @@ static Graph* UnaryOpsChain(int tensor_size, int repeat_graph,
   return g;
 }
 
-#define BM_UnaryOpsChain(N, R, F, type)                                \
-  static void BM_UnaryOpsChain##_##type##_##N##_##R##_##F(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * R * F);    \
-    test::Benchmark(#type, UnaryOpsChain(N, R, F)).Run(iters);         \
-  }                                                                    \
+#define BM_UnaryOpsChain(N, R, F, type)                                      \
+  static void BM_UnaryOpsChain##_##type##_##N##_##R##_##F(                   \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#type, UnaryOpsChain(N, R, F),                           \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * R * \
+                            F);                                              \
+  }                                                                          \
   BENCHMARK(BM_UnaryOpsChain##_##type##_##N##_##R##_##F);
 
 // Unary ops fused together.
@@ -140,11 +144,15 @@ static Graph* UnaryOpsCompo(int tensor_size, int repeat_graph,
   return g;
 }
 
-#define BM_UnaryOpsCompo(N, R, F, type)                                \
-  static void BM_UnaryOpsCompo##_##type##_##N##_##R##_##F(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * R * F);    \
-    test::Benchmark(#type, UnaryOpsCompo(N, R, F)).Run(iters);         \
-  }                                                                    \
+#define BM_UnaryOpsCompo(N, R, F, type)                                      \
+  static void BM_UnaryOpsCompo##_##type##_##N##_##R##_##F(                   \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#type, UnaryOpsCompo(N, R, F),                           \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * R * \
+                            F);                                              \
+  }                                                                          \
   BENCHMARK(BM_UnaryOpsCompo##_##type##_##N##_##R##_##F);
 
 // BenchmarkName(tensor_size, repeat_graph, num_ops, type)
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index a0249d9bc4c..590bd7f8c39 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -64,8 +64,10 @@ TensorProto GetRandomInt32TensorProtoWithRepeat(int dim, int repeat,
   return tensor_proto;
 }
 
-static void BM_Unique_INT32(int iters, int dim, int max_int) {
-  testing::StopTiming();
+void BM_Unique_INT32(::testing::benchmark::State& state) {
+  const int dim = state.range(0);
+  const int max_int = state.range(1);
+
   Graph* g = new Graph(OpRegistry::Global());
 
   Tensor input(DT_INT32, TensorShape({dim}));
@@ -78,16 +80,17 @@ static void BM_Unique_INT32(int iters, int dim, int max_int) {
                   .Finalize(g, &node));
   FixupSourceAndSinkEdges(g);
 
-  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
-  testing::UseRealTime();
-  testing::StartTiming();
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * dim *
+                          sizeof(int32));
 }
 
-static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
-  testing::StopTiming();
+void BM_Unique_INT32_Repeat(::testing::benchmark::State& state) {
+  const int dim = state.range(0);
+  const int max_int = state.range(1);
+
   Graph* g = new Graph(OpRegistry::Global());
 
   Tensor input(DT_INT32, TensorShape({dim * 200}));
@@ -101,13 +104,11 @@ static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
                   .Finalize(g, &node));
   FixupSourceAndSinkEdges(g);
 
-  testing::BytesProcessed(static_cast<int64>(iters) * dim * 200 *
-                          sizeof(int32));
-  testing::UseRealTime();
-  testing::StartTiming();
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * dim * 200 *
+                          sizeof(int32));
 }
 
 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@@ -127,8 +128,9 @@ TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
   return tensor_proto;
 }
 
-static void BM_Unique_STRING(int iters, int dim) {
-  testing::StopTiming();
+void BM_Unique_STRING(::testing::benchmark::State& state) {
+  const int dim = state.range(0);
+
   Graph* g = new Graph(OpRegistry::Global());
 
   Tensor input(DT_STRING, TensorShape({dim}));
@@ -140,16 +142,15 @@ static void BM_Unique_STRING(int iters, int dim) {
                   .Attr("T", DT_STRING)
                   .Finalize(g, &node));
   FixupSourceAndSinkEdges(g);
-
-  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(tstring));
-  testing::UseRealTime();
-  testing::StartTiming();
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * dim *
+                          sizeof(tstring));
 }
 
 BENCHMARK(BM_Unique_INT32)
+    ->UseRealTime()
     ->ArgPair(32, 1024 * 1024)
     ->ArgPair(256, 1024 * 1024)
     ->ArgPair(1024, 1024 * 1024)
@@ -168,6 +169,7 @@ BENCHMARK(BM_Unique_INT32)
     ->ArgPair(4 * 1024 * 1024, 64 * 1024 * 1024);
 
 BENCHMARK(BM_Unique_INT32_Repeat)
+    ->UseRealTime()
     ->ArgPair(32, 1024 * 1024)
     ->ArgPair(256, 1024 * 1024)
     ->ArgPair(1024, 1024 * 1024)
@@ -192,6 +194,7 @@ BENCHMARK(BM_Unique_INT32_Repeat)
     ->ArgPair(1024 * 1024, 64 * 1024 * 1024);
 
 BENCHMARK(BM_Unique_STRING)
+    ->UseRealTime()
     ->Arg(32)
     ->Arg(256)
     ->Arg(1024)
diff --git a/tensorflow/core/kernels/variable_ops_test.cc b/tensorflow/core/kernels/variable_ops_test.cc
index 7a615788cc9..0a814aab1db 100644
--- a/tensorflow/core/kernels/variable_ops_test.cc
+++ b/tensorflow/core/kernels/variable_ops_test.cc
@@ -28,8 +28,8 @@ namespace {
 // Benchmark to simulate the overhead in training and serving workloads from too
 // many threads grabbing the ResourceMgr lock at the same time because of the
 // variable and queue ops.
-void ManyManyVariablesHelper(int threads, int variables, int iters) {
-  testing::StopTiming();
+void ManyManyVariablesHelper(int threads, int variables,
+                             ::testing::benchmark::State& state) {
   Graph g(OpRegistry::Global());
   std::vector<string> targets;
   for (int i = 0; i < variables; ++i) {
@@ -50,16 +50,16 @@ void ManyManyVariablesHelper(int threads, int variables, int iters) {
   Session* sess = NewSession(opts);
   TF_CHECK_OK(sess->Create(gd));
   TF_CHECK_OK(sess->Run({}, {}, targets, nullptr));
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     TF_CHECK_OK(sess->Run({}, {}, targets, nullptr));
   }
-  testing::StopTiming();
   delete sess;
 }
 
-void BM_ManyManyVariablesManyThreads(int iters, int threads) {
-  ManyManyVariablesHelper(threads, 1000, iters);
+void BM_ManyManyVariablesManyThreads(::testing::benchmark::State& state) {
+  const int threads = state.range(0);
+
+  ManyManyVariablesHelper(threads, 1000, state);
 }
 
 BENCHMARK(BM_ManyManyVariablesManyThreads)->Arg(50);
diff --git a/tensorflow/core/kernels/xent_op_test.cc b/tensorflow/core/kernels/xent_op_test.cc
index b844979adfa..ec87e85e810 100644
--- a/tensorflow/core/kernels/xent_op_test.cc
+++ b/tensorflow/core/kernels/xent_op_test.cc
@@ -33,11 +33,14 @@ static Graph* Xent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_XentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, Xent(BATCH, CLASS)).Run(iters);            \
-  }                                                                     \
+#define BM_XentDev(BATCH, CLASS, DEVICE)                                      \
+  static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(                         \
+      ::testing::benchmark::State& state) {                                   \
+    test::Benchmark(#DEVICE, Xent(BATCH, CLASS), /*old_benchmark_api*/ false) \
+        .Run(state);                                                          \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH *  \
+                            CLASS);                                           \
+  }                                                                           \
   BENCHMARK(BM_Xent##_##BATCH##_##CLASS##_##DEVICE);
 
 /// The representative tests for ptb_word on GPU
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 886e3e9140c..3621dfde4f3 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -244,15 +244,10 @@ filegroup(
     srcs = [
         "stringpiece.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
-)
-
-filegroup(
-    name = "legacy_lib_core_threadpool_options_header",
-    srcs = [
-        "threadpool_options.h",
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
 )
 
 filegroup(
@@ -279,6 +274,7 @@ filegroup(
         "stringpiece.h",
         "threadpool.h",
         "threadpool_interface.h",
+        "threadpool_options.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 659513d05ed..5fb47043654 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -85,7 +85,6 @@ uint8* Decode(const void* srcdata, int datasize,
   }
 
   int target_num_frames = gif_file->ImageCount;
-  if (!expand_animations) target_num_frames = 1;
 
   // Don't request more memory than needed for each frame, preventing OOM
   int max_frame_width = 0;
@@ -101,6 +100,7 @@ uint8* Decode(const void* srcdata, int datasize,
   const int width = max_frame_width;
   const int height = max_frame_height;
   const int channel = 3;
+  if (!expand_animations) target_num_frames = 1;
 
   uint8* const dstdata =
       allocate_output(target_num_frames, width, height, channel);
@@ -118,27 +118,36 @@ uint8* Decode(const void* srcdata, int datasize,
 
     if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
         img_desc->Height != height) {
-      // If the first frame does not fill the entire canvas then return error.
+      // If the first frame does not fill the entire canvas then fill the
+      // unoccupied canvas with zeros (black).
       if (k == 0) {
-        *error_string = "the first frame does not fill the canvas";
-        return nullptr;
+        for (int i = 0; i < height; ++i) {
+          uint8* p_dst = this_dst + i * width * channel;
+          for (int j = 0; j < width; ++j) {
+            p_dst[j * channel + 0] = 0;
+            p_dst[j * channel + 1] = 0;
+            p_dst[j * channel + 2] = 0;
+          }
+        }
+      } else {
+        // Otherwise previous frame will be reused to fill the unoccupied
+        // canvas.
+        uint8* last_dst = dstdata + (k - 1) * width * channel * height;
+        for (int i = 0; i < height; ++i) {
+          uint8* p_dst = this_dst + i * width * channel;
+          uint8* l_dst = last_dst + i * width * channel;
+          for (int j = 0; j < width; ++j) {
+            p_dst[j * channel + 0] = l_dst[j * channel + 0];
+            p_dst[j * channel + 1] = l_dst[j * channel + 1];
+            p_dst[j * channel + 2] = l_dst[j * channel + 2];
+          }
+        }
       }
-      // Otherwise previous frame will be reused to fill the unoccupied canvas.
+
       imgLeft = std::max(imgLeft, 0);
       imgTop = std::max(imgTop, 0);
       imgRight = std::min(imgRight, width);
       imgBottom = std::min(imgBottom, height);
-
-      uint8* last_dst = dstdata + (k - 1) * width * channel * height;
-      for (int i = 0; i < height; ++i) {
-        uint8* p_dst = this_dst + i * width * channel;
-        uint8* l_dst = last_dst + i * width * channel;
-        for (int j = 0; j < width; ++j) {
-          p_dst[j * channel + 0] = l_dst[j * channel + 0];
-          p_dst[j * channel + 1] = l_dst[j * channel + 1];
-          p_dst[j * channel + 2] = l_dst[j * channel + 2];
-        }
-      }
     }
 
     ColorMapObject* color_map = this_image->ImageDesc.ColorMap
diff --git a/tensorflow/core/lib/gif/testdata/red_black.gif b/tensorflow/core/lib/gif/testdata/red_black.gif
new file mode 100644
index 00000000000..d32ddd3547d
Binary files /dev/null and b/tensorflow/core/lib/gif/testdata/red_black.gif differ
diff --git a/tensorflow/core/lib/gif/testdata/squares.gif b/tensorflow/core/lib/gif/testdata/squares.gif
new file mode 100644
index 00000000000..159f86355a8
Binary files /dev/null and b/tensorflow/core/lib/gif/testdata/squares.gif differ
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index fc9148c0f04..eadfbd1fe2e 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -8,6 +8,7 @@ package(
     default_visibility = [
         "//tensorflow/c/experimental/filesystem:__pkg__",
         "//tensorflow/c/experimental/filesystem/plugins/posix:__pkg__",
+        "//tensorflow/core/lib/io/snappy:__pkg__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
     ],
@@ -183,56 +184,24 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
+alias(
     name = "snappy_inputbuffer",
-    srcs = ["snappy/snappy_inputbuffer.cc"],
-    hdrs = ["snappy/snappy_inputbuffer.h"],
-    deps = [
-        ":inputstream_interface",
-        "//tensorflow/core/lib/core:status",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:macros",
-        "//tensorflow/core/platform:platform_port",
-        "//tensorflow/core/platform:types",
-    ],
-    alwayslink = True,
+    actual = "//tensorflow/core/lib/io/snappy:snappy_inputbuffer",
 )
 
-cc_library(
-    name = "snappy_outputbuffer",
-    srcs = ["snappy/snappy_outputbuffer.cc"],
-    hdrs = ["snappy/snappy_outputbuffer.h"],
-    deps = [
-        "//tensorflow/core/lib/core:status",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:macros",
-        "//tensorflow/core/platform:platform_port",
-        "//tensorflow/core/platform:types",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
+alias(
     name = "snappy_inputstream",
-    srcs = ["snappy/snappy_inputstream.cc"],
-    hdrs = ["snappy/snappy_inputstream.h"],
-    deps = [
-        ":inputstream_interface",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:platform_port",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = True,
+    actual = "//tensorflow/core/lib/io/snappy:snappy_inputstream",
 )
 
-cc_library(
+alias(
+    name = "snappy_outputbuffer",
+    actual = "//tensorflow/core/lib/io/snappy:snappy_outputbuffer",
+)
+
+alias(
     name = "snappy_compression_options",
-    hdrs = ["snappy/snappy_compression_options.h"],
-    deps = [
-        "//tensorflow/core/platform:types",
-    ],
-    alwayslink = True,
+    actual = "//tensorflow/core/lib/io/snappy:snappy_compression_options",
 )
 
 cc_library(
@@ -350,9 +319,6 @@ filegroup(
         "random_inputstream.h",
         "record_reader.cc",
         "record_reader.h",
-        "snappy/snappy_compression_options.h",
-        "snappy/snappy_inputstream.cc",
-        "snappy/snappy_inputstream.h",
         "table.cc",
         "table.h",
         "table_builder.cc",
@@ -364,6 +330,9 @@ filegroup(
         "zlib_compression_options.h",
         "zlib_inputstream.cc",
         "zlib_inputstream.h",
+        "//tensorflow/core/lib/io/snappy:snappy_compression_options.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.cc",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.h",
     ],
 )
 
@@ -383,10 +352,6 @@ filegroup(
         "random_inputstream.h",
         "record_reader.h",
         "record_writer.h",
-        "snappy/snappy_compression_options.h",
-        "snappy/snappy_inputbuffer.h",
-        "snappy/snappy_inputstream.h",
-        "snappy/snappy_outputbuffer.h",
         "table.h",
         "table_builder.h",
         "table_options.h",
@@ -394,6 +359,10 @@ filegroup(
         "zlib_compression_options.h",
         "zlib_inputstream.h",
         "zlib_outputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_compression_options.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.h",
+        "//tensorflow/core/lib/io/snappy:snappy_outputbuffer.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -409,9 +378,9 @@ filegroup(
         "random_inputstream_test.cc",
         "record_reader_writer_test.cc",
         "recordio_test.cc",
-        "snappy/snappy_test.cc",
         "table_test.cc",
         "zlib_buffers_test.cc",
+        "//tensorflow/core/lib/io/snappy:snappy_test.cc",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -440,13 +409,13 @@ filegroup(
     srcs = [
         "inputbuffer.h",
         "iterator.h",
-        "snappy/snappy_compression_options.h",
-        "snappy/snappy_inputbuffer.h",
-        "snappy/snappy_inputstream.h",
-        "snappy/snappy_outputbuffer.h",
         "zlib_compression_options.h",
         "zlib_inputstream.h",
         "zlib_outputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_compression_options.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.h",
+        "//tensorflow/core/lib/io/snappy:snappy_outputbuffer.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/io/snappy/BUILD b/tensorflow/core/lib/io/snappy/BUILD
new file mode 100644
index 00000000000..3f9405cdd6a
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/BUILD
@@ -0,0 +1,74 @@
+# Snappy targets.
+
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow/core/lib/io:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files([
+    "snappy_compression_options.h",
+    "snappy_inputbuffer.h",
+    "snappy_inputstream.h",
+    "snappy_outputbuffer.h",
+    "snappy_inputstream.cc",
+    "snappy_test.cc",
+])
+
+cc_library(
+    name = "snappy_inputbuffer",
+    srcs = ["snappy_inputbuffer.cc"],
+    hdrs = ["snappy_inputbuffer.h"],
+    deps = [
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/lib/io:inputstream_interface",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_outputbuffer",
+    srcs = ["snappy_outputbuffer.cc"],
+    hdrs = ["snappy_outputbuffer.h"],
+    deps = [
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_inputstream",
+    srcs = ["snappy_inputstream.cc"],
+    hdrs = ["snappy_inputstream.h"],
+    deps = [
+        "//tensorflow/core/lib/io:inputstream_interface",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:platform_port",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_compression_options",
+    hdrs = ["snappy_compression_options.h"],
+    deps = [
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/core/lib/jpeg/BUILD b/tensorflow/core/lib/jpeg/BUILD
new file mode 100644
index 00000000000..a254fa80397
--- /dev/null
+++ b/tensorflow/core/lib/jpeg/BUILD
@@ -0,0 +1,86 @@
+# Description:
+# JPEG handle and mem packages.
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_android",
+    "if_mobile",
+    "tf_cc_test",
+    "tf_copts",
+)
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+
+package(
+    default_visibility = ["//tensorflow/core:__pkg__"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "jpeg_internal",
+    srcs = [
+        "jpeg_handle.cc",
+        "jpeg_mem.cc",
+        "//tensorflow/core/platform:jpeg_hdrs",
+    ],
+    hdrs = [
+        "jpeg_handle.h",
+        "jpeg_mem.h",
+    ],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-ldl"],
+    }),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:jpeg",
+        "@libjpeg_turbo//:jpeg",
+    ],
+)
+
+cc_library(
+    name = "portable_jpeg_internal",
+    srcs = if_mobile([
+        "jpeg_handle.cc",
+        "jpeg_mem.cc",
+        "//tensorflow/core/platform:jpeg_hdrs",
+    ]),
+    hdrs = [
+        "jpeg_handle.h",
+        "jpeg_mem.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
+        "//tensorflow/core/platform:jpeg_internal_hdrs",
+        "//tensorflow/core/platform/default:integral_types.h",
+        "//tensorflow/core/platform/default:logging.h",
+    ],
+    copts = tf_copts(),
+    linkopts = if_android(["-ldl"]),
+    deps = [
+        "//tensorflow/core:core_stringpiece",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:jpeg",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "lib_jpeg_jpeg_mem_unittest",
+    srcs = ["jpeg_mem_unittest.cc"],
+    data = ["//tensorflow/core/lib/jpeg/testdata"],
+    deps = [
+        ":jpeg_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base",
+    ],
+)
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index aa80576365e..af820084df5 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -164,7 +164,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   cinfo.dct_method = flags.dct_method;
 
   // Determine the output image size before attempting decompress to prevent
-  // OOM'ing doing the decompress
+  // OOM'ing during the decompress
   jpeg_calc_output_dimensions(&cinfo);
 
   int64 total_size = static_cast<int64>(cinfo.output_height) *
@@ -577,7 +577,7 @@ bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
   SetSrc(&cinfo, srcdata, datasize, false);
 
   jpeg_read_header(&cinfo, TRUE);
-  jpeg_start_decompress(&cinfo);  // required to transfer image size to cinfo
+  jpeg_calc_output_dimensions(&cinfo);
   if (width) *width = cinfo.output_width;
   if (height) *height = cinfo.output_height;
   if (components) *components = cinfo.output_components;
diff --git a/tensorflow/core/lib/jpeg/testdata/BUILD b/tensorflow/core/lib/jpeg/testdata/BUILD
new file mode 100644
index 00000000000..3d78c84904e
--- /dev/null
+++ b/tensorflow/core/lib/jpeg/testdata/BUILD
@@ -0,0 +1,17 @@
+# Description:
+# JPEG test data packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "testdata",
+    srcs = glob(["*.jpg"]),
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+    ],
+)
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 92a0fea8578..9b1447f53c1 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -43,12 +43,11 @@ cc_library(
     ]) + if_cuda_or_rocm([
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:annotated_traceme",
diff --git a/tensorflow/core/nccl/collective_communicator.cc b/tensorflow/core/nccl/collective_communicator.cc
index 56e2255ae99..bcdee71be18 100644
--- a/tensorflow/core/nccl/collective_communicator.cc
+++ b/tensorflow/core/nccl/collective_communicator.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/nccl/collective_communicator.h"
 
+#include "tensorflow/core/framework/cancellation.h"
+
 #if TENSORFLOW_USE_NCCL && (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
 
 #include "absl/memory/memory.h"
@@ -77,7 +79,25 @@ void NcclCommunicator::Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
   auto* gpu_info = col_ctx->op_ctx->device()->tensorflow_gpu_device_info();
   auto participant = absl::make_unique<NcclManager::Participant>(
       compute_stream->parent(), compute_stream, gpu_info, col_ctx->input,
-      col_ctx->output, col_ctx->col_params.default_rank, std::move(done));
+      col_ctx->output, col_ctx->col_params.default_rank,
+      /*done_callback=*/nullptr);
+  CancellationManager* cancel_mgr = col_ctx->op_ctx->cancellation_manager();
+  if (cancel_mgr == nullptr) {
+    participant->done_callback = std::move(done);
+  } else {
+    CancellationToken cancel_token = cancel_mgr->get_cancellation_token();
+    cancel_mgr->RegisterCallback(cancel_token, [this]() {
+      nccl_manager_.StartAbort(errors::Cancelled("op cancelled"));
+      nccl_manager_.Reset();
+    });
+    participant->done_callback = [cancel_mgr, cancel_token,
+                                  done = std::move(done)](const Status& s) {
+      // Do not block on deregistration since this can be invoked by
+      // NcclManager::StartAbort() in the cancellation callback.
+      cancel_mgr->TryDeregisterCallback(cancel_token);
+      done(s);
+    };
+  }
   NcclManager::Context context(
       nccl_collective_key, num_local_devices, num_global_devices,
       col_params.group.runtime_details.communicator_key,
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 43c9b229450..eaa34d042ce 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -115,7 +115,7 @@ struct NcclManager::Communicator {
       : num_devices(members.size()), members(std::move(members)), key(key) {}
 
   const int num_devices;
-  const std::vector<CommunicatorMember> members;
+  std::vector<CommunicatorMember> members;
   const string key;
 };
 
@@ -304,7 +304,7 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
     // Launching of kernels must be serialized so that, given collectives A and
     // B, and an order of them (e.g., A before B), then for each comm_stream
     // involved, the kernel for A is launched before the kernel for B. This is
-    // guaranteed currently be a global mutex controlling additions of the
+    // guaranteed currently by a global mutex controlling additions of the
     // kernels to per-stream launch queues.  The launch queues are processed by
     // LoopKernelLaunches.
     for (auto& comm : communicators_) {
@@ -632,7 +632,7 @@ void NcclManager::RunCollective(Collective* collective) {
       // Wait to ensure that the kernel that produces the data in the input
       // tensor has finished running before the nccl kernel runs on the
       // communication stream.
-      nccl_stream->stream->ThenWaitFor(p->input_event.get());
+      nccl_stream->stream->ThenWaitFor(p->tensor_stream);
     }
     if (p->root) {
       if (collective->root_rank == -1) {
@@ -739,6 +739,7 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
 
         VLOG(2) << "call NcclAllReduce collective_key "
                 << collective->collective_key << " participant " << p_idx
+                << " num_participants " << collective->participants.size()
                 << " sendbuff " << sendbuff << " recvbuff " << recvbuff
                 << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
@@ -849,10 +850,8 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
 }
 
 void NcclManager::StartAbort(const Status& s) {
-  VLOG(1) << "NcclManager StartAbort";
   absl::flat_hash_map<string, Collective*> collectives;
-  // After status_ is set to a non-OK one, there should be no further
-  // modifications to collectives_.
+  std::vector<std::unique_ptr<Communicator>> communicators;
   {
     mutex_lock l(mu_);
     if (!status_.ok()) {
@@ -863,7 +862,11 @@ void NcclManager::StartAbort(const Status& s) {
     }
     status_ = s;
     collectives.swap(collectives_);
+    communicators.swap(communicators_);
   }
+  VLOG(2) << "Aborted NcclManager " << this << " with " << collectives.size()
+          << " collectives and " << communicators.size()
+          << " comms with status " << s;
   // collectives_ contains pending launches that haven't been dispatched to
   // kernel launch threads, so we can simply invoke the done callbacks of them.
   for (const auto& item : collectives) {
@@ -872,21 +875,23 @@ void NcclManager::StartAbort(const Status& s) {
     }
     item.second->Unref();
   }
-  // Abort ncclComm. Note that there could be multiple ncclComm per device, and
-  // ncclCommAbort contains cuda calls that requires device synchronization.
-  // That is a collective on nccl_comm_0 can block ncclCommAbort(nccl_comm_1),
-  // so we need to abort all ncclComm in a concurrent fashion. This assumes that
-  // there's only one active NcclManager at a time.
+  // Abort ncclComm. Note that there could be multiple ncclComm per device,
+  // and ncclCommAbort contains cuda calls that requires device
+  // synchronization. That is a collective on nccl_comm_0 can block
+  // ncclCommAbort(nccl_comm_1), so we need to abort all ncclComm in a
+  // concurrent fashion. This assumes that there's only one active NcclManager
+  // at a time.
   UnboundedWorkQueue queue(Env::Default(), "nccl_abort");
   int num_comms = 0;
-  for (std::unique_ptr<Communicator>& communicator : communicators_) {
+  for (std::unique_ptr<Communicator>& communicator : communicators) {
     num_comms += communicator->members.size();
   }
   BlockingCounter pending(num_comms);
-  for (std::unique_ptr<Communicator>& communicator : communicators_) {
-    for (const CommunicatorMember& member : communicator->members) {
+  for (std::unique_ptr<Communicator>& communicator : communicators) {
+    for (CommunicatorMember& member : communicator->members) {
       queue.Schedule([&member, &pending]() {
         ncclCommAbort(member.nccl_comm);
+        member.nccl_comm = nullptr;
         pending.DecrementCount();
       });
     }
@@ -894,6 +899,12 @@ void NcclManager::StartAbort(const Status& s) {
   pending.Wait();
 }
 
+void NcclManager::Reset() {
+  mutex_lock l(mu_);
+  status_ = Status();
+  VLOG(2) << "Reset NcclManager " << this;
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index b91ef86042a..b1d9dd62f94 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -27,7 +27,6 @@ limitations under the License.
 #endif
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
 #if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
 #elif TENSORFLOW_USE_ROCM
@@ -77,7 +76,6 @@ class NcclManager {
           context(static_cast<GPUDeviceContext*>(info->default_context)),
 #endif
           input(input),
-          input_event(nullptr),
           output(output),
           global_rank(global_rank),
           done_callback(std::move(done_callback)),
@@ -85,11 +83,6 @@ class NcclManager {
       DCHECK(executor != nullptr);
       DCHECK(event_mgr != nullptr);
       DCHECK(tensor_stream != nullptr);
-      if (input != nullptr) {
-        input_event = absl::make_unique<se::Event>(executor);
-        input_event->Init();
-        tensor_stream->ThenRecordEvent(input_event.get());
-      }
     }
 
     // StreamExecutor for the device. Expected to be live for process lifetime.
@@ -118,10 +111,6 @@ class NcclManager {
     // called. Is NULL for participants that only receive data.
     const Tensor* input;
 
-    // Wait on this event rather than synchronizing on the entire stream.
-    // This allows greater concurrency between compute and nccl streams.
-    std::unique_ptr<se::Event> input_event;
-
     // Owned by the caller, who must keep it live until `done_callback` is
     // called. Is NULL for participants that only send data.
     Tensor* output;
@@ -206,6 +195,10 @@ class NcclManager {
   // launched with this NcclManager.
   void StartAbort(const Status& s);
 
+  // Resets a previously aborted NcclManager, making it available for future
+  // collectives.
+  void Reset();
+
  private:
   enum CollectiveType {
     kAllReduce = 1,
@@ -259,7 +252,7 @@ class NcclManager {
   absl::flat_hash_map<se::StreamExecutor*, std::vector<NcclStream*>>
       device_to_comm_streams_ TF_GUARDED_BY(mu_);
 
-  std::vector<std::unique_ptr<Communicator>> communicators_;
+  std::vector<std::unique_ptr<Communicator>> communicators_ TF_GUARDED_BY(mu_);
 
   Status status_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index d16eefa6f72..0d0d003d63f 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -17,8 +17,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "tensorflow/core/nccl/nccl_manager.h"
-
 #include <algorithm>
 #include <random>
 #include <vector>
@@ -27,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/unbounded_work_queue.h"
 
@@ -640,6 +639,10 @@ TEST(NcclManagerTest, CommunicatorKey) {
 }
 
 #if !TENSORFLOW_USE_ROCM
+// ROCm platform currently does not support simulating a mutli-node
+// environment, on a single node with multiple GPUS. So tests that rely
+// upon such simulation need to be skipped on the ROCm platform
+
 // This test creates `num_nodes` NcclManagers to simulate a multi-node
 // environment.  It works on a single node with multiple GPUs.  It enqueues NCCL
 // kernels on separate stream per rank.
@@ -661,6 +664,10 @@ TYPED_TEST(NcclManagerTest, MultiNodeSingle) {
 }
 
 #if !TENSORFLOW_USE_ROCM
+// ROCm platform currently does not support simulating a mutli-node
+// environment, on a single node with multiple GPUS. So tests that rely
+// upon such simulation need to be skipped on the ROCm platform
+
 // Multi-node broadcast.
 TYPED_TEST(NcclManagerTest, MultiNodeBroadcast) {
   int num_nodes;
@@ -850,20 +857,24 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {
   this->VerifyError(test_case.get());
 }
 
-TYPED_TEST(NcclManagerTest, Abort) {
+#if !TENSORFLOW_USE_ROCM
+// ROCm platform currently does not support simulating a mutli-node
+// environment, on a single node with multiple GPUS. So tests that rely
+// upon such simulation need to be skipped on the ROCm platform
+
+TYPED_TEST(NcclManagerTest, AbortThenReset) {
   using NodeState = typename TestFixture::NodeState;
   using TestCase = typename TestFixture::TestCase;
-  int num_nodes = 2;
+  const int num_nodes = 2;
   std::vector<NodeState> nodes(num_nodes);
   // First do a normal all-reduce to simulate the the case when there're
   // multiple communicators.
   this->RunMultiNodeAllReduceTest(nodes, /* num_ranks_per_node */ 1);
 
-  // Use a new communicator_key, which uses a new set of ncclComm underneath.
-  string communicator_key = nodes[0].nccl_manager.GenerateCommunicatorKey();
-  string collective_key = "allreduce";
+  const string collective_key = "allreduce";
   ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(0);
-  auto node_fn = [&](TestCase* test_case, int node) {
+  auto node_fn = [&](TestCase* test_case, int node,
+                     const string& communicator_key) {
     auto* device = this->GetDevice(/* num_ranks_per_node */ 1, node,
                                    /* local_rank */ 0);
     auto* info = device->tensorflow_gpu_device_info();
@@ -881,6 +892,8 @@ TYPED_TEST(NcclManagerTest, Abort) {
     nodes[node].nccl_manager.SignalMultiNodeReady(collective_key);
   };
 
+  // Use a new communicator_key, which uses a new set of ncclComm underneath.
+  string communicator_key = nodes[0].nccl_manager.GenerateCommunicatorKey();
   // Do a normal all-reduce with this communicator key to initialize ncclComm.
   // This is because ncclCommInitRank waits for all ranks and is blocking.
   {
@@ -890,7 +903,9 @@ TYPED_TEST(NcclManagerTest, Abort) {
             TensorShape({2, 3}), 0.0f));
     for (int i = 0; i < num_nodes; ++i) {
       this->work_queue_->Schedule(
-          [&node_fn, &test_case, i]() { node_fn(test_case.get(), i); });
+          [&node_fn, &test_case, i, communicator_key]() {
+            node_fn(test_case.get(), i, communicator_key);
+          });
     }
     this->VerifyResults(test_case.get());
   }
@@ -901,17 +916,43 @@ TYPED_TEST(NcclManagerTest, Abort) {
       this->MakeReductionTestCase(
           /* num_nodes */ num_nodes, /* num_ranks_per_node */ 1, reduction_op,
           TensorShape({2, 3}), 0.0f));
-  node_fn(test_case.get(), 0);
+  node_fn(test_case.get(), 0, communicator_key);
   Env::Default()->SleepForMicroseconds(1000000);
-  nodes[0].nccl_manager.StartAbort(errors::Unavailable("peer down"));
+  for (auto& node : nodes) {
+    node.nccl_manager.StartAbort(errors::Unavailable("peer down"));
+  }
   {
     mutex_lock l(test_case->mu);
     while (test_case->num_completed != 1) {
       test_case->done_cv.wait(l);
     }
   }
+
+  // Reset the aborted NcclManager and then run another all-reduce with the
+  // resetted NcclManagers.
+  for (auto& node : nodes) {
+    node.nccl_manager.Reset();
+  }
+  // Regenerate the communicator_key, because this is needed to create new
+  // communicators.
+  communicator_key = nodes[0].nccl_manager.GenerateCommunicatorKey();
+  {
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeReductionTestCase(
+            /* num_nodes */ num_nodes, /* num_ranks_per_node */ 1, reduction_op,
+            TensorShape({2, 3}), 0.0f));
+    for (int i = 0; i < num_nodes; ++i) {
+      this->work_queue_->Schedule(
+          [&node_fn, &test_case, i, communicator_key]() {
+            node_fn(test_case.get(), i, communicator_key);
+          });
+    }
+    this->VerifyResults(test_case.get());
+  }
 }
 
+#endif
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
new file mode 100644
index 00000000000..40aaae8f148
--- /dev/null
+++ b/tensorflow/core/ops/BUILD
@@ -0,0 +1,541 @@
+# Description:
+# Tensorflow default op definitions.
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_chromiumos",
+    "tf_cc_test",
+)
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_kernel_tests_linkstatic",
+)
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
+
+# A lot of packages try to minimize binary size by depending on individual ops,\
+# so they need access here.
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    features = ["-parse_headers"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Export the BUILD file so automated tooling can check licenses
+exports_files([
+    "BUILD",
+    "ops.pbtxt",
+])
+
+# Generates library per group of ops.
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "batch_ops",
+        "bitwise_ops",
+        "boosted_trees_ops",
+        "tensor_forest_ops",
+        "candidate_sampling_ops",
+        "checkpoint_ops",
+        "clustering_ops",
+        "collective_ops",
+        "control_flow_ops",
+        "count_ops",
+        "ctc_ops",
+        "data_flow_ops",
+        "dataset_ops",
+        "decode_proto_ops",
+        "encode_proto_ops",
+        "experimental_dataset_ops",
+        "function_ops",
+        "functional_ops",
+        "image_ops",
+        "io_ops",
+        "linalg_ops",
+        "list_ops",
+        "map_ops",
+        "lookup_ops",
+        "manip_ops",
+        "math_ops",
+        "mkl_nn_ops",
+        "nccl_ops",
+        "nn_ops",
+        "no_op",
+        "parsing_ops",
+        "random_grad",
+        "random_ops",
+        "special_math_ops",
+        "stateful_random_ops",
+        "remote_fused_graph_ops",
+        "risc_ops",
+        "rnn_ops",
+        "rpc_ops",
+        "scoped_allocator_ops",
+        "sdca_ops",
+        "set_ops",
+        "script_ops",
+        "sendrecv_ops",
+        "sparse_csr_matrix_ops",
+        "sparse_ops",
+        "spectral_ops",
+        "state_ops",
+        "stateless_random_ops",
+        "stateless_random_ops_v2",
+        "summary_ops",
+        "training_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "logging_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        # TODO(b/162630222): remove this dependency.
+        "//tensorflow/c/kernels:histogram_summary_op_lib",
+        "//tensorflow/c/kernels:merge_summary_op_lib",
+        "//tensorflow/c/kernels:summary_op_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "string_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "array_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "mkl_array_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core:protos_all_cc"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "audio_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["debug_ops"],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "resource_variable_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_embedding_load_retrieve_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
+    ],
+)
+
+cc_library(
+    name = "word2vec_ops",
+    srcs = ["word2vec_ops.cc"],
+    linkstatic = 1,
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "cudnn_rnn_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "ragged_ops",
+    deps = [
+        "//tensorflow/core:ragged_array_ops_op_lib",
+        "//tensorflow/core:ragged_conversion_ops_op_lib",
+        "//tensorflow/core:ragged_math_ops_op_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "ragged_array_ops",
+        "ragged_conversion_ops",
+        "ragged_math_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core/util:ragged_to_dense_util"],
+)
+
+cc_library(
+    name = "ops",
+    deps = [
+        ":array_ops_op_lib",
+        ":audio_ops_op_lib",
+        ":batch_ops_op_lib",
+        ":bitwise_ops_op_lib",
+        ":boosted_trees_ops_op_lib",
+        ":tensor_forest_ops_op_lib",
+        ":candidate_sampling_ops_op_lib",
+        ":checkpoint_ops_op_lib",
+        ":clustering_ops_op_lib",
+        ":collective_ops_op_lib",
+        ":control_flow_ops_op_lib",
+        ":count_ops_op_lib",
+        ":ctc_ops_op_lib",
+        ":cudnn_rnn_ops_op_lib",
+        ":data_flow_ops_op_lib",
+        ":dataset_ops_op_lib",
+        ":debug_ops_op_lib",
+        ":decode_proto_ops_op_lib",
+        ":encode_proto_ops_op_lib",
+        ":experimental_dataset_ops_op_lib",
+        ":function_ops_op_lib",
+        ":functional_ops_op_lib",
+        ":image_ops_op_lib",
+        ":io_ops_op_lib",
+        ":linalg_ops_op_lib",
+        ":list_ops_op_lib",
+        ":map_ops_op_lib",
+        ":logging_ops_op_lib",
+        ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
+        ":math_ops_op_lib",
+        ":nccl_ops_op_lib",
+        ":nn_ops_op_lib",
+        ":no_op_op_lib",
+        ":parsing_ops_op_lib",
+        ":ragged_ops",
+        ":random_ops_op_lib",
+        ":risc_ops_op_lib",
+        ":rnn_ops_op_lib",
+        ":special_math_ops_op_lib",
+        ":stateful_random_ops_op_lib",
+        ":remote_fused_graph_ops_op_lib",
+        ":resource_variable_ops_op_lib",
+        ":rpc_ops_op_lib",
+        ":scoped_allocator_ops_op_lib",
+        ":script_ops_op_lib",
+        ":sdca_ops_op_lib",
+        ":sendrecv_ops_op_lib",
+        ":set_ops_op_lib",
+        ":sparse_csr_matrix_ops_op_lib",
+        ":sparse_ops_op_lib",
+        ":summary_ops_op_lib",
+        ":spectral_ops_op_lib",
+        ":state_ops_op_lib",
+        ":stateless_random_ops_op_lib",
+        ":stateless_random_ops_v2_op_lib",
+        ":string_ops_op_lib",
+        ":training_ops_op_lib",
+        ":word2vec_ops",
+    ] + if_chromiumos(
+        [],
+        # Non-tpu platforms don't need tpu dependency.
+        [
+            ":tpu_configuration_ops_op_lib",
+            ":tpu_cross_replica_ops_op_lib",
+            ":tpu_embedding_ops_op_lib",
+            ":tpu_embedding_load_retrieve_ops_op_lib",
+            ":tpu_functional_ops_op_lib",
+            ":tpu_heartbeat_ops_op_lib",
+            ":tpu_host_compute_ops_op_lib",
+            ":tpu_infeed_ops_op_lib",
+            ":tpu_outfeed_ops_op_lib",
+            ":tpu_ordinal_selector_ops_op_lib",
+            ":tpu_replication_ops_op_lib",
+        ],
+    ) + if_mkl([
+        ":mkl_array_ops_op_lib",
+        ":mkl_nn_ops_op_lib",
+    ]),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "array_grad",
+    srcs = ["array_grad.cc"],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        ":array_ops_op_lib",
+        "//tensorflow/c/kernels:bitcast_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "functional_grad",
+    srcs = ["functional_grad.cc"],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        ":functional_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "math_grad",
+    srcs = [
+        "math_grad.cc",
+        "random_grad.cc",
+        "stateless_random_grad.cc",
+    ],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        ":math_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "nn_grad",
+    srcs = ["nn_grad.cc"],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        ":nn_ops_op_lib",
+    ] + if_mkl([
+        ":mkl_nn_ops_op_lib",
+    ]),
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "portable_op_registrations_and_gradients",
+    srcs = ["//tensorflow/c/kernels:android_all_ops"] + glob(
+        [
+            "**/*.cc",
+            "**/*.h",
+        ],
+        exclude = [
+            "**/*test.cc",
+            "**/*testutil*",
+            "**/*testlib*",
+            "**/*main.cc",
+            "**/tpu_*",
+        ],
+    ),
+)
+
+tf_cc_test(
+    name = "cudnn_rnn_ops_test_cc",
+    size = "small",
+    srcs = [
+        "cudnn_rnn_ops_test.cc",
+    ],
+    deps = [
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_array_grad_test",
+    size = "small",
+    srcs = ["array_grad_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:math",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_math_grad_test",
+    size = "small",
+    srcs = ["math_grad_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_gpu"],
+    deps = [
+        ":ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:data_flow",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:math",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_remote_fused_graph_ops_test",
+    size = "small",
+    srcs = ["remote_fused_graph_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:remote_fused_graph_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_tests",
+    size = "small",
+    srcs = [
+        "array_ops_test.cc",
+        "candidate_sampling_ops_test.cc",
+        "control_flow_ops_test.cc",
+        "ctc_ops_test.cc",
+        "data_flow_ops_test.cc",
+        "functional_ops_test.cc",
+        "image_ops_test.cc",
+        "io_ops_test.cc",
+        "linalg_ops_test.cc",
+        "lookup_ops_test.cc",
+        "math_ops_test.cc",
+        "nn_ops_test.cc",
+        "parsing_ops_test.cc",
+        "random_ops_test.cc",
+        "rnn_ops_test.cc",
+        "set_ops_test.cc",
+        "shape_function_test.cc",
+        "sparse_csr_matrix_ops_test.cc",
+        "sparse_ops_test.cc",
+        "spectral_ops_test.cc",
+        "state_ops_test.cc",
+        "string_ops_test.cc",
+        "training_ops_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+    ],
+)
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index bf3955b354b..2018f793741 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2808,6 +2808,70 @@ REGISTER_OP("QuantizeAndDequantizeV2")
       return Status::OK();
     });
 
+REGISTER_OP("QuantizeAndDequantizeV4")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Attr("signed_input: bool = true")
+    .Attr("num_bits: int = 8")
+    .Attr("range_given: bool = false")
+    .Output("output: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr(
+        "round_mode: {'HALF_TO_EVEN', 'HALF_UP'} = "
+        "'HALF_TO_EVEN'")
+    .Attr("narrow_range: bool = false")
+    .Attr("axis: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
+REGISTER_OP("QuantizeAndDequantizeV4Grad")
+    .Input("gradients: T")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Output("input_backprop: T")
+    .Output("input_min_backprop: T")
+    .Output("input_max_backprop: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("axis: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(3), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      ShapeHandle inputs;
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &inputs));
+      c->set_output(0, inputs);
+      c->set_output(1, minmax);
+      c->set_output(2, minmax);
+      return Status::OK();
+    });
+
 REGISTER_OP("QuantizeAndDequantizeV3")
     .Input("input: T")
     .Input("input_min: T")
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index ecaab00c91a..9033c7154ff 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -111,10 +111,12 @@ REGISTER_OP("CollectiveReduceV2")
     .Input("group_size: int32")
     .Input("group_key: int32")
     .Input("instance_key: int32")
+    .Input("ordering_token: Nordering_token * resource")
     .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
     .Attr("final_op: {'Id', 'Div'}")
     .Attr("communication_hint: string = 'auto'")
     .Attr("timeout_seconds: float = 0")
+    .Attr("Nordering_token: int >= 0 = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -125,8 +127,10 @@ REGISTER_OP("CollectiveGatherV2")
     .Input("group_size: int32")
     .Input("group_key: int32")
     .Input("instance_key: int32")
+    .Input("ordering_token: Nordering_token * resource")
     .Attr("communication_hint: string = 'auto'")
     .Attr("timeout_seconds: float = 0")
+    .Attr("Nordering_token: int >= 0 = 0")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Scalar input is not supported.
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 899c76fb962..aad8b8e550c 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -20,11 +20,11 @@ cc_library(
     hdrs = ["op_compatibility_lib.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:debug_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/ops:debug_ops_op_lib",
     ],
 )
 
@@ -35,7 +35,7 @@ tf_cc_test(
         "backwards_compatibility_test.cc",
     ],
     data = [
-        "//tensorflow/core:ops/ops.pbtxt",
+        "//tensorflow/core/ops:ops.pbtxt",
         "//tensorflow/core/ops/compat/ops_history_v1:ops_history_v1_srcs",
         "//tensorflow/core/ops/compat/ops_history_v2:ops_history_v2_srcs",
     ] + glob([
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
index 2bbf48d1164..9a09128edcc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
@@ -184,3 +184,67 @@ op {
     }
   }
 }
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+        type: DT_VARIANT
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
index 8a081e34d34..4d473ac82e2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
@@ -49,3 +49,67 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGatherV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "ordering_token"
+    type: DT_RESOURCE
+    number_attr: "Nordering_token"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "Nordering_token"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
index b2751cc59e8..bdb99f807b6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
@@ -137,3 +137,89 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduceV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "ordering_token"
+    type: DT_RESOURCE
+    number_attr: "Nordering_token"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "Nordering_token"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..2a49131faaf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "QuantizeAndDequantizeV4"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..0bbe87452b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "QuantizeAndDequantizeV4Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_min_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_max_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
new file mode 100644
index 00000000000..45f2fcefe04
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "RaggedTensorToVariantGradient"
+  input_arg {
+    name: "encoded_ragged_grad"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "row_splits"
+    type_attr: "Tsplits"
+  }
+  input_arg {
+    name: "dense_values_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "dense_values_grad"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt
new file mode 100644
index 00000000000..86cc81e964c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "RiscAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
index 149359b7068..4897ee80bb8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
@@ -31,3 +31,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessRandomGetKeyCounterAlg"
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 91bcc3be49a..a19c3a6d934 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -69,7 +69,7 @@ REGISTER_OP("EmptyTensorList")
           0, &element_shape));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -106,7 +106,7 @@ REGISTER_OP("TensorListPushBack")
       }
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -153,7 +153,7 @@ REGISTER_OP("TensorListPushBackBatch")
       }
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -345,7 +345,7 @@ REGISTER_OP("TensorListSplit")
                                   &element_shape_from_tensor_shape));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 8948df2cef3..05aa229336d 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -169,10 +169,6 @@ REGISTER_OP("LookupTableFindV2")
       ShapeHandle handle;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
 
-      // Default value must be scalar or vector.
-      ShapeHandle keys;
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &keys));
-
       ShapeAndType value_shape_and_type;
       TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
           c,
diff --git a/tensorflow/core/ops/lookup_ops_test.cc b/tensorflow/core/ops/lookup_ops_test.cc
index ac899d59993..904099f1813 100644
--- a/tensorflow/core/ops/lookup_ops_test.cc
+++ b/tensorflow/core/ops/lookup_ops_test.cc
@@ -25,7 +25,6 @@ namespace {
 TEST(LookupOpsTest, LookupTableFindV2_ShapeFn) {
   ShapeInferenceTestOp op("LookupTableFindV2");
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[?];?;?");
-  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op, "[];?;[1,1]");
   TF_ASSERT_OK(NodeDefBuilder("test", "LookupTableFindV2")
                    .Input({"table_handle", 0, DT_RESOURCE})
                    .Input({"keys", 0, DT_INT64})
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 750819216bd..1604527b941 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -508,6 +508,86 @@ propagation. Uses oneDNN APIs to compute gradients of AvgPool3D function.
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklNativeMaxPool")
+    .Attr("T: {float, half, bfloat16} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr("workspace_enabled: bool = false")
+    .Input("input: T")
+    .Output("output: T")
+    .Output("workspace: uint8")
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+oneDNN version of MaxPool operator that does not depend
+on layout propagation. Uses oneDNN APIs to perform max pooling
+on the input.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPoolGrad")
+    .Attr("T: {float, half, bfloat16} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr("workspace_enabled: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Input("workspace: uint8")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::MaxPoolGradShape)
+    .Doc(R"doc(
+oneDNN version of MaxPoolGrad that does not depend on layout propagation.
+Uses oneDNN APIs to compute gradients of MaxPool operator.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPool3D")
+    .Input("input: T")
+    .Output("output: T")
+    .Output("workspace: uint8")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {half, bfloat16, float}")
+    .Attr("workspace_enabled: bool = false")
+    .SetShapeFn(shape_inference::Pool3DShape)
+    .Doc(R"doc(
+oneDNN version of MaxPool3D operator that does not depend on layout propagation.
+Uses oneDNN APIs to perform 3D max pooling on the input.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPool3DGrad")
+    .Input("orig_input: TInput")
+    .Input("orig_output: TInput")
+    .Input("grad: T")
+    .Input("workspace: uint8")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("workspace_enabled: bool = false")
+    .SetShapeFn(shape_inference::MaxPool3DGradShape)
+    .Doc(R"doc(
+oneDNN version of MaxPool3DGrad operator that does not depend on layout
+propagation. Uses oneDNN APIs to compute gradients of MaxPool3D function.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklQuantizedMaxPool")
     .Input("input:         T")
     .Input("min_input:     float")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index d05d23922e6..aeef3f6cf89 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -761,9 +761,7 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 5);
-    });
+    .SetShapeFn(shape_inference::MaxPool3DGradShape);
 
 REGISTER_OP("MaxPool3DGradGrad")
     .Input("orig_input: T")
@@ -867,9 +865,7 @@ REGISTER_OP("MaxPoolGrad")
     .Input("grad: T")
     .Output("output: T")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    });
+    .SetShapeFn(shape_inference::MaxPoolGradShape);
 
 REGISTER_OP("MaxPoolGradV2")
     .Attr(GetPaddingAttrString())
@@ -881,9 +877,7 @@ REGISTER_OP("MaxPoolGradV2")
     .Input("strides: int32")
     .Output("output: T")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    });
+    .SetShapeFn(shape_inference::MaxPoolGradShape);
 
 // TODO(b/150813181): Implement explicit padding.
 REGISTER_OP("MaxPoolGradGrad")
@@ -2331,14 +2325,12 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("mkl_workspace: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    })
+    .SetShapeFn(shape_inference::MaxPoolGradShape)
     .Doc(R"doc(
-MKL version of MaxPoolGrad. Uses MKL DNN APIs to compute gradients of
+oneDNN version of MaxPoolGrad. Uses oneDNN APIs to compute gradients of
 MaxPool operator.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
@@ -2462,14 +2454,12 @@ REGISTER_OP("_MklMaxPool3DGrad")
     .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .Attr("workspace_enabled: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 5);
-    })
+    .SetShapeFn(shape_inference::MaxPool3DGradShape)
     .Doc(R"doc(
-MKL version of MklPool3DGrad operator. Uses MKL DNN APIs to compute gradients
-of MklPool function.
+oneDNN version of MaxPool3DGrad operator. Uses oneDNN APIs to compute gradients
+of MaxPool3D function.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 07d29506004..09d8bf7224b 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2278,6 +2278,7 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_BOOL
+        type: DT_VARIANT
       }
     }
   }
@@ -7569,6 +7570,11 @@ op {
     name: "instance_key"
     type: DT_INT32
   }
+  input_arg {
+    name: "ordering_token"
+    type: DT_RESOURCE
+    number_attr: "Nordering_token"
+  }
   output_arg {
     name: "data"
     type_attr: "T"
@@ -7600,6 +7606,14 @@ op {
       f: 0
     }
   }
+  attr {
+    name: "Nordering_token"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
   is_stateful: true
 }
 op {
@@ -7745,6 +7759,11 @@ op {
     name: "instance_key"
     type: DT_INT32
   }
+  input_arg {
+    name: "ordering_token"
+    type: DT_RESOURCE
+    number_attr: "Nordering_token"
+  }
   output_arg {
     name: "data"
     type_attr: "T"
@@ -7798,6 +7817,14 @@ op {
       f: 0
     }
   }
+  attr {
+    name: "Nordering_token"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
   is_stateful: true
 }
 op {
@@ -29872,6 +29899,135 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV4"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV4Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_min_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_max_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
 op {
   name: "QuantizeDownAndShrinkRange"
   input_arg {
@@ -34646,6 +34802,42 @@ op {
     type: "bool"
   }
 }
+op {
+  name: "RaggedTensorToVariantGradient"
+  input_arg {
+    name: "encoded_ragged_grad"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "row_splits"
+    type_attr: "Tsplits"
+  }
+  input_arg {
+    name: "dense_values_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "dense_values_grad"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "RandomCrop"
   input_arg {
@@ -41124,6 +41316,35 @@ op {
     }
   }
 }
+op {
+  name: "RiscAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "RngReadAndSkip"
   input_arg {
@@ -49712,7 +49933,6 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
   name: "StatelessRandomNormal"
diff --git a/tensorflow/core/ops/ragged_conversion_ops.cc b/tensorflow/core/ops/ragged_conversion_ops.cc
index 44712bf7739..043ff469487 100644
--- a/tensorflow/core/ops/ragged_conversion_ops.cc
+++ b/tensorflow/core/ops/ragged_conversion_ops.cc
@@ -92,7 +92,8 @@ tensorflow::Status ValidateRowPartitionTypesAndShapes(
 Status RaggedTensorToSparseShapeFn(InferenceContext* c);
 Status RaggedTensorToVariantShapeFn(InferenceContext* c);
 Status RaggedTensorFromVariantShapeFn(InferenceContext* c);
-tensorflow::Status RaggedTensorToTensorShapeFn(InferenceContext* c);
+Status RaggedTensorToVariantGradientShapeFn(InferenceContext* c);
+Status RaggedTensorToTensorShapeFn(InferenceContext* c);
 
 //==============================================================================
 // Registered Ops
@@ -129,6 +130,15 @@ REGISTER_OP("RaggedTensorFromVariant")
     .Attr("Tsplits: {int32, int64} = DT_INT64")
     .SetShapeFn(RaggedTensorFromVariantShapeFn);
 
+REGISTER_OP("RaggedTensorToVariantGradient")
+    .Input("encoded_ragged_grad: variant")
+    .Input("row_splits: Tsplits")
+    .Input("dense_values_shape: int32")
+    .Output("dense_values_grad: Tvalues")
+    .Attr("Tvalues: type")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
+    .SetShapeFn(RaggedTensorToVariantGradientShapeFn);
+
 REGISTER_OP("RaggedTensorToTensor")
     .Attr("T: type")
     .Attr("Tindex: {int64, int32}")
@@ -201,6 +211,14 @@ Status RaggedTensorToVariantShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status RaggedTensorToVariantGradientShapeFn(InferenceContext* c) {
+  ShapeHandle shape;
+  TF_RETURN_IF_ERROR(
+      c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(2, &shape));
+  c->set_output(0, shape);
+  return Status::OK();
+}
+
 Status RaggedTensorFromVariantShapeFn(InferenceContext* c) {
   int64 input_ragged_rank;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/ops/risc_ops.cc b/tensorflow/core/ops/risc_ops.cc
new file mode 100644
index 00000000000..1d90a645965
--- /dev/null
+++ b/tensorflow/core/ops/risc_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("RiscAdd")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetIsAggregate()
+    .SetIsCommutative();
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/stateless_random_ops_v2.cc b/tensorflow/core/ops/stateless_random_ops_v2.cc
index e6f87674174..3e544178f9c 100644
--- a/tensorflow/core/ops/stateless_random_ops_v2.cc
+++ b/tensorflow/core/ops/stateless_random_ops_v2.cc
@@ -101,7 +101,6 @@ REGISTER_OP("StatelessRandomGetKeyCounterAlg")
     .Output("counter: uint64")
     .Output("alg: int32")
     .Attr("Tseed: {int32, int64} = DT_INT64")
-    .SetIsStateful()  // because outputs depend on device
     .SetShapeFn([](InferenceContext* c) {
       // Check seed shape
       ShapeHandle seed;
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index d0247e0cb94..6623524c717 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -116,7 +116,7 @@ REGISTER_OP("AsString")
     .Output("output: string")
     .Attr(
         "T: {int8, int16, int32, int64, complex64, complex128, float, double, "
-        "bool}")
+        "bool, variant}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 0a324f31023..f19b4abc1ca 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -181,6 +181,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":platform",
+        "@com_google_absl//absl/strings:cord",
     ] + tf_platform_deps("cord"),
 )
 
@@ -1480,7 +1481,10 @@ filegroup(
     srcs = [
         "jpeg.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+    ],
 )
 
 filegroup(
@@ -1503,7 +1507,10 @@ filegroup(
         "tstring.h",
         "types.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+    ],
 )
 
 filegroup(
@@ -1521,7 +1528,10 @@ filegroup(
         "tstring.h",
         "types.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+    ],
 )
 
 filegroup(
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index b82e1041695..eea50174b38 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -5,7 +5,6 @@ load(
     _if_dynamic_kernels = "if_dynamic_kernels",
     _if_static = "if_static",
     _if_static_and_not_mobile = "if_static_and_not_mobile",
-    _register_extension_info = "register_extension_info",
     _tf_additional_grpc_deps_py = "tf_additional_grpc_deps_py",
     _tf_additional_license_deps = "tf_additional_license_deps",
     _tf_additional_plugin_deps = "tf_additional_plugin_deps",
@@ -19,7 +18,6 @@ load(
 if_dynamic_kernels = _if_dynamic_kernels
 if_static = _if_static
 if_static_and_not_mobile = _if_static_and_not_mobile
-register_extension_info = _register_extension_info
 tf_additional_grpc_deps_py = _tf_additional_grpc_deps_py
 tf_additional_license_deps = _tf_additional_license_deps
 tf_additional_plugin_deps = _tf_additional_plugin_deps
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 849048e99be..07909de8a5f 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -1,7 +1,7 @@
 # Tensorflow default + linux implementations of tensorflow/core/platform libraries.
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.bzl", "filegroup")
-load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test", "tf_copts")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -41,6 +41,7 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
+    deps = ["@com_google_absl//absl/strings:cord"],
 )
 
 cc_library(
@@ -429,12 +430,28 @@ cc_library(
     deps = [
         "//tensorflow/core/platform",
         "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/util:reporter",
     ],
 )
 
+tf_cc_test(
+    name = "test_benchmark_test",
+    srcs = ["test_benchmark_test.cc"],
+    tags = [
+        "nobuilder",
+        "notap",
+    ],
+    deps = [
+        ":test_benchmark",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "test",
     testonly = True,
@@ -564,6 +581,7 @@ package_group(
     name = "core_and_platform_packages",
     packages = [
         "//tensorflow/core",
+        "//tensorflow/core/lib/jpeg",
         "//tensorflow/core/platform",
     ],
 )
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 6012b4db407..38aeb9fb6ba 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -71,6 +71,3 @@ def if_dynamic_kernels(extra_deps, otherwise = []):
         str(Label("//tensorflow:dynamic_loaded_kernels")): extra_deps,
         "//conditions:default": otherwise,
     })
-
-def register_extension_info(**kwargs):
-    pass
diff --git a/tensorflow/core/platform/default/cord.h b/tensorflow/core/platform/default/cord.h
index 5823374d1a0..f6e0391f254 100644
--- a/tensorflow/core/platform/default/cord.h
+++ b/tensorflow/core/platform/default/cord.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_CORD_H_
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_CORD_H_
 
-// TODO(ebrevdo): Fill this in.
+#include "absl/strings/cord.h"
 
 #endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_CORD_H_
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index b16d5e8cff7..058ed0c4ff8 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -1,9 +1,5 @@
 """Build rules for tf.distribute testing."""
 
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "register_extension_info",
-)
 load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -81,8 +77,3 @@ def distribute_py_test(
             disable_v3 = disable_v3,
             disable_mlir_bridge = disable_mlir_bridge,
         )
-
-register_extension_info(
-    extension_name = "distribute_py_test",
-    label_regex_for_dep = "{extension_name}",
-)
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 6d2af607748..43d8545e6fd 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -229,7 +229,7 @@ void LogMessage::GenerateLogMessage() {
   __android_log_write(android_log_level, "native", ss.str().c_str());
 
   // Also log to stderr (for standalone Android apps).
-  std::cerr << "native : " << ss.str() << std::endl;
+  fprintf(stderr, "native : %s\n", ss.str().c_str());
 
   // Android logging at level FATAL does not terminate execution, so abort()
   // is still required to stop the program.
diff --git a/tensorflow/core/platform/default/test_benchmark.cc b/tensorflow/core/platform/default/test_benchmark.cc
index 533c4ac1df1..3b495b0ad00 100644
--- a/tensorflow/core/platform/default/test_benchmark.cc
+++ b/tensorflow/core/platform/default/test_benchmark.cc
@@ -52,18 +52,48 @@ Benchmark::Benchmark(const char* name, void (*fn)(int, int, int))
   Register();
 }
 
+Benchmark::Benchmark(const char* name, void (*fn)(::testing::benchmark::State&))
+    : name_(name),
+      // -1 because the number of parameters is not part of the benchmark
+      // routine signature.
+      num_args_(-1),
+      fn_state_(fn) {
+  Register();
+}
+
+void Benchmark::CheckArgCount(int expected) {
+  if (num_args_ == expected) return;
+
+  // Number of args is not part of function signature.
+  // Verify that if benchmark instantiation has previously provided args, they
+  // match "args".
+  if (num_args_ < 0) {
+    if (args_.empty() || instantiated_num_args_ == expected) return;
+  }
+  CHECK(false) << "Expected " << expected << " args for benchmark, but got "
+               << instantiated_num_args_;
+}
+
 Benchmark* Benchmark::Arg(int x) {
-  CHECK_EQ(num_args_, 1);
+  CheckArgCount(/*expected=*/1);
   args_.push_back(std::make_pair(x, -1));
+  instantiated_num_args_ = 1;
   return this;
 }
 
 Benchmark* Benchmark::ArgPair(int x, int y) {
-  CHECK_EQ(num_args_, 2);
+  CheckArgCount(/*expected=*/2);
+  instantiated_num_args_ = 2;
   args_.push_back(std::make_pair(x, y));
   return this;
 }
 
+Benchmark* Benchmark::UseRealTime() {
+  // Do nothing.
+  // This only exists for API compatibility with internal benchmarks.
+  return this;
+}
+
 namespace {
 
 void AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
@@ -210,6 +240,7 @@ void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
   static const int64 kMaxIters = 1000000000;
   static const double kMinTime = 0.5;
   int64 iters = kMinIters;
+
   while (true) {
     accum_time = 0;
     start_time = env->NowMicros();
@@ -220,8 +251,11 @@ void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
       (*fn0_)(iters);
     } else if (fn1_) {
       (*fn1_)(iters, arg1);
-    } else {
+    } else if (fn2_) {
       (*fn2_)(iters, arg1, arg2);
+    } else if (fn_state_) {
+      ::testing::benchmark::State state(iters, std::vector<int>(arg1, arg2));
+      (*fn_state_)(state);
     }
     StopTiming();
     const double seconds = accum_time * 1e-6;
@@ -261,3 +295,38 @@ void UseRealTime() {}
 
 }  // namespace testing
 }  // namespace tensorflow
+
+namespace testing {
+namespace benchmark {
+State::State(size_t max_iterations, const std::vector<int>& args)
+    : max_iterations(max_iterations), args_(args) {
+  completed_iterations_ = 0;
+}
+
+void State::PauseTiming() { ::tensorflow::testing::StopTiming(); }
+
+void State::ResumeTiming() { ::tensorflow::testing::StartTiming(); }
+
+void State::SetBytesProcessed(::tensorflow::int64 bytes) {
+  ::tensorflow::testing::BytesProcessed(bytes);
+}
+
+void State::SetItemsProcessed(::tensorflow::int64 items) {
+  ::tensorflow::testing::ItemsProcessed(items);
+}
+
+void State::SetLabel(absl::string_view label) {
+  ::tensorflow::testing::SetLabel(std::string(label));
+}
+
+int State::range(size_t i) const {
+  if (i >= args_.size()) {
+    LOG(FATAL) << "argument for range " << i << " is not set";
+  }
+  return args_[i];
+}
+
+void RunSpecifiedBenchmarks() { ::tensorflow::testing::Benchmark::Run("all"); }
+
+}  // namespace benchmark
+}  // namespace testing
diff --git a/tensorflow/core/platform/default/test_benchmark.h b/tensorflow/core/platform/default/test_benchmark.h
index 55149e5c050..4a6892cb8c5 100644
--- a/tensorflow/core/platform/default/test_benchmark.h
+++ b/tensorflow/core/platform/default/test_benchmark.h
@@ -32,6 +32,12 @@ limitations under the License.
 #define TF_BENCHMARK_CONCAT(a, b, c) TF_BENCHMARK_CONCAT2(a, b, c)
 #define TF_BENCHMARK_CONCAT2(a, b, c) a##b##c
 
+namespace testing {
+namespace benchmark {
+class State;
+}
+}  // namespace testing
+
 namespace tensorflow {
 namespace testing {
 
@@ -77,26 +83,41 @@ void DoNotOptimize(const T& var) {
 
 class Benchmark {
  public:
-  Benchmark(const char* name, void (*fn)(int));
-  Benchmark(const char* name, void (*fn)(int, int));
-  Benchmark(const char* name, void (*fn)(int, int, int));
+  [[deprecated("use `benchmark::State&` instead.")]] Benchmark(const char* name,
+                                                               void (*fn)(int));
+
+  [[deprecated("use `benchmark::State&` instead.")]] Benchmark(const char* name,
+                                                               void (*fn)(int,
+                                                                          int));
+
+  [[deprecated("use `benchmark::State&` instead.")]] Benchmark(
+      const char* name, void (*fn)(int, int, int));
+
+  Benchmark(const char* name, void (*fn)(::testing::benchmark::State&));
 
   Benchmark* Arg(int x);
   Benchmark* ArgPair(int x, int y);
   Benchmark* Range(int lo, int hi);
   Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
+
+  Benchmark* UseRealTime();
+
   static void Run(const char* pattern);
 
  private:
   string name_;
   int num_args_;
+  int instantiated_num_args_ = -1;
   std::vector<std::pair<int, int> > args_;
   void (*fn0_)(int) = nullptr;
   void (*fn1_)(int, int) = nullptr;
   void (*fn2_)(int, int, int) = nullptr;
+  void (*fn_state_)(::testing::benchmark::State&) = nullptr;
 
   void Register();
   void Run(int arg1, int arg2, int* run_count, double* run_seconds);
+
+  void CheckArgCount(int expected);
 };
 
 void RunBenchmarks();
@@ -110,4 +131,148 @@ void UseRealTime();
 }  // namespace testing
 }  // namespace tensorflow
 
+// Support `void BM_Func(benchmark::State&)` interface so that the it is
+// compatible with the internal version.
+namespace testing {
+namespace benchmark {
+// State is passed as an argument to a benchmark function.
+// Each thread in threaded benchmarks receives own object.
+class State {
+ public:
+  // Incomplete iterator-like type with dummy value type so that
+  // benchmark::State can support iteration with a range-based for loop.
+  //
+  // The only supported usage:
+  //
+  //   static void BM_Foo(benchmark::State& state) {
+  //     for (auto s : state) {
+  //       // perform single iteration
+  //     }
+  //   }
+  //
+  // This is meant to replace the deprecated API :
+  //
+  //   static void BM_Foo(int iters) {
+  //     while (iters-- > 0) {
+  //       // perform single iteration
+  //     }
+  //   }
+  //
+  // See go/benchmark#old-benchmark-interface for more details.
+  class Iterator {
+   public:
+    struct Value {
+      // Non-trivial destructor to avoid warning for unused dummy variable in
+      // the range-based for loop.
+      ~Value() {}
+    };
+
+    explicit Iterator(State* parent);
+
+    Iterator& operator++();
+
+    bool operator!=(const Iterator& other);
+
+    Value operator*();
+
+   private:
+    State* const parent_;
+  };
+
+  Iterator begin();
+  Iterator end();
+
+  void PauseTiming();
+  void ResumeTiming();
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.  If this routine is called with a
+  // value > 0, then bytes processed per second is also reported.
+  void SetBytesProcessed(::tensorflow::int64 bytes);
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  void SetItemsProcessed(::tensorflow::int64 items);
+
+  // If this method is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(benchmark::State& state) {
+  //    ...
+  //    double compression = input_size / output_size;
+  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark is currently executing
+  void SetLabel(absl::string_view label);
+
+  // For parameterized benchmarks, range(i) returns the value of the ith
+  // parameter. Simple benchmarks are not parameterized and do not need to call
+  // range().
+  int range(size_t i) const;
+
+  // Total number of iterations processed so far.
+  size_t iterations() const;
+
+  const size_t
+      max_iterations;  // NOLINT: for compatibility with OSS benchmark library
+
+  // Disallow copy and assign.
+  State(const State&) = delete;
+  State& operator=(const State&) = delete;
+
+ protected:
+  friend class tensorflow::testing::Benchmark;
+  State(size_t max_iterations, const std::vector<int>& args);
+
+ private:
+  size_t completed_iterations_;
+  std::vector<int> args_;
+};
+
+inline State::Iterator::Iterator(State* parent) : parent_(parent) {}
+
+inline size_t State::iterations() const { return completed_iterations_; }
+
+inline bool State::Iterator::operator!=(const Iterator& other) {
+  DCHECK_EQ(other.parent_, nullptr);
+  DCHECK_NE(parent_, nullptr);
+
+  if (parent_->completed_iterations_ < parent_->max_iterations) {
+    return true;
+  }
+
+  ++parent_->completed_iterations_;
+  // If this is the last iteration, stop the timer.
+  parent_->PauseTiming();
+  return false;
+}
+
+inline State::Iterator& State::Iterator::operator++() {
+  DCHECK_LT(parent_->completed_iterations_, parent_->max_iterations);
+  ++parent_->completed_iterations_;
+  return *this;
+}
+
+inline State::Iterator::Value State::Iterator::operator*() { return Value(); }
+
+inline State::Iterator State::begin() {
+  // Starts the timer here because if the code uses this API, it expects
+  // the timer to starts at the beginning of this loop.
+  ResumeTiming();
+  return Iterator(this);
+}
+
+inline State::Iterator State::end() { return Iterator(nullptr); }
+
+void RunSpecifiedBenchmarks();
+
+}  // namespace benchmark
+}  // namespace testing
+
 #endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_TEST_BENCHMARK_H_
diff --git a/tensorflow/core/platform/default/test_benchmark_test.cc b/tensorflow/core/platform/default/test_benchmark_test.cc
new file mode 100644
index 00000000000..2c692b2af7a
--- /dev/null
+++ b/tensorflow/core/platform/default/test_benchmark_test.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/default/test_benchmark.h"
+
+// Test the new interface: BM_benchmark(benchmark::State& state)
+namespace tensorflow {
+namespace testing {
+namespace {
+
+void BM_TestIterState(::testing::benchmark::State& state) {
+  int i = 0;
+  for (auto s : state) {
+    ++i;
+    DoNotOptimize(i);
+  }
+}
+
+BENCHMARK(BM_TestIterState);
+
+}  // namespace
+}  // namespace testing
+}  // namespace tensorflow
+
+int main() {
+  ::testing::benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 74195db7730..ed46a16d2e9 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -164,9 +164,8 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   string nn(namenode);
 
   string cacheKey(scheme.data(), scheme.size());
-  hdfsBuilder* builder = libhdfs()->hdfsNewBuilder();
   if (scheme == "file") {
-    libhdfs()->hdfsBuilderSetNameNode(builder, nullptr);
+    nn = "";
   } else if (scheme == "viewfs") {
     char* defaultFS = nullptr;
     libhdfs()->hdfsConfGetStr("fs.defaultFS", &defaultFS);
@@ -181,22 +180,24 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
     // The default NameNode configuration will be used (from the XML
     // configuration files). See:
     // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
-    libhdfs()->hdfsBuilderSetNameNode(builder, "default");
+    nn = "default";
   } else if (scheme == "har") {
     TF_RETURN_IF_ERROR(SplitArchiveNameAndPath(path, nn));
-    libhdfs()->hdfsBuilderSetNameNode(builder, nn.c_str());
-    cacheKey += nn;
   } else {
-    libhdfs()->hdfsBuilderSetNameNode(builder,
-                                      nn.empty() ? "default" : nn.c_str());
-    cacheKey += nn;
+    if (nn.empty()) {
+      nn = "default";
+    }
   }
+  cacheKey += nn;
   {
     mutex_lock lock(mu_);
     if (connectionCache_.find(cacheKey) == connectionCache_.end()) {
+      hdfsBuilder* builder = libhdfs()->hdfsNewBuilder();
+      libhdfs()->hdfsBuilderSetNameNode(builder,
+                                        nn.empty() ? nullptr : nn.c_str());
       hdfsFS cacheFs = libhdfs()->hdfsBuilderConnect(builder);
       if (cacheFs == nullptr) {
-        return errors::NotFound(strerror(errno));
+        return errors::Aborted(strerror(errno));
       }
       connectionCache_[cacheKey] = cacheFs;
     }
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 7cd1c4de88f..b76b3377397 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include <windows.h>
 #endif
 
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
 #include "absl/base/call_once.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h"
@@ -114,17 +118,11 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
          "CPU frequency";
   return INVALID_FREQUENCY;
 #elif defined(__APPLE__)
-  int64 freq_hz;
-  FILE* fp =
-      popen("sysctl hw | grep hw.cpufrequency_max: | cut -d' ' -f 2", "r");
-  if (fp == nullptr) {
-    return INVALID_FREQUENCY;
-  }
-  if (fscanf(fp, "%lld", &freq_hz) != 1) {
-    return INVALID_FREQUENCY;
-  }
-  pclose(fp);
-  if (freq_hz < 1e6) {
+  int64 freq_hz = 0;
+  size_t freq_size = sizeof(freq_hz);
+  int retval =
+      sysctlbyname("hw.cpufrequency_max", &freq_hz, &freq_size, NULL, 0);
+  if (retval != 0 || freq_hz < 1e6) {
     LOG(WARNING) << "Failed to get CPU frequency: " << freq_hz << " Hz";
     return INVALID_FREQUENCY;
   }
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 61c19ed4f97..57f827956df 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -129,6 +129,7 @@ cc_library(
     copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
@@ -355,6 +356,7 @@ tf_cc_test(
     size = "small",
     srcs = ["xplane_to_op_stats_test.cc"],
     deps = [
+        ":step_events_to_steps_db",
         ":xplane_to_op_stats",
         ":xplane_to_tf_functions",
         "//tensorflow/core:lib",
@@ -683,17 +685,23 @@ cc_library(
     deps = [
         ":op_stats_to_input_pipeline_analysis",
         ":op_stats_to_overview_page",
+        ":op_stats_to_pod_viewer",
         ":op_stats_to_tf_stats",
         ":xplane_to_memory_profile",
         ":xplane_to_op_stats",
+        ":xplane_to_tf_data_stats",
         ":xplane_to_trace_events",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -704,11 +712,13 @@ cc_library(
     hdrs = ["xplane_to_tf_data_stats.h"],
     copts = tf_profiler_copts(),
     deps = [
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
@@ -717,6 +727,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.cc b/tensorflow/core/profiler/convert/op_stats_combiner.cc
index d4ce7ec315f..0de30f2c4e8 100644
--- a/tensorflow/core/profiler/convert/op_stats_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.cc
@@ -97,6 +97,8 @@ void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
     dst->set_num_cores_per_replica(
         std::max(src.num_cores_per_replica(), dst->num_cores_per_replica()));
     *dst->mutable_topology() = src.topology();
+  } else if (dst->device_type().empty()) {
+    dst->set_device_type(src.device_type());
   }
   dst->set_task_count(src.task_count() + dst->task_count());
   (*dst->mutable_host_independent_job_info()) = src.host_independent_job_info();
@@ -157,6 +159,10 @@ void CombineOpStats(
 
   // Combine tf-function stats.
   CombineTfFunctionDb(src.tf_function_db(), dst->mutable_tf_function_db());
+
+  // Combine the mapping from core ID to details.
+  CombineCoreIdMap(src_host_id, src.core_id_to_details(),
+                   dst->mutable_core_id_to_details());
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index cf0b7e6ad43..b8610eb903a 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -72,6 +72,9 @@ void ComputeHostTips(OverviewPageRecommendation* re) {
   *re->add_host_tips() = MakeOverviewPageTip(
       "input_pipeline_analyzer (especially Section 3 for the breakdown of "
       "input operations on the Host)");
+  *re->add_host_tips() = MakeOverviewPageTip(
+      "tf_data_bottleneck_analysis (find the bottleneck in the tf.data input "
+      "pipeline)");
   *re->add_host_tips() = MakeOverviewPageTip(
       "trace_viewer (look at the activities on the timeline of each Host "
       "Thread near the bottom of the trace view)");
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
index 132b559d125..9e18cb1446a 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/diagnostics.h"
@@ -38,14 +39,31 @@ PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
   record.set_step_num(step_info.step_num());
   record.set_total_duration_us(PicosToMicros(step_info.duration_ps()));
   auto& step_breakdown_map = *record.mutable_step_breakdown_us();
-  std::vector<std::pair<uint64, std::string>> metrics;
-  for (const auto& entry : generic.type_ps()) {
-    step_breakdown_map[entry.first] = PicosToMicros(entry.second);
-    metrics.emplace_back(
-        entry.second, PrintEventTypeLabel(static_cast<EventType>(entry.first)));
-  }
+  std::vector<std::pair<uint64, absl::string_view>> metrics;
+
+  auto add_event = [&](GenericEventType type,
+                       std::initializer_list<EventType> event_list) {
+    uint64 ps = 0;
+    for (const auto& event_type : event_list) {
+      ps += gtl::FindWithDefault(generic.type_ps(), event_type, /*value=*/0);
+    }
+    step_breakdown_map[type] = PicosToMicros(ps);
+    metrics.emplace_back(ps, GetGenericEventTypeStr(type));
+  };
+
+  add_event(kDeviceCompute, {DEVICE_COMPUTE_32, DEVICE_COMPUTE_16});
+  add_event(kDeviceToDevice, {DEVICE_TO_DEVICE, DEVICE_WAIT_DEVICE});
+  add_event(kDeviceCollectives, {DEVICE_COLLECTIVES});
+  add_event(kHostCompute, {HOST_COMPUTE});
+  add_event(kHostPrepare, {HOST_PREPARE});
+  add_event(kInput, {HOST_WAIT_INPUT, HOST_TO_DEVICE, DEVICE_WAIT_HOST});
+  add_event(kOutput, {DEVICE_TO_HOST});
+  add_event(kCompile, {HOST_COMPILE});
+  add_event(kAllOthers, {UNKNOWN_TIME});
+
   std::sort(metrics.begin(), metrics.end());
-  record.set_bottleneck(metrics.back().second);
+  record.set_bottleneck(metrics.back().second.data(),
+                        metrics.back().second.size());
   return record;
 }
 
@@ -53,31 +71,21 @@ PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
 
 PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats) {
   PodStatsDatabase pod_stats_db;
-  auto add_event = [&pod_stats_db](EventType type) {
-    StepBreakdownEvents* event = pod_stats_db.add_step_breakdown_events();
-    event->set_id(type);
-    event->set_name(PrintEventTypeLabel(type));
-  };
-  add_event(HOST_COMPUTE);
-  add_event(HOST_COMPILE);
-  add_event(HOST_TO_HOST);
-  add_event(HOST_TO_DEVICE);
-  add_event(HOST_PREPARE);
-  add_event(DEVICE_COLLECTIVES);
-  add_event(HOST_WAIT_INPUT);
-  add_event(DEVICE_TO_DEVICE);
-  add_event(DEVICE_TO_HOST);
-  add_event(DEVICE_COMPUTE_32);
-  add_event(DEVICE_COMPUTE_16);
-  add_event(DEVICE_WAIT_DEVICE);
-  add_event(DEVICE_WAIT_HOST);
-  add_event(UNKNOWN_TIME);
+  const auto& core_id_map = op_stats.core_id_to_details();
+  for (int i = GenericEventType::kFirstGenericEventType;
+       i <= GenericEventType::kLastGenericEventType; i++) {
+    auto& event = *pod_stats_db.add_step_breakdown_events();
+    event.set_id(i);
+    absl::string_view type_str =
+        GetGenericEventTypeStr(static_cast<GenericEventType>(i));
+    event.set_name(type_str.data(), type_str.size());
+  }
 
   for (const auto& step_sequence : op_stats.step_db().step_sequence()) {
-    int count = 0;
     for (const auto& entry : step_sequence.step_info_per_core()) {
+      const CoreDetails& details = core_id_map.at(entry.first);
       *pod_stats_db.add_pod_stats_record() =
-          CreatePodStatsRecord(absl::StrCat(count++), entry.second);
+          CreatePodStatsRecord(details.hostname(), entry.second);
     }
   }
   PopulateStepDiagnostics(op_stats, pod_stats_db.mutable_diagnostics());
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
index 1e519fe361c..4f922791e5f 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
@@ -33,11 +33,12 @@ namespace {
 
 const double kMaxError = 1e-6;
 constexpr int kStepNum = 2;
-constexpr int kCoreId = 1;
+constexpr int kCoreId = 1001;
 constexpr int kStepTimePs = 1000;
-constexpr int kHostComputePs = 100;
+constexpr int kHostComputePs = 50;
 constexpr int kHostCompilePs = 50;
 constexpr int kHostToHostPs = 50;
+constexpr int kHostToDevicePs = 50;
 constexpr int kHostPreparePs = 50;
 constexpr int kDeviceCollectivePs = 350;
 constexpr int kHostWaitInputPs = 50;
@@ -48,6 +49,7 @@ constexpr int kDeviceCompute16Ps = 50;
 constexpr int kDeviceWaitDevicePs = 50;
 constexpr int kDeviceWaitHostPs = 50;
 constexpr int kUnknownTimePs = 50;
+static constexpr char kHostname[] = "host:123";
 
 void CreateOpStats(OpStats* op_stats) {
   PerCoreStepInfo* info = op_stats->mutable_step_db()->add_step_sequence();
@@ -60,6 +62,7 @@ void CreateOpStats(OpStats* op_stats) {
   type_ps[HOST_COMPUTE] = kHostComputePs;
   type_ps[HOST_COMPILE] = kHostCompilePs;
   type_ps[HOST_TO_HOST] = kHostToHostPs;
+  type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
   type_ps[HOST_PREPARE] = kHostPreparePs;
   type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
   type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
@@ -71,6 +74,8 @@ void CreateOpStats(OpStats* op_stats) {
   type_ps[DEVICE_WAIT_HOST] = kDeviceWaitHostPs;
   type_ps[UNKNOWN_TIME] = kUnknownTimePs;
   step_info.mutable_step_breakdown()->PackFrom(breakdown);
+  CoreDetails& details = (*op_stats->mutable_core_id_to_details())[kCoreId];
+  details.set_hostname(kHostname);
 }
 
 TEST(OpStatsToPodStats, GpuPodStats) {
@@ -80,37 +85,29 @@ TEST(OpStatsToPodStats, GpuPodStats) {
   EXPECT_EQ(1, pod_stats_db.pod_stats_record_size());
   const PodStatsRecord& record = pod_stats_db.pod_stats_record(0);
   EXPECT_EQ(kStepNum, record.step_num());
+  EXPECT_EQ(kHostname, record.host_name());
   EXPECT_NEAR(PicosToMicros(kStepTimePs), record.total_duration_us(),
               kMaxError);
   const auto& breakdown = record.step_breakdown_us();
-  EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(HOST_COMPUTE),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(HOST_COMPILE),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostToHostPs), breakdown.at(HOST_TO_HOST),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(HOST_PREPARE),
-              kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps + kDeviceCompute16Ps),
+              breakdown.at(kDeviceCompute), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs + kDeviceWaitDevicePs),
+              breakdown.at(kDeviceToDevice), kMaxError);
   EXPECT_NEAR(PicosToMicros(kDeviceCollectivePs),
-              breakdown.at(DEVICE_COLLECTIVES), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostWaitInputPs), breakdown.at(HOST_WAIT_INPUT),
+              breakdown.at(kDeviceCollectives), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(kHostCompute),
               kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs), breakdown.at(DEVICE_TO_DEVICE),
+  EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(kHostPrepare),
               kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(DEVICE_TO_HOST),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps),
-              breakdown.at(DEVICE_COMPUTE_32), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceCompute16Ps),
-              breakdown.at(DEVICE_COMPUTE_16), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceWaitDevicePs),
-              breakdown.at(DEVICE_WAIT_DEVICE), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceWaitHostPs), breakdown.at(DEVICE_WAIT_HOST),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(UNKNOWN_TIME),
+  EXPECT_NEAR(
+      PicosToMicros(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
+      breakdown.at(kInput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(kCompile), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(kAllOthers),
               kMaxError);
 
-  EXPECT_EQ(PrintEventTypeLabel(DEVICE_COLLECTIVES), record.bottleneck());
+  EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
 }
 
 TEST(OpStatsToPodStats, Diagnostics) {
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
index 975f6df2d07..c6d2a95f26d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
@@ -48,6 +48,7 @@ PodStatsSequence ConvertOpStatsToPodStatsSequence(const OpStats& op_stats,
 
 PodViewerDatabase ConvertOpStatsToPodViewer(const OpStats& op_stats) {
   PodViewerDatabase database;
+  database.set_device_type(op_stats.run_environment().device_type());
   PodStatsDatabase pod_stats = ConvertOpStatsToPodStats(op_stats);
   database.mutable_step_breakdown_events()->Swap(
       pod_stats.mutable_step_breakdown_events());
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
index fbbdd54cab1..c63edffbfce 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
@@ -34,11 +34,12 @@ namespace {
 
 const double kMaxError = 1e-6;
 constexpr int kStepNum = 2;
-constexpr int kCoreId = 1;
+constexpr int kCoreId = 1001;
 constexpr int kStepTimePs = 1000;
-constexpr int kHostComputePs = 100;
+constexpr int kHostComputePs = 50;
 constexpr int kHostCompilePs = 50;
 constexpr int kHostToHostPs = 50;
+constexpr int kHostToDevicePs = 50;
 constexpr int kHostPreparePs = 50;
 constexpr int kDeviceCollectivePs = 350;
 constexpr int kHostWaitInputPs = 50;
@@ -49,6 +50,7 @@ constexpr int kDeviceCompute16Ps = 50;
 constexpr int kDeviceWaitDevicePs = 50;
 constexpr int kDeviceWaitHostPs = 50;
 constexpr int kUnknownTimePs = 50;
+static constexpr char kHostname[] = "host:123";
 
 void CreateOpStats(OpStats* op_stats) {
   PerCoreStepInfo* info = op_stats->mutable_step_db()->add_step_sequence();
@@ -61,6 +63,7 @@ void CreateOpStats(OpStats* op_stats) {
   type_ps[HOST_COMPUTE] = kHostComputePs;
   type_ps[HOST_COMPILE] = kHostCompilePs;
   type_ps[HOST_TO_HOST] = kHostToHostPs;
+  type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
   type_ps[HOST_PREPARE] = kHostPreparePs;
   type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
   type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
@@ -72,6 +75,8 @@ void CreateOpStats(OpStats* op_stats) {
   type_ps[DEVICE_WAIT_HOST] = kDeviceWaitHostPs;
   type_ps[UNKNOWN_TIME] = kUnknownTimePs;
   step_info.mutable_step_breakdown()->PackFrom(breakdown);
+  CoreDetails& details = (*op_stats->mutable_core_id_to_details())[kCoreId];
+  details.set_hostname(kHostname);
 }
 
 TEST(OpStatsToPodViewer, GpuPodViewer) {
@@ -84,37 +89,29 @@ TEST(OpStatsToPodViewer, GpuPodViewer) {
   EXPECT_EQ(kStepNum, pod_stats_map.step_num());
   const PodStatsRecord& record = pod_stats_map.pod_stats_per_core().at(kCoreId);
   EXPECT_EQ(kStepNum, record.step_num());
+  EXPECT_EQ(kHostname, record.host_name());
   EXPECT_NEAR(PicosToMicros(kStepTimePs), record.total_duration_us(),
               kMaxError);
   const auto& breakdown = record.step_breakdown_us();
-  EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(HOST_COMPUTE),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(HOST_COMPILE),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostToHostPs), breakdown.at(HOST_TO_HOST),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(HOST_PREPARE),
-              kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps + kDeviceCompute16Ps),
+              breakdown.at(kDeviceCompute), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs + kDeviceWaitDevicePs),
+              breakdown.at(kDeviceToDevice), kMaxError);
   EXPECT_NEAR(PicosToMicros(kDeviceCollectivePs),
-              breakdown.at(DEVICE_COLLECTIVES), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kHostWaitInputPs), breakdown.at(HOST_WAIT_INPUT),
+              breakdown.at(kDeviceCollectives), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(kHostCompute),
               kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs), breakdown.at(DEVICE_TO_DEVICE),
+  EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(kHostPrepare),
               kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(DEVICE_TO_HOST),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps),
-              breakdown.at(DEVICE_COMPUTE_32), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceCompute16Ps),
-              breakdown.at(DEVICE_COMPUTE_16), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceWaitDevicePs),
-              breakdown.at(DEVICE_WAIT_DEVICE), kMaxError);
-  EXPECT_NEAR(PicosToMicros(kDeviceWaitHostPs), breakdown.at(DEVICE_WAIT_HOST),
-              kMaxError);
-  EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(UNKNOWN_TIME),
+  EXPECT_NEAR(
+      PicosToMicros(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
+      breakdown.at(kInput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(kCompile), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(kAllOthers),
               kMaxError);
 
-  EXPECT_EQ(PrintEventTypeLabel(DEVICE_COLLECTIVES), record.bottleneck());
+  EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
 }
 
 TEST(OpStatsToPodViewer, Diagnostics) {
@@ -125,6 +122,13 @@ TEST(OpStatsToPodViewer, Diagnostics) {
   EXPECT_EQ(kErrorIncompleteStep, pod_viewer_db.diagnostics().warnings(0));
 }
 
+TEST(OpStatsToPodViewer, DeviceType) {
+  OpStats op_stats;
+  op_stats.mutable_run_environment()->set_device_type("GPU");
+  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
+  EXPECT_EQ("GPU", pod_viewer_db.device_type());
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 067d47a0b57..0a1a1e19048 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -32,7 +32,7 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-XEventBuilder AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
+XEventBuilder AddTensorFlowOpEvent(std::string&& tf_op_fullname,
                                    int64 start_timestamp_ns, int64 duration_ns,
                                    bool on_device,
                                    absl::string_view kernel_name,
@@ -42,12 +42,13 @@ XEventBuilder AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
   event.SetTimestampNs(start_timestamp_ns);
   event.SetDurationNs(duration_ns);
   if (!on_device) return event;
-  event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("level 0"),
-                             tf_op_fullname);
+  event.AddStatValue(
+      *plane->GetOrCreateStatMetadata("level 0"),
+      *plane->GetOrCreateStatMetadata(std::move(tf_op_fullname)));
   return event;
 }
 
-void AddTensorFlowOpEventWithKernelDetails(absl::string_view tf_op_fullname,
+void AddTensorFlowOpEventWithKernelDetails(std::string&& tf_op_fullname,
                                            int64 start_timestamp_ns,
                                            int64 duration_ns, bool on_device,
                                            absl::string_view kernel_name,
@@ -55,8 +56,8 @@ void AddTensorFlowOpEventWithKernelDetails(absl::string_view tf_op_fullname,
                                            XPlaneBuilder* plane,
                                            XLineBuilder* line) {
   XEventBuilder event =
-      AddTensorFlowOpEvent(tf_op_fullname, start_timestamp_ns, duration_ns,
-                           on_device, kernel_name, plane, line);
+      AddTensorFlowOpEvent(std::move(tf_op_fullname), start_timestamp_ns,
+                           duration_ns, on_device, kernel_name, plane, line);
   if (!on_device) return;
   event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("kernel_details"),
                              kernel_details);
diff --git a/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc b/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
index 581a003eb38..4fe3ed58366 100644
--- a/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
+++ b/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
@@ -22,10 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns) {
-  VLOG(3) << "Post processing local profiler XSpace.";
-  // Post processing the collected XSpace without hold profiler lock.
-  // 1. Merge plane of host events with plane of CUPTI driver api.
+void MergeHostPlanes(XSpace* space) {
   const XPlane* cupti_driver_api_plane =
       FindPlaneWithName(*space, kCuptiDriverApiPlaneName);
   const XPlane* python_tracer_plane =
@@ -40,15 +37,20 @@ void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns) {
       MergePlanes(*python_tracer_plane, host_plane);
     }
     SortXLinesBy(host_plane, XLinesComparatorByName());
-    // NOTE: RemovePlaneWithName might invalidate plane pointers. so do these
-    // at the last step.
     if (cupti_driver_api_plane) {
-      RemovePlaneWithName(space, kCuptiDriverApiPlaneName);
+      RemovePlane(space, cupti_driver_api_plane);
     }
     if (python_tracer_plane) {
-      RemovePlaneWithName(space, kPythonTracerPlaneName);
+      RemovePlane(space, python_tracer_plane);
     }
   }
+}
+
+void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns) {
+  VLOG(3) << "Post processing local profiler XSpace.";
+  // Post processing the collected XSpace without hold profiler lock.
+  // 1. Merge plane of host events with plane of CUPTI driver api.
+  MergeHostPlanes(space);
 
   // 2. Normalize all timestamps by shifting timeline to profiling start time.
   // NOTE: this have to be done before sorting XSpace due to timestamp overflow.
diff --git a/tensorflow/core/profiler/convert/post_process_single_host_xplane.h b/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
index 31ebe28c48f..70c6785591b 100644
--- a/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
+++ b/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
@@ -21,6 +21,9 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+// Merges XPlanes generated by TraceMe, CUPTI API trace and Python tracer.
+void MergeHostPlanes(XSpace* space);
+
 // Post process XSpaces collected locally from multiple profilers.
 void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns);
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
index e404e096b70..cfc3439a373 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -52,7 +52,8 @@ void ConvertDeviceTraceXPlaneToKernelReports(
       event.ForEachStat([&](const tensorflow::profiler::XStatVisitor& stat) {
         if (!stat.Type().has_value()) return;
         switch (stat.Type().value()) {
-          case StatType::kLevel0:
+          case StatType::kTfOp:
+          case StatType::kLevel0:  // old way to deliver tf_op info.
             tf_op_fullname = stat.StrOrRefValue();
             break;
           case StatType::kKernelDetails:
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
index a7052c1d065..8500c3bddd6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
@@ -40,7 +40,7 @@ TEST(ConvertXplaneToKernelStats, MultiKernels) {
   XLineBuilder line_builder = device_trace_builder.GetOrCreateLine(0);
   CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_shortest",
                /*offset_ps=*/10000, /*duration_ps=*/1000,
-               {{StatType::kLevel0, "mul_786"},
+               {{StatType::kTfOp, "mul_786"},
                 {StatType::kKernelDetails, R"MULTI(registers_per_thread:16
 static_shared_memory_usage:0
 dynamic_shared_memory_usage:0
@@ -54,7 +54,7 @@ block_z:1)MULTI"},
 
   CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_middle",
                /*offset_ps=*/20000, /*duration_ps=*/2000,
-               {{StatType::kLevel0, "Conv2D"},
+               {{StatType::kTfOp, "Conv2D"},
                 {StatType::kKernelDetails, R"MULTI(registers_per_thread:32
 static_shared_memory_usage:0
 dynamic_shared_memory_usage:16384
@@ -69,7 +69,7 @@ block_z:1)MULTI"},
   CreateXEvent(&device_trace_builder, &line_builder,
                "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn",
                /*offset_ps=*/30000, /*duration_ps=*/3000,
-               {{StatType::kLevel0, "Einsum_80"},
+               {{StatType::kTfOp, "Einsum_80"},
                 {StatType::kKernelDetails, R"MULTI(registers_per_thread:32
 static_shared_memory_usage:0
 dynamic_shared_memory_usage:16384
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index 4abe5740969..e1dbaed1746 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -231,7 +231,8 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(
       absl::string_view tf_op_full_name;
       bool is_eager;
       event.ForEachStat([&](const XStatVisitor& stat) {
-        if (stat.Type() == StatType::kLevel0) {
+        if (stat.Type() == StatType::kLevel0 ||  // old way to deliver tf_op.
+            stat.Type() == StatType::kTfOp) {
           tf_op_full_name = stat.StrOrRefValue();
         } else if (stat.Type() == StatType::kIsEager) {
           is_eager = stat.IntValue();
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index bdac1129c81..7d6f23db041 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-void AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
+void AddTensorFlowOpEvent(std::string&& tf_op_fullname,
                           int64 start_timestamp_ns, int64 duration_ns,
                           bool on_device, absl::string_view kernel_name,
                           XPlaneBuilder* plane, XLineBuilder* line) {
@@ -40,8 +40,9 @@ void AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
   event.SetTimestampNs(start_timestamp_ns);
   event.SetDurationNs(duration_ns);
   if (!on_device) return;
-  event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("level 0"),
-                             tf_op_fullname);
+  event.AddStatValue(
+      *plane->GetOrCreateStatMetadata("level 0"),
+      *plane->GetOrCreateStatMetadata(std::move(tf_op_fullname)));
 }
 
 TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 9ca784c01bb..6eb67eab216 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -63,7 +63,7 @@ DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
         cap.set_num_cores(stat.IntValue());
         break;
       case kDevCapMemoryBandwidth:
-        cap.set_memory_bandwidth(stat.IntValue());  // bytes/s
+        cap.set_memory_bandwidth(stat.UintValue());  // bytes/s
         break;
       case kDevCapMemorySize:
         cap.set_memory_size_in_bytes(stat.UintValue());
@@ -100,10 +100,15 @@ PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
 
 namespace {
 
-void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
+void SetRunEnvironment(const XSpace& space, int32 accelerator_count,
+                       RunEnvironment* env) {
   // Currently, we only support profiling one host and one program.
   env->set_host_count(1);
   env->set_task_count(1);
+  for (const auto& hostname : space.hostnames()) {
+    std::vector<std::string> hostname_split = absl::StrSplit(hostname, ':');
+    (*env->mutable_hostnames())[hostname_split[0]] = true;
+  }
   env->set_device_type(accelerator_count > 0 ? "GPU" : "CPU");
   env->set_device_core_count(accelerator_count);
 }
@@ -155,7 +160,8 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
   // Convert device planes.
   OpMetricsDbCombiner op_metrics_db_combiner(
       op_stats.mutable_device_op_metrics_db());
-  SetRunEnvironment(device_planes.size(), op_stats.mutable_run_environment());
+  SetRunEnvironment(space, device_planes.size(),
+                    op_stats.mutable_run_environment());
 
   KernelReportMap reports;
   // TODO(b/161942993) parallelize XPlane processing per thread.
@@ -201,6 +207,11 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
     *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
         ComputePrecisionStats(nonoverlapped_step_events);
   }
+
+  CoreDetails& details =
+      (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
+  details.set_hostname(space.hostnames().empty() ? "localhost"
+                                                 : space.hostnames(0));
   return op_stats;
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index 09ed246e766..178f8c261f2 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -33,7 +33,7 @@ struct OpStatsOptions {
 
 // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsOptions& config);
+                               const OpStatsOptions& options);
 
 // Propagate and dedup the diagnostics in XSpace and add to OpStats.
 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index c3ccb73c078..a61c22f98a4 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
@@ -36,33 +37,34 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+static constexpr char kXPlanePb[] = "xplane.pb";
+
 TEST(ConvertXPlaneToOpStats, PerfEnv) {
   XSpace space;
   constexpr double kMaxError = 0.01;
   constexpr int kClockRateKHz = 1530000;
   constexpr int kCoreCount = 80;
-  constexpr uint64 kMemoryBandwidthBytesPerSecond = 900 * 1e9;
+  constexpr uint64 kMemoryBandwidthBytesPerSecond =
+      uint64{900} * 1000 * 1000 * 1000;
   // Volta.
   constexpr int kComputeCapMajor = 7;
   constexpr int kComputeCapMinor = 0;
 
   XPlaneBuilder device_plane(
       GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
-  device_plane.ParseAndAddStatValue(
-      *device_plane.GetOrCreateStatMetadata("clock_rate"),
-      absl::StrCat(kClockRateKHz));
-  device_plane.ParseAndAddStatValue(
-      *device_plane.GetOrCreateStatMetadata("core_count"),
-      absl::StrCat(kCoreCount));
-  device_plane.ParseAndAddStatValue(
+  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("clock_rate"),
+                            kClockRateKHz);
+  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("core_count"),
+                            kCoreCount);
+  device_plane.AddStatValue(
       *device_plane.GetOrCreateStatMetadata("memory_bandwidth"),
-      absl::StrCat(kMemoryBandwidthBytesPerSecond));
-  device_plane.ParseAndAddStatValue(
+      kMemoryBandwidthBytesPerSecond);
+  device_plane.AddStatValue(
       *device_plane.GetOrCreateStatMetadata("compute_cap_major"),
-      absl::StrCat(kComputeCapMajor));
-  device_plane.ParseAndAddStatValue(
+      kComputeCapMajor);
+  device_plane.AddStatValue(
       *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
-      absl::StrCat(kComputeCapMinor));
+      kComputeCapMinor);
 
   GroupTfEvents(&space);
   OpStatsOptions options;
@@ -178,9 +180,20 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
   EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
 }
 
+TEST(ConvertXPlaneToOpStats, Hostnames) {
+  XSpace space;
+  static constexpr char kHost[] = "host1";
+  *space.add_hostnames() = kHost;
+
+  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
+  EXPECT_EQ(
+      kHost,
+      op_stats.core_id_to_details().at(kDefaultGpuLocalCoreId).hostname());
+}
+
 // Helper function to build a XSpace and store it to test directory.
-void BuildAndStoreXSpaceForTest(Env* test_env, const std::string& test_dir,
-                                const std::string& xspace_name) {
+void BuildAndStoreXSpaceForTest(Env* test_env, absl::string_view test_dir,
+                                absl::string_view hostname) {
   constexpr int64 kStepNum = 123;
   constexpr int64 kStepId = 456;
   // Create a host only XSpace for test.
@@ -202,6 +215,9 @@ void BuildAndStoreXSpaceForTest(Env* test_env, const std::string& test_dir,
   CreateXEvent(&host_plane_builder, &executor_thread, "aaa:bbb", 30, 70);
   GroupTfEvents(&xspace);
 
+  xspace.add_hostnames(std::string(hostname));
+
+  std::string xspace_name = absl::StrCat(hostname, ".", kXPlanePb);
   TF_CHECK_OK(
       WriteBinaryProto(test_env, io::JoinPath(test_dir, xspace_name), xspace))
       << "Failed to write binary XSpace to file: " << xspace_name;
@@ -214,14 +230,17 @@ TEST(ConvertXPlaneToOpStats, TestConvertMultiXSpacesToCombinedOpStats) {
   TF_CHECK_OK(test_env->CreateDir(test_dir))
       << "Failed to create test directory: " << test_dir;
 
-  const std::string xspace1 = "xspace1.pb";
-  const std::string xspace2 = "xspace2.pb";
-  BuildAndStoreXSpaceForTest(test_env, test_dir, xspace1);
-  BuildAndStoreXSpaceForTest(test_env, test_dir, xspace2);
+  static constexpr char kHost1[] = "host1";
+  static constexpr char kHost2[] = "host2";
+
+  BuildAndStoreXSpaceForTest(test_env, test_dir, kHost1);
+  BuildAndStoreXSpaceForTest(test_env, test_dir, kHost2);
 
   std::vector<std::string> xspace_paths;
-  xspace_paths.push_back(io::JoinPath(test_dir, xspace1));
-  xspace_paths.push_back(io::JoinPath(test_dir, xspace2));
+  xspace_paths.push_back(
+      io::JoinPath(test_dir, absl::StrCat(kHost1, ".", kXPlanePb)));
+  xspace_paths.push_back(
+      io::JoinPath(test_dir, absl::StrCat(kHost2, ".", kXPlanePb)));
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
   options.generate_step_db = true;
@@ -248,8 +267,13 @@ TEST(ConvertXPlaneToOpStats, TestConvertMultiXSpacesToCombinedOpStats) {
   const auto& step_info_per_core =
       combined_op_stats.step_db().step_sequence(0).step_info_per_core();
   // global_core_id is computed using: 1000 * host_id + local_core_id.
-  EXPECT_TRUE(step_info_per_core.contains(1));
-  EXPECT_TRUE(step_info_per_core.contains(1001));
+  EXPECT_TRUE(step_info_per_core.contains(kDefaultGpuLocalCoreId));
+  EXPECT_TRUE(step_info_per_core.contains(1000 + kDefaultGpuLocalCoreId));
+
+  const auto& core_details_map = combined_op_stats.core_id_to_details();
+  EXPECT_EQ(kHost1, core_details_map.at(kDefaultGpuLocalCoreId).hostname());
+  EXPECT_EQ(kHost2,
+            core_details_map.at(1000 + kDefaultGpuLocalCoreId).hostname());
 
   // Tear down environment and directory for testing.
   int64 undeleted_files, undeleted_dirs;
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 10b2122d764..cf48a7d61ab 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -50,6 +50,7 @@ const absl::string_view kInputPipeline = "input_pipeline";
 const absl::string_view kOverviewPage = "overview_page";
 const absl::string_view kKernelStats = "kernel_stats";
 const absl::string_view kMemoryProfile = "memory_profile";
+const absl::string_view kXPlanePb = "xplane.pb";
 
 template <typename Proto>
 void AddToolData(absl::string_view tool_name, const Proto& tool_output,
@@ -74,6 +75,9 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
   absl::flat_hash_set<absl::string_view> tools(req.tools().begin(),
                                                req.tools().end());
   if (tools.empty()) return Status::OK();
+  if (tools.contains(kXPlanePb)) {
+    AddToolData(kXPlanePb, xspace, response);
+  }
   if (tools.contains(kTraceViewer)) {
     Trace trace;
     ConvertXSpaceToTraceEvents(xspace, &trace);
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
index d50cd9a98ff..c905713a4d8 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
@@ -39,16 +39,16 @@ void CreateXSpace(XSpace* space) {
       thread1.AddEvent(*host_plane.GetOrCreateEventMetadata("event1"));
   event1.SetTimestampNs(150000);
   event1.SetDurationNs(10000);
-  event1.ParseAndAddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
-                              "Relu");
+  event1.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
+                      *host_plane.GetOrCreateStatMetadata("Relu"));
   XLineBuilder thread2 = host_plane.GetOrCreateLine(20);
   thread2.SetName("thread2");
   XEventBuilder event2 =
       thread2.AddEvent(*host_plane.GetOrCreateEventMetadata("event2"));
   event2.SetTimestampNs(160000);
   event2.SetDurationNs(10000);
-  event2.ParseAndAddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
-                              "Conv2D");
+  event2.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
+                      *host_plane.GetOrCreateStatMetadata("Conv2D"));
 
   device_plane.SetName("gpu:0");
   device_plane.SetId(1);
@@ -58,8 +58,8 @@ void CreateXSpace(XSpace* space) {
       stream1.AddEvent(*device_plane.GetOrCreateEventMetadata("kernel1"));
   event3.SetTimestampNs(180000);
   event3.SetDurationNs(10000);
-  event3.ParseAndAddStatValue(
-      *device_plane.GetOrCreateStatMetadata("correlation id"), "55");
+  event3.AddStatValue(*device_plane.GetOrCreateStatMetadata("correlation id"),
+                      55);
 }
 
 TEST(ConvertXPlaneToProfileResponse, TraceViewer) {
@@ -109,6 +109,18 @@ TEST(ConvertXPlaneToProfileResponse, TensorflowStats) {
   ASSERT_TRUE(tf_stats_db.ParseFromString(response.tool_data(0).data()));
 }
 
+TEST(ConvertXPlaneToProfileResponse, XPlane) {
+  XSpace xspace;
+  CreateXSpace(&xspace);
+  ProfileRequest request;
+  request.add_tools("xplane.pb");
+  ProfileResponse response;
+  TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
+  EXPECT_EQ(1, response.tool_data_size());
+  EXPECT_EQ("xplane.pb", response.tool_data(0).name());
+  ASSERT_TRUE(xspace.ParseFromString(response.tool_data(0).data()));
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
index 7abc8ac37e3..6a678ed9680 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
@@ -30,6 +33,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace profiler {
+
+// 50 us from https://www.tensorflow.org/guide/data_performance_analysis
+const int64 kSlowCallThresholdPs = 50 * 1000000;
+
 namespace {
 
 // Returns true if the given iterator event is for a root iterator.
@@ -129,7 +136,7 @@ void ProcessEventForest(const EventForest& event_forest,
   }
 }
 
-void SetInputPipelineMetadata(int64 id, uint64 name_id,
+void SetInputPipelineMetadata(int64 id, int64 name_id,
                               bool is_device_input_pipeline,
                               InputPipelineMetadata* metadata) {
   constexpr absl::string_view kHostInputPipelinePrefix = "Host:";
@@ -199,8 +206,8 @@ void ProcessInputPipelines(
         root_iterator_event_map,
     TfDataStats* tf_data_stats) {
   auto* input_pipelines = tf_data_stats->mutable_input_pipelines();
-  uint64 num_host_input_pipelines = 0;
-  uint64 num_device_input_pipelines = 0;
+  int64 num_host_input_pipelines = 0;
+  int64 num_device_input_pipelines = 0;
   for (auto& id_and_events : *root_iterator_event_map) {
     auto& root_iterator_id = id_and_events.first;
     auto& root_iterator_events = id_and_events.second;
@@ -216,35 +223,220 @@ void ProcessInputPipelines(
     if (result.second) {
       bool is_device_input_pipeline =
           device_input_pipeline_ids.contains(root_iterator_id);
-      uint64 name_id = is_device_input_pipeline ? num_device_input_pipelines++
-                                                : num_host_input_pipelines++;
+      int64 name_id = is_device_input_pipeline ? num_device_input_pipelines++
+                                               : num_host_input_pipelines++;
       SetInputPipelineMetadata(root_iterator_id, name_id,
                                is_device_input_pipeline, metadata);
     }
-    uint64 sum_latency_ps = 0;
-    uint64 min_latency_ps = UINT64_MAX;
-    uint64 max_latency_ps = 0;
+    int64 sum_latency_ps = 0;
+    int64 min_latency_ps = INT64_MAX;
+    int64 max_latency_ps = 0;
+    int64 num_slow_calls = 0;
     for (const EventNode* root_iterator_event : root_iterator_events) {
       InputPipelineStat* stat = input_pipeline_stats.add_stats();
       ProcessIteratorEvent(*root_iterator_event, stat,
                            /*is_blocking*/ true);
       SetBottleneckIteratorId(stat);
-      uint64 latency_ps = root_iterator_event->GetEventVisitor().DurationPs();
+      int64 latency_ps = root_iterator_event->GetEventVisitor().DurationPs();
       sum_latency_ps += latency_ps;
       min_latency_ps = std::min(min_latency_ps, latency_ps);
       max_latency_ps = std::max(max_latency_ps, latency_ps);
+      if (latency_ps > kSlowCallThresholdPs) num_slow_calls++;
     }
     input_pipeline_stats.set_avg_latency_ps(sum_latency_ps /
                                             root_iterator_events.size());
     input_pipeline_stats.set_min_latency_ps(min_latency_ps);
     input_pipeline_stats.set_max_latency_ps(max_latency_ps);
+    input_pipeline_stats.set_num_slow_calls(num_slow_calls);
+  }
+}
+
+void SetBottleneckAnalysis(absl::string_view host_name,
+                           const TfDataStats& tf_data_stats,
+                           TfDataBottleneckAnalysis* bottleneck_analysis) {
+  for (const auto& id_and_stats : tf_data_stats.input_pipelines()) {
+    const InputPipelineStats& input_pipeline_stats = id_and_stats.second;
+    if (input_pipeline_stats.metadata().type() ==
+            InputPipelineMetadata::DEVICE ||
+        input_pipeline_stats.max_latency_ps() <=
+            bottleneck_analysis->max_latency_ps()) {
+      // Ignore device input pipelines and input pipelines faster than the
+      // current bottleneck.
+      continue;
+    }
+    bottleneck_analysis->set_host(host_name.data(), host_name.size());
+    bottleneck_analysis->set_input_pipeline(
+        input_pipeline_stats.metadata().name());
+    bottleneck_analysis->set_max_latency_ps(
+        input_pipeline_stats.max_latency_ps());
+    const IteratorMetadata& metadata = tf_data_stats.iterator_metadata().at(
+        input_pipeline_stats.stats(0).bottleneck_iterator_id());
+    bottleneck_analysis->set_iterator_name(metadata.name());
+    bottleneck_analysis->set_iterator_long_name(metadata.long_name());
+  }
+}
+
+std::string GetSuggestion(BottleneckType type) {
+  constexpr absl::string_view kPlaybookLink =
+      "https://www.tensorflow.org/guide/data_performance_analysis";
+  constexpr absl::string_view kPlaybookSourceDatasetLink =
+      "https://www.tensorflow.org/guide/"
+      "data_performance_analysis#source_datasets";
+  constexpr absl::string_view kPlaybookCpuUtilizationLink =
+      "https://www.tensorflow.org/guide/"
+      "data_performance_analysis#3_are_you_reaching_high_cpu_utilization";
+  constexpr absl::string_view kPlaybookTransformationLink =
+      "https://www.tensorflow.org/guide/"
+      "data_performance_analysis#transformation_datasets";
+  constexpr absl::string_view kTfGuideParallelDataExtractionLink =
+      "https://www.tensorflow.org/guide/"
+      "data_performance#parallelizing_data_extraction";
+  constexpr absl::string_view kTfGuideParallelTransformationLink =
+      "https://www.tensorflow.org/guide/"
+      "data_performance#parallelizing_data_transformation";
+  constexpr absl::string_view kTfGuideCacheLink =
+      "https://www.tensorflow.org/guide/data_performance#caching";
+  constexpr absl::string_view kTfDataServiceLink =
+      "https://www.tensorflow.org/api_docs/python/tf/data/experimental/"
+      "service?version=nightly";
+  switch (type) {
+    case BottleneckType::kSlowSource:
+      return absl::StrFormat(
+          "1. Check the locality of a host and input data. Ideally, they "
+          "should be in the same cell (or very close, like the same "
+          "region).<br/>"
+          "2. Parallelize reading from this dataset source. See %s and %s for "
+          "more details.<br/>",
+          AnchorElement(kPlaybookSourceDatasetLink, "here"),
+          AnchorElement(kTfGuideParallelDataExtractionLink, "here"));
+    case BottleneckType::kSlowDataService:
+      return absl::StrFormat(
+          "1. Fetching data from tf.data service took a while. Profile the "
+          "tf.data service worker to analyze the issue further.<br/>"
+          "2. See %s for more details on tf.data service.<br/>"
+          "3. See %s for other suggestions.",
+          AnchorElement(kTfDataServiceLink, "this"),
+          AnchorElement(kPlaybookLink, "this"));
+    case BottleneckType::kSlowRemoteSource:
+      return absl::StrFormat(
+          "1. The remote data source is slow. Profile its host to analyze the "
+          "issue further.<br/>"
+          "2. See %s for other suggestions.",
+          AnchorElement(kPlaybookLink, "this"));
+    case BottleneckType::kSlowTransformationWithParallelVersion:
+      return absl::StrFormat(
+          "1. Parallelize this transformation by setting "
+          "<code>num_parallel_calls=tf.data.experimental.AUTOTUNE</code>. See "
+          "%s for more details.<br/>"
+          "2. Consider adding <code>cache</code> after this transformation if "
+          "your data fits into memory and it is appropriate (e.g., there is no "
+          "randomness in upstream transformations like <code>shuffle</code>). "
+          "See %s for more details.<br/>"
+          "3. Find more resources %s.",
+          AnchorElement(kTfGuideParallelTransformationLink, "this"),
+          AnchorElement(kTfGuideCacheLink, "this"),
+          AnchorElement(kPlaybookTransformationLink, "here"));
+    case BottleneckType::kSlowTransformationWithoutParallelVersion:
+      return absl::StrFormat(
+          "1. This transformation is inherently sequential. Add outer "
+          "parallelism by running multiple copies of the input pipeline over "
+          "sharded inputs and combining the results. See %s for more "
+          "details.<br/>"
+          "2. Consider adding <code>cache</code> after this transformation if "
+          "your data fits into memory and it is appropriate (e.g., there is no "
+          "randomness in upstream transformations like <code>shuffle</code>). "
+          "See %s for more details.<br/>"
+          "3. Find more resources %s.",
+          AnchorElement(kPlaybookTransformationLink, "this"),
+          AnchorElement(kTfGuideCacheLink, "this"),
+          AnchorElement(kPlaybookCpuUtilizationLink, "here"));
+    default:
+      return absl::StrFormat("See %s for suggestions.",
+                             AnchorElement(kPlaybookLink, "this"));
+  }
+}
+
+void SetSuggestion(TfDataBottleneckAnalysis* bottleneck_analysis) {
+  if (bottleneck_analysis->max_latency_ps() <= kSlowCallThresholdPs) return;
+  bottleneck_analysis->set_suggestion(
+      GetSuggestion(GetBottleneckType(bottleneck_analysis->iterator_name())));
+}
+
+void SetSummary(CombinedTfDataStats* combined_tf_data_stats) {
+  int64 max_latency_ps =
+      combined_tf_data_stats->bottleneck_analysis().max_latency_ps();
+  if (max_latency_ps > kSlowCallThresholdPs) {
+    combined_tf_data_stats->set_is_input_bound(true);
+    combined_tf_data_stats->set_summary(
+        "Your profile has a tf.data input pipeline slower than 50 us. Below "
+        "shows a bottleneck in the slow input pipeline and a suggestion on how "
+        "to fix it.");
+  } else if (max_latency_ps > 0) {
+    combined_tf_data_stats->set_is_input_bound(false);
+    combined_tf_data_stats->set_summary(
+        "Your profile does not have any tf.data input pipeline slower than 50 "
+        "us. Your job could be still input bound if this profile didn't "
+        "capture all workers.");
+  } else {
+    combined_tf_data_stats->set_is_input_bound(false);
+    combined_tf_data_stats->set_summary(
+        "No tf.data activitiy captured in your profile. If your job uses "
+        "tf.data, try to capture a longer profile.");
   }
 }
 
 }  // namespace
 
-TfDataStats ConvertXPlaneToTfDataStats(XPlane* host_plane) {
-  TfDataStats tf_data_stats;
+BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name) {
+  static auto* kBottleneckTypeMap = new absl::flat_hash_map<absl::string_view,
+                                                            BottleneckType>(
+      {// Read from storage.
+       {"TFRecord", BottleneckType::kSlowSource},
+       {"SSTable", BottleneckType::kSlowSource},
+       {"RecordIO", BottleneckType::kSlowSource},
+       {"Spanner", BottleneckType::kSlowSource},
+       {"TFColumn", BottleneckType::kSlowSource},
+       {"SleepwalkRemoteDataset", BottleneckType::kSlowSource},
+       {"TextLine", BottleneckType::kSlowSource},
+       {"StitchedTimelineDataset", BottleneckType::kSlowSource},
+       {"DateKeyDataset", BottleneckType::kSlowSource},
+       {"CapacitorProto", BottleneckType::kSlowSource},
+       {"LMDB", BottleneckType::kSlowSource},
+       {"ExternalDataset", BottleneckType::kSlowSource},
+       {"PearModel", BottleneckType::kSlowSource},
+       {"FixedLengthRecordV2", BottleneckType::kSlowSource},
+       // Read from local memory.
+       {"FromTensor", BottleneckType::kSlowSource},
+       {"TensorSlice", BottleneckType::kSlowSource},
+       {"Generator", BottleneckType::kSlowSource},
+       {"SyntheticDatasetOp", BottleneckType::kSlowSource},
+       // tf.data service.
+       {"DataService", BottleneckType::kSlowDataService},
+       // Read from remote memory.
+       {"GuzzlerDataGuzzlerRemoteDataset", BottleneckType::kSlowRemoteSource},
+       {"ReverbDataset", BottleneckType::kSlowRemoteSource},
+       {"DatasetSampleGame", BottleneckType::kSlowRemoteSource},
+       {"Courier", BottleneckType::kSlowRemoteSource},
+       {"ReverbEpisodeDataset", BottleneckType::kSlowRemoteSource},
+       // Transformations with parallel version.
+       {"Map", BottleneckType::kSlowTransformationWithParallelVersion},
+       {"Interleave", BottleneckType::kSlowTransformationWithParallelVersion},
+       // Transformations without parallel version.
+       {"Filter", BottleneckType::kSlowTransformationWithoutParallelVersion},
+       {"Batch", BottleneckType::kSlowTransformationWithoutParallelVersion},
+       {"Unbatch", BottleneckType::kSlowTransformationWithoutParallelVersion}});
+  if (auto type =
+          gtl::FindOrNull(*kBottleneckTypeMap, bottleneck_iterator_name)) {
+    return *type;
+  }
+  return BottleneckType::kOther;
+}
+
+void CombinedTfDataStatsBuilder::Add(absl::string_view host_name,
+                                     XPlane* host_plane) {
+  TfDataStats& tf_data_stats =
+      (*combined_tf_data_stats_
+            ->mutable_tf_data_stats())[std::string(host_name)];
   EventForest event_forest;
   event_forest.AddPlanes(CreateTfXPlaneVisitor, {host_plane});
   event_forest.ConnectEvents();
@@ -255,7 +447,19 @@ TfDataStats ConvertXPlaneToTfDataStats(XPlane* host_plane) {
                      &root_iterator_event_map, &tf_data_stats);
   ProcessInputPipelines(device_input_pipeline_ids, &root_iterator_event_map,
                         &tf_data_stats);
-  return tf_data_stats;
+}
+
+void CombinedTfDataStatsBuilder::Finalize() {
+  TfDataBottleneckAnalysis* bottleneck_analysis =
+      combined_tf_data_stats_->mutable_bottleneck_analysis();
+  for (const auto& host_name_and_tf_data_stats :
+       combined_tf_data_stats_->tf_data_stats()) {
+    SetBottleneckAnalysis(host_name_and_tf_data_stats.first,
+                          host_name_and_tf_data_stats.second,
+                          bottleneck_analysis);
+  }
+  if (generate_suggestion_) SetSuggestion(bottleneck_analysis);
+  SetSummary(combined_tf_data_stats_);
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
index 486c198d735..2f4473805f9 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
@@ -16,13 +16,45 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
 
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
-TfDataStats ConvertXPlaneToTfDataStats(XPlane* host_plane);
+TF_CONST_INIT extern const int64 kSlowCallThresholdPs;
+
+enum class BottleneckType {
+  kSlowSource,
+  kSlowDataService,
+  kSlowRemoteSource,
+  kSlowTransformationWithParallelVersion,
+  kSlowTransformationWithoutParallelVersion,
+  kOther,
+};
+
+BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name);
+
+class CombinedTfDataStatsBuilder {
+ public:
+  explicit CombinedTfDataStatsBuilder(
+      CombinedTfDataStats* combined_tf_data_stats,
+      bool generate_suggestion = true)
+      : combined_tf_data_stats_(combined_tf_data_stats),
+        generate_suggestion_(generate_suggestion) {}
+
+  void Add(absl::string_view host_name, XPlane* host_plane);
+
+  // Finalizes by populating TfDataBottleneckAnalysis.
+  void Finalize();
+
+ private:
+  CombinedTfDataStats* combined_tf_data_stats_;
+  bool generate_suggestion_;
+};
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
index 5b597227c83..176db2d5469 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
@@ -45,115 +45,135 @@ TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
 
   auto consumer_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 0,
-               100, {{StatType::kStepId, kPrefetchIteratorId}});
+               100000000, {{StatType::kStepId, kPrefetchIteratorId}});
   CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kPrefetchConsume, 80, 20,
+               HostEventType::kPrefetchConsume, 80000000, 20000000,
                {{StatType::kElementId, kFirstElementId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 200,
-               20, {{StatType::kStepId, kPrefetchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch",
+               200000000, 20000000, {{StatType::kStepId, kPrefetchIteratorId}});
   CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kPrefetchConsume, 210, 10,
+               HostEventType::kPrefetchConsume, 210000000, 10000000,
                {{StatType::kElementId, kSecondElementId}});
 
   auto producer_thread = host_plane_builder.GetOrCreateLine(1);
   // Blocking producer.
   CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kPrefetchProduce, 0, 80,
+               HostEventType::kPrefetchProduce, 0, 80000000,
                {{StatType::kElementId, kFirstElementId}});
   CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::Prefetch::Range", 0, 80,
+               "Iterator::Prefetch::Range", 0, 80000000,
                {{StatType::kStepId, kRangeIteratorId},
                 {StatType::kParentId, kPrefetchIteratorId}});
   // Non-blocking producer.
   CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kPrefetchProduce, 100, 80,
+               HostEventType::kPrefetchProduce, 100000000, 80000000,
                {{StatType::kElementId, kSecondElementId}});
   CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::Prefetch::Range", 100, 80,
+               "Iterator::Prefetch::Range", 100000000, 80000000,
                {{StatType::kStepId, kRangeIteratorId},
                 {StatType::kParentId, kPrefetchIteratorId}});
 
-  TfDataStats tf_data_stats = ConvertXPlaneToTfDataStats(&host_plane);
-  EXPECT_THAT(tf_data_stats, EqualsProto(R"pb(
-                iterator_metadata: {
-                  key: 123,
-                  value: {
-                    id: 123
-                    name: "Prefetch"
-                    long_name: "Iterator::Prefetch"
-                    is_async: true
-                  }
-                }
-                iterator_metadata: {
-                  key: 456,
-                  value: {
-                    id: 456
-                    parent_id: 123
-                    name: "Range"
-                    long_name: "Iterator::Prefetch::Range"
-                    is_async: false
-                  }
-                }
-                input_pipelines {
-                  key: 123,
-                  value: {
-                    metadata { id: 123 type: HOST name: "Host:0" }
-                    avg_latency_ps: 60
-                    min_latency_ps: 20
-                    max_latency_ps: 100
-                    stats {
-                      bottleneck_iterator_id: 456
-                      iterator_stats {
-                        key: 123,
-                        value: {
-                          id: 123
-                          start_time_ps: 0
-                          duration_ps: 100
-                          self_time_ps: 20
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
-                      iterator_stats {
-                        key: 456,
-                        value: {
-                          id: 456
-                          start_time_ps: 0
-                          duration_ps: 80
-                          self_time_ps: 80
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  builder.Add("host1", &host_plane);
+  builder.Finalize();
+  EXPECT_THAT(
+      combined_tf_data_stats, EqualsProto(R"pb(
+        bottleneck_analysis: {
+          host: "host1"
+          input_pipeline: "Host:0"
+          max_latency_ps: 100000000
+          iterator_name: "Range"
+          iterator_long_name: "Iterator::Prefetch::Range"
+          suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
+        }
+        tf_data_stats: {
+          key: "host1"
+          value: {
+            iterator_metadata: {
+              key: 123,
+              value: {
+                id: 123
+                name: "Prefetch"
+                long_name: "Iterator::Prefetch"
+                is_async: true
+              }
+            }
+            iterator_metadata: {
+              key: 456,
+              value: {
+                id: 456
+                parent_id: 123
+                name: "Range"
+                long_name: "Iterator::Prefetch::Range"
+                is_async: false
+              }
+            }
+            input_pipelines {
+              key: 123,
+              value: {
+                metadata { id: 123 type: HOST name: "Host:0" }
+                avg_latency_ps: 60000000
+                min_latency_ps: 20000000
+                max_latency_ps: 100000000
+                num_slow_calls: 1
+                stats {
+                  bottleneck_iterator_id: 456
+                  iterator_stats {
+                    key: 123,
+                    value: {
+                      id: 123
+                      start_time_ps: 0
+                      duration_ps: 100000000
+                      self_time_ps: 20000000
+                      is_blocking: true
+                      num_calls: 1
                     }
-                    stats {
-                      bottleneck_iterator_id: 123
-                      iterator_stats {
-                        key: 123,
-                        value: {
-                          id: 123
-                          start_time_ps: 200
-                          duration_ps: 20
-                          self_time_ps: 20
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
-                      iterator_stats {
-                        key: 456,
-                        value: {
-                          id: 456
-                          start_time_ps: 100
-                          duration_ps: 80
-                          self_time_ps: 80
-                          is_blocking: false
-                          num_calls: 1
-                        }
-                      }
+                  }
+                  iterator_stats {
+                    key: 456,
+                    value: {
+                      id: 456
+                      start_time_ps: 0
+                      duration_ps: 80000000
+                      self_time_ps: 80000000
+                      is_blocking: true
+                      num_calls: 1
                     }
                   }
                 }
-              )pb"));
+                stats {
+                  bottleneck_iterator_id: 123
+                  iterator_stats {
+                    key: 123,
+                    value: {
+                      id: 123
+                      start_time_ps: 200000000
+                      duration_ps: 20000000
+                      self_time_ps: 20000000
+                      is_blocking: true
+                      num_calls: 1
+                    }
+                  }
+                  iterator_stats {
+                    key: 456,
+                    value: {
+                      id: 456
+                      start_time_ps: 100000000
+                      duration_ps: 80000000
+                      self_time_ps: 80000000
+                      is_blocking: false
+                      num_calls: 1
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        is_input_bound: true
+        summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
+      )pb"));
 }
 
 TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
@@ -167,92 +187,106 @@ TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
 
   auto consumer_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 0,
-               30, {{StatType::kStepId, kPrefetchIteratorId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 100,
-               100, {{StatType::kStepId, kPrefetchIteratorId}});
+               30000000, {{StatType::kStepId, kPrefetchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch",
+               100000000, 100000000,
+               {{StatType::kStepId, kPrefetchIteratorId}});
   CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kPrefetchConsume, 180, 20,
+               HostEventType::kPrefetchConsume, 180000000, 20000000,
                {{StatType::kElementId, kElementId}});
 
   auto producer_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kPrefetchProduce, 100, 80,
+               HostEventType::kPrefetchProduce, 100000000, 80000000,
                {{StatType::kElementId, kElementId}});
   CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::Prefetch::Generator", 100, 80,
+               "Iterator::Prefetch::Generator", 100000000, 80000000,
                {{StatType::kStepId, kRangeIteratorId},
                 {StatType::kParentId, kPrefetchIteratorId}});
 
-  TfDataStats tf_data_stats = ConvertXPlaneToTfDataStats(&host_plane);
-  EXPECT_THAT(tf_data_stats, EqualsProto(R"pb(
-                iterator_metadata: {
-                  key: 123,
-                  value: {
-                    id: 123
-                    name: "Prefetch"
-                    long_name: "Iterator::Prefetch"
-                    is_async: true
-                  }
-                }
-                iterator_metadata: {
-                  key: 456,
-                  value: {
-                    id: 456
-                    parent_id: 123
-                    name: "Generator"
-                    long_name: "Iterator::Prefetch::Generator"
-                    is_async: false
-                  }
-                }
-                input_pipelines {
-                  key: 123,
-                  value: {
-                    metadata { id: 123 type: DEVICE name: "Device:0" }
-                    avg_latency_ps: 65
-                    min_latency_ps: 30
-                    max_latency_ps: 100
-                    stats {
-                      bottleneck_iterator_id: 456
-                      iterator_stats {
-                        key: 123,
-                        value: {
-                          id: 123
-                          start_time_ps: 100
-                          duration_ps: 100
-                          self_time_ps: 20
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
-                      iterator_stats {
-                        key: 456,
-                        value: {
-                          id: 456
-                          start_time_ps: 100
-                          duration_ps: 80
-                          self_time_ps: 80
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  builder.Add("host1", &host_plane);
+  builder.Finalize();
+  // Device input pipeline is not considered for bottleneck analysis.
+  EXPECT_THAT(
+      combined_tf_data_stats, EqualsProto(R"pb(
+        bottleneck_analysis: {}
+        tf_data_stats: {
+          key: "host1"
+          value: {
+            iterator_metadata: {
+              key: 123,
+              value: {
+                id: 123
+                name: "Prefetch"
+                long_name: "Iterator::Prefetch"
+                is_async: true
+              }
+            }
+            iterator_metadata: {
+              key: 456,
+              value: {
+                id: 456
+                parent_id: 123
+                name: "Generator"
+                long_name: "Iterator::Prefetch::Generator"
+                is_async: false
+              }
+            }
+            input_pipelines {
+              key: 123,
+              value: {
+                metadata { id: 123 type: DEVICE name: "Device:0" }
+                avg_latency_ps: 65000000
+                min_latency_ps: 30000000
+                max_latency_ps: 100000000
+                num_slow_calls: 1
+                stats {
+                  bottleneck_iterator_id: 456
+                  iterator_stats {
+                    key: 123,
+                    value: {
+                      id: 123
+                      start_time_ps: 100000000
+                      duration_ps: 100000000
+                      self_time_ps: 20000000
+                      is_blocking: true
+                      num_calls: 1
                     }
-                    stats {
-                      bottleneck_iterator_id: 123
-                      iterator_stats {
-                        key: 123,
-                        value: {
-                          id: 123
-                          start_time_ps: 0
-                          duration_ps: 30
-                          self_time_ps: 30
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
+                  }
+                  iterator_stats {
+                    key: 456,
+                    value: {
+                      id: 456
+                      start_time_ps: 100000000
+                      duration_ps: 80000000
+                      self_time_ps: 80000000
+                      is_blocking: true
+                      num_calls: 1
                     }
                   }
                 }
-              )pb"));
+                stats {
+                  bottleneck_iterator_id: 123
+                  iterator_stats {
+                    key: 123,
+                    value: {
+                      id: 123
+                      start_time_ps: 0
+                      duration_ps: 30000000
+                      self_time_ps: 30000000
+                      is_blocking: true
+                      num_calls: 1
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        summary: "No tf.data activitiy captured in your profile. If your job uses tf.data, try to capture a longer profile."
+      )pb"));
 }
 
 // Test with the following example dataset:
@@ -272,83 +306,103 @@ TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
 
   XLineBuilder consumer_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::MapAndBatch",
-               0, 100, {{StatType::kStepId, kMapAndBatchIteratorId}});
+               0, 100000000, {{StatType::kStepId, kMapAndBatchIteratorId}});
   CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kMapAndBatchConsume, 80, 20,
+               HostEventType::kMapAndBatchConsume, 80000000, 20000000,
                {{StatType::kElementId, kElementId}});
 
   XLineBuilder producer_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kMapAndBatchProduce, 0, 30,
+               HostEventType::kMapAndBatchProduce, 0, 30000000,
                {{StatType::kElementId, kElementId}});
   CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::MapAndBatch::Range", 0, 30,
+               "Iterator::MapAndBatch::Range", 0, 30000000,
                {{StatType::kStepId, kRangeIteratorId},
                 {StatType::kParentId, kMapAndBatchIteratorId}});
   CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kMapAndBatchProduce, 40, 30,
+               HostEventType::kMapAndBatchProduce, 40000000, 30000000,
                {{StatType::kElementId, kElementId}});
   CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::MapAndBatch::Range", 40, 30,
+               "Iterator::MapAndBatch::Range", 40000000, 30000000,
                {{StatType::kStepId, kRangeIteratorId},
                 {StatType::kParentId, kMapAndBatchIteratorId}});
 
-  TfDataStats tf_data_stats = ConvertXPlaneToTfDataStats(&host_plane);
-  EXPECT_THAT(tf_data_stats, EqualsProto(R"pb(
-                iterator_metadata: {
-                  key: 123,
-                  value: {
-                    id: 123
-                    name: "MapAndBatch"
-                    long_name: "Iterator::MapAndBatch"
-                    is_async: true
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  builder.Add("host1", &host_plane);
+  builder.Finalize();
+  EXPECT_THAT(
+      combined_tf_data_stats, EqualsProto(R"pb(
+        bottleneck_analysis: {
+          host: "host1"
+          input_pipeline: "Host:0"
+          max_latency_ps: 100000000
+          iterator_name: "Range"
+          iterator_long_name: "Iterator::MapAndBatch::Range"
+          suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
+        }
+        tf_data_stats: {
+          key: "host1"
+          value: {
+            iterator_metadata: {
+              key: 123,
+              value: {
+                id: 123
+                name: "MapAndBatch"
+                long_name: "Iterator::MapAndBatch"
+                is_async: true
+              }
+            }
+            iterator_metadata: {
+              key: 456,
+              value: {
+                id: 456
+                parent_id: 123
+                name: "Range"
+                long_name: "Iterator::MapAndBatch::Range"
+                is_async: false
+              }
+            }
+            input_pipelines {
+              key: 123,
+              value: {
+                metadata { id: 123 type: HOST name: "Host:0" }
+                avg_latency_ps: 100000000
+                min_latency_ps: 100000000
+                max_latency_ps: 100000000
+                num_slow_calls: 1
+                stats {
+                  bottleneck_iterator_id: 456
+                  iterator_stats {
+                    key: 123,
+                    value: {
+                      id: 123
+                      start_time_ps: 0
+                      duration_ps: 100000000
+                      self_time_ps: 40000000
+                      is_blocking: true
+                      num_calls: 1
+                    }
                   }
-                }
-                iterator_metadata: {
-                  key: 456,
-                  value: {
-                    id: 456
-                    parent_id: 123
-                    name: "Range"
-                    long_name: "Iterator::MapAndBatch::Range"
-                    is_async: false
-                  }
-                }
-                input_pipelines {
-                  key: 123,
-                  value: {
-                    metadata { id: 123 type: HOST name: "Host:0" }
-                    avg_latency_ps: 100
-                    min_latency_ps: 100
-                    max_latency_ps: 100
-                    stats {
-                      bottleneck_iterator_id: 456
-                      iterator_stats {
-                        key: 123,
-                        value: {
-                          id: 123
-                          start_time_ps: 0
-                          duration_ps: 100
-                          self_time_ps: 40
-                          is_blocking: true
-                          num_calls: 1
-                        }
-                      }
-                      iterator_stats {
-                        key: 456,
-                        value: {
-                          id: 456
-                          start_time_ps: 0
-                          duration_ps: 60
-                          self_time_ps: 60
-                          is_blocking: true
-                          num_calls: 2
-                        }
-                      }
+                  iterator_stats {
+                    key: 456,
+                    value: {
+                      id: 456
+                      start_time_ps: 0
+                      duration_ps: 60000000
+                      self_time_ps: 60000000
+                      is_blocking: true
+                      num_calls: 2
                     }
                   }
                 }
-              )pb"));
+              }
+            }
+          }
+        }
+        is_input_bound: true
+        summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
+      )pb"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
index 59af75109d0..f1648744370 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -20,18 +20,25 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
+#include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -153,6 +160,60 @@ std::pair<std::string, bool> ConvertXSpaceToMemoryProfile(
   return std::make_pair(json_output, true);
 }
 
+std::pair<std::string, bool> ConvertMultiXSpacesToPodViewer(
+    const std::vector<std::string>& xspace_paths) {
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats combined_op_stats;
+  Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                       &combined_op_stats);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate OpStats for pod_viewer. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+
+  std::string json_output;
+  protobuf::util::JsonPrintOptions opts;
+  opts.always_print_primitive_fields = true;
+  auto encode_status = protobuf::util::MessageToJsonString(
+      ConvertOpStatsToPodViewer(combined_op_stats), &json_output, opts);
+  if (!encode_status.ok()) {
+    LOG(WARNING) << "Could not convert pod viewer proto to json. Error: "
+                 << encode_status.message();
+    return std::make_pair("", false);
+  }
+  return std::make_pair(json_output, true);
+}
+
+std::pair<std::string, bool> ConvertMultiXSpacesToTfDataBottleneckAnalysis(
+    const std::vector<std::string>& xspace_paths) {
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  for (const std::string& xspace_path : xspace_paths) {
+    XSpace xspace;
+    Status status = ReadBinaryProto(Env::Default(), xspace_path, &xspace);
+    if (!status.ok()) {
+      LOG(WARNING) << "Could not read XSpace for tf data stats: "
+                   << xspace_path;
+      return std::make_pair("", false);
+    }
+    XPlane* host_plane =
+        FindMutablePlaneWithName(&xspace, kHostThreadsPlaneName);
+    if (host_plane == nullptr) {
+      LOG(WARNING) << "Could not find host XPlane for tf data stats: "
+                   << xspace_path;
+      return std::make_pair("", false);
+    }
+    absl::string_view host_name =
+        xspace.hostnames_size() ? xspace.hostnames(0) : xspace_path;
+    builder.Add(host_name, host_plane);
+  }
+  builder.Finalize();
+  return std::make_pair(combined_tf_data_stats.SerializeAsString(), true);
+}
+
 }  // namespace
 
 std::pair<std::string, bool> ConvertMultiXSpacesToToolData(
@@ -170,6 +231,10 @@ std::pair<std::string, bool> ConvertMultiXSpacesToToolData(
     return ConvertMultiXSpacesToKernelStats(xspace_paths);
   } else if (tool_name == "memory_profile") {
     return ConvertXSpaceToMemoryProfile(xspace_paths);
+  } else if (tool_name == "pod_viewer") {
+    return ConvertMultiXSpacesToPodViewer(xspace_paths);
+  } else if (tool_name == "tf_data_bottleneck_analysis") {
+    return ConvertMultiXSpacesToTfDataBottleneckAnalysis(xspace_paths);
   } else {
     LOG(WARNING) << "Can not find tool: " << tool_name << ". Please update to "
                  << "the latest version of Tensorflow.";
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
index 1e0d27ae68a..a9b6a29704f 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
@@ -35,16 +35,16 @@ void CreateXSpace(XSpace* space) {
       thread1.AddEvent(*host_plane.GetOrCreateEventMetadata("event1"));
   event1.SetTimestampNs(150000);
   event1.SetDurationNs(10000);
-  event1.ParseAndAddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
-                              "Relu");
+  event1.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
+                      *host_plane.GetOrCreateStatMetadata("Relu"));
   XLineBuilder thread2 = host_plane.GetOrCreateLine(20);
   thread2.SetName("thread2");
   XEventBuilder event2 =
       thread2.AddEvent(*host_plane.GetOrCreateEventMetadata("event2"));
   event2.SetTimestampNs(160000);
   event2.SetDurationNs(10000);
-  event2.ParseAndAddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
-                              "Conv2D");
+  event2.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
+                      *host_plane.GetOrCreateStatMetadata("Conv2D"));
 
   XPlaneBuilder device_plane(space->add_planes());
   device_plane.SetName(GpuPlaneName(0));
@@ -55,8 +55,8 @@ void CreateXSpace(XSpace* space) {
       stream1.AddEvent(*device_plane.GetOrCreateEventMetadata("kernel1"));
   event3.SetTimestampNs(180000);
   event3.SetDurationNs(10000);
-  event3.ParseAndAddStatValue(
-      *device_plane.GetOrCreateStatMetadata("correlation id"), "55");
+  event3.AddStatValue(*device_plane.GetOrCreateStatMetadata("correlation id"),
+                      55);
 }
 
 TEST(ConvertXPlaneToTraceEvents, Convert) {
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 43fae35e9da..336375f7d22 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -22,7 +22,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -53,7 +53,7 @@ cc_library(
         ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings:str_format",
@@ -75,7 +75,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings:str_format",
@@ -95,7 +95,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -119,7 +119,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -140,7 +140,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings:str_format",
@@ -176,7 +176,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -200,7 +200,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -255,7 +255,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -358,6 +358,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index ba882067463..e640d0763a6 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -82,11 +82,15 @@ Status HostTracer::Start() {
   if (recording_) {
     return errors::Internal("TraceMeRecorder already started");
   }
+
+  // All TraceMe captured should have a timestamp greater or equal to
+  // start_timestamp_ns_ to prevent timestamp underflow in XPlane.
+  // Therefore this have to be done before TraceMeRecorder::Start.
+  start_timestamp_ns_ = EnvTime::NowNanos();
   recording_ = TraceMeRecorder::Start(host_trace_level_);
   if (!recording_) {
     return errors::Internal("Failed to start TraceMeRecorder");
   }
-  start_timestamp_ns_ = EnvTime::NowNanos();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index 1b48df0a650..1cd9f0bfe33 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -152,7 +152,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   ASSERT_EQ(plane.name(), kHostThreadsPlaneName);
   ASSERT_EQ(plane.lines_size(), 1);
   ASSERT_EQ(plane.event_metadata_size(), 7);
-  ASSERT_EQ(plane.stat_metadata_size(), 2);
+  ASSERT_EQ(plane.stat_metadata_size(), 4);
   const auto& line = plane.lines(0);
   EXPECT_EQ(line.id(), thread_id);
   EXPECT_EQ(line.name(), thread_name);
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
index 4f12776f581..f3d128f4539 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
@@ -80,6 +80,7 @@ void ConvertCompleteEventsToXPlane(uint64 start_timestamp_ns,
     xline.ReserveEvents(thread.events.size());
     for (const auto& event : thread.events) {
       if (!IsCompleteEvent(event)) continue;
+      if (event.start_time < start_timestamp_ns) continue;
       Annotation annotation = ParseAnnotation(event.name);
       XEventMetadata* xevent_metadata =
           xplane.GetOrCreateEventMetadata(annotation.name);
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index ec529b93fae..80c87b055a1 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -43,10 +43,6 @@ tf_cuda_library(
         "//tensorflow/core/profiler/lib:profiler_factory",
         "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:parse_annotation",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -73,7 +69,6 @@ tf_cc_test_gpu(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:gpu_runtime",
@@ -83,6 +78,7 @@ tf_cc_test_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
@@ -128,11 +124,11 @@ tf_cuda_library(
     copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":cupti_collector",
         ":cupti_interface",
         ":cupti_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/internal/cpu:annotation_stack",
-        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
@@ -140,6 +136,28 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "cupti_collector",
+    srcs = if_cuda_is_configured_compat(["cupti_collector.cc"]),
+    hdrs = if_cuda_is_configured_compat(["cupti_collector.h"]),
+    copts = tf_profiler_copts() + tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:parse_annotation",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cuda_library(
     name = "cupti_utils",
     srcs = if_cuda_is_configured_compat(["cupti_utils.cc"]),
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_collector.cc b/tensorflow/core/profiler/internal/gpu/cupti_collector.cc
new file mode 100644
index 00000000000..bfdf5dad52a
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_collector.cc
@@ -0,0 +1,572 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/host_info.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/profiler/utils/parse_annotation.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+bool IsHostEvent(const CuptiTracerEvent& event, int64* line_id) {
+  // DriverCallback(i.e. kernel launching) events are host events.
+  if (event.source == CuptiTracerEventSource::DriverCallback) {
+    *line_id = event.thread_id;
+    return true;
+  }
+  // Non-overhead activity events are device events.
+  if (event.type != CuptiTracerEventType::Overhead) {
+    *line_id = event.stream_id;
+    return false;
+  }
+  // Overhead events can be associated with a thread or a stream, etc.
+  // If a valid thread id is specified, we consider it as a host event.
+  //
+  if (event.stream_id != CuptiTracerEvent::kInvalidStreamId) {
+    *line_id = event.stream_id;
+    return false;
+  } else if (event.thread_id != CuptiTracerEvent::kInvalidThreadId &&
+             event.thread_id != 0) {
+    *line_id = event.thread_id;
+    return true;
+  } else {
+    *line_id = kThreadIdOverhead;
+    return false;
+  }
+}
+
+void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
+                  uint64 start_gpu_ns, uint64 end_gpu_ns, XLineBuilder* line) {
+  if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
+      event.start_time_ns > event.end_time_ns) {
+    VLOG(2) << "events have abnormal timestamps:" << event.name
+            << " start time(ns): " << event.start_time_ns
+            << " end time(ns): " << event.end_time_ns;
+    return;
+  }
+  std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
+  if (kernel_name.empty()) {
+    kernel_name = GetTraceEventTypeName(event.type);
+  }
+  XEventMetadata* event_metadata =
+      plane->GetOrCreateEventMetadata(std::move(kernel_name));
+  XEventBuilder xevent = line->AddEvent(*event_metadata);
+  xevent.SetTimestampNs(event.start_time_ns);
+  xevent.SetEndTimestampNs(event.end_time_ns);
+  if (event.source == CuptiTracerEventSource::DriverCallback) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
+        event.device_id);
+  }
+  if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kCorrelationId)),
+                        event.correlation_id);
+  }
+  if (!event.annotation.empty()) {
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kKernelAnnotation)),
+                        *plane->GetOrCreateStatMetadata(event.annotation));
+  }
+  if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
+        absl::StrCat("$$", static_cast<uint64>(event.context_id)));
+  }
+  if (event.type == CuptiTracerEventType::Kernel) {
+    std::string kernel_details = absl::StrCat(
+        "regs:", event.kernel_info.registers_per_thread,
+        " shm:", event.kernel_info.static_shared_memory_usage,
+        " grid:", event.kernel_info.grid_x, ",", event.kernel_info.grid_y, ",",
+        event.kernel_info.grid_z, " block:", event.kernel_info.block_x, ",",
+        event.kernel_info.block_y, ",", event.kernel_info.block_z);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kKernelDetails)),
+                        *plane->GetOrCreateStatMetadata(kernel_details));
+  } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
+             event.type == CuptiTracerEventType::MemcpyD2H ||
+             event.type == CuptiTracerEventType::MemcpyD2D ||
+             event.type == CuptiTracerEventType::MemcpyP2P ||
+             event.type == CuptiTracerEventType::MemcpyOther) {
+    const auto& memcpy_info = event.memcpy_info;
+    std::string memcpy_details = absl::StrCat("size:", memcpy_info.num_bytes,
+                                              " dest:", memcpy_info.destination,
+                                              " async:", memcpy_info.async);
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kMemcpyDetails)),
+        *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
+  } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
+    std::string memalloc_details =
+        absl::StrCat("num_bytes:", event.memalloc_info.num_bytes);
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kMemallocDetails)),
+        *plane->GetOrCreateStatMetadata(std::move(memalloc_details)));
+  }
+
+  std::vector<Annotation> annotation_stack =
+      ParseAnnotationStack(event.annotation);
+  // If multiple metadata have the same key name, show the values from the top
+  // of the stack (innermost annotation). Concatenate the values from "hlo_op".
+  absl::flat_hash_set<absl::string_view> key_set;
+  std::vector<absl::string_view> hlo_op_names;
+  for (auto annotation = annotation_stack.rbegin();
+       annotation != annotation_stack.rend(); ++annotation) {
+    for (const Annotation::Metadata& metadata : annotation->metadata) {
+      if (metadata.key == "tf_op") {
+        continue;  // ignored, obtained from HLO proto via DebugInfoMap
+      } else if (key_set.insert(metadata.key).second) {
+        xevent.ParseAndAddStatValue(
+            *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
+      }
+    }
+  }
+  if (!annotation_stack.empty()) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
+        *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
+  }
+}
+
+absl::optional<int> GetDeviceAttribute(CUdevice device,
+                                       CUdevice_attribute attrib) {
+  int ret_val;
+  CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
+  if (err != CUDA_SUCCESS) return absl::nullopt;
+  return ret_val;
+}
+
+std::string GetDeviceXLineName(
+    int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) {
+  std::string line_name = absl::StrCat("Stream #", stream_id);
+  event_types.erase(CuptiTracerEventType::Unsupported);
+  if (event_types.empty()) return line_name;
+  if (event_types.count(CuptiTracerEventType::Overhead))
+    return "CUPTI overhead";
+  std::vector<const char*> type_names;
+  for (const auto event_type : event_types) {
+    type_names.emplace_back(GetTraceEventTypeName(event_type));
+  }
+  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
+}
+
+}  // namespace
+
+void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
+                        const std::string& annotation) {
+  if (annotation.empty()) return;
+  VLOG(3) << "Add annotation: device_id: " << device_id
+          << " correlation_id: " << correlation_id
+          << " annotation: " << annotation;
+  if (device_id >= per_device_map_.size()) return;
+  auto& per_device_map = per_device_map_[device_id];
+  absl::MutexLock lock(&per_device_map.mutex);
+  if (per_device_map.annotations.size() < max_size_) {
+    absl::string_view annotation_str =
+        *per_device_map.annotations.insert(annotation).first;
+    per_device_map.correlation_map.emplace(correlation_id, annotation_str);
+  }
+}
+
+absl::string_view AnnotationMap::LookUp(uint32 device_id,
+                                        uint32 correlation_id) {
+  if (device_id >= per_device_map_.size()) return absl::string_view();
+  auto& per_device_map = per_device_map_[device_id];
+  absl::MutexLock lock(&per_device_map.mutex);
+  auto it = per_device_map.correlation_map.find(correlation_id);
+  return it != per_device_map.correlation_map.end() ? it->second
+                                                    : absl::string_view();
+}
+
+// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
+// eventually convert and filter them to StepStats or XSpace.
+class CuptiTraceCollectorImpl : public CuptiTraceCollector {
+ public:
+  CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
+                          uint64 start_walltime_ns, uint64 start_gpu_ns)
+      : CuptiTraceCollector(option),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gpu_ns_(start_gpu_ns),
+        num_gpus_(option.num_gpus),
+        per_device_collector_(option.num_gpus) {}
+
+  void AddEvent(CuptiTracerEvent&& event) override {
+    if (event.device_id >= num_gpus_) return;
+    if (event.source == CuptiTracerEventSource::DriverCallback) {
+      if (num_callback_events_ > options_.max_callback_api_events) {
+        OnEventsDropped("total driver(callback) events reaches max", 1);
+        return;
+      }
+      num_callback_events_++;
+    } else {
+      if (num_activity_events_ > options_.max_activity_api_events) {
+        OnEventsDropped("total device(activity) events reaches max", 1);
+        return;
+      }
+      num_activity_events_++;
+    }
+    per_device_collector_[event.device_id].AddEvent(std::move(event));
+  }
+  void OnEventsDropped(const std::string& reason, uint32 num_events) override {
+    absl::MutexLock lock(&mutex_);
+    dropped_events_[reason] += num_events;
+  }
+  void Flush() override {}
+  void Export(StepStats* step_stats) override {
+    LOG(INFO) << " GpuTracer has collected " << num_callback_events_
+              << " callback api events and " << num_activity_events_
+              << " activity events. " << ReportDroppedEvents();
+    for (int i = 0; i < num_gpus_; ++i) {
+      per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
+                                     step_stats);
+    }
+  }
+  // Returns true if some GPU events are captured.
+  bool Export(XSpace* space, uint64 end_gpu_ns) override {
+    LOG(INFO) << " GpuTracer has collected " << num_callback_events_
+              << " callback api events and " << num_activity_events_
+              << " activity events. " << ReportDroppedEvents();
+    size_t num_events = 0;
+    XPlaneBuilder host_plane(
+        FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
+    for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
+      std::string name = GpuPlaneName(device_ordinal);
+      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
+      device_plane.SetId(device_ordinal);
+      num_events += per_device_collector_[device_ordinal].Flush(
+          start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
+      per_device_collector_[device_ordinal].GetDeviceCapabilities(
+          device_ordinal, &device_plane);
+      NormalizeTimeStamps(&device_plane, start_walltime_ns_);
+    }
+    NormalizeTimeStamps(&host_plane, start_walltime_ns_);
+    return num_events > 0;
+  }
+  std::string ReportDroppedEvents() {
+    absl::MutexLock lock(&mutex_);
+    string result;
+    for (const auto& dropped : dropped_events_) {
+      absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
+                      dropped.first, ";");
+    }
+    if (!result.empty()) result.back() = '.';
+    return result;
+  }
+  std::string ReportNumEventsIfDropped() override {
+    std::string events_dropped = ReportDroppedEvents();
+    if (events_dropped.empty()) return "";
+    return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
+                        ": Profiler has collected ",
+                        num_callback_events_.load(), " driver events and ",
+                        num_activity_events_.load(), " device events.",
+                        events_dropped);
+  }
+
+ private:
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  absl::Mutex mutex_;
+  absl::flat_hash_map<std::string, uint64> dropped_events_
+      ABSL_GUARDED_BY(mutex_);
+  uint64 start_walltime_ns_;
+  uint64 start_gpu_ns_;
+  int num_gpus_;
+
+  // Set the all XLines of specified XPlane to starting walltime.
+  // Events time in both host and device planes are CUTPI timestamps.
+  // We set initial CUPTI timestamp as start time for all lines to reflect
+  // this fact. Eventually we change line start time to corresponding
+  // start_walltime_ns to normalize with CPU wall time.
+  static void NormalizeTimeStamps(XPlaneBuilder* plane,
+                                  uint64 start_walltime_ns) {
+    plane->ForEachLine(
+        [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
+  }
+
+  struct CorrelationInfo {
+    CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
+    uint32 thread_id;
+    uint64 enqueue_time_ns;
+  };
+  struct PerDeviceCollector {
+    void AddEvent(CuptiTracerEvent&& event) {
+      mutex_lock l(m);
+      if (event.source == CuptiTracerEventSource::DriverCallback) {
+        // Cupti api callback events were used to populate launch times etc.
+        if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
+          correlation_info.insert(
+              {event.correlation_id,
+               CorrelationInfo(event.thread_id, event.start_time_ns)});
+        }
+        events.emplace_back(std::move(event));
+      } else {
+        // Cupti activity events measure device times etc.
+        events.emplace_back(std::move(event));
+      }
+    }
+
+    void Flush(int32 device_ordinal, uint64 start_walltime_ns,
+               uint64 start_gpu_ns, StepStats* step_stats) {
+      mutex_lock l(m);
+      absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
+                          DeviceStepStats*>
+          stream_dev_stats_map;
+      DeviceStepStats* unknown_stream_dev_stats = nullptr;
+      DeviceStepStats* all_streams_dev_stats = nullptr;
+      DeviceStepStats* memcpy_dev_stats = nullptr;
+      DeviceStepStats* sync_dev_stats = nullptr;
+      for (const CuptiTracerEvent& event : events) {
+        NodeExecStats* ns = new NodeExecStats;
+        ns->set_all_start_micros(
+            (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
+        ns->set_op_start_rel_micros(0);
+        auto elapsed_ns = event.end_time_ns - event.start_time_ns;
+        ns->set_op_end_rel_micros(elapsed_ns / 1000);
+        ns->set_all_end_rel_micros(elapsed_ns / 1000);
+
+        if (event.source == CuptiTracerEventSource::DriverCallback) {
+          // Legacy code ignore all other launch events except
+          // cuStreamSynchronize.
+          if (event.name == "cuStreamSynchronize") {
+            ns->set_node_name(event.name);
+            ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
+            ns->set_thread_id(event.thread_id);
+            if (sync_dev_stats == nullptr) {
+              sync_dev_stats = step_stats->add_dev_stats();
+              sync_dev_stats->set_device(
+                  absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
+            }
+            sync_dev_stats->add_node_stats()->Swap(ns);
+          }
+        } else {  // CuptiTracerEventSource::Activity
+          // Get launch information if available.
+          if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
+            auto it = correlation_info.find(event.correlation_id);
+            if (it != correlation_info.end()) {
+              ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
+              ns->set_thread_id(it->second.thread_id);
+            }
+          }
+
+          auto annotation_stack = ParseAnnotationStack(event.annotation);
+          std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
+          std::string activity_name =
+              !annotation_stack.empty()
+                  ? std::string(annotation_stack.back().name)
+                  : kernel_name;
+          ns->set_node_name(activity_name);
+          switch (event.type) {
+            case CuptiTracerEventType::Kernel: {
+              ns->set_timeline_label(absl::StrCat(
+                  kernel_name, " regs:", event.kernel_info.registers_per_thread,
+                  " shm:", event.kernel_info.static_shared_memory_usage,
+                  " grid: ", event.kernel_info.grid_x, ",",
+                  event.kernel_info.grid_y, ",", event.kernel_info.grid_z,
+                  " block:", event.kernel_info.block_x, ",",
+                  event.kernel_info.block_y, ",", event.kernel_info.block_z,
+                  "@@", event.annotation));
+              DeviceStepStats*& stream_dev_stats =
+                  stream_dev_stats_map[std::make_pair(event.stream_id,
+                                                      event.type)];
+              if (stream_dev_stats == nullptr) {
+                stream_dev_stats = step_stats->add_dev_stats();
+                stream_dev_stats->set_device(
+                    absl::StrCat("/device:GPU:", device_ordinal,
+                                 "/stream:", event.stream_id));
+              }
+              *stream_dev_stats->add_node_stats() = *ns;
+              if (all_streams_dev_stats == nullptr) {
+                all_streams_dev_stats = step_stats->add_dev_stats();
+                all_streams_dev_stats->set_device(absl::StrCat(
+                    "/device:GPU:", device_ordinal, "/stream:all"));
+              }
+              all_streams_dev_stats->add_node_stats()->Swap(ns);
+              break;
+            }
+            case CuptiTracerEventType::MemcpyH2D:
+            case CuptiTracerEventType::MemcpyD2H:
+            case CuptiTracerEventType::MemcpyD2D:
+            case CuptiTracerEventType::MemcpyP2P: {
+              std::string details = absl::StrCat(
+                  activity_name, " bytes:", event.memcpy_info.num_bytes);
+              if (event.memcpy_info.async) {
+                absl::StrAppend(&details, " aync");
+              }
+              if (event.memcpy_info.destination != event.device_id) {
+                absl::StrAppend(&details,
+                                " to device:", event.memcpy_info.destination);
+              }
+              ns->set_timeline_label(std::move(details));
+              DeviceStepStats*& stream_dev_stats =
+                  stream_dev_stats_map[std::make_pair(event.stream_id,
+                                                      event.type)];
+              if (stream_dev_stats == nullptr) {
+                stream_dev_stats = step_stats->add_dev_stats();
+                stream_dev_stats->set_device(absl::StrCat(
+                    "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
+                    "<", GetTraceEventTypeName(event.type), ">"));
+              }
+              *stream_dev_stats->add_node_stats() = *ns;
+              if (memcpy_dev_stats == nullptr) {
+                memcpy_dev_stats = step_stats->add_dev_stats();
+                memcpy_dev_stats->set_device(
+                    absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
+              }
+              memcpy_dev_stats->add_node_stats()->Swap(ns);
+              break;
+            }
+            default:
+              ns->set_timeline_label(activity_name);
+              if (unknown_stream_dev_stats == nullptr) {
+                unknown_stream_dev_stats = step_stats->add_dev_stats();
+                unknown_stream_dev_stats->set_device(
+                    absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
+              }
+              unknown_stream_dev_stats->add_node_stats()->Swap(ns);
+              break;
+          }
+        }
+      }
+      events.clear();
+    }
+
+    size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
+                 XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
+      mutex_lock l(m);
+      // Tracking event types per line.
+      absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
+          events_types_per_line;
+      for (auto& event : events) {
+        int64 line_id = CuptiTracerEvent::kInvalidThreadId;
+        bool is_host_event = IsHostEvent(event, &line_id);
+        if (line_id == CuptiTracerEvent::kInvalidThreadId ||
+            line_id == CuptiTracerEvent::kInvalidStreamId)
+          continue;
+        auto* plane = is_host_event ? host_plane : device_plane;
+        XLineBuilder line = plane->GetOrCreateLine(line_id);
+        line.SetTimestampNs(start_gpu_ns);
+        CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
+        events_types_per_line[line_id].emplace(event.type);
+      }
+      device_plane->ForEachLine([&](XLineBuilder line) {
+        line.SetName(
+            GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
+      });
+      host_plane->ForEachLine([&](XLineBuilder line) {
+        line.SetName(absl::StrCat("Host Threads/", line.Id()));
+      });
+      size_t num_events = events.size();
+      events.clear();
+      return num_events;
+    }
+
+    void GetDeviceCapabilities(int32 device_ordinal,
+                               XPlaneBuilder* device_plane) {
+      CUdevice device;
+      if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
+
+      auto clock_rate_in_khz =
+          GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
+      if (clock_rate_in_khz) {
+        device_plane->AddStatValue(
+            *device_plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kDevCapClockRateKHz)),
+            *clock_rate_in_khz);
+      }
+
+      auto core_count =
+          GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
+      if (core_count) {
+        device_plane->AddStatValue(
+            *device_plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kDevCapCoreCount)),
+            *core_count);
+      }
+
+      auto mem_clock_khz =
+          GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
+      auto mem_bus_width_bits = GetDeviceAttribute(
+          device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
+      if (mem_clock_khz && mem_bus_width_bits) {
+        // Times 2 because HBM is DDR memory; it gets two data bits per each
+        // data lane.
+        auto memory_bandwidth =
+            uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
+        device_plane->AddStatValue(
+            *device_plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
+            memory_bandwidth);
+      }
+
+      size_t total_memory = 0;
+      if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
+        device_plane->AddStatValue(
+            *device_plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kDevCapMemorySize)),
+            static_cast<uint64>(total_memory));
+      }
+
+      auto compute_capability_major = GetDeviceAttribute(
+          device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+      if (compute_capability_major) {
+        device_plane->AddStatValue(
+            *device_plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
+            *compute_capability_major);
+      }
+      auto compute_capability_minor = GetDeviceAttribute(
+          device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+      if (compute_capability_minor) {
+        device_plane->AddStatValue(
+            *device_plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
+            *compute_capability_minor);
+      }
+    }
+
+    mutex m;
+    std::vector<CuptiTracerEvent> events TF_GUARDED_BY(m);
+    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
+        TF_GUARDED_BY(m);
+  };
+  absl::FixedArray<PerDeviceCollector> per_device_collector_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
+};
+
+std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
+    const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
+    const uint64 start_gputime_ns) {
+  return absl::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
+                                                    start_gputime_ns);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_collector.h b/tensorflow/core/profiler/internal/gpu/cupti_collector.h
new file mode 100644
index 00000000000..bbc169364ea
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_collector.h
@@ -0,0 +1,204 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
+
+#include <memory>
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32 destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
+  // For events from other CuptiTracerEventSource, it is always 0.
+  int8 kind;
+  // CUpti_ActivityMemoryKind of source.
+  int8 src_mem_kind;
+  // CUpti_ActivityMemoryKind of destination.
+  int8 dst_mem_kind;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64 num_bytes;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint64 registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint64 static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint64 dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint64 block_x;
+  // Y-dimension of a thread block.
+  uint64 block_y;
+  // Z-dimension of a thread block.
+  uint64 block_z;
+  // X-dimension of a grid.
+  uint64 grid_x;
+  // Y-dimension of a grid.
+  uint64 grid_y;
+  // Z-dimension of a grid.
+  uint64 grid_z;
+};
+
+enum class CuptiTracerEventType {
+  Unsupported = 0,
+  Kernel = 1,
+  MemcpyH2D = 2,
+  MemcpyD2H = 3,
+  MemcpyD2D = 4,
+  MemcpyP2P = 5,
+  MemcpyOther = 6,
+  MemoryAlloc = 7,
+  Overhead = 8,
+  UnifiedMemory = 9,
+  Generic = 100,
+};
+
+const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
+
+enum class CuptiTracerEventSource {
+  DriverCallback = 0,
+  Activity = 1,
+  // Maybe consider adding runtime callback and metric api in the future.
+};
+
+struct CuptiTracerEvent {
+  static constexpr uint32 kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32 kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64 kInvalidContextId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint64 kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  CuptiTracerEventType type;
+  CuptiTracerEventSource source;
+  // Although CUpti_CallbackData::functionName is persistent, however
+  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+  // it.
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  uint64 start_time_ns;
+  uint64 end_time_ns;
+  uint32 device_id;
+  uint32 correlation_id = kInvalidCorrelationId;
+  uint32 thread_id = kInvalidThreadId;
+  int64 context_id = kInvalidContextId;
+  int64 stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;      // If type == Memcpy*
+    MemAllocDetails memalloc_info;  // If type == MemoryAlloc
+    KernelDetails kernel_info;      // If type == Kernel
+  };
+};
+
+struct CuptiTracerCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64 max_callback_api_events = 2 * 1024 * 1024;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64 max_activity_api_events = 2 * 1024 * 1024;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64 max_annotation_strings = 1024 * 1024;
+  // Number of GPUs involved.
+  uint32 num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
+      : max_size_(max_size), per_device_map_(num_gpus) {}
+  void Add(uint32 device_id, uint32 correlation_id,
+           const std::string& annotation);
+  absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
+
+ private:
+  struct PerDeviceAnnotationMap {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32, absl::string_view> correlation_map;
+  };
+  const uint64 max_size_;
+  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
+};
+
+class CuptiTraceCollector {
+ public:
+  explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
+      : options_(options),
+        annotation_map_(options.max_annotation_strings, options.num_gpus) {}
+  virtual ~CuptiTraceCollector() {}
+
+  // Producer side functions (i.e. called by CuptiTracer).
+  virtual void AddEvent(CuptiTracerEvent&& event) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32 num_events) = 0;
+  virtual void Flush() = 0;
+
+  // Consumer side functions (i.e. called by GPU tracer);
+  virtual void Export(StepStats* step_stats) {}
+  virtual bool Export(XSpace* space, uint64 end_gpu_ns) { return true; }
+  virtual std::string ReportNumEventsIfDropped() { return ""; }
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  CuptiTracerCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
+};
+
+std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
+    const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
+    const uint64 start_gputime_ns);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index aedb1722fad..d3e2b7d56b4 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -1344,32 +1344,6 @@ const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
   }
 }
 
-void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
-                        const std::string &annotation) {
-  if (annotation.empty()) return;
-  VLOG(3) << "Add annotation: device_id: " << device_id
-          << " correlation_id: " << correlation_id
-          << " annotation: " << annotation;
-  if (device_id >= per_device_map_.size()) return;
-  auto &per_device_map = per_device_map_[device_id];
-  absl::MutexLock lock(&per_device_map.mutex);
-  if (per_device_map.annotations.size() < max_size_) {
-    absl::string_view annotation_str =
-        *per_device_map.annotations.insert(annotation).first;
-    per_device_map.correlation_map.emplace(correlation_id, annotation_str);
-  }
-}
-
-absl::string_view AnnotationMap::LookUp(uint32 device_id,
-                                        uint32 correlation_id) {
-  if (device_id >= per_device_map_.size()) return absl::string_view();
-  auto &per_device_map = per_device_map_[device_id];
-  absl::MutexLock lock(&per_device_map.mutex);
-  auto it = per_device_map.correlation_map.find(correlation_id);
-  return it != per_device_map.correlation_map.end() ? it->second
-                                                    : absl::string_view();
-}
-
 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
   static auto *singleton = new CuptiTracer(GetCuptiInterface());
   return singleton;
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index a62c08013e8..3f7a2d4d7e1 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -16,117 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
 
-#include "absl/container/fixed_array.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_set.h"
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"
 
 namespace tensorflow {
 namespace profiler {
 
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: its the current device.
-  uint32 destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
-  // For events from other CuptiTracerEventSource, it is always 0.
-  int8 kind;
-  // CUpti_ActivityMemoryKind of source.
-  int8 src_mem_kind;
-  // CUpti_ActivityMemoryKind of destination.
-  int8 dst_mem_kind;
-};
-
-struct MemAllocDetails {
-  // The amount of data requested for cudaMalloc events.
-  uint64 num_bytes;
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint64 registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint64 static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint64 dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint64 block_x;
-  // Y-dimension of a thread block.
-  uint64 block_y;
-  // Z-dimension of a thread block.
-  uint64 block_z;
-  // X-dimension of a grid.
-  uint64 grid_x;
-  // Y-dimension of a grid.
-  uint64 grid_y;
-  // Z-dimension of a grid.
-  uint64 grid_z;
-};
-
-enum class CuptiTracerEventType {
-  Unsupported = 0,
-  Kernel = 1,
-  MemcpyH2D = 2,
-  MemcpyD2H = 3,
-  MemcpyD2D = 4,
-  MemcpyP2P = 5,
-  MemcpyOther = 6,
-  MemoryAlloc = 7,
-  Overhead = 8,
-  UnifiedMemory = 9,
-  Generic = 100,
-};
-
-const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
-
-enum class CuptiTracerEventSource {
-  DriverCallback = 0,
-  Activity = 1,
-  // Maybe consider adding runtime callback and metric api in the future.
-};
-
-struct CuptiTracerEvent {
-  static constexpr uint32 kInvalidThreadId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32 kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64 kInvalidContextId =
-      std::numeric_limits<uint64_t>::max();
-  static constexpr uint64 kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  CuptiTracerEventType type;
-  CuptiTracerEventSource source;
-  // Although CUpti_CallbackData::functionName is persistent, however
-  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
-  // it.
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  uint64 start_time_ns;
-  uint64 end_time_ns;
-  uint32 device_id;
-  uint32 correlation_id = kInvalidCorrelationId;
-  uint32 thread_id = kInvalidThreadId;
-  int64 context_id = kInvalidContextId;
-  int64 stream_id = kInvalidStreamId;
-  union {
-    MemcpyDetails memcpy_info;      // If type == Memcpy*
-    MemAllocDetails memalloc_info;  // If type == MemoryAlloc
-    KernelDetails kernel_info;      // If type == Kernel
-  };
-};
-
 struct CuptiTracerOptions {
   bool enable_activity_api = true;
 
@@ -151,66 +52,6 @@ struct CuptiTracerOptions {
   bool sync_devices_before_stop = false;
 };
 
-struct CuptiTracerCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64 max_callback_api_events = 2 * 1024 * 1024;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64 max_activity_api_events = 2 * 1024 * 1024;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64 max_annotation_strings = 1024 * 1024;
-  // Number of GPUs involved.
-  uint32 num_gpus;
-};
-
-class AnnotationMap {
- public:
-  explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
-      : max_size_(max_size), per_device_map_(num_gpus) {}
-  void Add(uint32 device_id, uint32 correlation_id,
-           const std::string& annotation);
-  absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
-
- private:
-  struct PerDeviceAnnotationMap {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32, absl::string_view> correlation_map;
-  };
-  const uint64 max_size_;
-  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
-};
-
-class CuptiTraceCollector {
- public:
-  explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
-      : options_(options),
-        annotation_map_(options.max_annotation_strings, options.num_gpus) {}
-  virtual ~CuptiTraceCollector() {}
-
-  virtual void AddEvent(CuptiTracerEvent&& event) = 0;
-  virtual void OnEventsDropped(const std::string& reason,
-                               uint32 num_events) = 0;
-  virtual void Flush() = 0;
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
-
- protected:
-  CuptiTracerCollectorOptions options_;
-
- private:
-  AnnotationMap annotation_map_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
-};
-
 class CuptiDriverApiHook {
  public:
   virtual ~CuptiDriverApiHook() {}
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index ed675e712a1..da5e5955389 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -23,510 +23,23 @@ limitations under the License.
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/synchronization/mutex.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
 #include "tensorflow/core/profiler/lib/profiler_factory.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/parse_annotation.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace profiler {
 
-namespace {
-
-bool IsHostEvent(const CuptiTracerEvent& event) {
-  // DriverCallback(i.e. kernel launching) events are host events.
-  if (event.source == CuptiTracerEventSource::DriverCallback) return true;
-  // Non-overhead activity events are device events.
-  if (event.type != CuptiTracerEventType::Overhead) return false;
-  // Overhead events can be associated with a thread or a stream, etc.
-  // If a valid thread id is specified, we consider it as a host event.
-  return event.thread_id != CuptiTracerEvent::kInvalidThreadId;
-}
-
-void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
-                  uint64 start_gpu_ns, uint64 end_gpu_ns, XLineBuilder* line) {
-  if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
-      event.start_time_ns > event.end_time_ns) {
-    VLOG(2) << "events have abnormal timestamps:" << event.name
-            << " start time(ns): " << event.start_time_ns
-            << " end time(ns): " << event.end_time_ns;
-    return;
-  }
-  std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
-  if (kernel_name.empty()) {
-    kernel_name = GetTraceEventTypeName(event.type);
-  }
-  XEventMetadata* event_metadata =
-      plane->GetOrCreateEventMetadata(std::move(kernel_name));
-  XEventBuilder xevent = line->AddEvent(*event_metadata);
-  xevent.SetTimestampNs(event.start_time_ns);
-  xevent.SetEndTimestampNs(event.end_time_ns);
-  if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
-    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kCorrelationId)),
-                        event.correlation_id);
-  }
-  if (!event.annotation.empty()) {
-    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kKernelAnnotation)),
-                        *plane->GetOrCreateStatMetadata(event.annotation));
-  }
-  if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
-    xevent.AddStatValue(
-        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
-        absl::StrCat("$$", static_cast<uint64>(event.context_id)));
-  }
-  if (event.type == CuptiTracerEventType::Kernel) {
-    std::string kernel_details = absl::StrCat(
-        "regs:", event.kernel_info.registers_per_thread,
-        " shm:", event.kernel_info.static_shared_memory_usage,
-        " grid:", event.kernel_info.grid_x, ",", event.kernel_info.grid_y, ",",
-        event.kernel_info.grid_z, " block:", event.kernel_info.block_x, ",",
-        event.kernel_info.block_y, ",", event.kernel_info.block_z);
-    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kKernelDetails)),
-                        *plane->GetOrCreateStatMetadata(kernel_details));
-  } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
-             event.type == CuptiTracerEventType::MemcpyD2H ||
-             event.type == CuptiTracerEventType::MemcpyD2D ||
-             event.type == CuptiTracerEventType::MemcpyP2P ||
-             event.type == CuptiTracerEventType::MemcpyOther) {
-    const auto& memcpy_info = event.memcpy_info;
-    std::string memcpy_details = absl::StrCat("size:", memcpy_info.num_bytes,
-                                              " dest:", memcpy_info.destination,
-                                              " async:", memcpy_info.async);
-    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kMemcpyDetails)),
-                        memcpy_details);
-  } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
-    std::string memalloc_details =
-        absl::StrCat("num_bytes:", event.memalloc_info.num_bytes);
-    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kMemallocDetails)),
-                        memalloc_details);
-  }
-
-  std::vector<Annotation> annotation_stack =
-      ParseAnnotationStack(event.annotation);
-  // If multiple metadata have the same key name, show the values from the top
-  // of the stack (innermost annotation). Concatenate the values from "hlo_op".
-  absl::flat_hash_set<absl::string_view> key_set;
-  std::vector<absl::string_view> hlo_op_names;
-  for (auto annotation = annotation_stack.rbegin();
-       annotation != annotation_stack.rend(); ++annotation) {
-    for (const Annotation::Metadata& metadata : annotation->metadata) {
-      if (metadata.key == "tf_op") {
-        continue;  // ignored, obtained from HLO proto via DebugInfoMap
-      } else if (key_set.insert(metadata.key).second) {
-        xevent.ParseAndAddStatValue(
-            *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
-      }
-    }
-  }
-  // TODO(profiler): we should get rid of kLevel0, it is based on the assumption
-  // that those op-related ScopedAnnotation are at the very TOP level.
-  if (!annotation_stack.empty()) {
-    xevent.AddStatValue(
-        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-        *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
-  }
-}
-
-absl::optional<int> GetDeviceAttribute(CUdevice device,
-                                       CUdevice_attribute attrib) {
-  int ret_val;
-  CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
-  if (err != CUDA_SUCCESS) return absl::nullopt;
-  return ret_val;
-}
-
-std::string GetDeviceXLineName(
-    int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) {
-  std::string line_name = absl::StrCat("Stream #", stream_id);
-  event_types.erase(CuptiTracerEventType::Unsupported);
-  if (event_types.empty()) return line_name;
-  std::vector<const char*> type_names;
-  for (const auto event_type : event_types) {
-    type_names.emplace_back(GetTraceEventTypeName(event_type));
-  }
-  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
-}
-
-}  // namespace
-
-// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
-// eventually convert and filter them to StepStats or XSpace.
-class CuptiTraceCollectorImpl : public CuptiTraceCollector {
- public:
-  CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
-                          uint64 start_walltime_ns, uint64 start_gpu_ns)
-      : CuptiTraceCollector(option),
-        num_callback_events_(0),
-        num_activity_events_(0),
-        start_walltime_ns_(start_walltime_ns),
-        start_gpu_ns_(start_gpu_ns),
-        num_gpus_(option.num_gpus),
-        per_device_collector_(option.num_gpus) {}
-
-  void AddEvent(CuptiTracerEvent&& event) override {
-    if (event.device_id >= num_gpus_) return;
-    if (event.source == CuptiTracerEventSource::DriverCallback) {
-      if (num_callback_events_ > options_.max_callback_api_events) {
-        OnEventsDropped("total driver(callback) events reaches max", 1);
-        return;
-      }
-      num_callback_events_++;
-    } else {
-      if (num_activity_events_ > options_.max_activity_api_events) {
-        OnEventsDropped("total device(activity) events reaches max", 1);
-        return;
-      }
-      num_activity_events_++;
-    }
-    per_device_collector_[event.device_id].AddEvent(std::move(event));
-  }
-  void OnEventsDropped(const std::string& reason, uint32 num_events) override {
-    absl::MutexLock lock(&mutex_);
-    dropped_events_[reason] += num_events;
-  }
-  void Flush() override {}
-  void Export(StepStats* step_stats) {
-    LOG(INFO) << " GpuTracer has collected " << num_callback_events_
-              << " callback api events and " << num_activity_events_
-              << " activity events. " << ReportDroppedEvents();
-    for (int i = 0; i < num_gpus_; ++i) {
-      per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
-                                     step_stats);
-    }
-  }
-  void Export(XSpace* space) {
-    LOG(INFO) << " GpuTracer has collected " << num_callback_events_
-              << " callback api events and " << num_activity_events_
-              << " activity events. " << ReportDroppedEvents();
-    uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
-    XPlaneBuilder host_plane(
-        FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
-    for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-      std::string name = GpuPlaneName(device_ordinal);
-      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-      device_plane.SetId(device_ordinal);
-      per_device_collector_[device_ordinal].Flush(start_gpu_ns_, end_gpu_ns,
-                                                  &device_plane, &host_plane);
-      per_device_collector_[device_ordinal].GetDeviceCapabilities(
-          device_ordinal, &device_plane);
-      NormalizeTimeStamps(&device_plane, start_walltime_ns_);
-    }
-    NormalizeTimeStamps(&host_plane, start_walltime_ns_);
-  }
-  std::string ReportDroppedEvents() {
-    absl::MutexLock lock(&mutex_);
-    string result;
-    for (const auto& dropped : dropped_events_) {
-      absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
-                      dropped.first, ";");
-    }
-    if (!result.empty()) result.back() = '.';
-    return result;
-  }
-  std::string ReportNumEventsIfDropped() {
-    std::string events_dropped = ReportDroppedEvents();
-    if (events_dropped.empty()) return "";
-    return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
-                        ": Profiler has collected ",
-                        num_callback_events_.load(), " driver events and ",
-                        num_activity_events_.load(), " device events.",
-                        events_dropped);
-  }
-
- private:
-  std::atomic<int> num_callback_events_;
-  std::atomic<int> num_activity_events_;
-  absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, uint64> dropped_events_
-      ABSL_GUARDED_BY(mutex_);
-  uint64 start_walltime_ns_;
-  uint64 start_gpu_ns_;
-  int num_gpus_;
-
-  // Set the all XLines of specified XPlane to starting walltime.
-  // Events time in both host and device planes are CUTPI timestamps.
-  // We set initial CUPTI timestamp as start time for all lines to reflect
-  // this fact. Eventually we change line start time to corresponding
-  // start_walltime_ns to normalize with CPU wall time.
-  static void NormalizeTimeStamps(XPlaneBuilder* plane,
-                                  uint64 start_walltime_ns) {
-    plane->ForEachLine(
-        [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
-  }
-
-  struct CorrelationInfo {
-    CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
-    uint32 thread_id;
-    uint64 enqueue_time_ns;
-  };
-  struct PerDeviceCollector {
-    void AddEvent(CuptiTracerEvent&& event) {
-      mutex_lock l(m);
-      if (event.source == CuptiTracerEventSource::DriverCallback) {
-        // Cupti api callback events were used to populate launch times etc.
-        if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
-          correlation_info.insert(
-              {event.correlation_id,
-               CorrelationInfo(event.thread_id, event.start_time_ns)});
-        }
-        events.emplace_back(std::move(event));
-      } else {
-        // Cupti activity events measure device times etc.
-        events.emplace_back(std::move(event));
-      }
-    }
-
-    void Flush(int32 device_ordinal, uint64 start_walltime_ns,
-               uint64 start_gpu_ns, StepStats* step_stats) {
-      mutex_lock l(m);
-      absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
-                          DeviceStepStats*>
-          stream_dev_stats_map;
-      DeviceStepStats* unknown_stream_dev_stats = nullptr;
-      DeviceStepStats* all_streams_dev_stats = nullptr;
-      DeviceStepStats* memcpy_dev_stats = nullptr;
-      DeviceStepStats* sync_dev_stats = nullptr;
-      for (const CuptiTracerEvent& event : events) {
-        NodeExecStats* ns = new NodeExecStats;
-        ns->set_all_start_micros(
-            (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
-        ns->set_op_start_rel_micros(0);
-        auto elapsed_ns = event.end_time_ns - event.start_time_ns;
-        ns->set_op_end_rel_micros(elapsed_ns / 1000);
-        ns->set_all_end_rel_micros(elapsed_ns / 1000);
-
-        if (event.source == CuptiTracerEventSource::DriverCallback) {
-          // Legacy code ignore all other launch events except
-          // cuStreamSynchronize.
-          if (event.name == "cuStreamSynchronize") {
-            ns->set_node_name(event.name);
-            ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
-            ns->set_thread_id(event.thread_id);
-            if (sync_dev_stats == nullptr) {
-              sync_dev_stats = step_stats->add_dev_stats();
-              sync_dev_stats->set_device(
-                  absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
-            }
-            sync_dev_stats->add_node_stats()->Swap(ns);
-          }
-        } else {  // CuptiTracerEventSource::Activity
-          // Get launch information if available.
-          if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
-            auto it = correlation_info.find(event.correlation_id);
-            if (it != correlation_info.end()) {
-              ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
-              ns->set_thread_id(it->second.thread_id);
-            }
-          }
-
-          auto annotation_stack = ParseAnnotationStack(event.annotation);
-          std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
-          std::string activity_name =
-              !annotation_stack.empty()
-                  ? std::string(annotation_stack.back().name)
-                  : kernel_name;
-          ns->set_node_name(activity_name);
-          switch (event.type) {
-            case CuptiTracerEventType::Kernel: {
-              ns->set_timeline_label(absl::StrCat(
-                  kernel_name, " regs:", event.kernel_info.registers_per_thread,
-                  " shm:", event.kernel_info.static_shared_memory_usage,
-                  " grid: ", event.kernel_info.grid_x, ",",
-                  event.kernel_info.grid_y, ",", event.kernel_info.grid_z,
-                  " block:", event.kernel_info.block_x, ",",
-                  event.kernel_info.block_y, ",", event.kernel_info.block_z,
-                  "@@", event.annotation));
-              DeviceStepStats*& stream_dev_stats =
-                  stream_dev_stats_map[std::make_pair(event.stream_id,
-                                                      event.type)];
-              if (stream_dev_stats == nullptr) {
-                stream_dev_stats = step_stats->add_dev_stats();
-                stream_dev_stats->set_device(
-                    absl::StrCat("/device:GPU:", device_ordinal,
-                                 "/stream:", event.stream_id));
-              }
-              *stream_dev_stats->add_node_stats() = *ns;
-              if (all_streams_dev_stats == nullptr) {
-                all_streams_dev_stats = step_stats->add_dev_stats();
-                all_streams_dev_stats->set_device(absl::StrCat(
-                    "/device:GPU:", device_ordinal, "/stream:all"));
-              }
-              all_streams_dev_stats->add_node_stats()->Swap(ns);
-              break;
-            }
-            case CuptiTracerEventType::MemcpyH2D:
-            case CuptiTracerEventType::MemcpyD2H:
-            case CuptiTracerEventType::MemcpyD2D:
-            case CuptiTracerEventType::MemcpyP2P: {
-              std::string details = absl::StrCat(
-                  activity_name, " bytes:", event.memcpy_info.num_bytes);
-              if (event.memcpy_info.async) {
-                absl::StrAppend(&details, " aync");
-              }
-              if (event.memcpy_info.destination != event.device_id) {
-                absl::StrAppend(&details,
-                                " to device:", event.memcpy_info.destination);
-              }
-              ns->set_timeline_label(std::move(details));
-              DeviceStepStats*& stream_dev_stats =
-                  stream_dev_stats_map[std::make_pair(event.stream_id,
-                                                      event.type)];
-              if (stream_dev_stats == nullptr) {
-                stream_dev_stats = step_stats->add_dev_stats();
-                stream_dev_stats->set_device(absl::StrCat(
-                    "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
-                    "<", GetTraceEventTypeName(event.type), ">"));
-              }
-              *stream_dev_stats->add_node_stats() = *ns;
-              if (memcpy_dev_stats == nullptr) {
-                memcpy_dev_stats = step_stats->add_dev_stats();
-                memcpy_dev_stats->set_device(
-                    absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
-              }
-              memcpy_dev_stats->add_node_stats()->Swap(ns);
-              break;
-            }
-            default:
-              ns->set_timeline_label(activity_name);
-              if (unknown_stream_dev_stats == nullptr) {
-                unknown_stream_dev_stats = step_stats->add_dev_stats();
-                unknown_stream_dev_stats->set_device(
-                    absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
-              }
-              unknown_stream_dev_stats->add_node_stats()->Swap(ns);
-              break;
-          }
-        }
-      }
-      events.clear();
-    }
-
-    void Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
-               XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
-      mutex_lock l(m);
-      // Tracking event types per line.
-      absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
-          events_types_per_line;
-      for (auto& event : events) {
-        bool is_host_event = IsHostEvent(event);
-        int64 line_id = is_host_event ? static_cast<int64>(event.thread_id)
-                                      : event.stream_id;
-        if (line_id == CuptiTracerEvent::kInvalidThreadId ||
-            line_id == CuptiTracerEvent::kInvalidStreamId)
-          continue;
-        auto* plane = is_host_event ? host_plane : device_plane;
-        XLineBuilder line = plane->GetOrCreateLine(line_id);
-        line.SetTimestampNs(start_gpu_ns);
-        CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
-        events_types_per_line[line_id].emplace(event.type);
-      }
-      device_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(
-            GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
-      });
-      host_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(absl::StrCat("Host Threads/", line.Id()));
-      });
-      events.clear();
-    }
-
-    void GetDeviceCapabilities(int32 device_ordinal,
-                               XPlaneBuilder* device_plane) {
-      CUdevice device;
-      if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
-
-      auto clock_rate_in_khz =
-          GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
-      if (clock_rate_in_khz) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-            *clock_rate_in_khz);
-      }
-
-      auto core_count =
-          GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
-      if (core_count) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapCoreCount)),
-            *core_count);
-      }
-
-      auto mem_clock_khz =
-          GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
-      auto mem_bus_width_bits = GetDeviceAttribute(
-          device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
-      if (mem_clock_khz && mem_bus_width_bits) {
-        // Times 2 because HBM is DDR memory; it gets two data bits per each
-        // data lane.
-        auto memory_bandwidth =
-            uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-            memory_bandwidth);
-      }
-
-      size_t total_memory = 0;
-      if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemorySize)),
-            static_cast<uint64>(total_memory));
-      }
-
-      auto compute_capability_major = GetDeviceAttribute(
-          device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
-      if (compute_capability_major) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-            *compute_capability_major);
-      }
-      auto compute_capability_minor = GetDeviceAttribute(
-          device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
-      if (compute_capability_minor) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-            *compute_capability_minor);
-      }
-    }
-
-    mutex m;
-    std::vector<CuptiTracerEvent> events TF_GUARDED_BY(m);
-    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
-        TF_GUARDED_BY(m);
-  };
-  absl::FixedArray<PerDeviceCollector> per_device_collector_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
-};
-
 // GpuTracer for GPU.
 class GpuTracer : public profiler::ProfilerInterface {
  public:
@@ -557,7 +70,7 @@ class GpuTracer : public profiler::ProfilerInterface {
 
   CuptiTracer* cupti_tracer_;
   CuptiTracerOptions options_;
-  std::unique_ptr<CuptiTraceCollectorImpl> cupti_collector_;
+  std::unique_ptr<CuptiTraceCollector> cupti_collector_;
 };
 
 Status GpuTracer::DoStart() {
@@ -621,8 +134,8 @@ Status GpuTracer::DoStart() {
   collector_options.num_gpus = cupti_tracer_->NumGpus();
   uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
   uint64 start_walltime_ns = tensorflow::EnvTime::NowNanos();
-  cupti_collector_ = absl::make_unique<CuptiTraceCollectorImpl>(
-      collector_options, start_walltime_ns, start_gputime_ns);
+  cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
+                                          start_gputime_ns);
 
   AnnotationStack::Enable(true);
   cupti_tracer_->Enable(options_, cupti_collector_.get());
@@ -683,6 +196,7 @@ Status GpuTracer::CollectData(RunMetadata* run_metadata) {
 }
 
 Status GpuTracer::CollectData(XSpace* space) {
+  VLOG(2) << "Collecting data to XSpace from GpuTracer.";
   switch (profiling_state_) {
     case State::kNotStarted:
       VLOG(1) << "No trace data collected, session wasn't started";
@@ -705,7 +219,8 @@ Status GpuTracer::CollectData(XSpace* space) {
         space->add_warnings(std::move(events_dropped));
       }
       if (cupti_collector_) {
-        cupti_collector_->Export(space);
+        uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
+        cupti_collector_->Export(space, end_gpu_ns);
       }
       return Status::OK();
     }
@@ -716,7 +231,7 @@ Status GpuTracer::CollectData(XSpace* space) {
 // Not in anonymous namespace for testing purposes.
 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
     const ProfileOptions& options) {
-  VLOG(2) << "Collecting data to XSpace from GpuTracer.";
+  if (options.device_tracer_level() == 0) return nullptr;
   if (options.device_type() != ProfileOptions::GPU &&
       options.device_type() != ProfileOptions::UNSPECIFIED)
     return nullptr;
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index cf4ba6f84a9..25fdc35368a 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -55,6 +55,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
         "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
         ":profiler_lock",
@@ -68,58 +69,6 @@ cc_library(
     alwayslink = True,
 )
 
-tf_profiler_pybind_cc_library_wrapper(
-    name = "local_profiler_for_pybind",
-    actual = ":local_profiler",
-    visibility = [
-        "//tensorflow/core/profiler/rpc:__pkg__",
-        "//tensorflow/python/profiler/internal:__pkg__",
-    ],
-)
-
-cc_library(
-    name = "local_profiler",
-    hdrs = ["local_profiler.h"],
-    visibility = ["//tensorflow/core/profiler:internal"],
-    deps = [
-        "@com_google_absl//absl/memory",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        ":profiler_interface",
-    ] + if_static([
-        ":local_profiler_impl",
-    ]),
-)
-
-cc_library(
-    name = "local_profiler_impl",
-    srcs = ["local_profiler.cc"],
-    hdrs = ["local_profiler.h"],
-    copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/core/profiler:internal"],
-    deps = [
-        ":profiler_interface",
-        ":profiler_lock",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/convert:post_process_single_host_xplane",
-        "//tensorflow/core/profiler/lib:profiler_factory",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:derived_timeline",
-        "//tensorflow/core/profiler/utils:group_events",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
     name = "profiler_factory",
     hdrs = ["profiler_factory.h"],
diff --git a/tensorflow/core/profiler/lib/local_profiler.cc b/tensorflow/core/profiler/lib/local_profiler.cc
deleted file mode 100644
index 5c2abe67918..00000000000
--- a/tensorflow/core/profiler/lib/local_profiler.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/lib/local_profiler.h"
-
-#include <memory>
-
-#include "absl/memory/memory.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/lib/profiler_lock.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/derived_timeline.h"
-#include "tensorflow/core/profiler/utils/group_events.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-/*static*/ std::unique_ptr<LocalProfiler> LocalProfiler::Create(
-    const ProfileOptions& options, Status* out_status) {
-  auto profiler = absl::WrapUnique(new LocalProfiler(options));
-  Status status = profiler->Init();
-  if (out_status) {
-    *out_status = status;
-  }
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return nullptr;
-  }
-  return profiler;
-}
-
-LocalProfiler::LocalProfiler(ProfileOptions options)
-    : options_(std::move(options)) {}
-
-LocalProfiler::~LocalProfiler() {
-  mutex_lock lock(mutex_);
-
-  for (auto& profiler : profilers_) {
-    profiler->Stop().IgnoreError();
-  }
-
-  if (active_) {
-    // Allow another LocalProfiler to be instantiated.
-    ReleaseProfilerLock();
-    active_ = false;
-  }
-}
-
-Status LocalProfiler::Init() {
-  mutex_lock lock(mutex_);
-  VLOG(1) << "Creating a LocalProfiler.";
-
-  bool active_ = AcquireProfilerLock();
-  if (!active_) {
-    return errors::Unavailable("Another LocalProfiler is active.");
-  }
-
-  CreateProfilers(options_, &profilers_);
-
-  VLOG(1) << "LocalProfiler initialized with " << profilers_.size()
-          << " profilers.";
-  return Status::OK();
-}
-
-Status LocalProfiler::Start() {
-  mutex_lock lock(mutex_);
-  VLOG(1) << "Starting all profilers.";
-
-  if (!active_) {
-    return errors::FailedPrecondition("LocalProfiler is inactive.");
-  }
-
-  if (start_time_ns_ != 0) {
-    return errors::FailedPrecondition("LocalProfiler is not restartable.");
-  }
-
-  start_time_ns_ = EnvTime::NowNanos();
-
-  Status status;
-  for (auto& profiler : profilers_) {
-    Status start_status = profiler->Start();
-    if (!start_status.ok()) {
-      LOG(WARNING) << "Encountered error while starting profiler: "
-                   << start_status.ToString();
-    }
-    status.Update(start_status);
-  }
-
-  VLOG(1) << "Started all profilers.";
-  return status;
-}
-
-Status LocalProfiler::Stop() {
-  mutex_lock lock(mutex_);
-  VLOG(1) << "Stopping all profilers.";
-
-  if (!active_) {
-    return errors::FailedPrecondition("LocalProfiler is inactive.");
-  }
-
-  if (start_time_ns_ == 0) {
-    return errors::FailedPrecondition(
-        "LocalProfiler needs to Start() before it can stop producing data.");
-  }
-
-  Status status;
-  for (auto& profiler : profilers_) {
-    status.Update(profiler->Stop());
-  }
-
-  // Allow another LocalProfiler to be instantiated.
-  if (active_) {
-    ReleaseProfilerLock();
-    active_ = false;
-  }
-
-  VLOG(1) << "Stopped all profilers.";
-  return status;
-}
-
-Status LocalProfiler::CollectData(XSpace* space) {
-  Status status;
-  uint64 data_start_time_ns;
-
-  {
-    mutex_lock lock(mutex_);
-    VLOG(1) << "Collecting data from " << profilers_.size() << " profilers.";
-
-    if (!active_) {
-      return errors::FailedPrecondition("LocalProfiler is inactive.");
-    }
-
-    if (start_time_ns_ != 0) {
-      return errors::FailedPrecondition(
-          "LocalProfiler needs to Stop() before collecting data.");
-    }
-
-    for (auto& profiler : profilers_) {
-      VLOG(3) << "Collecting data from " << typeid(*profiler).name();
-      status.Update(profiler->CollectData(space));
-    }
-
-    profilers_.clear();
-
-    data_start_time_ns = start_time_ns_;
-  }
-
-  PostProcessSingleHostXSpace(space, data_start_time_ns);
-  return status;
-}
-
-Status LocalProfiler::CollectData(RunMetadata* run_metadata) {
-  return errors::Unimplemented(
-      "Collecting profiler data into RunMetaData is unsupported.");
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/local_profiler.h b/tensorflow/core/profiler/lib/local_profiler.h
deleted file mode 100644
index 9c8d629af93..00000000000
--- a/tensorflow/core/profiler/lib/local_profiler.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_LIB_LOCAL_PROFILER_H_
-#define TENSORFLOW_CORE_PROFILER_LIB_LOCAL_PROFILER_H_
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// LocalProfiler encapsulates multiple profiler backends that each implements.
-// ProfilerInterface.
-// Thread-safety: LocalProfiler is thread-safe.
-class LocalProfiler : public ProfilerInterface {
- public:
-  // Instantiates a LocalProfiler if there is not one already active.
-  // Returns null on errors, which will be indicated by the Status code.
-  static std::unique_ptr<LocalProfiler> Create(const ProfileOptions& options,
-                                               Status* status);
-
-  static ProfileOptions DefaultOptions() {
-    ProfileOptions options;
-    options.set_version(1);
-    options.set_device_tracer_level(1);
-    options.set_host_tracer_level(2);
-    options.set_device_type(ProfileOptions::UNSPECIFIED);
-    options.set_python_tracer_level(0);
-    options.set_enable_hlo_proto(false);
-    options.set_include_dataset_ops(true);
-    return options;
-  }
-
-  // Starts all profilers.
-  Status Start() override TF_LOCKS_EXCLUDED(mutex_);
-
-  // Stops all profilers.
-  Status Stop() override TF_LOCKS_EXCLUDED(mutex_);
-
-  // Collects data from all profilers into XSpace. Post-process the XSpace
-  // (e.g., groups trace events per step). This is best effort profiling and
-  //  XSpace may contain data collected before any errors occurred.
-  Status CollectData(XSpace* space) override TF_LOCKS_EXCLUDED(mutex_);
-
-  // Unimplemented, do not use. This will be deprecated in future.
-  Status CollectData(RunMetadata* run_metadata) override;
-
-  // Deletes an existing Profiler and enables starting a new one.
-  ~LocalProfiler() override;
-
- private:
-  // Constructs an instance of the class and starts profiling
-  explicit LocalProfiler(ProfileOptions options);
-
-  // Neither copyable or movable.
-  LocalProfiler(const LocalProfiler&) = delete;
-  LocalProfiler& operator=(const LocalProfiler&) = delete;
-
-  // Initializes LocalProfiler and sets ups all profilers.
-  Status Init();
-
-  mutex mutex_;
-
-  std::vector<std::unique_ptr<ProfilerInterface>> profilers_
-      TF_GUARDED_BY(mutex_);
-
-  // True if the LocalProfiler is active.
-  bool active_ TF_GUARDED_BY(mutex_) = false;
-
-  // Time when Start() was called.
-  uint64 start_time_ns_ TF_GUARDED_BY(mutex_) = 0;
-
-  ProfileOptions options_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_LIB_LOCAL_PROFILER_H_
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index f37cb12ebab..c1b871025d6 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
@@ -65,6 +68,7 @@ tensorflow::Status ProfilerSession::Status() {
 Status ProfilerSession::CollectData(profiler::XSpace* space) {
   mutex_lock l(mutex_);
   if (!status_.ok()) return status_;
+  LOG(INFO) << "Profiler session collecting data.";
   for (auto& profiler : profilers_) {
     profiler->Stop().IgnoreError();
   }
@@ -116,7 +120,6 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
 #else
     : active_(false),
 #endif
-      start_time_ns_(EnvTime::NowNanos()),
       options_(std::move(options)) {
   if (!active_) {
 #if !defined(IS_MOBILE_PLATFORM)
@@ -130,6 +133,25 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
     return;
   }
 
+  LOG(INFO) << "Profiler session initializing.";
+  // Sleep until it is time to start profiling.
+  const bool delayed_start = options_.start_timestamp_ns() > 0;
+  if (delayed_start) {
+    absl::Time scheduled_start =
+        absl::FromUnixNanos(options_.start_timestamp_ns());
+    auto now = absl::Now();
+    if (scheduled_start < now) {
+      LOG(WARNING) << "Profiling is late (" << now
+                   << ") for the scheduled start (" << scheduled_start
+                   << ") and will start immediately.";
+    } else {
+      absl::Duration sleep_duration = scheduled_start - now;
+      LOG(INFO) << "Delaying start of profiler session by " << sleep_duration;
+      absl::SleepFor(sleep_duration);
+    }
+  }
+
+  start_time_ns_ = EnvTime::NowNanos();
   LOG(INFO) << "Profiler session started.";
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -147,7 +169,7 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
 }
 
 ProfilerSession::~ProfilerSession() {
-  VLOG(1) << "Profiler session stopping.";
+  LOG(INFO) << "Profiler session tear down.";
   for (auto& profiler : profilers_) {
     profiler->Stop().IgnoreError();
   }
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index c179f1710c9..2a852c92267 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -38,7 +38,7 @@ namespace tensorflow {
 // Thread-safety: ProfilerSession is thread-safe.
 class ProfilerSession {
  public:
-  // Creates and ProfilerSession and starts profiling.
+  // Creates a ProfilerSession and starts profiling.
   static std::unique_ptr<ProfilerSession> Create(const ProfileOptions& options);
 
   static ProfileOptions DefaultOptions() {
@@ -79,7 +79,7 @@ class ProfilerSession {
   bool active_ TF_GUARDED_BY(mutex_);
 
   tensorflow::Status status_ TF_GUARDED_BY(mutex_);
-  const uint64 start_time_ns_;
+  uint64 start_time_ns_;
   mutex mutex_;
   ProfileOptions options_;
 };
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 5a072b3e731..976fcfc82dd 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -123,8 +123,11 @@ class TraceMe {
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
   // Example Usage:
+  //   TraceMe trace_me([&]() {
+  //     return StrCat("my_trace", id);
+  //   }
   //   TraceMe op_trace_me([&]() {
-  //     return StrCat(op_name, ":", op_type);
+  //     return TraceMeOp(op_name, op_type);
   //   }
   //   TraceMe trace_me_with_metadata([&value1]() {
   //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
@@ -189,6 +192,23 @@ class TraceMe {
 
   // Static API, for use when scoped objects are inconvenient.
 
+  // Record the start time of an activity.
+  // Returns the activity ID, which is used to stop the activity.
+  // Calls `name_generator` to get the name for activity.
+  template <typename NameGeneratorT>
+  static uint64 ActivityStart(NameGeneratorT name_generator, int level = 1) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
+      uint64 activity_id = TraceMeRecorder::NewActivityId();
+      TraceMeRecorder::Record({activity_id, name_generator(),
+                               /*start_time=*/EnvTime::NowNanos(),
+                               /*end_time=*/0});
+      return activity_id;
+    }
+#endif
+    return kUntracedActivity;
+  }
+
   // Record the start time of an activity.
   // Returns the activity ID, which is used to stop the activity.
   static uint64 ActivityStart(absl::string_view name, int level = 1) {
@@ -204,6 +224,16 @@ class TraceMe {
     return kUntracedActivity;
   }
 
+  // Same as ActivityStart above, an overload for "const std::string&"
+  static uint64 ActivityStart(const std::string& name, int level = 1) {
+    return ActivityStart(absl::string_view(name), level);
+  }
+
+  // Same as ActivityStart above, an overload for "const char*"
+  static uint64 ActivityStart(const char* name, int level = 1) {
+    return ActivityStart(absl::string_view(name), level);
+  }
+
   // Record the end time of an activity started by ActivityStart().
   static void ActivityEnd(uint64 activity_id) {
 #if !defined(IS_MOBILE_PLATFORM)
diff --git a/tensorflow/core/profiler/profiler_options.proto b/tensorflow/core/profiler/profiler_options.proto
index 7858f08c8ec..899bdb909b0 100644
--- a/tensorflow/core/profiler/profiler_options.proto
+++ b/tensorflow/core/profiler/profiler_options.proto
@@ -63,7 +63,7 @@ message ProfileOptions {
 }
 
 // Options for remote profiler session manager.
-// Next ID: 5
+// Next ID: 6
 message RemoteProfilerSessionManagerOptions {
   // Options for each local profiler.
   ProfileOptions profiler_options = 1;
@@ -79,4 +79,7 @@ message RemoteProfilerSessionManagerOptions {
   // continues until interrupted. Otherwise, value must be greater than
   // profiler_options.duration_ms.
   uint64 max_session_duration_ms = 4;
+
+  // Start of profiling is delayed by this much (in milliseconds).
+  uint64 delay_ms = 5;
 }
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index 99627ac9d26..02c8f2b6ad8 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -82,7 +82,6 @@ tf_proto_library(
     protodeps = [
         ":diagnostics_proto",
         ":pod_stats_proto",
-        ":op_stats_proto",
     ],
     visibility = [":friends"],
 )
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index 500de69048a..53d6499b401 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -90,6 +90,17 @@ message RunEnvironment {
   uint32 host_trace_level = 12;
 }
 
+// Next ID: 7
+message CoreDetails {
+  string hostname = 1;
+  uint32 device_ordinal = 2;  // unique within host, TPU core only
+  uint32 core_num = 3;        // unique within chip per core type
+  uint32 local_chip_id = 4;   // unique within host
+  uint32 global_chip_id = 5;  // unique within mesh
+  uint32 global_core_id = 6;  // unique within mesh, TPU core only
+}
+
+// Next ID: 12
 // Operator Statistics.
 message OpStats {
   // The database for the op metrics collected from the host over the entire
@@ -110,6 +121,8 @@ message OpStats {
   KernelStatsDb kernel_stats_db = 6;
   // Statistics for all tf-functions.
   TfFunctionDb tf_function_db = 8;
+  // A map from core ID to details.
+  map<uint32, CoreDetails> core_id_to_details = 11;
   // Error and warning messages for diagnosing profiling issues.
   Diagnostics diagnostics = 9;
   reserved 7;
diff --git a/tensorflow/core/profiler/protobuf/pod_viewer.proto b/tensorflow/core/profiler/protobuf/pod_viewer.proto
index 0c327657616..3cfaa096c54 100644
--- a/tensorflow/core/profiler/protobuf/pod_viewer.proto
+++ b/tensorflow/core/profiler/protobuf/pod_viewer.proto
@@ -5,7 +5,6 @@ syntax = "proto3";
 package tensorflow.profiler;
 
 import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-import "tensorflow/core/profiler/protobuf/op_stats.proto";
 import "tensorflow/core/profiler/protobuf/pod_stats.proto";
 
 // Describes the replica groups in a cross replica op (e.g., all-reduce and
@@ -82,31 +81,45 @@ message ChannelInfo {
   reserved 2, 3, 10;
 }
 
-// Run environment of the job.
-message PodViewerRunEnvironment {
-  // The type of TPU used.
-  string tpu_type = 1;
-  // Pod system topology.
-  SystemTopology topology = 2;
-}
-
 message PodViewerSummary {
   repeated string warnings = 1;
 }
 
-// Next ID: 10
+// Next ID: 8
+// Topology graph draws all the cores in the system in a 2-D rectangle or
+// 3-D cube. It is hierarchically grouped by host, chip and core.
+message PodViewerTopology {
+  // Number of cores in the x dimension of the rectangle/cube.
+  int32 x_dimension = 1;
+  // Number of cores in the y dimension of the rectangle/cube.
+  int32 y_dimension = 2;
+  // Number of cores in the z dimension of the cube.
+  int32 z_dimension = 3;
+  // Number of cores in the x dimension of each host.
+  int32 host_x_stride = 4;
+  // Number of cores in the y dimension of each host.
+  int32 host_y_stride = 5;
+  // Number of cores in the z dimension of each host.
+  int32 host_z_stride = 6;
+  // Number of cores per chip.
+  int32 num_cores_per_chip = 7;
+}
+
+// Next ID: 12
 // A database of pod viewer records.
 message PodViewerDatabase {
+  // The type of device used.
+  string device_type = 10;
   // Pod level stats for each step.
   PodStatsSequence pod_stats_sequence = 3;
-  // Job run environment, including number of hosts used, device type, etc.
-  PodViewerRunEnvironment run_environment = 6;
   // Top level summary of pod viewer.
   PodViewerSummary summary = 7;
   // Error and warning messages for diagnosing profiling issues.
   Diagnostics diagnostics = 8;
   // A map from event type number to event name string for step breakdown.
   repeated StepBreakdownEvents step_breakdown_events = 9;
+  // Info to draw the topology graph.
+  PodViewerTopology topology = 11;
 
-  reserved 1, 2, 4, 5;
+  reserved 1, 2, 4, 5, 6;
 }
diff --git a/tensorflow/core/profiler/protobuf/tf_data_stats.proto b/tensorflow/core/profiler/protobuf/tf_data_stats.proto
index f4edc4144d4..25c19614fc1 100644
--- a/tensorflow/core/profiler/protobuf/tf_data_stats.proto
+++ b/tensorflow/core/profiler/protobuf/tf_data_stats.proto
@@ -76,6 +76,8 @@ message InputPipelineStats {
   int64 min_latency_ps = 4;
   // Maximum latency of the input pipeline.
   int64 max_latency_ps = 5;
+  // The number of times this input pipeline was slower than 50 us.
+  int64 num_slow_calls = 6;
   // Stats per call sorted by the root iterator's duration.
   repeated InputPipelineStat stats = 2;
 }
@@ -87,3 +89,30 @@ message TfDataStats {
   // Stats per input pipeline.
   map<int64, InputPipelineStats> input_pipelines = 1;
 }
+
+message TfDataBottleneckAnalysis {
+  // Host name.
+  string host = 1;
+  // Input pipeline name.
+  string input_pipeline = 2;
+  // Maximum latency of the input pipeline.
+  int64 max_latency_ps = 3;
+  // Name of the bottleneck iterator.
+  string iterator_name = 4;
+  // Long name of the bottleneck iterator.
+  string iterator_long_name = 5;
+  // Suggestion to resolve the bottleneck.
+  string suggestion = 6;
+}
+
+// TfDataStats of all hosts.
+message CombinedTfDataStats {
+  // Whether it is input bound.
+  bool is_input_bound = 3;
+  // Summary of the analysis.
+  string summary = 4;
+  // Bottleneck analysis result.
+  TfDataBottleneckAnalysis bottleneck_analysis = 1;
+  // TfDataStats per host.
+  map<string, TfDataStats> tf_data_stats = 2;
+}
diff --git a/tensorflow/core/profiler/protobuf/xplane.proto b/tensorflow/core/profiler/protobuf/xplane.proto
index dd34c2f40b1..f57d7609891 100644
--- a/tensorflow/core/profiler/protobuf/xplane.proto
+++ b/tensorflow/core/profiler/protobuf/xplane.proto
@@ -5,13 +5,15 @@ package tensorflow.profiler;
 option cc_enable_arenas = true;
 
 // A container of parallel XPlanes, generated by one or more profiling sources.
-// Next ID: 4
+// Next ID: 5
 message XSpace {
   repeated XPlane planes = 1;
   // Errors (if any) in the generation of planes.
   repeated string errors = 2;
   // Warnings (if any) in the generation of planes;
   repeated string warnings = 3;
+  // List of hostnames that XPlanes are generated from.
+  repeated string hostnames = 4;
 }
 
 // An XPlane is a container of parallel timelines (XLines), generated by a
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index d532ad90bb9..e54ffe0f615 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -47,8 +47,11 @@ cc_library(
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:file_system_utils",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
     ],
 )
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 9111e16b494..ea37a8852a0 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -27,6 +27,7 @@ cc_library(
     ],
     deps = [
         ":profiler_client_for_pybind",
+        ":remote_profiler_session_manager",
         ":save_profile",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -36,6 +37,7 @@ cc_library(
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -115,7 +117,6 @@ cc_library(
 tf_cc_test(
     name = "profiler_client_test",
     srcs = ["profiler_client_test.cc"],
-    tags = ["external"],  # So that test suite reruns unconditionally.
     deps = [
         ":profiler_client",
         ":profiler_client_impl",  # for oss
@@ -133,14 +134,11 @@ cc_library(
     srcs = ["remote_profiler_session_manager.cc"],
     hdrs = ["remote_profiler_session_manager.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
         ":profiler_client_for_pybind",
-        ":save_profile",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/utils:time_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -151,7 +149,6 @@ cc_library(
 tf_cc_test(
     name = "remote_profiler_session_manager_test",
     srcs = ["remote_profiler_session_manager_test.cc"],
-    tags = ["external"],  # So that test suite reruns unconditionally.
     deps = [
         ":profiler_client_impl",  # for oss
         ":profiler_client_test_util",
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index d7707aff5c2..7d3f0c1a7b5 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/status.h"
@@ -30,12 +32,16 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::tensorflow::profiler::RemoteProfilerSessionManager;
+using Response = ::tensorflow::profiler::RemoteProfilerSessionManager::Response;
+
 constexpr uint64 kMaxEvents = 1000000;
 const absl::string_view kXPlanePb = "xplane.pb";
 
@@ -48,17 +54,18 @@ MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
   return request;
 }
 
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const std::string& repository_root,
-                                      const std::string& session_id,
-                                      const std::string& host_name,
-                                      const ProfileOptions& opts) {
+ProfileRequest PopulateProfileRequest(
+    absl::string_view repository_root, absl::string_view session_id,
+    absl::string_view host_name,
+    const RemoteProfilerSessionManagerOptions& options) {
   ProfileRequest request;
-  request.set_duration_ms(duration_ms);
+  // TODO(b/169976117) Remove duration from request.
+  request.set_duration_ms(options.profiler_options().duration_ms());
   request.set_max_events(kMaxEvents);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
-  request.set_host_name(host_name);
+  request.set_repository_root(repository_root.data(), repository_root.size());
+  request.set_session_id(session_id.data(), session_id.size());
+  request.set_host_name(host_name.data(), host_name.size());
+  // These tools are only used by TPU profiler.
   request.add_tools("trace_viewer");
   request.add_tools("op_profile");
   request.add_tools("input_pipeline");
@@ -68,21 +75,26 @@ ProfileRequest PopulateProfileRequest(int duration_ms,
   request.add_tools("overview_page");
   request.add_tools("pod_viewer");
   request.add_tools("tensorflow_stats");
-  *request.mutable_opts() = opts;
+  // XPlane tool is only used by OSS profiler and safely ignored by TPU
+  // profiler.
+  request.add_tools(kXPlanePb.data(), kXPlanePb.size());
+  *request.mutable_opts() = options.profiler_options();
   return request;
 }
 
 NewProfileSessionRequest PopulateNewProfileSessionRequest(
-    const std::string& service_addr, const std::string& repository_root,
-    const std::vector<string>& hostnames, int duration_ms,
-    const std::string& session_id, const ProfileOptions& opts) {
+    absl::string_view repository_root, absl::string_view session_id,
+    const RemoteProfilerSessionManagerOptions& opts) {
   NewProfileSessionRequest request;
-  std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
-  *request.mutable_request() = PopulateProfileRequest(
-      duration_ms, repository_root, session_id, parts[0], opts);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
-  for (const auto& hostname : hostnames) {
+  std::vector<absl::string_view> parts =
+      absl::StrSplit(opts.service_addresses(0), ':');
+  DCHECK(!parts.empty());
+
+  *request.mutable_request() =
+      PopulateProfileRequest(repository_root, session_id, parts[0], opts);
+  request.set_repository_root(repository_root.data(), repository_root.size());
+  request.set_session_id(session_id.data(), session_id.size());
+  for (const auto& hostname : opts.service_addresses()) {
     request.add_hosts(hostname);
   }
   return request;
@@ -99,44 +111,46 @@ inline bool ShouldRetryTracing(Status status) {
           status.error_message() == "Stream removed");
 }
 
-// If the ProfileResponse has single 'xplane.pb' tool, convert the xplane to
-// other tools and add in ProfileResponse. Otherwise, the ProfileResponse is
-// already converted, simply return.
-Status ConvertXSpaceToToolsInProfileResponse(const ProfileRequest& request,
-                                             ProfileResponse* response) {
-  if (response->tool_data_size() != 1) return Status::OK();
-  if (response->tool_data(0).name() != kXPlanePb) return Status::OK();
-  XSpace xspace;
-  xspace.ParseFromString(response->tool_data(0).data());
-  TF_RETURN_IF_ERROR(ConvertXSpaceToProfileResponse(xspace, request, response));
-  return Status::OK();
-}
+Status Profile(const std::string& repository_root,
+               const std::string& session_id,
+               const RemoteProfilerSessionManagerOptions& opts) {
+  Status status;
+  // Host name will be overwritten by RemoteProfilerSessionManager later.
+  ProfileRequest request = PopulateProfileRequest(repository_root, session_id,
+                                                  /*host_name=*/"", opts);
+  auto session = RemoteProfilerSessionManager::Create(opts, request, status);
+  TF_RETURN_IF_ERROR(status);
+  // Expect one or more service addresses.
+  DCHECK_GT(opts.service_addresses_size(), 0);
+  std::vector<Response> responses = session->WaitForCompletion();
+  // Expect responses to have the same size as clients.
+  DCHECK_EQ(responses.size(), opts.service_addresses_size());
 
-Status Profile(const std::string& service_addr,
-               const std::string& repository_root, int duration_ms,
-               const std::string& session_id, const ProfileOptions& opts) {
-  std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
-  ProfileRequest request = PopulateProfileRequest(duration_ms, repository_root,
-                                                  session_id, parts[0], opts);
-  ProfileResponse response;
-  TF_RETURN_IF_ERROR(ProfileGrpc(service_addr, request, &response));
-
-  if (!response.empty_trace()) {
-    TF_RETURN_IF_ERROR(
-        ConvertXSpaceToToolsInProfileResponse(request, &response));
-    TF_RETURN_IF_ERROR(SaveProfile(repository_root, session_id,
-                                   request.host_name(), response, &std::cout));
-    // Print this at the end so that it's not buried in irrelevant LOG messages.
-    std::cout
-        << "NOTE: using the trace duration " << duration_ms << "ms.\n"
-        << "Set an appropriate duration (with --duration_ms) if you "
-           "don't see a full step in your trace or the captured trace is too "
-           "large."
-        << std::endl;
+  bool has_trace_data = false;
+  for (const auto& client_response : responses) {
+    ProfileResponse& response = *client_response.profile_response;
+    if (response.empty_trace()) {
+      LOG(WARNING) << "No trace event is collected from "
+                   << client_response.service_address;
+    } else {
+      has_trace_data = true;
+      // If server side returns tool data in the response, saves that into the
+      // repository. This improves backward compatibility by reducing assumption
+      // of what server side does.
+      TF_RETURN_IF_ERROR(SaveProfile(repository_root, session_id,
+                                     client_response.service_address, response,
+                                     &std::cout));
+    }
+    if (!client_response.status.ok()) {
+      LOG(WARNING) << client_response.service_address << " returned "
+                   << client_response.status;
+    }
   }
 
-  if (response.empty_trace()) {
-    return Status(error::Code::UNAVAILABLE, "No trace event is collected");
+  if (!has_trace_data) {
+    return Status(error::Code::UNAVAILABLE,
+                  "No trace event was collected because there were no responses"
+                  " from clients or the responses did not have trace data.");
   }
   return Status::OK();
 }
@@ -144,52 +158,55 @@ Status Profile(const std::string& service_addr,
 // Start a new profiling session that include all the hosts included in
 // hostnames, for the time interval of duration_ms. Possibly save the profiling
 // result in the directory specified by repository_root and session_id.
-Status NewSession(const std::string& service_addr,
-                  const std::string& repository_root,
-                  const std::vector<string>& hostnames, int duration_ms,
-                  const std::string& session_id, const ProfileOptions& opts) {
-  NewProfileSessionRequest request = PopulateNewProfileSessionRequest(
-      service_addr, repository_root, hostnames, duration_ms, session_id, opts);
+Status NewSession(absl::string_view repository_root,
+                  absl::string_view session_id,
+                  const RemoteProfilerSessionManagerOptions& opts) {
+  NewProfileSessionRequest request =
+      PopulateNewProfileSessionRequest(repository_root, session_id, opts);
   NewProfileSessionResponse response;
-  TF_RETURN_IF_ERROR(NewSessionGrpc(service_addr, request, &response));
+  TF_RETURN_IF_ERROR(
+      NewSessionGrpc(opts.service_addresses(0), request, &response));
 
   std::cout << "Profile session succeed for host(s):"
-            << absl::StrJoin(hostnames, ",") << std::endl;
+            << absl::StrJoin(opts.service_addresses(), ",") << std::endl;
   if (response.empty_trace()) {
-    return Status(error::Code::UNAVAILABLE, "No trace event is collected");
+    return errors::Unavailable("No trace event is collected");
   }
   return Status::OK();
 }
 
 }  // namespace
 
-// Starts tracing on a single or multiple hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-Status Trace(const std::string& service_addr, const std::string& logdir,
-             const std::string& workers_list, int duration_ms,
-             int num_tracing_attempts, const ProfileOptions& opts) {
+Status Trace(const std::string& logdir, int num_tracing_attempts,
+             RemoteProfilerSessionManagerOptions& opts,
+             bool is_cloud_tpu_session) {
+  DCHECK_GT(opts.profiler_options().duration_ms(), 0);
+  DCHECK(!opts.service_addresses().empty());
+
   // Use the current timestamp as the run name.
   std::string session_id = GetCurrentTimeStampAsString();
-  std::vector<std::string> hostnames;
-  if (!workers_list.empty()) {
-    hostnames = absl::StrSplit(workers_list, ',');
-  }
+  std::string repository_root = GetTensorBoardProfilePluginDir(logdir);
+  auto duration_ms = opts.profiler_options().duration_ms();
   TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
-  std::string repository_root =
-      profiler::GetTensorBoardProfilePluginDir(logdir);
 
-  Status status = Status::OK();
+  Status status;
   int remaining_attempts = num_tracing_attempts;
   while (true) {
+    auto start_timestamp = absl::Now() + absl::Milliseconds(opts.delay_ms());
+    opts.mutable_profiler_options()->set_start_timestamp_ns(
+        absl::ToUnixNanos(start_timestamp));
+    LOG(INFO) << "Profiler delay_ms was " << opts.delay_ms()
+              << ", start_timestamp_ns set to "
+              << opts.profiler_options().start_timestamp_ns() << " ["
+              << start_timestamp << "]";
+
     std::cout << "Starting to trace for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << --remaining_attempts << std::endl;
-    if (hostnames.empty()) {
-      status =
-          Profile(service_addr, repository_root, duration_ms, session_id, opts);
+
+    if (is_cloud_tpu_session) {
+      status = NewSession(repository_root, session_id, opts);
     } else {
-      status = NewSession(service_addr, repository_root, hostnames, duration_ms,
-                          session_id, opts);
+      status = Profile(repository_root, session_id, opts);
     }
     if (remaining_attempts <= 0 || status.ok() || !ShouldRetryTracing(status))
       break;
@@ -223,11 +240,10 @@ Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir) {
 
   ProfileResponse response;
   ProfileRequest request = PopulateProfileRequest(
-      /*duration_ms=*/0, GetTensorBoardProfilePluginDir(logdir),
-      GetCurrentTimeStampAsString(), port::Hostname(), /*opts=*/{});
+      GetTensorBoardProfilePluginDir(logdir), GetCurrentTimeStampAsString(),
+      port::Hostname(), /*options=*/{});
   TF_RETURN_IF_ERROR(
       ConvertXSpaceToProfileResponse(xspace, request, &response));
-
   std::stringstream ss;  // Record LOG messages.
   TF_RETURN_IF_ERROR(SaveProfile(request.repository_root(),
                                  request.session_id(), request.host_name(),
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 771f1fee722..96f3dafa06a 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -36,12 +36,12 @@ Status Monitor(const std::string& service_addr, int duration_ms,
                int monitoring_level, bool display_timestamp,
                std::string* result);
 
-// Starts tracing on a single or multiple hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-Status Trace(const std::string& service_addr, const std::string& logdir,
-             const std::string& workers_list, int duration_ms,
-             int num_tracing_attempts, const ProfileOptions& opts);
+// Starts tracing on a single or multiple hosts. Each host will save the result
+// in the given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts. Assumes that options have been validated.
+Status Trace(const std::string& logdir, int num_tracing_attempts,
+             RemoteProfilerSessionManagerOptions& opts,
+             bool is_cloud_tpu_session);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.cc b/tensorflow/core/profiler/rpc/client/profiler_client.cc
index 8a178a281fe..f46075e8c44 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.cc
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.cc
@@ -99,10 +99,11 @@ RemoteProfilerSession::RemoteProfilerSession(std::string service_address,
       service_address_(std::move(service_address)),
       stub_(CreateStub<grpc::ProfilerService>(service_address_)),
       deadline_(deadline),
-      profile_request_(std::move(profile_request)) {}
+      profile_request_(std::move(profile_request)) {
+  response_->set_empty_trace(true);
+}
 
 RemoteProfilerSession::~RemoteProfilerSession() {
-  LOG(INFO) << "Waiting for completion.";
   Status dummy;
   WaitForCompletion(dummy);
   grpc_context_.TryCancel();
@@ -113,6 +114,8 @@ void RemoteProfilerSession::ProfileAsync() {
   grpc_context_.set_deadline(absl::ToChronoTime(deadline_));
   VLOG(1) << "Deadline set to " << deadline_;
   rpc_ = stub_->AsyncProfile(&grpc_context_, profile_request_, &cq_);
+  // Connection failure will create lame channel whereby grpc_status_ will be an
+  // error.
   rpc_->Finish(response_.get(), &grpc_status_,
                static_cast<void*>(&status_on_completion_));
   VLOG(2) << "Asynchronous gRPC Profile() issued." << absl::Now();
@@ -125,6 +128,7 @@ std::unique_ptr<ProfileResponse> RemoteProfilerSession::WaitForCompletion(
         "WaitForCompletion must only be called once.");
     return nullptr;
   }
+  LOG(INFO) << "Waiting for completion.";
 
   void* got_tag = nullptr;
   bool ok = false;
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.h b/tensorflow/core/profiler/rpc/client/profiler_client.h
index b171c674e92..84c71a55d5f 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.h
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.h
@@ -82,7 +82,7 @@ class RemoteProfilerSession {
   absl::Time deadline_;
   ::grpc::ClientContext grpc_context_;
   std::unique_ptr<::grpc::ClientAsyncResponseReader<ProfileResponse>> rpc_;
-  ::grpc::Status grpc_status_;
+  ::grpc::Status grpc_status_ = ::grpc::Status::OK;
 
   // Asynchronous completion queue states.
   ::grpc::CompletionQueue cq_;
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client_test.cc b/tensorflow/core/profiler/rpc/client/profiler_client_test.cc
index a65dc1a480e..ae42c5cacd0 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client_test.cc
+++ b/tensorflow/core/profiler/rpc/client/profiler_client_test.cc
@@ -52,8 +52,10 @@ TEST(RemoteProfilerSession, Simple) {
   absl::Duration elapsed = absl::Now() - approx_start;
   // At end of session this evaluates to true still.
   EXPECT_TRUE(status.ok());
-  EXPECT_FALSE(response->empty_trace());
-  EXPECT_GT(response->tool_data_size(), 0);
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
   EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
 }
 
@@ -86,8 +88,9 @@ TEST(RemoteProfilerSession, Timeout) {
   auto response = remote_session->WaitForCompletion(status);
   // At end of session we will have a timeout error.
   EXPECT_TRUE(errors::IsDeadlineExceeded(status));
-
-  EXPECT_FALSE(response->empty_trace());  // This defaults to false.
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
   EXPECT_EQ(response->tool_data_size(), 0);
 }
 
@@ -109,8 +112,10 @@ TEST(RemoteProfilerSession, LongDeadline) {
   absl::Duration elapsed = absl::Now() - approx_start;
   // At end of session this evaluates to true still.
   EXPECT_TRUE(status.ok());
-  EXPECT_FALSE(response->empty_trace());
-  EXPECT_GT(response->tool_data_size(), 0);
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
   // Elapsed time is near profiling duration despite long grace period.
   EXPECT_THAT(elapsed, DurationNear(duration));
 }
@@ -134,8 +139,10 @@ TEST(RemoteProfilerSession, LongDuration) {
   absl::Duration elapsed = absl::Now() - approx_start;
   // At end of session this evaluates to true still.
   EXPECT_TRUE(status.ok());
-  EXPECT_FALSE(response->empty_trace());
-  EXPECT_GT(response->tool_data_size(), 0);
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
   // Elapsed time takes longer to complete for larger traces.
   EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
 }
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h b/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
index 436ed58f41c..da10dcc6193 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
+++ b/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
@@ -37,14 +37,14 @@ namespace profiler {
 namespace test {
 
 inline std::unique_ptr<ProfilerServer> StartServer(
-    absl::Duration duration, std::string* service_addresses,
+    absl::Duration duration, std::string* service_address,
     ProfileRequest* request = nullptr) {
   auto profiler_server = absl::make_unique<ProfilerServer>();
   int port = testing::PickUnusedPortOrDie();
   profiler_server->StartProfilerServer(port);
 
-  DCHECK(service_addresses);
-  *service_addresses = absl::StrCat("localhost:", port);
+  DCHECK(service_address);
+  *service_address = absl::StrCat("localhost:", port);
 
   if (request) {
     request->set_duration_ms(absl::ToInt64Milliseconds(duration));
@@ -53,10 +53,11 @@ inline std::unique_ptr<ProfilerServer> StartServer(
     request->mutable_opts()->set_duration_ms(
         absl::ToInt64Milliseconds(duration));
     request->set_session_id("test_session");
-    request->set_host_name(*service_addresses);
+    request->set_host_name(*service_address);
+    request->set_repository_root(testing::TmpDir());
   }
 
-  LOG(INFO) << "Started " << *service_addresses << " at " << absl::Now();
+  LOG(INFO) << "Started " << *service_address << " at " << absl::Now();
   LOG(INFO) << "Duration: " << duration;
 
   return profiler_server;
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
index 8bf37d53c53..2eeffa292f0 100644
--- a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
@@ -26,47 +26,20 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/rpc/client/save_profile.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace profiler {
-namespace {
-
-constexpr uint64 kMaxEvents = 1000000;
-
-// TODO(yisitu) merge with the implementation in capture_profile.
-void PopulateProfileRequest(const RemoteProfilerSessionManagerOptions& options,
-                            absl::string_view session_id,
-                            absl::string_view host_name,
-                            ProfileRequest* request) {
-  request->set_max_events(kMaxEvents);
-  request->set_repository_root(options.profiler_options().repository_path());
-  request->set_session_id(session_id.data(), session_id.size());
-  request->add_tools("trace_viewer");
-  request->add_tools("op_profile");
-  request->add_tools("input_pipeline");
-  request->add_tools("kernel_stats");
-  request->add_tools("memory_viewer");
-  request->add_tools("memory_profile");
-  request->add_tools("overview_page");
-  request->add_tools("pod_viewer");
-  request->add_tools("tensorflow_stats");
-  request->set_host_name(host_name.data(), host_name.size());
-  *request->mutable_opts() = options.profiler_options();
-  request->set_duration_ms(options.profiler_options().duration_ms());
-}
-
-}  // namespace
 
 /*static*/ std::unique_ptr<RemoteProfilerSessionManager>
 RemoteProfilerSessionManager::Create(
     const RemoteProfilerSessionManagerOptions& options,
-    tensorflow::Status& out_status, AddressResolver resolver) {
+    const ProfileRequest& request, tensorflow::Status& out_status,
+    AddressResolver resolver) {
   VLOG(1) << "Creating a RemoteProfilerSessionManager.";
-  auto session_manager =
-      absl::WrapUnique(new RemoteProfilerSessionManager(options, resolver));
+  auto session_manager = absl::WrapUnique(
+      new RemoteProfilerSessionManager(options, request, resolver));
   out_status = session_manager->Init();
   if (!out_status.ok()) {
     return nullptr;
@@ -75,8 +48,9 @@ RemoteProfilerSessionManager::Create(
 }
 
 RemoteProfilerSessionManager::RemoteProfilerSessionManager(
-    RemoteProfilerSessionManagerOptions options, AddressResolver resolver)
-    : options_(std::move(options)) {
+    RemoteProfilerSessionManagerOptions options, ProfileRequest request,
+    AddressResolver resolver)
+    : options_(std::move(options)), request_(std::move(request)) {
   if (resolver) {
     resolver_ = std::move(resolver);
   } else {
@@ -91,14 +65,7 @@ RemoteProfilerSessionManager::~RemoteProfilerSessionManager() {
 Status RemoteProfilerSessionManager::Init() {
   mutex_lock lock(mutex_);
   VLOG(1) << "SessionManager initializing.";
-  // TODO(b/169482824) Move validation to call site.
-  Status status = ValidateOptionsLocked();
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return status;
-  }
 
-  std::string session_id = GetCurrentTimeStampAsString();
   const absl::Time session_created_ts =
       absl::FromUnixNanos(options_.session_creation_timestamp_ns());
   const absl::Time deadline =
@@ -115,16 +82,14 @@ Status RemoteProfilerSessionManager::Init() {
   // Prepare a list of clients.
   clients_.reserve(options_.service_addresses_size());
 
-  for (auto& service_addr : options_.service_addresses()) {
-    std::string resolved_service_addr = resolver_(service_addr);
-    ProfileRequest profile_request;
-    PopulateProfileRequest(options_, session_id, resolved_service_addr,
-                           &profile_request);
+  for (auto& service_address : options_.service_addresses()) {
+    std::string resolved_service_address = resolver_(service_address);
+    ProfileRequest request = request_;
+    request.set_host_name(resolved_service_address);
 
     // Creation also issues Profile RPC asynchronously.
     auto client = RemoteProfilerSession::Create(
-        std::move(resolved_service_addr), deadline, std::move(profile_request));
-
+        std::move(resolved_service_address), deadline, std::move(request));
     clients_.push_back(std::move(client));
   }
 
@@ -132,41 +97,18 @@ Status RemoteProfilerSessionManager::Init() {
   return Status::OK();
 }
 
-Status RemoteProfilerSessionManager::ValidateOptionsLocked() {
-  if (!options_.service_addresses_size()) {
-    return errors::InvalidArgument("No service addresses specified.");
-  }
-
-  if (options_.profiler_options().duration_ms() == 0) {
-    if (options_.max_session_duration_ms() != 0) {
-      return errors::InvalidArgument(
-          "If local profiler duration is unbounded, profiling session duration "
-          "must be unbounded.");
-    }
-  }
-
-  if (options_.max_session_duration_ms() <
-      options_.profiler_options().duration_ms()) {
-    return errors::InvalidArgument(
-        "The maximum profiling session duration must be greater than or equal "
-        "to the local profiler duration.");
-  }
-  return Status::OK();
-}
-
 std::vector<RemoteProfilerSessionManager::Response>
 RemoteProfilerSessionManager::WaitForCompletion() {
   mutex_lock lock(mutex_);
-  std::vector<RemoteProfilerSessionManager::Response> remote_responses;
-  remote_responses.reserve(clients_.size());
+  std::vector<RemoteProfilerSessionManager::Response> remote_responses(
+      clients_.size());
 
-  for (auto& client : clients_) {
-    remote_responses.emplace_back();
-    auto* profile_response = &remote_responses.back().profile_response;
-    Status& status = remote_responses.back().status;
-    std::string* service_addr = &remote_responses.back().service_addr;
-    *profile_response = client->WaitForCompletion(status);
-    *service_addr = std::string(client->GetServiceAddress());
+  for (int32 idx = 0; idx < clients_.size(); ++idx) {
+    auto& remote_response = remote_responses[idx];
+    auto* client = clients_[idx].get();
+    remote_response.profile_response =
+        client->WaitForCompletion(remote_response.status);
+    remote_response.service_address = std::string(client->GetServiceAddress());
   }
   return remote_responses;
 }
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
index 52f2e74fc68..1dc240ad530 100644
--- a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
@@ -26,9 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/profiler_client.h"
 
 namespace tensorflow {
@@ -40,21 +37,16 @@ using AddressResolver = std::function<std::string(absl::string_view)>;
 class RemoteProfilerSessionManager {
  public:
   struct Response {
-    std::string service_addr;
+    std::string service_address;
     std::unique_ptr<ProfileResponse> profile_response;
     Status status;
   };
   // Instantiates a collection of RemoteProfilerSessions starts profiling on
-  // each of them immediately.
+  // each of them immediately. Assumes that options have already been validated.
   static std::unique_ptr<RemoteProfilerSessionManager> Create(
       const RemoteProfilerSessionManagerOptions& options,
-      tensorflow::Status& out_status, AddressResolver resolver = nullptr);
-
-  static RemoteProfilerSessionManagerOptions DefaultOptions() {
-    RemoteProfilerSessionManagerOptions options;
-    *options.mutable_profiler_options() = ProfilerSession::DefaultOptions();
-    return options;
-  }
+      const ProfileRequest& request, tensorflow::Status& out_status,
+      AddressResolver resolver = nullptr);
 
   // Awaits for responses from remote profiler sessions and returns them as a
   // list. Subsequent calls beyond the first will yield a list of errors.
@@ -69,16 +61,16 @@ class RemoteProfilerSessionManager {
 
  private:
   explicit RemoteProfilerSessionManager(
-      RemoteProfilerSessionManagerOptions options, AddressResolver resolver);
+      RemoteProfilerSessionManagerOptions options, ProfileRequest request,
+      AddressResolver resolver);
 
   // Initialization of all client contexts.
   Status Init();
 
-  Status ValidateOptionsLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
   mutex mutex_;
   // Remote profiler session options.
   RemoteProfilerSessionManagerOptions options_ TF_GUARDED_BY(mutex_);
+  ProfileRequest request_ TF_GUARDED_BY(mutex_);
   // List of clients, each connects to a profiling service.
   std::vector<std::unique_ptr<RemoteProfilerSession>> clients_
       TF_GUARDED_BY(mutex_);
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
index ec275d6fe08..c6d2d640eae 100644
--- a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -35,46 +35,73 @@ namespace {
 using ::tensorflow::profiler::test::DurationApproxLess;
 using ::tensorflow::profiler::test::DurationNear;
 using ::tensorflow::profiler::test::StartServer;
+using ::tensorflow::testing::TmpDir;
 using Response = tensorflow::profiler::RemoteProfilerSessionManager::Response;
 
+// Copied from capture_profile to not introduce a dependency.
+ProfileRequest PopulateProfileRequest(
+    absl::string_view repository_root, absl::string_view session_id,
+    absl::string_view host_name,
+    const RemoteProfilerSessionManagerOptions& options) {
+  constexpr uint64 kMaxEvents = 1000000;
+  const absl::string_view kXPlanePb = "xplane.pb";
+  ProfileRequest request;
+  // TODO(b/169976117) Remove duration from request.
+  request.set_duration_ms(options.profiler_options().duration_ms());
+  request.set_max_events(kMaxEvents);
+  request.set_repository_root(repository_root.data(), repository_root.size());
+  request.set_session_id(session_id.data(), session_id.size());
+  request.set_host_name(host_name.data(), host_name.size());
+  // XPlane tool is only used by OSS profiler and safely ignored by TPU
+  // profiler.
+  request.add_tools(kXPlanePb.data(), kXPlanePb.size());
+  *request.mutable_opts() = options.profiler_options();
+  return request;
+}
+
 TEST(RemoteProfilerSessionManagerTest, Simple) {
   absl::Duration duration = absl::Milliseconds(30);
-  RemoteProfilerSessionManagerOptions options =
-      RemoteProfilerSessionManager::DefaultOptions();
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
+      tensorflow::ProfilerSession::DefaultOptions();
   options.mutable_profiler_options()->set_duration_ms(
       absl::ToInt64Milliseconds(duration));
 
-  std::string service_addresses;
-  auto server = StartServer(duration, &service_addresses);
-  options.add_service_addresses(service_addresses);
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
   absl::Time approx_start = absl::Now();
   absl::Duration grace = absl::Seconds(1);
   absl::Duration max_duration = duration + grace;
   options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
   options.set_session_creation_timestamp_ns(absl::ToUnixNanos(approx_start));
 
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
   Status status;
-  auto sessions = RemoteProfilerSessionManager::Create(options, status);
+  auto sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
   std::vector<Response> responses = sessions->WaitForCompletion();
   absl::Duration elapsed = absl::Now() - approx_start;
   ASSERT_EQ(responses.size(), 1);
   EXPECT_TRUE(responses.back().status.ok());
-  EXPECT_FALSE(responses.back().profile_response->empty_trace());
-  EXPECT_GT(responses.back().profile_response->tool_data_size(), 0);
+  EXPECT_TRUE(responses.back().profile_response->empty_trace());
+  EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
   EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
 }
 
 TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
   absl::Duration duration = absl::Milliseconds(30);
-  RemoteProfilerSessionManagerOptions options =
-      RemoteProfilerSessionManager::DefaultOptions();
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
+      tensorflow::ProfilerSession::DefaultOptions();
   options.mutable_profiler_options()->set_duration_ms(
       absl::ToInt64Milliseconds(duration));
 
-  std::string service_addresses;
-  auto server = StartServer(duration, &service_addresses);
-  options.add_service_addresses(service_addresses);
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
   absl::Duration grace = absl::Seconds(1);
   absl::Duration max_duration = duration + grace;
   options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
@@ -82,28 +109,32 @@ TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
   options.set_session_creation_timestamp_ns(0);
 
   absl::Time approx_start = absl::Now();
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
   Status status;
-  auto sessions = RemoteProfilerSessionManager::Create(options, status);
+  auto sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
   std::vector<Response> responses = sessions->WaitForCompletion();
   absl::Duration elapsed = absl::Now() - approx_start;
   EXPECT_THAT(elapsed, DurationNear(absl::Seconds(0)));
   ASSERT_EQ(responses.size(), 1);
   EXPECT_TRUE(errors::IsDeadlineExceeded(responses.back().status));
-  EXPECT_FALSE(responses.back().profile_response->empty_trace());
+  EXPECT_TRUE(responses.back().profile_response->empty_trace());
   EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
 }
 
 TEST(RemoteProfilerSessionManagerTest, LongSession) {
   absl::Duration duration = absl::Seconds(3);
-  RemoteProfilerSessionManagerOptions options =
-      RemoteProfilerSessionManager::DefaultOptions();
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
+      tensorflow::ProfilerSession::DefaultOptions();
   options.mutable_profiler_options()->set_duration_ms(
       absl::ToInt64Milliseconds(duration));
 
-  std::string service_addresses;
-  auto server = StartServer(duration, &service_addresses);
-  options.add_service_addresses(service_addresses);
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
   absl::Time approx_start = absl::Now();
   // Empirically determined value.
   absl::Duration grace = absl::Seconds(20);
@@ -111,15 +142,18 @@ TEST(RemoteProfilerSessionManagerTest, LongSession) {
   options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
   options.set_session_creation_timestamp_ns(absl::ToUnixNanos(approx_start));
 
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
   Status status;
-  auto sessions = RemoteProfilerSessionManager::Create(options, status);
+  auto sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
   std::vector<Response> responses = sessions->WaitForCompletion();
   absl::Duration elapsed = absl::Now() - approx_start;
   ASSERT_EQ(responses.size(), 1);
   EXPECT_TRUE(responses.back().status.ok());
-  EXPECT_FALSE(responses.back().profile_response->empty_trace());
-  EXPECT_GT(responses.back().profile_response->tool_data_size(), 0);
+  EXPECT_TRUE(responses.back().profile_response->empty_trace());
+  EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
   EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
 }
 
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index acf5ecc71de..63aa1067db4 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "absl/time/clock.h"
@@ -115,10 +116,13 @@ Status MaybeCreateEmptyEventFile(const std::string& logdir) {
 Status SaveProfile(const std::string& repository_root, const std::string& run,
                    const std::string& host, const ProfileResponse& response,
                    std::ostream* os) {
+  if (response.tool_data().empty()) return Status::OK();
   std::string run_dir;
   TF_RETURN_IF_ERROR(GetOrCreateRunDir(repository_root, run, &run_dir, os));
+  // Windows file names do not support colons.
+  std::string hostname = absl::StrReplaceAll(host, {{":", "_"}});
   for (const auto& tool_data : response.tool_data()) {
-    TF_RETURN_IF_ERROR(DumpToolData(run_dir, host, tool_data, os));
+    TF_RETURN_IF_ERROR(DumpToolData(run_dir, hostname, tool_data, os));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index b65621450d1..1b10cb0380a 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -52,6 +52,7 @@ ProfilerServer::~ProfilerServer() {
   if (server_) {
     server_->Shutdown();
     server_->Wait();
+    LOG(INFO) << "Profiler server was shut down";
   }
 }
 
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index cd94b1b28cf..e8690f1f1f8 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "grpcpp/support/status.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_replace.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
@@ -31,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/file_system_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -38,15 +41,32 @@ namespace {
 
 const absl::string_view kXPlanePb = "xplane.pb";
 
-Status CollectDataToResponse(const ProfileRequest& req,
-                             ProfilerSession* profiler,
-                             ProfileResponse* response) {
-  profiler::XSpace xspace;
+// Collects data in XSpace format. The data is saved to a repository
+// unconditionally.
+Status CollectDataToRepository(const ProfileRequest& request,
+                               ProfilerSession* profiler,
+                               ProfileResponse* response) {
+  response->set_empty_trace(true);
+  // Read the profile data into xspace.
+  XSpace xspace;
   TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
-  auto* tool_data = response->add_tool_data();
-  tool_data->set_name(kXPlanePb.data(), kXPlanePb.size());
-  xspace.SerializeToString(tool_data->mutable_data());
-  return Status::OK();
+  xspace.add_hostnames(request.host_name());
+  VLOG(3) << "Collected XSpace to repository.";
+  response->set_empty_trace(IsEmpty(xspace));
+
+  std::string log_dir_path =
+      ProfilerJoinPath(request.repository_root(), request.session_id());
+  VLOG(1) << "Creating " << log_dir_path;
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(log_dir_path));
+
+  std::string file_name = absl::StrCat(request.host_name(), ".", kXPlanePb);
+  // Windows file names do not support colons.
+  absl::StrReplaceAll({{":", "_"}}, &file_name);
+  // Dumps profile data to <repository_root>/<run>/<host>_<port>.<kXPlanePb>
+  std::string out_path = ProfilerJoinPath(log_dir_path, file_name);
+  LOG(INFO) << "Collecting XSpace to repository: " << out_path;
+
+  return WriteBinaryProto(Env::Default(), out_path, xspace);
 }
 
 class ProfilerServiceImpl : public grpc::ProfilerService::Service {
@@ -68,7 +88,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
     }
 
     Env* env = Env::Default();
-    for (uint64 i = 0; i < req->duration_ms(); ++i) {
+    for (uint64 i = 0; i < req->opts().duration_ms(); ++i) {
       env->SleepForMicroseconds(EnvTime::kMillisToMicros);
       if (ctx->IsCancelled()) {
         return ::grpc::Status::CANCELLED;
@@ -80,7 +100,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
       }
     }
 
-    status = CollectDataToResponse(*req, profiler.get(), response);
+    status = CollectDataToRepository(*req, profiler.get(), response);
     if (!status.ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
                             status.error_message());
@@ -116,5 +136,4 @@ std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
 }
 
 }  // namespace profiler
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 1c8e86a9466..7476c5aa0c5 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -128,7 +128,7 @@ cc_library(
     copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -220,6 +220,7 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
+        ":tf_op_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -239,8 +240,7 @@ cc_library(
         ":trace_utils",
         ":xplane_builder",
         ":xplane_visitor",
-        "//tensorflow/core:platform_base",
-        "//tensorflow/core/platform:types",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -254,7 +254,7 @@ tf_cc_test(
         ":xplane_builder",
         ":xplane_utils",
         ":xplane_visitor",
-        "//tensorflow/core:platform_base",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
@@ -275,7 +275,7 @@ cc_library(
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
-        "//tensorflow/core/platform:types",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -320,7 +320,6 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
-        ":tf_op_utils",
         ":tf_xplane_visitor",
         ":xplane_builder",
         ":xplane_schema",
@@ -527,3 +526,21 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "device_caps_utils",
+    srcs = ["device_caps_utils.cc"],
+    hdrs = ["device_caps_utils.h"],
+    copts = tf_profiler_copts(),
+    visibility = [":friends"],
+    deps = [
+        ":xplane_builder",
+        ":xplane_schema",
+        ":xplane_visitor",
+        "//tensorflow/core:platform_base",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/profiler/utils/cost_utils.cc b/tensorflow/core/profiler/utils/cost_utils.cc
index 896019f775e..dbac33dc65b 100644
--- a/tensorflow/core/profiler/utils/cost_utils.cc
+++ b/tensorflow/core/profiler/utils/cost_utils.cc
@@ -91,7 +91,8 @@ TfOpRoofLineCostEstimator::OpRoofLineStats TfOpRoofLineCostEstimator::Predict(
   event.ForEachStat([&](const XStatVisitor& stat) {
     if (!stat.Type().has_value()) return;
     switch (stat.Type().value()) {
-      case StatType::kLevel0:
+      case StatType::kTfOp:
+      case StatType::kLevel0:  // old way to deliver tf_op info.
         tf_op = ParseTfOpFullname(stat.StrOrRefValue());
         break;
       case StatType::kTensorShapes:
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index a78404af849..e2cfc37871e 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -174,7 +174,8 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
     event.ForEachStat([&](const XStatVisitor& stat) {
       if (stat.Type() == StatType::kGroupId) {
         group_id = stat.IntValue();
-      } else if (stat.Type() == StatType::kLevel0) {
+      } else if (stat.Type() == StatType::kLevel0 ||  // old way to carry tf_op
+                 stat.Type() == StatType::kTfOp) {
         tf_op_full_name = stat.StrOrRefValue();
       } else if (stat.Type() == StatType::kHloOp) {
         hlo_op_names =
@@ -264,7 +265,7 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
         if (stat.Type() == StatType::kGroupId) {
           group_id = stat.IntValue();
         } else if (stat.Type() == StatType::kDeviceId) {
-          device_id = stat.IntValue();
+          device_id = stat.IntOrUintValue();
         } else if (stat.Type() == StatType::kCorrelationId) {
           correlation_id = stat.IntValue();
         }
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index 5952382bd7f..92a463dcc54 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -78,10 +78,10 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kLevel0, kTfOpName},
+               {{StatType::kTfOp, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
   CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kLevel0, kTfOpName},
+               {{StatType::kTfOp, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
@@ -115,11 +115,11 @@ TEST(DerivedTimelineTest, DependencyTest) {
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
                {{StatType::kGroupId, kFirstGroupId},
-                {StatType::kLevel0, kTfOpName},
+                {StatType::kTfOp, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
   CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
                {{StatType::kGroupId, kSecondGroupId},
-                {StatType::kLevel0, kTfOpName},
+                {StatType::kTfOp, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
@@ -143,10 +143,10 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kLevel0, kTfOpName},
+               {{StatType::kTfOp, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
   CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kLevel0, kTfOpName},
+               {{StatType::kTfOp, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
diff --git a/tensorflow/core/profiler/utils/device_caps_utils.cc b/tensorflow/core/profiler/utils/device_caps_utils.cc
new file mode 100644
index 00000000000..44ab1692422
--- /dev/null
+++ b/tensorflow/core/profiler/utils/device_caps_utils.cc
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/device_caps_utils.h"
+
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void SetDeviceCaps(const DeviceCapabilities& caps, XPlane* plane) {
+  XPlaneBuilder xplane(plane);
+  int clock_rate_in_khz =
+      static_cast<int>(caps.clock_rate_in_ghz() * 1000000.0);
+  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
+                          GetStatTypeStr(StatType::kDevCapClockRateKHz)),
+                      clock_rate_in_khz);
+  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
+                          GetStatTypeStr(StatType::kDevCapCoreCount)),
+                      caps.num_cores());
+  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
+                          GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
+                      caps.memory_bandwidth());
+  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
+                          GetStatTypeStr(StatType::kDevCapMemorySize)),
+                      caps.memory_size_in_bytes());
+  if (caps.has_compute_capability()) {
+    xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
+                        caps.compute_capability().major());
+    xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
+                        caps.compute_capability().minor());
+  }
+}
+
+DeviceCapabilities GetDeviceCaps(const XPlane& plane) {
+  DeviceCapabilities caps;
+  XPlaneVisitor xplane(&plane);
+  xplane.ForEachStat([&](const tensorflow::profiler::XStatVisitor& stat) {
+    if (!stat.Type().has_value()) return;
+    switch (stat.Type().value()) {
+      case StatType::kDevCapClockRateKHz:
+        caps.set_clock_rate_in_ghz(stat.IntOrUintValue() * 1000000.0);
+        break;
+      case StatType::kDevCapCoreCount:
+        caps.set_num_cores(stat.IntOrUintValue());
+        break;
+      case StatType::kDevCapMemoryBandwidth:
+        caps.set_memory_bandwidth(stat.IntOrUintValue());
+        break;
+      case StatType::kDevCapMemorySize:
+        caps.set_memory_size_in_bytes(stat.IntOrUintValue());
+        break;
+      case StatType::kDevCapComputeCapMajor:
+        caps.mutable_compute_capability()->set_major(stat.IntOrUintValue());
+        break;
+      case StatType::kDevCapComputeCapMinor:
+        caps.mutable_compute_capability()->set_minor(stat.IntOrUintValue());
+        break;
+    }
+  });
+
+  return caps;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/device_caps_utils.h b/tensorflow/core/profiler/utils/device_caps_utils.h
new file mode 100644
index 00000000000..5ab116ff3a0
--- /dev/null
+++ b/tensorflow/core/profiler/utils/device_caps_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAP_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAP_UTILS_H_
+
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void SetDeviceCaps(const DeviceCapabilities& caps, XPlane* plane);
+DeviceCapabilities GetDeviceCaps(const XPlane& plane);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAP_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 764bc442692..3bc2505ea89 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -162,8 +162,35 @@ EventType ClassifyDeviceCompute(absl::string_view event_name,
   }
 }
 
+constexpr int kNumGenericEventTypes = GenericEventType::kLastGenericEventType -
+                                      GenericEventType::kFirstGenericEventType +
+                                      1;
+
+using GenericEventTypeStrMap =
+    absl::flat_hash_map<GenericEventType, absl::string_view>;
+
+const GenericEventTypeStrMap& GetGenericEventTypeStrMap() {
+  static const auto* generic_event_type_str_map = new GenericEventTypeStrMap({
+      {kDeviceCompute, "Device compute"},
+      {kDeviceToDevice, "Device to device"},
+      {kDeviceCollectives, "Device collective communication"},
+      {kHostCompute, "Host compute"},
+      {kHostPrepare, "Kernel launch"},
+      {kInput, "Input"},
+      {kOutput, "Output"},
+      {kCompile, "Compilation"},
+      {kAllOthers, "All others"},
+  });
+  DCHECK_EQ(generic_event_type_str_map->size(), kNumGenericEventTypes);
+  return *generic_event_type_str_map;
+}
+
 }  // namespace
 
+absl::string_view GetGenericEventTypeStr(GenericEventType event_type) {
+  return GetGenericEventTypeStrMap().at(event_type);
+}
+
 EventType ClassifyGpuEvent(absl::string_view event_name,
                            absl::string_view tensor_shapes) {
   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD"))
@@ -231,42 +258,6 @@ std::string PrintEventType(EventType event_type) {
   }
 }
 
-std::string PrintEventTypeLabel(EventType event_type) {
-  switch (event_type) {
-    case UNKNOWN_TIME:
-      return "Machine idle or unknown events";
-    case HOST_COMPUTE:
-      return "Host compute";
-    case HOST_COMPILE:
-      return "Host compile";
-    case HOST_TO_HOST:
-      return "Host to host";
-    case HOST_TO_DEVICE:
-      return "Host to device";
-    case HOST_PREPARE:
-      return "Host prepare";
-    case DEVICE_COLLECTIVES:
-      return "Device collectives";
-    case HOST_WAIT_INPUT:
-      return "Host wait input";
-    case DEVICE_TO_DEVICE:
-      return "Device to device";
-    case DEVICE_TO_HOST:
-      return "Device to host";
-    case DEVICE_COMPUTE_32:
-      return "Device compute 32-bit";
-    case DEVICE_COMPUTE_16:
-      return "Device compute 16-bit";
-    case DEVICE_WAIT_DEVICE:
-      return "Device wait device";
-    case DEVICE_WAIT_HOST:
-      return "Device wait host";
-    default:
-      DCHECK(false);
-      return "Unknown event type";
-  }
-}
-
 std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span) {
   return absl::StrCat("(", PrintEventType(event_type_span.type), ", ",
                       event_type_span.span.DebugString(), ")");
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index e23e0d9e67a..898d6ce7ad3 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -68,6 +68,30 @@ enum EventType {
   LAST_EVENT_TYPE = DEVICE_WAIT_HOST
 };
 
+// Generic event types that shown to the user.
+enum GenericEventType {
+  kFirstGenericEventType = 1,
+  // Device is computing.
+  kDeviceCompute = kFirstGenericEventType,
+  // Device-to-device communication.
+  kDeviceToDevice,
+  // Collective Ops such as All-Reduce and NCCL.
+  kDeviceCollectives,
+  // Host is computing.
+  kHostCompute,
+  // Host is preparing to launch a computation on device.
+  kHostPrepare,
+  // Device waiting for input from the host.
+  kInput,
+  // Device sending output to the host.
+  kOutput,
+  // Host is compling.
+  kCompile,
+  // No recognized event associated with the time.
+  kAllOthers,
+  kLastGenericEventType = kAllOthers,
+};
+
 // Contains the type and timespan of an event.
 struct EventTypeSpan {
   EventType type;  // type of this event.
@@ -197,8 +221,8 @@ EventType ClassifyGpuEvent(absl::string_view event_name,
 // Returns the name of the given EventType.
 std::string PrintEventType(EventType event_type);
 
-// Returns the label of the given EventType.
-std::string PrintEventTypeLabel(EventType event_type);
+// Returns the string of the given GenericEventType.
+absl::string_view GetGenericEventTypeStr(GenericEventType event_type);
 
 // Returns a string that prints the given EventTypeSpan.
 std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span);
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 6fe3326b4af..7c32b3d0753 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -79,16 +78,7 @@ int64 GetEventType(bool is_host_plane, const EventNode& event) {
     } else if (absl::StartsWith(name, "ProcessBatch")) {
       return HostEventType::kProcessBatch;
     }
-    // TF op names.
-    Category category = ParseTfOpFullname(name).category;
-    switch (category) {
-      case Category::kTensorFlow:
-        return HostEventType::kTfOpRun;
-      case Category::kTfData:
-        return HostEventType::kIterator;
-      default:
-        return HostEventType::kUnknownHostEventType;
-    }
+    return HostEventType::kUnknownHostEventType;
   }
 }
 
@@ -304,6 +294,11 @@ bool HasJaxEvent(const EventNodeMap& event_node_map) {
   return event_node_map.contains(HostEventType::kExecuteOnLocalDevices);
 }
 
+bool IsIteratorEventType(absl::optional<int64> event_type) {
+  return event_type == HostEventType::kIterator ||
+         event_type == HostEventType::kDeviceInputPipelineSecondIterator;
+}
+
 }  // namespace
 
 EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
@@ -410,8 +405,9 @@ std::string EventNode::GetGroupName() const {
 
 void EventNode::SetGroupId(int64 group_id) {
   group_id_ = group_id;
-  AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kGroupId), group_id,
-                     raw_event_);
+  FindOrAddMutableStat(*plane_->GetStatMetadataId(StatType::kGroupId),
+                       raw_event_)
+      ->set_int64_value(group_id);
 }
 
 void EventNode::PropagateGroupId(int64 group_id,
@@ -440,31 +436,32 @@ void EventNode::PropagateGroupId(int64 group_id,
 }
 
 void EventNode::AddStepName(absl::string_view step_name) {
-  AddOrUpdateStrStat(*plane_->GetStatMetadataId(StatType::kStepName), step_name,
-                     raw_event_);
+  FindOrAddMutableStat(*plane_->GetStatMetadataId(StatType::kStepName),
+                       raw_event_)
+      ->set_str_value(step_name.data(), step_name.size());
 }
 
 void EventNode::AddSelectedGroupIds(
     const GroupMetadataMap& group_metadata_map) {
+  const auto& group_metadata = group_metadata_map.at(*group_id_);
   std::vector<int64> group_ids;
-  group_ids.reserve(1 + group_metadata_map.at(*group_id_).parents.size() +
-                    group_metadata_map.at(*group_id_).children.size());
+  group_ids.reserve(1 + group_metadata.parents.size() +
+                    group_metadata.children.size());
   group_ids.push_back(*group_id_);
-  group_ids.insert(group_ids.end(),
-                   group_metadata_map.at(*group_id_).parents.begin(),
-                   group_metadata_map.at(*group_id_).parents.end());
-  group_ids.insert(group_ids.end(),
-                   group_metadata_map.at(*group_id_).children.begin(),
-                   group_metadata_map.at(*group_id_).children.end());
-  AddOrUpdateStrStat(
-      *plane_->GetStatMetadataId(StatType::kSelectedGroupIds),
-      absl::StrCat("?selected_group_ids=", absl::StrJoin(group_ids, ",")),
-      raw_event_);
+  group_ids.insert(group_ids.end(), group_metadata.parents.begin(),
+                   group_metadata.parents.end());
+  group_ids.insert(group_ids.end(), group_metadata.children.begin(),
+                   group_metadata.children.end());
+  FindOrAddMutableStat(*plane_->GetStatMetadataId(StatType::kSelectedGroupIds),
+                       raw_event_)
+      ->set_str_value(
+          absl::StrCat("?selected_group_ids=", absl::StrJoin(group_ids, ",")));
 }
 
 void EventNode::SetIsEager(bool is_eager) {
-  AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kIsEager),
-                     is_eager ? 1 : 0, raw_event_);
+  FindOrAddMutableStat(*plane_->GetStatMetadataId(StatType::kIsEager),
+                       raw_event_)
+      ->set_int64_value(is_eager ? 1 : 0);
 }
 
 bool EventNode::IsEager() {
@@ -798,8 +795,7 @@ void EventForest::ConnectTfDataEvents() {
           produce_event->GetEventVisitor().GetStat(StatType::kElementId);
       if (!element_id.has_value()) continue;
       for (EventNode* produce_iterator : produce_event->GetChildren()) {
-        if (IsDatasetOp(ParseTfOpFullname(
-                produce_iterator->GetEventVisitor().Name()))) {
+        if (IsIteratorEventType(produce_iterator->GetEventVisitor().Type())) {
           absl::optional<XStatVisitor> iterator_id =
               produce_iterator->GetEventVisitor().GetStat(StatType::kParentId);
           if (!iterator_id.has_value()) break;
@@ -832,8 +828,7 @@ void EventForest::ConnectTfDataEvents() {
       // parents.
       EventNode* consume_iterator = consume_event->GetParents().at(0);
       if (!consume_iterator ||
-          !IsDatasetOp(
-              ParseTfOpFullname(consume_iterator->GetEventVisitor().Name()))) {
+          !IsIteratorEventType(consume_iterator->GetEventVisitor().Type())) {
         continue;
       }
       absl::optional<XStatVisitor> iterator_id =
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index b19d02ffa44..982434369ea 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -88,9 +88,11 @@ bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
   // Some examples: volta_h884gemm, volta_fp16_s884gemm,
   // turing_fp16_s1688cudnn_fp16
   bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
-                                absl::StrContains(kernel_name, "1688");
+                                absl::StrContains(kernel_name, "1688") ||
+                                absl::StrContains(kernel_name, "hmma") ||
+                                absl::StrContains(kernel_name, "xmma");
   if (possible_tensor_kernel) {
-    VLOG(1) << "Possible tensor kernel: " << kernel_name << "\n";
+    VLOG(3) << "Possible tensor kernel: " << kernel_name;
   }
 
   return (absl::StartsWith(kernel_name, "volta_i884") ||
@@ -104,7 +106,9 @@ bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
           absl::StartsWith(kernel_name, "turing_s1688") ||
           absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
           absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
-          absl::StartsWith(kernel_name, "turing_fp16_s1688"));
+          absl::StartsWith(kernel_name, "turing_fp16_s1688") ||
+          absl::StrContains(kernel_name, "hmma") ||
+          absl::StrContains(kernel_name, "xmma"));
 }
 
 // This list is not exhaustive.
diff --git a/tensorflow/core/profiler/utils/tf_op_utils_test.cc b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
index eeda6102da1..edefe81c9cd 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils_test.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
@@ -132,6 +132,22 @@ TEST(TfOpUtilsTest, JaxOpTest) {
   EXPECT_EQ(TfOpEventName(kName), "op_type");
 }
 
+TEST(TfOpUtilsTest, JaxOpNameTest) {
+  const absl::string_view kOpName = "namescope/add";
+  const absl::string_view kOpType = "add";
+  EXPECT_TRUE(IsJaxOpNameAndType(kOpName, kOpType));
+}
+
+TEST(TfOpUtilsTest, JaxOpNameWithMetadataTest) {
+  const absl::string_view kOpName =
+      "pmap(<unnamed wrapped function>)/gather[ "
+      "dimension_numbers=GatherDimensionNumbers(offset_dims=(2,), "
+      "collapsed_slice_dims=(0, 1), start_index_map=(0, 1))\n                  "
+      "                       slice_sizes=(1, 1, 81) ]:gather";
+  const absl::string_view kOpType = "gather";
+  EXPECT_TRUE(IsJaxOpNameAndType(kOpName, kOpType));
+}
+
 TEST(TfOpUtilsTest, OpWithoutTypeTest) {
   const absl::string_view kName = "OpName:";  // with trailing ':'
   TfOp tf_op = ParseTfOpFullname(kName);
diff --git a/tensorflow/core/profiler/utils/tf_xplane_visitor.h b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
index 17a7b94ef92..45459173a71 100644
--- a/tensorflow/core/profiler/utils/tf_xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
@@ -24,7 +24,8 @@ namespace tensorflow {
 namespace profiler {
 
 inline XPlaneVisitor CreateTfXPlaneVisitor(const XPlane* plane) {
-  return XPlaneVisitor(plane, {FindHostEventType}, {FindStatType});
+  return XPlaneVisitor(plane, {FindHostEventType, FindTfOpEventType},
+                       {FindStatType});
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/time_utils.h b/tensorflow/core/profiler/utils/time_utils.h
index 0a2518b90ff..cef1bda0b76 100644
--- a/tensorflow/core/profiler/utils/time_utils.h
+++ b/tensorflow/core/profiler/utils/time_utils.h
@@ -22,6 +22,8 @@ namespace tensorflow {
 namespace profiler {
 
 // Converts among different time units.
+// NOTE: We use uint64 for picoseconds and nanoseconds, which are used in
+// storage, and double for other units that are used in the UI.
 inline double PicosToNanos(uint64 ps) { return ps / 1E3; }
 inline double PicosToMicros(uint64 ps) { return ps / 1E6; }
 inline double PicosToMillis(uint64 ps) { return ps / 1E9; }
@@ -29,9 +31,9 @@ inline double PicosToSeconds(uint64 ps) { return ps / 1E12; }
 inline uint64 NanosToPicos(uint64 ns) { return ns * 1000; }
 inline double NanosToMicros(uint64 ns) { return ns / 1E3; }
 inline double MicrosToMillis(double us) { return us / 1E3; }
-inline uint64 MillisToPicos(uint64 ms) { return ms * 1000000000; }
-inline uint64 MillisToNanos(uint64 ms) { return ms * 1000000; }
-inline double MillisToSeconds(uint64 ms) { return ms / 1E3; }
+inline uint64 MillisToPicos(double ms) { return ms * 1E9; }
+inline uint64 MillisToNanos(double ms) { return ms * 1E6; }
+inline double MillisToSeconds(double ms) { return ms / 1E3; }
 inline uint64 SecondsToNanos(double s) { return s * 1E9; }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index ded2b353c2c..2504f4b5c48 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -34,7 +34,7 @@ namespace profiler {
 
 class XPlaneBuilder;
 
-template <class T>
+template <typename T>
 class XStatsBuilder {
  public:
   explicit XStatsBuilder(T* stats_owner, XPlaneBuilder* stats_metadata_owner)
@@ -79,7 +79,19 @@ class XStatsBuilder {
     proto.SerializeToString(bytes);
   }
 
-  void AddStat(const XStatMetadata& key, const XStat& stat, const XPlane& src);
+  void AddStat(const XStatMetadata& key, const XStat& stat, const XPlane& src) {
+    if (stat.value_case() == XStat::kRefValue) {
+      const auto& stat_metadata_map = src.stat_metadata();
+      const auto it = stat_metadata_map.find(stat.ref_value());
+      if (TF_PREDICT_TRUE(it != stat_metadata_map.end())) {
+        AddStatRefValue(key, it->second.name());
+      }
+    } else {
+      XStat* new_stat = stats_owner_->add_stats();
+      *new_stat = stat;
+      new_stat->set_metadata_id(key.id());
+    }
+  }
 
   XStat* FindOrAddMutableStat(int64 metadata_id) {
     for (auto& stat : *stats_owner_->mutable_stats()) {
@@ -102,7 +114,7 @@ class XStatsBuilder {
     } else if (absl::SimpleAtod(value, &double_value)) {
       AddStatValue(metadata, double_value);
     } else {
-      AddStatValue(metadata, value);
+      AddStatRefValue(metadata, value);
     }
   }
   void ReserveStats(size_t num_stats) {
@@ -116,6 +128,8 @@ class XStatsBuilder {
     return stat;
   }
 
+  void AddStatRefValue(const XStatMetadata& metadata, absl::string_view value);
+
   T* stats_owner_;
   XPlaneBuilder* stats_metadata_owner_;
 };
@@ -278,24 +292,12 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
   absl::flat_hash_map<int64, XLine*> lines_by_id_;
 };
 
-template <class T>
-void XStatsBuilder<T>::AddStat(const XStatMetadata& key, const XStat& stat,
-                               const XPlane& src) {
-  if (stat.value_case() == XStat::kRefValue) {
-    const auto& stat_metadata_map = src.stat_metadata();
-    const auto it = stat_metadata_map.find(stat.ref_value());
-    if (TF_PREDICT_FALSE(it == stat_metadata_map.end())) {
-      // the reference value in stat is not found in XStatMetadata from src.
-      return;
-    }
-    XStatMetadata* value =
-        stats_metadata_owner_->GetOrCreateStatMetadata(it->second.name());
-    AddStatValue(key, *value);
-  } else {
-    XStat* new_stat = stats_owner_->add_stats();
-    *new_stat = stat;
-    new_stat->set_metadata_id(key.id());
-  }
+template <typename T>
+void XStatsBuilder<T>::AddStatRefValue(const XStatMetadata& metadata,
+                                       absl::string_view value) {
+  const XStatMetadata* ref_value =
+      stats_metadata_owner_->GetOrCreateStatMetadata(value);
+  AddStatValue(metadata, *ref_value);
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index dec10e327bc..858dd7a99ba 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -111,6 +112,10 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       // Batching related.
       {"BatchingSessionRun", kBatchingSessionRun},
       {"ProcessBatch", kProcessBatch},
+      {"ConcatInputTensors", kConcatInputTensors},
+      {"MergeInputTensors", kMergeInputTensors},
+      {"ScheduleWithoutSplit", kScheduleWithoutSplit},
+      {"ScheduleWithSplit", kScheduleWithSplit},
       // JAX related.
       {"LocalExecutable::ExecuteOnLocalDevices", kExecuteOnLocalDevices},
       // GPU related.
@@ -201,6 +206,10 @@ const StatTypeMap& GetStatTypeMap() {
       {"memory_size", kDevCapMemorySize},
       {"compute_cap_major", kDevCapComputeCapMajor},
       {"compute_cap_minor", kDevCapComputeCapMinor},
+      // Batching related.
+      {"batch_size_after_padding", kBatchSizeAfterPadding},
+      {"padding_amount", kPaddingAmount},
+      {"batching_input_task_size", kBatchingInputTaskSize},
   });
   DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
   return *stat_type_map;
@@ -231,6 +240,19 @@ absl::optional<int64> FindHostEventType(absl::string_view event_name) {
   return absl::nullopt;
 }
 
+absl::optional<int64> FindTfOpEventType(absl::string_view event_name) {
+  // TF op names.
+  Category category = ParseTfOpFullname(event_name).category;
+  switch (category) {
+    case Category::kTensorFlow:
+      return HostEventType::kTfOpRun;
+    case Category::kTfData:
+      return HostEventType::kIterator;
+    default:
+      return absl::nullopt;
+  }
+}
+
 absl::string_view GetStatTypeStr(StatType stat_type) {
   return GetStatTypeStrMap().at(stat_type);
 }
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 555fd7dfef1..dd8b4fe5140 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -103,6 +103,10 @@ enum HostEventType {
   // Batching related.
   kBatchingSessionRun,
   kProcessBatch,
+  kConcatInputTensors,
+  kMergeInputTensors,
+  kScheduleWithoutSplit,
+  kScheduleWithSplit,
   // JAX related.
   kExecuteOnLocalDevices,
   // GPU related.
@@ -191,7 +195,11 @@ enum StatType {
   kDevCapMemorySize,
   kDevCapComputeCapMajor,
   kDevCapComputeCapMinor,
-  kLastStatType = kDevCapComputeCapMinor,
+  // Batching related.
+  kBatchSizeAfterPadding,
+  kPaddingAmount,
+  kBatchingInputTaskSize,
+  kLastStatType = kBatchingInputTaskSize,
 };
 
 inline std::string GpuPlaneName(int32 device_ordinal) {
@@ -209,6 +217,8 @@ inline bool IsHostEventType(HostEventType event_type,
 
 absl::optional<int64> FindHostEventType(absl::string_view event_name);
 
+absl::optional<int64> FindTfOpEventType(absl::string_view event_name);
+
 absl::string_view GetStatTypeStr(StatType stat_type);
 
 bool IsStatType(StatType stat_type, absl::string_view stat_name);
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 825469f9eab..68f7c127e2f 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
@@ -34,6 +35,37 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+// Returns the index of the first element in array for which pred is true.
+// Returns -1 if no such element is found.
+template <typename T, typename Pred>
+int FindIf(const protobuf::RepeatedPtrField<T>& array, Pred&& pred) {
+  for (int i = 0; i < array.size(); ++i) {
+    if (pred(&array.Get(i))) return i;
+  }
+  return -1;
+}
+
+// Removes the given element from array.
+template <typename T>
+void Remove(protobuf::RepeatedPtrField<T>* array, const T* elem) {
+  int i = FindIf(*array, [elem](const T* e) { return elem == e; });
+  if (i == -1) return;
+  for (; i < array->size() - 1; ++i) {
+    array->SwapElements(i + 1, i);
+  }
+  array->RemoveLast();
+}
+
+template <typename T, typename Pred>
+void RemoveIf(protobuf::RepeatedPtrField<T>* array, Pred&& pred) {
+  int i = FindIf(*array, pred);
+  if (i == -1) return;
+  for (int j = i + 1; j < array->size(); ++j) {
+    if (!pred(&array->Get(j))) array->SwapElements(j, i++);
+  }
+  array->DeleteSubrange(i, array->size() - i);
+}
+
 // Creates a Timespan from an XEvent.
 // WARNING: This should only be used when comparing events from the same XLine.
 Timespan XEventTimespan(const XEvent& event) {
@@ -43,17 +75,15 @@ Timespan XEventTimespan(const XEvent& event) {
 }  // namespace
 
 const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name) {
-  for (const XPlane& plane : space.planes()) {
-    if (plane.name() == name) return &plane;
-  }
-  return nullptr;
+  int i = FindIf(space.planes(),
+                 [name](const XPlane* plane) { return plane->name() == name; });
+  return (i != -1) ? &space.planes(i) : nullptr;
 }
 
 XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name) {
-  for (XPlane& plane : *space->mutable_planes()) {
-    if (plane.name() == name) return &plane;
-  }
-  return nullptr;
+  int i = FindIf(space->planes(),
+                 [name](const XPlane* plane) { return plane->name() == name; });
+  return (i != -1) ? space->mutable_planes(i) : nullptr;
 }
 
 XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name) {
@@ -87,54 +117,30 @@ bool IsNested(const XEvent& event, const XEvent& parent) {
   return XEventTimespan(parent).Includes(XEventTimespan(event));
 }
 
-void AddOrUpdateIntStat(int64 metadata_id, int64 value, XEvent* event) {
+XStat* FindOrAddMutableStat(int64 metadata_id, XEvent* event) {
   for (auto& stat : *event->mutable_stats()) {
     if (stat.metadata_id() == metadata_id) {
-      stat.set_int64_value(value);
-      return;
+      return &stat;
     }
   }
   XStat* stat = event->add_stats();
   stat->set_metadata_id(metadata_id);
-  stat->set_int64_value(value);
+  return stat;
 }
 
-void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
-                        XEvent* event) {
-  for (auto& stat : *event->mutable_stats()) {
-    if (stat.metadata_id() == metadata_id) {
-      stat.set_str_value(std::string(value));
-      return;
-    }
-  }
-  XStat* stat = event->add_stats();
-  stat->set_metadata_id(metadata_id);
-  stat->set_str_value(std::string(value));
-}
-
-void RemovePlaneWithName(XSpace* space, absl::string_view name) {
-  auto* planes = space->mutable_planes();
-  planes->erase(
-      std::remove_if(planes->begin(), planes->end(),
-                     [&](const XPlane& plane) { return plane.name() == name; }),
-      planes->end());
+void RemovePlane(XSpace* space, const XPlane* plane) {
+  DCHECK(plane != nullptr);
+  Remove(space->mutable_planes(), plane);
 }
 
 void RemoveEmptyPlanes(XSpace* space) {
-  auto* planes = space->mutable_planes();
-  planes->erase(std::remove_if(planes->begin(), planes->end(),
-                               [&](const XPlane& plane) {
-                                 return plane.lines_size() == 0;
-                               }),
-                planes->end());
+  RemoveIf(space->mutable_planes(),
+           [&](const XPlane* plane) { return plane->lines().empty(); });
 }
 
 void RemoveEmptyLines(XPlane* plane) {
-  auto* lines = plane->mutable_lines();
-  lines->erase(std::remove_if(
-                   lines->begin(), lines->end(),
-                   [&](const XLine& line) { return line.events_size() == 0; }),
-               lines->end());
+  RemoveIf(plane->mutable_lines(),
+           [&](const XLine* line) { return line->events().empty(); });
 }
 
 bool XEventsComparator::operator()(const XEvent* a, const XEvent* b) const {
@@ -240,5 +246,16 @@ uint64 GetStartTimestampNs(const XPlane& plane) {
   return plane_timestamp;
 }
 
+bool IsEmpty(const XSpace& space) {
+  for (const auto& plane : space.planes()) {
+    for (const auto& line : plane.lines()) {
+      if (!line.events().empty()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 5cd5275e85e..2183c1151dc 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -44,13 +44,10 @@ std::vector<XPlane*> FindMutablePlanesWithPrefix(XSpace* space,
 bool IsNested(const tensorflow::profiler::XEvent& event,
               const tensorflow::profiler::XEvent& parent);
 
-void AddOrUpdateIntStat(int64 metadata_id, int64 value,
-                        tensorflow::profiler::XEvent* event);
+XStat* FindOrAddMutableStat(int64 metadata_id, XEvent* event);
 
-void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
-                        tensorflow::profiler::XEvent* event);
+void RemovePlane(XSpace* space, const XPlane* plane);
 
-void RemovePlaneWithName(XSpace* space, absl::string_view name);
 void RemoveEmptyPlanes(XSpace* space);
 void RemoveEmptyLines(XPlane* plane);
 
@@ -110,6 +107,9 @@ void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
 // timestamps. If zero line exists, return 0;
 uint64 GetStartTimestampNs(const XPlane& plane);
 
+// Returns true if there are no XEvents.
+bool IsEmpty(const XSpace& space);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_utils_test.cc b/tensorflow/core/profiler/utils/xplane_utils_test.cc
index 04e06fcb05b..21c87b5c872 100644
--- a/tensorflow/core/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils_test.cc
@@ -51,23 +51,28 @@ TEST(XPlaneUtilsTest, IsNestedTest) {
   EXPECT_FALSE(IsNested(event, not_parent));
 }
 
-TEST(XPlaneUtilsTest, RemovePlaneWithName) {
+TEST(XPlaneUtilsTest, AddAndRemovePlanes) {
   XSpace space;
-  RemovePlaneWithName(&space, "non-exist");
-  EXPECT_EQ(space.planes_size(), 0);
 
-  space.add_planes()->set_name("p1");
-  space.add_planes()->set_name("p2");
-  space.add_planes()->set_name("p3");
-  RemovePlaneWithName(&space, "non-exist");
-  EXPECT_EQ(space.planes_size(), 3);
-  RemovePlaneWithName(&space, "p2");
+  auto* p1 = FindOrAddMutablePlaneWithName(&space, "p1");
+  EXPECT_EQ(p1, FindPlaneWithName(space, "p1"));
+  auto* p2 = FindOrAddMutablePlaneWithName(&space, "p2");
+  EXPECT_EQ(p2, FindPlaneWithName(space, "p2"));
+  auto* p3 = FindOrAddMutablePlaneWithName(&space, "p3");
+  EXPECT_EQ(p3, FindPlaneWithName(space, "p3"));
+
+  // Removing a plane does not invalidate pointers to other planes.
+
+  RemovePlane(&space, p2);
   EXPECT_EQ(space.planes_size(), 2);
-  RemovePlaneWithName(&space, "p1");
+  EXPECT_EQ(p1, FindPlaneWithName(space, "p1"));
+  EXPECT_EQ(p3, FindPlaneWithName(space, "p3"));
+
+  RemovePlane(&space, p1);
   EXPECT_EQ(space.planes_size(), 1);
-  RemovePlaneWithName(&space, "p1");
-  EXPECT_EQ(space.planes_size(), 1);
-  RemovePlaneWithName(&space, "p3");
+  EXPECT_EQ(p3, FindPlaneWithName(space, "p3"));
+
+  RemovePlane(&space, p3);
   EXPECT_EQ(space.planes_size(), 0);
 }
 
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.cc b/tensorflow/core/profiler/utils/xplane_visitor.cc
index 42068b7c61a..626657a5c2d 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.cc
+++ b/tensorflow/core/profiler/utils/xplane_visitor.cc
@@ -81,38 +81,40 @@ XPlaneVisitor::XPlaneVisitor(const XPlane* plane,
                              const TypeGetterList& event_type_getter_list,
                              const TypeGetterList& stat_type_getter_list)
     : XStatsOwner<XPlane>(this, plane), plane_(plane) {
-  for (const auto& event_type_getter : event_type_getter_list) {
-    BuildEventTypeMap(plane, event_type_getter);
-  }
-  for (const auto& stat_type_getter : stat_type_getter_list) {
-    BuildStatTypeMap(plane, stat_type_getter);
-  }
+  BuildEventTypeMap(plane, event_type_getter_list);
+  BuildStatTypeMap(plane, stat_type_getter_list);
 }
 
-void XPlaneVisitor::BuildEventTypeMap(const XPlane* plane,
-                                      const TypeGetter& event_type_getter) {
+void XPlaneVisitor::BuildEventTypeMap(
+    const XPlane* plane, const TypeGetterList& event_type_getter_list) {
   for (const auto& event_metadata : plane->event_metadata()) {
     uint64 metadata_id = event_metadata.first;
     const auto& metadata = event_metadata.second;
-    absl::optional<int64> event_type = event_type_getter(metadata.name());
-    if (event_type.has_value()) {
-      auto result = event_metadata_id_map_.emplace(metadata_id, *event_type);
-      DCHECK(result.second);  // inserted
-      event_type_map_.emplace(*event_type, &metadata);
+    for (const auto& event_type_getter : event_type_getter_list) {
+      absl::optional<int64> event_type = event_type_getter(metadata.name());
+      if (event_type.has_value()) {
+        auto result = event_metadata_id_map_.emplace(metadata_id, *event_type);
+        DCHECK(result.second);  // inserted
+        event_type_map_.emplace(*event_type, &metadata);
+        break;
+      }
     }
   }
 }
 
-void XPlaneVisitor::BuildStatTypeMap(const XPlane* plane,
-                                     const TypeGetter& stat_type_getter) {
+void XPlaneVisitor::BuildStatTypeMap(
+    const XPlane* plane, const TypeGetterList& stat_type_getter_list) {
   for (const auto& stat_metadata : plane->stat_metadata()) {
     uint64 metadata_id = stat_metadata.first;
     const auto& metadata = stat_metadata.second;
-    absl::optional<int64> stat_type = stat_type_getter(metadata.name());
-    if (stat_type.has_value()) {
-      auto result = stat_metadata_id_map_.emplace(metadata_id, *stat_type);
-      DCHECK(result.second);  // inserted
-      stat_type_map_.emplace(*stat_type, &metadata);
+    for (const auto& stat_type_getter : stat_type_getter_list) {
+      absl::optional<int64> stat_type = stat_type_getter(metadata.name());
+      if (stat_type.has_value()) {
+        auto result = stat_metadata_id_map_.emplace(metadata_id, *stat_type);
+        DCHECK(result.second);  // inserted
+        stat_type_map_.emplace(*stat_type, &metadata);
+        break;
+      }
     }
   }
 }
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index f0e4204e4d3..93830c0852a 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -239,9 +239,9 @@ class XPlaneVisitor : public XStatsOwner<XPlane> {
 
  private:
   void BuildEventTypeMap(const XPlane* plane,
-                         const TypeGetter& event_type_getter);
+                         const TypeGetterList& event_type_getter_list);
   void BuildStatTypeMap(const XPlane* plane,
-                        const TypeGetter& stat_type_getter);
+                        const TypeGetterList& stat_type_getter_list);
 
   const XPlane* plane_;
 
diff --git a/tensorflow/core/protobuf/data/experimental/service_config.proto b/tensorflow/core/protobuf/data/experimental/service_config.proto
index 7a0aa16e2c4..3dcd2cd48d0 100644
--- a/tensorflow/core/protobuf/data/experimental/service_config.proto
+++ b/tensorflow/core/protobuf/data/experimental/service_config.proto
@@ -37,4 +37,7 @@ message WorkerConfig {
   string worker_address = 4;
   // How often the worker should heartbeat to the master.
   int64 heartbeat_interval_ms = 5;
+  // How long to retry requests to the dispatcher before giving up and reporting
+  // an error.
+  int64 dispatcher_timeout_ms = 6;
 }
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index a5b4cfbe823..8df58683ead 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -76,6 +76,9 @@ message SavedUserObject {
   string identifier = 1;
   // Version information from the producer of this SavedUserObject.
   VersionDef version = 2;
+  // Deprecated! At the time of deprecation, Keras was the only user of this
+  // field, and its saving and loading code will be updated shortly.
+  // Please save your application-specific metadata to separate file
   // Initialization-related metadata.
   string metadata = 3;
 }
diff --git a/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
index 038c7a1b8aa..7e321158091 100644
--- a/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
+++ b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
@@ -87,6 +87,23 @@ message TPUEmbeddingConfiguration {
   // problem.
   bool pipeline_execution_with_tensor_core = 7;
 
+  // Directory where embedding lookup statistics are stored. These statistics
+  // summarize information about the inputs to the embedding lookup
+  // operation, in particular, the average number of embedding IDs per example
+  // and how well the embedding IDs are load balanced across the system. The
+  // lookup statistics are used during TPU initialization for embedding table
+  // partitioning. Collection of lookup statistics is done at runtime by
+  // profiling the embedding inputs: only 3% of input samples are profiled to
+  // minimize host CPU overhead. Once a suitable number of samples are
+  // profiled, the lookup statistics are saved to table-specific files in the
+  // profile data directory generally at the end of a TPU training loop. The
+  // filename corresponding to each table is obtained by hashing table specific
+  // parameters (e.g., table name and number of features) and global
+  // configuration parameters (e.g., sharding strategy and TPU worker task
+  // count). The same profile data directory can be shared amongst several
+  // models to reuse embedding lookup statistics.
+  string profile_data_directory = 9;
+
   // Extended output layout information; deprecated and now ignored.
   TPUEmbeddingOutputLayout output_layout = 8 [deprecated = true];
 }
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 1828cab334b..29158564b54 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 4
+#define TF_MINOR_VERSION 5
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 552  // Updated: 2020/10/12
+#define TF_GRAPH_DEF_VERSION 574  // Updated: 2020/11/3
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 85586809014..f6e058a7efe 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -110,30 +110,15 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_config_c_api",
-    hdrs = ["tpu_config_c_api.h"],
-    deps = [
-        ":libtftpu_header",
-        "//tensorflow/c:tf_status",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_api",
     srcs = ["tpu_api.cc"],
     hdrs = ["tpu_api.h"],
     deps = [
         ":libtftpu_header",
-        ":tpu_config_c_api",
         ":tpu_executor_api",
-        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        ":tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
-        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
     ],
 )
 
@@ -160,20 +145,15 @@ cc_library(
         ":tpu_api",
         ":tpu_api_dlsym_set_fn",
         ":tpu_compilation_device",
-        ":tpu_config_c_api",
         ":tpu_executor_init_fns",
         ":tpu_library_init_fns",
         ":tpu_node_device",
+        ":tpu_ops_c_api_hdrs",
         ":tpu_system_device",
         "//tensorflow/core:lib",
         "//tensorflow/core/tpu/graph_rewrite:tpu_rewrite_pass_registration",
-        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_computation_placer",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
-        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
     ],
 )
 
@@ -292,7 +272,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_executable_info_proto_cc",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:stream",
@@ -351,3 +331,16 @@ cc_library(
         "//tensorflow/stream_executor/tpu:tpu_transfer_manager",
     ],
 )
+
+cc_library(
+    name = "tpu_ops_c_api_hdrs",
+    srcs = [],
+    hdrs = ["tpu_ops_c_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtftpu_header",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
+        "//tensorflow/stream_executor/tpu:proto_helper",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index 36c3b6205e1..4e110f6348a 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -158,7 +158,7 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "//tensorflow/core/tpu:tpu_compile_interface",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
         "//tensorflow/stream_executor/tpu:tpu_topology_external",
         "@com_google_absl//absl/algorithm:container",
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index cdf32c54d86..bd20924ff23 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -64,9 +64,9 @@ limitations under the License.
 #include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h"
 #include "tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h"
 #include "tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/tpu_compile_interface.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
diff --git a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
index 40f9353beb4..b7c71dc5cba 100644
--- a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
@@ -326,6 +326,28 @@ Status RemoveIdentityNodesForArgRetval(Graph* g) {
   return Status::OK();
 }
 
+// Updates the TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR when
+// 'additional_per_replicate_inputs' are added to the inputs of `xla_node`.
+Status UpdateMirroredVariableIndices(int additional_per_replica_inputs,
+                                     Node* xla_node) {
+  std::vector<int> mirrored_variable_indices;
+  if (xla_node->attrs().Find(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR) !=
+      nullptr) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(),
+                                   TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                                   &mirrored_variable_indices));
+  }
+
+  if (!mirrored_variable_indices.empty()) {
+    for (int i = 0; i < mirrored_variable_indices.size(); ++i)
+      mirrored_variable_indices[i] += additional_per_replica_inputs;
+    xla_node->ClearAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR);
+    xla_node->AddAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                      mirrored_variable_indices);
+  }
+  return Status::OK();
+}
+
 // Move outside compilation nodes at the beginning of XLA computation to host.
 // For XLA computation graph, we will add new _Arg nodes to replace those
 // outside compilation nodes.
@@ -545,6 +567,9 @@ Status MoveHeadOutsideCompilationToHost(
   xla_node->ClearAttr("Tinputs");
   xla_node->AddAttr("Tinputs", new_input_types);
 
+  TF_RETURN_IF_ERROR(UpdateMirroredVariableIndices(
+      /*additional_per_replica_inputs=*/oc_output_edges.size(), xla_node));
+
   int new_variable_start_index =
       num_new_per_replica_input_types / num_replicas + num_distributed_vars +
       broadcast_input_types.size();
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 8de50acfd6c..0f183c5de98 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -56,7 +56,7 @@ cc_library(
         ":tpu_op_util",
         ":tpu_program_group_interface",
         ":tpu_util",
-        ":tpu_util_c_api_hdrs",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         ":tpu_util_hdrs",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -114,7 +114,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/core/tpu:tpu_config_c_api",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/lib",
@@ -123,19 +123,6 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "tpu_compile_c_api_hdrs",
-    hdrs = ["tpu_compile_c_api.h"],
-    deps = [
-        ":tpu_mesh_state_c_api_hdrs",
-        ":tpu_program_c_api_hdrs",
-        ":tpu_util_c_api_hdrs",
-        "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/stream_executor/tpu:c_api_decl",
-    ],
-    alwayslink = True,
-)
-
 tf_proto_library(
     name = "tpu_executable_info_proto",
     srcs = ["tpu_executable_info.proto"],
@@ -261,24 +248,16 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_mesh_state_c_api_hdrs",
-    hdrs = ["tpu_mesh_state_c_api.h"],
-    deps = ["//tensorflow/core/tpu:libtftpu_header"],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_mesh_state_interface",
     srcs = [],
     hdrs = ["tpu_mesh_state_interface.h"],
     deps = [
-        ":tpu_compile_c_api_hdrs",
-        ":tpu_mesh_state_c_api_hdrs",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/core:framework",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
     ],
 )
 
@@ -310,14 +289,11 @@ cc_library(
     srcs = ["tpu_program_group.cc"],
     hdrs = ["tpu_program_group.h"],
     deps = [
-        ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_common",
         ":tpu_compile_op_support",
         ":tpu_compile_proto_cc",
         ":tpu_executable_info_proto_cc",
-        ":tpu_mesh_state_c_api_hdrs",
         ":tpu_mesh_state_interface",
-        ":tpu_program_c_api_hdrs",
         ":tpu_program_group_interface",
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -329,6 +305,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:proto_helper",
         "//tensorflow/stream_executor/tpu:status_helper",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
@@ -382,11 +359,9 @@ cc_library(
         ":tpu_compilation_cache_key",
         ":tpu_compilation_metrics",  # buildcleaner: keep
         ":tpu_compilation_metrics_hdrs",
-        ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
         ":tpu_op_consts",
-        ":tpu_program_c_api_hdrs",
         ":tpu_program_group",
         ":tpu_util",
         ":trace_util_hdrs",
@@ -398,6 +373,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -450,42 +426,18 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_util_c_api_hdrs",
-    hdrs = ["tpu_util_c_api.h"],
-    deps = [
-        ":tpu_mesh_state_c_api_hdrs",
-        "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/stream_executor/tpu:c_api_decl",
-        "//tensorflow/stream_executor/tpu:proto_helper",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "tpu_program_c_api_hdrs",
-    hdrs = ["tpu_program_c_api.h"],
-    deps = [
-        ":tpu_util_c_api_hdrs",
-        "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/stream_executor/tpu:c_api_decl",
-        "//tensorflow/stream_executor/tpu:proto_helper",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_op_util",
     srcs = ["tpu_op_util.cc"],
     hdrs = ["tpu_op_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
-        ":tpu_compile_c_api_hdrs",
         ":tpu_mesh_state_interface",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -496,7 +448,6 @@ cc_library(
     hdrs = ["tpu_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
-        ":tpu_util_c_api_hdrs",
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
@@ -504,6 +455,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         tf_grpc_cc_dependency(),
@@ -548,7 +500,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "//tensorflow/core/tpu:tpu_config_c_api",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
 )
@@ -665,17 +617,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_execute_c_api_hdrs",
-    hdrs = ["tpu_execute_c_api.h"],
-    deps = [
-        ":tpu_program_c_api_hdrs",
-        ":tpu_util_c_api_hdrs",
-        "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
-    ],
-)
-
 cc_library(
     name = "tpu_compile_op_impl",
     srcs = ["tpu_compile_op_impl.cc"],
@@ -683,11 +624,9 @@ cc_library(
     copts = tf_copts(),
     deps = [
         ":tpu_compilation_cache_key",
-        ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_common",
         ":tpu_compile_op_support",
         ":tpu_compile_proto_cc",
-        ":tpu_mesh_state_c_api_hdrs",
         ":tpu_program_group",
         ":tpu_program_group_interface",
         ":tpu_util",
@@ -696,6 +635,7 @@ cc_library(
         "//tensorflow/compiler/xla:status",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_executor",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "@com_google_absl//absl/types:variant",
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 80010d70cd4..7dcc30ed182 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -25,11 +25,10 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/kernels/trace_util.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index c3f95e7e09d..655449d6291 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -34,11 +34,11 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index 5ddce57807d..88532c295cb 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -434,7 +434,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
 
     // Check if caller has disabled compilation. Set using
     // internal::ScopedTpuCompileDisabler.
-    if (!UtilApiFn()->TpuCompile_IsTpuCompilationEnabledFn()) {
+    if (!OpsApiFn()->TpuCompile_IsTpuCompilationEnabledFn()) {
       const std::string error_msg = strings::StrCat(
           "[TpuCompilationDisabled]: Compilation cache miss, but compilation "
           "disabled, session_name(",
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
index 29ec8701a37..6a0bab4a0a7 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -111,23 +111,6 @@ xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
   }
   header.set_is_empty(false);
 
-  HostComputeMetadataSerializedProto host_compute_metadata;
-  auto cleanup_host_compute_metadata =
-      xla::MakeCleanup([&host_compute_metadata]() {
-        if (host_compute_metadata.size > 0) {
-          stream_executor::tpu::SerializedProto_Free(host_compute_metadata);
-        }
-      });
-  Status get_host_compute_metadata_status =
-      tpu_program_group->SerializeHostComputeMetadata(cache_entry.core_index(),
-                                                      &host_compute_metadata);
-  if (!get_host_compute_metadata_status.ok()) {
-    return errors::Internal("Failed to serialize host compute metadata.");
-  }
-  if (!header.mutable_host_compute_metadata()->ParseFromArray(
-          host_compute_metadata.bytes, host_compute_metadata.size)) {
-    return errors::Internal("Failed to deserialize host compute metadata.");
-  }
 
   bool may_modify_variables =
       tpu_program_group->may_modify_variables(cache_entry.core_index());
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
deleted file mode 100644
index 07bc49b2167..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
-
-#include <stddef.h>
-
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
-#include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/c_api_decl.h"
-
-extern "C" {
-
-// Compiles Mlir or TF function computation by lowering into HLO IR and returns
-// `count` number of TPU programs ready for execution.
-// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
-// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
-// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
-// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
-// API respectively.
-TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
-    TpuSerializedProto compilation_request, const XLA_TpuMeshState* mesh_state,
-    XLA_TpuProgram** tpu_programs[], size_t* count, SE_Status* status);
-
-struct TfTpu_CompileApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
-};
-
-}  // extern "C"
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 73669c21f1e..fa98ab763ca 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_op_util.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -497,26 +497,37 @@ Status TpuCompileOpKernelCommon::OptimizeGraph(
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(!flags->tf_xla_disable_constant_folding);
   GraphOptimizer optimizer(opts);
-  // Performs a first function inlining pass before shape inference, since
-  // otherwise shape inference can't see inside functions and a comprehensive
-  // shape_map, including function ops, is needed to constant-propagate Shape
-  // Ops below.
-  GraphOptimizer::Options optimizer_opts;
-  optimizer_opts.inline_multi_device_functions = true;
-  optimizer_opts.inline_impl_selection_group_functions = true;
-  optimizer_opts.inline_with_single_device_body_placer = true;
-  optimizer.Optimize(flr, flr->env(), flr->device(), graph, optimizer_opts);
+  {
+    // Performs a first function inlining pass before shape inference, since
+    // otherwise shape inference can't see inside functions and a comprehensive
+    // shape_map, including function ops, is needed to constant-propagate Shape
+    // Ops below.
+    GraphOptimizer::Options optimizer_opts;
+    optimizer_opts.inline_multi_device_functions = true;
+    optimizer_opts.inline_impl_selection_group_functions = true;
+    optimizer_opts.inline_with_single_device_body_placer = true;
+    // Infer shapes for each node in the computation. Shape inference can help
+    // skip constant folding of large shapes.
+    GraphShapeInfo shape_info;
+    TF_RETURN_IF_ERROR(RunShapeInferenceOnComputation(
+        metadata, arg_shapes, graph->get(), flr, &shape_info));
+    // Converts the GraphShapeInfo into the form needed by the constant-folding
+    // pass of the optimizer.
+    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
+    optimizer_opts.shape_map = &shape_map;
+    optimizer.Optimize(flr, flr->env(), flr->device(), graph, optimizer_opts);
+  }
 
-  // Infer shapes for each node in the computation.
-  GraphShapeInfo shape_info;
-  TF_RETURN_IF_ERROR(RunShapeInferenceOnComputation(
-      metadata, arg_shapes, graph->get(), flr, &shape_info));
-
-  // Converts the GraphShapeInfo into the form needed by the constant-folding
-  // pass of the optimizer.
-  std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
-  ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
-  optimizer.Optimize(flr, flr->env(), flr->device(), graph, &shape_map);
+  {
+    // Infer shapes for each node in the computation.
+    GraphShapeInfo shape_info;
+    TF_RETURN_IF_ERROR(RunShapeInferenceOnComputation(
+        metadata, arg_shapes, graph->get(), flr, &shape_info));
+    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
+    optimizer.Optimize(flr, flr->env(), flr->device(), graph, &shape_map);
+  }
 
   TF_RETURN_IF_ERROR(RewriteTensorListWithConstElement(graph->get(), fld));
 
@@ -532,7 +543,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
       ctx->cancellation_manager()->get_cancellation_token();
   const bool already_cancelled =
       !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (UtilApiFn()->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
+        if (OpsApiFn()->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
           return;
         }
 
@@ -623,8 +634,9 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
   }
 
   const TpuCompilationCacheKey key = CreateCompilationCacheKey(
-      function_.name(), metadata_.function_library_fingerprint(), mlir_module_,
-      guaranteed_constants, dynamic_shapes, metadata_, *mesh_state);
+      function_.name(), metadata_.function_library_fingerprint(),
+      mlir_module_fingerprint_, guaranteed_constants, dynamic_shapes, metadata_,
+      *mesh_state);
 
   // Process-wide cache of TPU executables.
   TpuCompilationCacheInterface* cache;
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 327aa460ddd..81c189971d1 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -26,6 +26,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
@@ -72,7 +75,9 @@ class TpuCompileOpKernelCommon {
         num_computations_(num_computations),
         return_hlo_protos_(return_hlo_protos),
         unload_cache_entry_on_session_close_(unload_cache_on_session_close),
-        persistent_cache_(nullptr) {}
+        persistent_cache_(nullptr) {
+    mlir_module_fingerprint_ = tensorflow::Fingerprint64(mlir_module_);
+  }
 
   TpuCompileOpKernelCommon(
       const NameAttrList& function, const tpu::TPUCompileMetadataProto metadata,
@@ -208,6 +213,9 @@ class TpuCompileOpKernelCommon {
 
   // A serialized MLIR ModuleOp.
   std::string mlir_module_;
+  // Fingerprint of the MLIR Module created once on construction to avoid paying
+  // the cost on each invocation.
+  uint64 mlir_module_fingerprint_ = 0;
 
   // Number of different programs to compile. This maps to number of cores in
   // each replica.
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
index 270c2c53d7a..59d2aa79ace 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
index 3f058683223..f0d731a93ef 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_common.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 271a9697f18..32741c9967c 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -32,9 +32,9 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/kernels/tpu_pod_state.h"
 #include "tensorflow/core/tpu/tpu_api.h"
-#include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 namespace tensorflow {
@@ -203,12 +203,11 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   auto cleanup = xla::MakeCleanup([&status, &tpu_topology_output]() {
     TF_DeleteStatus(status);
-    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
-        tpu_topology_output);
+    tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(tpu_topology_output);
   });
 
   auto* mesh_common_state = mesh_state->mesh_common_state();
-  tpu::ConfigApiFn()->WaitForDistributedTpuOp_DoWorkFn(
+  tpu::OpsApiFn()->WaitForDistributedTpuOp_DoWorkFn(
       num_hosts, num_devices_per_host,
       const_cast<const int32_t**>(mapping_arg.data()), mesh_common_state,
       &tpu_topology_output_size, &tpu_topology_output, status);
@@ -247,7 +246,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   auto tpu_host_config = ctx->input(0).scalar<tstring>()();
 
   bool is_master_worker =
-      tpu::ConfigApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
+      tpu::OpsApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
   if (!is_master_worker) {
     // Reset the mesh interface if we are not the master.
     OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
@@ -283,9 +282,9 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   int32_t* device_id_output = nullptr;
   auto cleanup = xla::MakeCleanup([&status, &device_id_output]() {
     TF_DeleteStatus(status);
-    tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
+    tpu::OpsApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
   });
-  tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
+  tpu::OpsApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
       enable_whole_mesh_compilations_, is_master_worker, &device_id_output_size,
       &device_id_output, status);
@@ -302,16 +301,16 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
                           tpu::kCompiledProtoCacheResourceName, proto_lookup));
   } else {
     int64_t cache_size_bytes;
-    tpu::ConfigApiFn()->TpuConfigurationApi_RemoteCompilationCacheSizeInBytesFn(
+    tpu::OpsApiFn()->TpuConfigurationApi_RemoteCompilationCacheSizeInBytesFn(
         &cache_size_bytes);
 
     char* server_address_output = nullptr;
     auto cleanup_server_address = xla::MakeCleanup([&server_address_output]() {
-      tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+      tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
           server_address_output);
     });
     size_t server_address_output_size;
-    tpu::ConfigApiFn()
+    tpu::OpsApiFn()
         ->TpuConfigurationApi_CompilationCacheServerAddressFromConfigFn(
             tpu_host_config.size(), tpu_host_config.data(),
             &server_address_output_size, &server_address_output, status);
@@ -346,8 +345,8 @@ void SetGlobalTPUArrayOp::Compute(OpKernelContext* ctx) {
   auto tpu_topology = ctx->input(0).scalar<tstring>()();
   TF_Status* status = TF_NewStatus();
 
-  tpu::ConfigApiFn()->SetGlobalTPUArrayOp_DoWorkFn(tpu_topology.size(),
-                                                   tpu_topology.data(), status);
+  tpu::OpsApiFn()->SetGlobalTPUArrayOp_DoWorkFn(tpu_topology.size(),
+                                                tpu_topology.data(), status);
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
@@ -362,7 +361,7 @@ void DisconnectDistributedTpuChipsOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   int32_t number_of_chips_output = 0;
 
-  tpu::ConfigApiFn()->DisconnectDistributedTpuChipsOp_DoWorkFn(
+  tpu::OpsApiFn()->DisconnectDistributedTpuChipsOp_DoWorkFn(
       &number_of_chips_output, status);
 
   Tensor* ctx_output;
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_c_api.h b/tensorflow/core/tpu/kernels/tpu_execute_c_api.h
deleted file mode 100644
index 81d23441ddc..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_execute_c_api.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_C_API_H_
-
-#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
-#include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
-
-extern "C" {
-
-typedef struct XLA_DeviceAssignment {
-  const char* bytes;
-  size_t size;
-} XLA_DeviceAssignment;
-
-TFTPU_CAPI_EXPORT void TpuExecutable_LoadProgramAndEnqueueToStream(
-    const XLA_TpuProgram* program, SE_DeviceMemoryBase* arguments,
-    size_t arguments_len, SE_DeviceMemoryBase* result,
-    SE_DeviceMemoryBase* cross_program_prefetch_addr, int32_t rng_seed,
-    XLA_DeviceAssignment* device_assignment, SE_Stream* stream,
-    SE_Status* status);
-
-TFTPU_CAPI_EXPORT void HardwareLayout_HostShapeToDeviceShape(
-    XLA_Shape* host_shape, XLA_Shape* device_shape);
-TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSize(XLA_Shape* shape);
-TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompact(XLA_Shape* shape);
-TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompactRaw(XLA_Shape* shape);
-
-TFTPU_CAPI_EXPORT void TpuExecute_RuntimeInputToPaddedData(
-    uint32_t* runtime_input_ptr, size_t runtime_input_size,
-    int8_t* padded_data_ptr, size_t padded_data_size, XLA_Shape* runtime_shape,
-    XLA_Shape* compile_time_shape, SE_Status* status);
-
-struct TfTpu_ExecuteApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_LoadProgramAndEnqueueToStream);
-  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_HostShapeToDeviceShape);
-  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSize);
-  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompact);
-  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompactRaw);
-  TFTPU_ADD_FN_IN_STRUCT(TpuExecute_RuntimeInputToPaddedData);
-};
-
-}  // extern "C"
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
index ce69d976398..88c799bc64b 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -172,7 +172,7 @@ struct InputBuffers {
                                    int device_ordinal) {
     CHECK_NE(allocator, nullptr);
     xla::ShapedBuffer shaped_buffer(std::move(host_shape), buffers.shape(),
-                                    allocator->platform(), device_ordinal);
+                                    device_ordinal);
     shaped_buffer.set_buffers(buffers.Map<se::DeviceMemoryBase>(
         [](xla::MaybeOwningDeviceMemory* buffer) {
           CHECK(buffer);
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
deleted file mode 100644
index a6434d7d2fd..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
-
-#include "tensorflow/core/tpu/libtftpu.h"
-
-typedef struct XLA_TpuMeshState XLA_TpuMeshState;
-
-extern "C" {
-
-// Creates a new TPU mesh state object.
-TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
-
-// Deletes the given TPU `mesh_state` object. Once deleted the object is
-// unusable.
-TFTPU_CAPI_EXPORT void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
-
-// Returns a pointer to an opaque mesh data structure used internally.
-TFTPU_CAPI_EXPORT void* TpuMeshState_MeshCommonState(
-    XLA_TpuMeshState* mesh_state);
-
-}  // extern "C"
-
-struct TfTpu_MeshStateApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Create);
-  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Free);
-  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_MeshCommonState);
-};
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 20d1f672c65..0fed2b607ec 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -19,9 +19,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 
@@ -39,19 +38,19 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
 
   ~TpuMeshStateInterface() override {
     if (mesh_state_ != nullptr) {
-      MeshStateApiFn()->TpuMeshState_FreeFn(mesh_state_);
+      OpsApiFn()->TpuMeshState_FreeFn(mesh_state_);
     }
   }
 
   static TpuMeshStateInterface* Create() {
-    return new TpuMeshStateInterface(MeshStateApiFn()->TpuMeshState_CreateFn());
+    return new TpuMeshStateInterface(OpsApiFn()->TpuMeshState_CreateFn());
   }
 
   const XLA_TpuMeshState* data() const { return mesh_state_; }
 
   tensorflow::TpuMeshCommonState* mesh_common_state() const {
     return static_cast<tensorflow::TpuMeshCommonState*>(
-        MeshStateApiFn()->TpuMeshState_MeshCommonStateFn(mesh_state_));
+        OpsApiFn()->TpuMeshState_MeshCommonStateFn(mesh_state_));
   }
 
   // Returns whether we should include the device assignment as a static field
@@ -63,8 +62,8 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
     // Static device assignment enables XLA to perform certain optimization when
     // all cores are used in the replicated computation.
     return metadata.num_cores_per_replica() * metadata.num_replicas() ==
-           UtilApiFn()->TpuTopology_AvailableCoreCountFn(mesh_state_,
-                                                         tpu_core_type);
+           OpsApiFn()->TpuTopology_AvailableCoreCountFn(mesh_state_,
+                                                        tpu_core_type);
   }
 
   string DebugString() const override { return "TpuMeshStateInterface"; }
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 0d02cac7377..b61be882282 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -14,10 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_op_util.h"
 
+#include <cstdint>
 #include <string>
 
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -77,7 +78,7 @@ std::string GuaranteedConstFingerprint(
     uint64_t fingerprint = 0;
     for (const Tensor& constant : guaranteed_constants) {
       fingerprint =
-          tpu::UtilApiFn()->TpuCompile_CreateGuaranteedConstFingerprintFn(
+          tpu::OpsApiFn()->TpuCompile_CreateGuaranteedConstFingerprintFn(
               fingerprint, constant.tensor_data().data(),
               constant.tensor_data().size());
     }
@@ -91,7 +92,7 @@ std::string GuaranteedConstFingerprint(
 // evaluation of `guaranteed_const_fingerprint()` callback.
 TpuCompilationCacheKey CreateCompilationCacheKey(
     absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state) {
@@ -110,12 +111,12 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     }
   }
   CompilationCacheKeyResult result =
-      tpu::UtilApiFn()->TpuCompile_CreateCompilationCacheKeyFn(
+      tpu::OpsApiFn()->TpuCompile_CreateCompilationCacheKeyFn(
           CompilationCacheKeyProperty{
               config_prefix.data(),
               shapes_prefix.data(),
               function_name.data(),
-              mlir_module.data(),
+              mlir_module_fingerprint,
               flattened_device_ids.data(),
               flattened_device_ids.size(),
               guaranteed_constants.size(),
@@ -125,7 +126,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
               mesh_state.data(),
           });
   auto buffer_cleanup = gtl::MakeCleanup([result]() {
-    tpu::UtilApiFn()->TpuCompile_DestroyCompilationCacheKeyFn(result);
+    tpu::OpsApiFn()->TpuCompile_DestroyCompilationCacheKeyFn(result);
   });
   TpuCompilationCacheKey key;
   key.prefix = result.key;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
index 0a9657ca05e..229596c9000 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -30,7 +30,7 @@ namespace tpu {
 // Creates a unique compilation cache `key`.
 TpuCompilationCacheKey CreateCompilationCacheKey(
     absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state);
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index 898f02b28e9..0c52bff1109 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -74,12 +74,11 @@ Status GetServerAddressAndPort(std::string* server_address, int* serving_port) {
   char* server_address_output = nullptr;
   auto cleanup = xla::MakeCleanup([&status, &server_address_output]() {
     TF_DeleteStatus(status);
-    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
-        server_address_output);
+    tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(server_address_output);
   });
   size_t server_address_output_size;
   *serving_port = -1;
-  tpu::ConfigApiFn()->TpuConfigurationApi_GetServerAddressAndPortFn(
+  tpu::OpsApiFn()->TpuConfigurationApi_GetServerAddressAndPortFn(
       &server_address_output_size, &server_address_output, serving_port,
       status);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
@@ -98,7 +97,7 @@ TpuPodState::~TpuPodState() {
     VLOG(1) << "Shutting down Compilation Cache Service.";
     if (cache_service_->Shutdown(20)) {
       if (service_port_ >= 0) {
-        tpu::UtilApiFn()->TpuNetUtil_RecycleUnusedPortFn(service_port_);
+        tpu::OpsApiFn()->TpuNetUtil_RecycleUnusedPortFn(service_port_);
       }
     } else {
       LOG(ERROR)
@@ -150,10 +149,10 @@ Status ConstructTpuPodState(
 
   char* host_config_output = nullptr;
   auto host_config_cleanup = xla::MakeCleanup([&host_config_output]() {
-    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
+    tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
   });
   size_t host_config_output_size;
-  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
+  tpu::OpsApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
       server_address.size(), server_address.data(), &host_config_output_size,
       &host_config_output, status);
diff --git a/tensorflow/core/tpu/kernels/tpu_program_c_api.h b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
deleted file mode 100644
index 1b35a8a036b..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_program_c_api.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
-
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
-#include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/c_api_decl.h"
-#include "tensorflow/stream_executor/tpu/proto_helper.h"
-
-typedef struct XLA_TpuProgram XLA_TpuProgram;
-
-// Enum for choosing sharding/unsharding program from a `XLA_TpuProgram` obj.
-enum TpuProgramShardingType { kInvalid = 0, kMain, kSharding, kUnsharding };
-
-struct TpuExecutableSerializedProto {
-  const char* bytes;
-  size_t size;
-};
-
-struct CompilerMetadataSerializedProto {
-  const char* bytes;
-  size_t size;
-};
-
-struct HostComputeMetadataSerializedProto {
-  const char* bytes;
-  size_t size;
-};
-
-extern "C" {
-
-// Creates a new TPU program.
-TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_New();
-
-// Destroys the `tpu_program`.
-TFTPU_CAPI_EXPORT void TpuProgram_Free(XLA_TpuProgram* tpu_program);
-
-// Creates an array of `XLA_TpuProgram*`.
-TFTPU_CAPI_EXPORT XLA_TpuProgram** TpuProgram_NewArray(size_t count);
-
-// Destroys an array of `XLA_TpuProgram*`.
-TFTPU_CAPI_EXPORT void TpuProgram_FreeArray(XLA_TpuProgram* tpu_program[]);
-
-// Unloads and destroys the `tpu_program`. Once the TPU program is unloaded and
-// destroyed, it is in an unusable state.
-TFTPU_CAPI_EXPORT void TpuProgram_UnloadAndDestroy(XLA_TpuProgram* tpu_program,
-                                                   SE_Status* status);
-
-// Gets TPU program size in bytes from the `tpu_program`.
-TFTPU_CAPI_EXPORT int64_t
-TpuProgram_GetProgramSize(const XLA_TpuProgram* tpu_program);
-
-// Logs the summary of current memory state snapshot of the `tpu_program`.
-TFTPU_CAPI_EXPORT bool TpuProgram_LogProgramMemorySummary(
-    const XLA_TpuProgram* tpu_program);
-
-// Gets TPU program executable info from the `tpu_program`.
-TFTPU_CAPI_EXPORT void TpuProgram_GetExecutableInfo(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info,
-    SE_Status* status);
-
-// Gets host transfer info proto.
-TFTPU_CAPI_EXPORT void TpuProgram_GetHostTransferInfo(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info,
-    SE_Status* status);
-
-// Gets HLO metadata proto.
-TFTPU_CAPI_EXPORT void TpuProgram_GetHloMetadata(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata,
-    SE_Status* status);
-
-// Gets may modify variables boolean value.
-TFTPU_CAPI_EXPORT void TpuProgram_GetMayModifyVariables(
-    const XLA_TpuProgram* tpu_program, bool* may_modify_variables);
-
-// Checks if TPU program has sharding.
-TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
-    const XLA_TpuProgram* tpu_program);
-
-// Gets TPU program by sharding type. Return value is valid only when the
-// `status.status()` returns `OK`.
-TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_GetTpuProgram(
-    XLA_TpuProgram* tpu_program, TpuProgramShardingType type);
-
-// Gets TPU executable proto from a `tpu_program`.
-TFTPU_CAPI_EXPORT void TpuProgram_SerializeTpuExecutable(
-    const XLA_TpuProgram* tpu_program, TpuExecutableSerializedProto* executable,
-    SE_Status* status);
-
-// Gets compilation metadata proto from a `tpu_program`.
-TFTPU_CAPI_EXPORT void TpuProgram_SerializeCompilerMetadata(
-    const XLA_TpuProgram* tpu_program,
-    CompilerMetadataSerializedProto* compiler_metadata, SE_Status* status);
-
-// Gets host transfer metadata proto from a `tpu_program`.
-TFTPU_CAPI_EXPORT void TpuProgram_SerializeHostComputeMetadata(
-    const XLA_TpuProgram* tpu_program,
-    HostComputeMetadataSerializedProto* host_compute_metadata,
-    SE_Status* status);
-
-// Deserializes the `GetTpuProgramResponse` proto into an `XLA_TpuProgram`.
-TFTPU_CAPI_EXPORT void TpuProgram_DeserializeFromGetTpuProgramResponseProto(
-    TpuSerializedProto get_tpu_program_response, XLA_TpuProgram* tpu_program,
-    SE_Status* status);
-
-struct TfTpu_TpuProgramApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_New);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_Free);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_NewArray);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_FreeArray);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_UnloadAndDestroy);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetProgramSize);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_LogProgramMemorySummary);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetExecutableInfo);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHostTransferInfo);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHloMetadata);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetMayModifyVariables);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_HasSharding);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetTpuProgram);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeTpuExecutable);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeCompilerMetadata);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeHostComputeMetadata);
-  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_DeserializeFromGetTpuProgramResponseProto);
-};
-
-}  // extern "C"
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index cd2d5bda98c..ad194cf6531 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -20,10 +20,9 @@ limitations under the License.
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 
@@ -39,7 +38,7 @@ TPUExecutableInfoProto TpuProgramGroup::ConstructExecutableInfo(
   VLOG(1) << "ConstructExecutableInfo";
   TpuSerializedProto serialized_executable_info = {};
   StatusHelper status;
-  TpuProgramApiFn()->TpuProgram_GetExecutableInfoFn(
+  OpsApiFn()->TpuProgram_GetExecutableInfoFn(
       xla_tpu_program, &serialized_executable_info, status.c_status);
   TPUExecutableInfoProto executable_info;
   if (status.ok()) {
@@ -55,7 +54,7 @@ TPUHostTransferInfoProto TpuProgramGroup::ConstructHostTransferInfo(
   VLOG(1) << "ConstructHostTransferInfo";
   TpuSerializedProto serialized_host_transfer_info = {};
   StatusHelper status;
-  TpuProgramApiFn()->TpuProgram_GetHostTransferInfoFn(
+  OpsApiFn()->TpuProgram_GetHostTransferInfoFn(
       xla_tpu_program, &serialized_host_transfer_info, status.c_status);
   TPUHostTransferInfoProto host_transfer_info;
   if (status.ok()) {
@@ -71,7 +70,7 @@ xla::HloProto TpuProgramGroup::ConstructHloMetadata(
   VLOG(1) << "ConstructHloMetadata";
   TpuSerializedProto serialized_hlo_metadata = {};
   StatusHelper status;
-  TpuProgramApiFn()->TpuProgram_GetHloMetadataFn(
+  OpsApiFn()->TpuProgram_GetHloMetadataFn(
       xla_tpu_program, &serialized_hlo_metadata, status.c_status);
   xla::HloProto hlo_metadata;
   if (status.ok()) {
@@ -97,8 +96,8 @@ void TpuProgramGroup::Initialize(
   for (size_t i = 0; i < tpu_programs_.size(); ++i) {
     const XLA_TpuProgram* xla_tpu_program = tpu_programs_[i];
     bool may_modify_variables;
-    TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(
-        xla_tpu_program, &may_modify_variables);
+    OpsApiFn()->TpuProgram_GetMayModifyVariablesFn(xla_tpu_program,
+                                                   &may_modify_variables);
     may_modify_variables_array[i] = may_modify_variables;
     executable_infos[i] = ConstructExecutableInfo(xla_tpu_program);
     host_transfer_infos[i] = ConstructHostTransferInfo(xla_tpu_program);
@@ -114,7 +113,7 @@ void TpuProgramGroup::Initialize(
 
 bool TpuProgramGroup::has_sharding_program() const {
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    if (!TpuProgramApiFn()->TpuProgram_HasShardingFn(tpu_program)) {
+    if (!OpsApiFn()->TpuProgram_HasShardingFn(tpu_program)) {
       return false;
     }
   }
@@ -126,7 +125,7 @@ size_t TpuProgramGroup::program_count() const { return tpu_programs_.size(); }
 int64_t TpuProgramGroup::program_size() const {
   int64_t total_size = 0;
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    total_size += TpuProgramApiFn()->TpuProgram_GetProgramSizeFn(tpu_program);
+    total_size += OpsApiFn()->TpuProgram_GetProgramSizeFn(tpu_program);
   }
   return total_size;
 }
@@ -134,8 +133,7 @@ int64_t TpuProgramGroup::program_size() const {
 bool TpuProgramGroup::LogProgramMemorySummary() {
   bool success = true;
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    success &=
-        TpuProgramApiFn()->TpuProgram_LogProgramMemorySummaryFn(tpu_program);
+    success &= OpsApiFn()->TpuProgram_LogProgramMemorySummaryFn(tpu_program);
   }
   return success;
 }
@@ -143,8 +141,7 @@ bool TpuProgramGroup::LogProgramMemorySummary() {
 void TpuProgramGroup::UnloadAndDestroyPrograms() {
   for (XLA_TpuProgram* tpu_program : tpu_programs_) {
     StatusHelper status;
-    TpuProgramApiFn()->TpuProgram_UnloadAndDestroyFn(tpu_program,
-                                                     status.c_status);
+    OpsApiFn()->TpuProgram_UnloadAndDestroyFn(tpu_program, status.c_status);
     auto s = status.status();
     if (!s.ok()) {
       LOG(ERROR) << "TpuProgramGroup::UnloadPrograms(): " << s.ToString();
@@ -208,8 +205,8 @@ bool TpuProgramGroup::may_modify_variables(int index) const {
   CHECK_GE(index, 0);
   CHECK_LT(index, tpu_programs_.size());
   bool may_modify_variables;
-  TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(tpu_programs_[index],
-                                                        &may_modify_variables);
+  OpsApiFn()->TpuProgram_GetMayModifyVariablesFn(tpu_programs_[index],
+                                                 &may_modify_variables);
   return may_modify_variables;
 }
 
@@ -258,9 +255,9 @@ Status TpuProgramGroup::CompileAndBuild(
   size_t count = 0;
   XLA_TpuProgram** xla_tpu_programs = nullptr;
   StatusHelper status;
-  CompileApiFn()->TpuCompile_CompileAndBuildFn(serialized_compilation_request,
-                                               mesh_state, &xla_tpu_programs,
-                                               &count, status.c_status);
+  OpsApiFn()->TpuCompile_CompileAndBuildFn(serialized_compilation_request,
+                                           mesh_state, &xla_tpu_programs,
+                                           &count, status.c_status);
   if (!status.ok()) {
     VLOG(1) << "Run CompileAndBuild failed.";
     return status.status();
@@ -275,7 +272,7 @@ Status TpuProgramGroup::CompileAndBuild(
       tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
   tpu_program_group->Initialize(
       absl::MakeConstSpan(&xla_tpu_programs[0], count));
-  TpuProgramApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
+  OpsApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
   return status.status();
 }
 
@@ -284,8 +281,8 @@ std::vector<XLA_TpuProgram*> TpuProgramGroup::tpu_programs(
   std::vector<XLA_TpuProgram*> tpu_programs;
   tpu_programs.reserve(tpu_programs_.size());
   for (size_t i = 0; i < tpu_programs_.size(); ++i) {
-    if (TpuProgramApiFn()->TpuProgram_HasShardingFn(tpu_programs_[i])) {
-      tpu_programs.push_back(TpuProgramApiFn()->TpuProgram_GetTpuProgramFn(
+    if (OpsApiFn()->TpuProgram_HasShardingFn(tpu_programs_[i])) {
+      tpu_programs.push_back(OpsApiFn()->TpuProgram_GetTpuProgramFn(
           tpu_programs_[i], sharding_type));
       CHECK_NE(tpu_programs[i], nullptr);
     }
@@ -300,11 +297,11 @@ Status TpuProgramGroup::DeserializeFromRpcResponseProtos(
 
   for (size_t i = 0; i < rpc_response_protos.size(); ++i) {
     StatusHelper status;
-    auto* xla_tpu_program = TpuProgramApiFn()->TpuProgram_NewFn();
-    TpuProgramApiFn()->TpuProgram_DeserializeFromGetTpuProgramResponseProtoFn(
+    auto* xla_tpu_program = OpsApiFn()->TpuProgram_NewFn();
+    OpsApiFn()->TpuProgram_DeserializeFromGetTpuProgramResponseProtoFn(
         rpc_response_protos[i], xla_tpu_program, status.c_status);
     if (!status.status().ok()) {
-      TpuProgramApiFn()->TpuProgram_FreeFn(xla_tpu_program);
+      OpsApiFn()->TpuProgram_FreeFn(xla_tpu_program);
       return status.status();
     }
     tpu_programs[i] = xla_tpu_program;
@@ -319,8 +316,8 @@ Status TpuProgramGroup::SerializeExecutable(
   CHECK_GE(index, 0);
   CHECK_LT(index, tpu_programs_.size());
   StatusHelper status;
-  TpuProgramApiFn()->TpuProgram_SerializeTpuExecutableFn(
-      tpu_programs_[index], executable, status.c_status);
+  OpsApiFn()->TpuProgram_SerializeTpuExecutableFn(tpu_programs_[index],
+                                                  executable, status.c_status);
   return status.status();
 }
 
@@ -329,20 +326,9 @@ Status TpuProgramGroup::SerializeCompilerMetadata(
   CHECK_GE(index, 0);
   CHECK_LT(index, tpu_programs_.size());
   StatusHelper status;
-  TpuProgramApiFn()->TpuProgram_SerializeCompilerMetadataFn(
+  OpsApiFn()->TpuProgram_SerializeCompilerMetadataFn(
       tpu_programs_[index], compiler_metadata, status.c_status);
   return status.status();
 }
-
-Status TpuProgramGroup::SerializeHostComputeMetadata(
-    int index,
-    HostComputeMetadataSerializedProto* host_compute_metadata) const {
-  CHECK_GE(index, 0);
-  CHECK_LT(index, tpu_programs_.size());
-  StatusHelper status;
-  TpuProgramApiFn()->TpuProgram_SerializeHostComputeMetadataFn(
-      tpu_programs_[index], host_compute_metadata, status.c_status);
-  return status.status();
-}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 3ed1623e9e6..5812976d0d3 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -27,10 +27,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
deleted file mode 100644
index 04b65e24e54..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
-
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/proto_helper.h"
-
-// Property for creating compilation cache key.
-struct CompilationCacheKeyProperty {
-  const char* config_prefix;
-  const char* shapes_prefix;
-  const char* function_name;
-  const char* mlir_module;
-  const int32_t* device_ids;
-  size_t device_ids_size;
-  int32_t guaranteed_constants_size;
-  uint64_t function_library_fingerprint;
-  int32_t num_cores_per_replica;
-  int32_t num_replicas;
-  const XLA_TpuMeshState* mesh_state;
-};
-
-// Compilation cache key result returning both the key and a more verbose debug
-// version.
-struct CompilationCacheKeyResult {
-  const char* key;
-  const char* debug_string;
-};
-
-extern "C" {
-
-// Checks if whether a TPU compilation is enabled.
-TFTPU_CAPI_EXPORT bool TpuCompile_IsTpuCompilationEnabled();
-
-// XLA compilation cannot be cancelled. To avoid hanging the TF worker will exit
-// when cancellation is requested for an XLA compile op. Some tests require this
-// behavior to be disabled, and we test for this condition with the following
-// flag function.
-TFTPU_CAPI_EXPORT bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
-
-// Returns the number of available TPU core count.
-TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
-    const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
-
-// Recycle unused service port.
-TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
-
-// Creates a unique compilation cache `key` used for `put` and `get` operations.
-// Returned buffers are heap-allocated and must be owned.
-TFTPU_CAPI_EXPORT CompilationCacheKeyResult
-TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty property);
-
-// Destroys the CompilationCacheKeyResult returned by calling the
-// `TpuCompile_CreateCompilationCacheKey` API.
-TFTPU_CAPI_EXPORT void TpuCompile_DestroyCompilationCacheKey(
-    CompilationCacheKeyResult result);
-
-// Creates a guaranteed const fingerprint. Guarantee const is normally used in
-// TPU inference to avoid re-copying unchanged variables onto the TPU device.
-// It promises the value is identical for every execution in the same session
-// even if the actual value changes in later executions.
-TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
-    uint64_t fingerprint, const char* data, size_t size);
-
-}  // extern "C"
-
-struct TfTpu_UtilApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
-  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
-  TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
-};
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_api.cc b/tensorflow/core/tpu/tpu_api.cc
index 17520ea6ea4..690e2049652 100644
--- a/tensorflow/core/tpu/tpu_api.cc
+++ b/tensorflow/core/tpu/tpu_api.cc
@@ -23,39 +23,9 @@ TfTpu_BaseFn* InitializeApiFn() {
   return &base_fn;
 }
 
-TfTpu_ConfigApiFn* ConfigApiFn() {
-  static TfTpu_ConfigApiFn config_api_fn;
-  return &config_api_fn;
-}
-
-TfTpu_MeshStateApiFn* MeshStateApiFn() {
-  static TfTpu_MeshStateApiFn mesh_state_api_fn;
-  return &mesh_state_api_fn;
-}
-
-TfTpu_CompileApiFn* CompileApiFn() {
-  static TfTpu_CompileApiFn compile_api_fn;
-  return &compile_api_fn;
-}
-
-TfTpu_ExecuteApiFn* ExecuteApiFn() {
-  static TfTpu_ExecuteApiFn execute_api_fn;
-  return &execute_api_fn;
-}
-
-TfTpu_TpuProgramApiFn* TpuProgramApiFn() {
-  static TfTpu_TpuProgramApiFn tpu_program_api_fn;
-  return &tpu_program_api_fn;
-}
-
-TfTpu_NodeContextApiFn* NodeContextApiFn() {
-  static TfTpu_NodeContextApiFn node_context_api_fn;
-  return &node_context_api_fn;
-}
-
-TfTpu_UtilApiFn* UtilApiFn() {
-  static TfTpu_UtilApiFn util_api_fn;
-  return &util_api_fn;
+TfTpu_OpsApiFn* OpsApiFn() {
+  static TfTpu_OpsApiFn ops_api_fn;
+  return &ops_api_fn;
 }
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/tpu_api.h b/tensorflow/core/tpu/tpu_api.h
index a9f7bccfdb4..b880f4ed9cf 100644
--- a/tensorflow/core/tpu/tpu_api.h
+++ b/tensorflow/core/tpu/tpu_api.h
@@ -16,33 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_TPU_API_H_
 #define TENSORFLOW_CORE_TPU_TPU_API_H_
 
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_execute_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_executor_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
 
 TfTpu_BaseFn* InitializeApiFn();
 
-TfTpu_ConfigApiFn* ConfigApiFn();
-
-TfTpu_MeshStateApiFn* MeshStateApiFn();
-
-TfTpu_CompileApiFn* CompileApiFn();
-
-TfTpu_ExecuteApiFn* ExecuteApiFn();
-
-TfTpu_TpuProgramApiFn* TpuProgramApiFn();
-
-TfTpu_NodeContextApiFn* NodeContextApiFn();
-
-TfTpu_UtilApiFn* UtilApiFn();
+TfTpu_OpsApiFn* OpsApiFn();
 
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.h b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
index 1126e132264..ffb5ffb33a9 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
@@ -17,14 +17,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_execute_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/core/tpu/tpu_config_c_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
 // LINT.IfChange
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
deleted file mode 100644
index de4b2e25570..00000000000
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
-#define TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
-
-#include <cstddef>
-#include <cstdint>
-
-#include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/tpu/libtftpu.h"
-
-typedef struct TpuSerializedProto TpuSerializedProto;
-
-namespace tensorflow {
-class TpuMeshCommonState;
-}  // namespace tensorflow
-
-extern "C" {
-
-TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
-    const size_t num_cores_per_host_size, const int32_t* num_cores_per_host,
-    size_t server_address_size, const char* server_address,
-    size_t* host_config_output_size, char** host_config_output,
-    TF_Status* status);
-
-TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
-    const size_t num_hosts, const size_t num_cores_per_host,
-    const int32_t** host_ordinal_to_global_core_id_map,
-    tensorflow::TpuMeshCommonState* tpu_mesh_common_state,
-    size_t* tpu_topology_output_size, char** tpu_topology_output,
-    TF_Status* status);
-
-TFTPU_CAPI_EXPORT void InitializeHostForDistributedTpuOp_DoWork(
-    const size_t tpu_host_config_size, const char* tpu_host_config,
-    const bool enable_whole_mesh_compilations, bool is_master_worker,
-    size_t* core_id_output_size, int32_t** core_id_output, TF_Status* status);
-
-TFTPU_CAPI_EXPORT void SetGlobalTPUArrayOp_DoWork(
-    const size_t tpu_topology_size, const char* tpu_topology,
-    TF_Status* status);
-
-TFTPU_CAPI_EXPORT void DisconnectDistributedTpuChipsOp_DoWork(
-    int32_t* number_of_chips_output, TF_Status* status);
-
-TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeCharArray(char* output);
-TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeInt32Array(int32_t* output);
-
-TFTPU_CAPI_EXPORT bool TpuConfigurationApi_HasTPUPodState();
-
-TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpusPerHost(int32_t* tpus,
-                                                       TF_Status* status);
-TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpuMemoryLimit(int64_t* memory_limit,
-                                                          TF_Status* status);
-
-TFTPU_CAPI_EXPORT void TpuConfigurationApi_RemoteCompilationCacheSizeInBytes(
-    int64_t* cache_size_in_bytes);
-TFTPU_CAPI_EXPORT
-void TpuConfigurationApi_CompilationCacheServerAddressFromConfig(
-    size_t tpu_host_config_size, const char* tpu_host_config,
-    size_t* server_address_output_size, char** server_address_output,
-    TF_Status* status);
-TFTPU_CAPI_EXPORT void TpuConfigurationApi_GetServerAddressAndPort(
-    size_t* server_address_output_size, char** server_address_output,
-    int* port_output, TF_Status* status);
-}
-
-struct TfTpu_ConfigApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(ConfigureDistributedTpuOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(WaitForDistributedTpuOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(InitializeHostForDistributedTpuOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(SetGlobalTPUArrayOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(DisconnectDistributedTpuChipsOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeCharArray);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeInt32Array);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpusPerHost);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpuMemoryLimit);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
-  TFTPU_ADD_FN_IN_STRUCT(
-      TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
-  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_GetServerAddressAndPort);
-};
-
-#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 29a05c0d538..71455936d60 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -107,7 +107,7 @@ xla::Shape HostShapeToDeviceShape(const xla::Shape& host_shape) {
   XLA_Shape c_host_shape;
   XLA_Shape c_device_shape;
   ApiConverter::ToC(host_shape, &c_host_shape);
-  tensorflow::tpu::ExecuteApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
+  tensorflow::tpu::OpsApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
       &c_host_shape, &c_device_shape);
   xla::Shape device_shape = ApiConverter::FromC(&c_device_shape);
   ApiConverter::Free(&c_host_shape);
@@ -119,8 +119,7 @@ int64 ShapeSizeCompact(const xla::Shape& shape) {
   XLA_Shape c_shape;
   ApiConverter::ToC(shape, &c_shape);
   int64 size =
-      tensorflow::tpu::ExecuteApiFn()->HardwareLayout_ShapeSizeCompactFn(
-          &c_shape);
+      tensorflow::tpu::OpsApiFn()->HardwareLayout_ShapeSizeCompactFn(&c_shape);
   ApiConverter::Free(&c_shape);
   return size;
 }
@@ -129,7 +128,7 @@ int64 ShapeSizeCompactRaw(const xla::Shape& shape) {
   XLA_Shape c_shape;
   ApiConverter::ToC(shape, &c_shape);
   int64 size =
-      tensorflow::tpu::ExecuteApiFn()->HardwareLayout_ShapeSizeCompactRawFn(
+      tensorflow::tpu::OpsApiFn()->HardwareLayout_ShapeSizeCompactRawFn(
           &c_shape);
   ApiConverter::Free(&c_shape);
   return size;
@@ -241,11 +240,10 @@ xla::Status UpdateDynamicInputs(
             ApiConverter::ToC(runtime_shape, &c_runtime_shape);
             ApiConverter::ToC(compile_time_shape, &c_compile_time_shape);
             StatusHelper status;
-            tensorflow::tpu::ExecuteApiFn()
-                ->TpuExecute_RuntimeInputToPaddedDataFn(
-                    raw_input_runtime->data(), raw_input_runtime->size(),
-                    padded_data->data(), padded_data->size(), &c_runtime_shape,
-                    &c_compile_time_shape, status.c_status);
+            tensorflow::tpu::OpsApiFn()->TpuExecute_RuntimeInputToPaddedDataFn(
+                raw_input_runtime->data(), raw_input_runtime->size(),
+                padded_data->data(), padded_data->size(), &c_runtime_shape,
+                &c_compile_time_shape, status.c_status);
             ApiConverter::Free(&c_runtime_shape);
             ApiConverter::Free(&c_compile_time_shape);
             return status.status();
diff --git a/tensorflow/core/tpu/tpu_execute.h b/tensorflow/core/tpu/tpu_execute.h
index e2142ad7a7a..fc247eb2e7d 100644
--- a/tensorflow/core/tpu/tpu_execute.h
+++ b/tensorflow/core/tpu/tpu_execute.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context.h"
 
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index de245340b8a..dc9714a8918 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -6,119 +6,76 @@
 
 namespace {
 
-tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
-  auto* config_fn = tensorflow::tpu::ConfigApiFn();
+tensorflow::Status SetTpuOpsStructFns(void* library_handle) {
+  auto* ops_api_fn = tensorflow::tpu::OpsApiFn();
 
-  TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
-  TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_HasTPUPodState);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpusPerHost);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpuMemoryLimit);
-  TFTPU_SET_FN(config_fn,
+  TFTPU_SET_FN(ops_api_fn, ConfigureDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(ops_api_fn, WaitForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(ops_api_fn, InitializeHostForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(ops_api_fn, SetGlobalTPUArrayOp_DoWork);
+  TFTPU_SET_FN(ops_api_fn, DisconnectDistributedTpuChipsOp_DoWork);
+  TFTPU_SET_FN(ops_api_fn, TpuConfigurationApi_FreeCharArray);
+  TFTPU_SET_FN(ops_api_fn, TpuConfigurationApi_FreeInt32Array);
+  TFTPU_SET_FN(ops_api_fn, TpuConfigurationApi_HasTPUPodState);
+  TFTPU_SET_FN(ops_api_fn, TpuConfigurationApi_TpusPerHost);
+  TFTPU_SET_FN(ops_api_fn, TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_SET_FN(ops_api_fn,
                TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
-  TFTPU_SET_FN(config_fn,
+  TFTPU_SET_FN(ops_api_fn,
                TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_GetServerAddressAndPort);
+  TFTPU_SET_FN(ops_api_fn, TpuConfigurationApi_GetServerAddressAndPort);
 
-  return tensorflow::Status::OK();
-}
+  TFTPU_SET_FN(ops_api_fn, TpuMeshState_Create);
+  TFTPU_SET_FN(ops_api_fn, TpuMeshState_Free);
+  TFTPU_SET_FN(ops_api_fn, TpuMeshState_MeshCommonState);
 
-tensorflow::Status SetTpuMeshStateStructFns(void* library_handle) {
-  auto* mesh_state_fn = tensorflow::tpu::MeshStateApiFn();
+  TFTPU_SET_FN(ops_api_fn, TpuCompile_CompileAndBuild);
 
-  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Create);
-  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Free);
-  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_MeshCommonState);
+  TFTPU_SET_FN(ops_api_fn, TpuExecutable_LoadProgramAndEnqueueToStream);
+  TFTPU_SET_FN(ops_api_fn, HardwareLayout_HostShapeToDeviceShape);
+  TFTPU_SET_FN(ops_api_fn, HardwareLayout_ShapeSize);
+  TFTPU_SET_FN(ops_api_fn, HardwareLayout_ShapeSizeCompact);
+  TFTPU_SET_FN(ops_api_fn, HardwareLayout_ShapeSizeCompactRaw);
+  TFTPU_SET_FN(ops_api_fn, TpuExecute_RuntimeInputToPaddedData);
 
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetCompileStructFn(void* library_handle) {
-  auto* compile_fn = tensorflow::tpu::CompileApiFn();
-
-  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAndBuild);
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetExecuteStructFn(void* library_handle) {
-  auto* execute_fn = tensorflow::tpu::ExecuteApiFn();
-
-  TFTPU_SET_FN(execute_fn, TpuExecutable_LoadProgramAndEnqueueToStream);
-  TFTPU_SET_FN(execute_fn, HardwareLayout_HostShapeToDeviceShape);
-  TFTPU_SET_FN(execute_fn, HardwareLayout_ShapeSize);
-  TFTPU_SET_FN(execute_fn, HardwareLayout_ShapeSizeCompact);
-  TFTPU_SET_FN(execute_fn, HardwareLayout_ShapeSizeCompactRaw);
-  TFTPU_SET_FN(execute_fn, TpuExecute_RuntimeInputToPaddedData);
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetTpuProgramStructFn(void* library_handle) {
-  auto* tpu_program_fn = tensorflow::tpu::TpuProgramApiFn();
-
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_New);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_Free);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_NewArray);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_FreeArray);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_UnloadAndDestroy);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetProgramSize);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_LogProgramMemorySummary);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetExecutableInfo);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetHostTransferInfo);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetHloMetadata);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetMayModifyVariables);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_HasSharding);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetTpuProgram);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeTpuExecutable);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeCompilerMetadata);
-  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeHostComputeMetadata);
-  TFTPU_SET_FN(tpu_program_fn,
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_New);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_Free);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_NewArray);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_FreeArray);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_UnloadAndDestroy);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_GetProgramSize);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_LogProgramMemorySummary);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_GetExecutableInfo);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_GetHostTransferInfo);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_GetHloMetadata);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_GetMayModifyVariables);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_HasSharding);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_GetTpuProgram);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_SerializeTpuExecutable);
+  TFTPU_SET_FN(ops_api_fn, TpuProgram_SerializeCompilerMetadata);
+  TFTPU_SET_FN(ops_api_fn,
                TpuProgram_DeserializeFromGetTpuProgramResponseProto);
 
-  return tensorflow::Status::OK();
-}
+  TFTPU_SET_FN(ops_api_fn, TpuNodeContext_Create);
+  TFTPU_SET_FN(ops_api_fn, TpuNodeContext_Free);
+  TFTPU_SET_FN(ops_api_fn, TpuNodeContext_Initialize);
+  TFTPU_SET_FN(ops_api_fn, TpuNodeContext_StopChipHeartbeats);
+  TFTPU_SET_FN(ops_api_fn, TpuNodeContext_CloseTpuHost);
 
-tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
-  auto* node_context_fn = tensorflow::tpu::NodeContextApiFn();
-
-  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Create);
-  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Free);
-  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Initialize);
-  TFTPU_SET_FN(node_context_fn, TpuNodeContext_StopChipHeartbeats);
-  TFTPU_SET_FN(node_context_fn, TpuNodeContext_CloseTpuHost);
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
-  auto* util_fn = tensorflow::tpu::UtilApiFn();
-
-  TFTPU_SET_FN(util_fn, TpuTopology_AvailableCoreCount);
-  TFTPU_SET_FN(util_fn, TpuNetUtil_RecycleUnusedPort);
-  TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
-  TFTPU_SET_FN(util_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
-  TFTPU_SET_FN(util_fn, TpuCompile_CreateCompilationCacheKey);
-  TFTPU_SET_FN(util_fn, TpuCompile_DestroyCompilationCacheKey);
-  TFTPU_SET_FN(util_fn, TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_SET_FN(ops_api_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(ops_api_fn, TpuNetUtil_RecycleUnusedPort);
+  TFTPU_SET_FN(ops_api_fn, TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_SET_FN(ops_api_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
+  TFTPU_SET_FN(ops_api_fn, TpuCompile_CreateCompilationCacheKey);
+  TFTPU_SET_FN(ops_api_fn, TpuCompile_DestroyCompilationCacheKey);
+  TFTPU_SET_FN(ops_api_fn, TpuCompile_CreateGuaranteedConstFingerprint);
 
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status InitializeTpuStructFns(void* library_handle) {
-  TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
-  TF_RETURN_IF_ERROR(SetTpuMeshStateStructFns(library_handle));
-  TF_RETURN_IF_ERROR(SetCompileStructFn(library_handle));
-  TF_RETURN_IF_ERROR(SetExecuteStructFn(library_handle));
-  TF_RETURN_IF_ERROR(SetTpuProgramStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuOpsStructFns(library_handle));
   TF_RETURN_IF_ERROR(SetExecutorStructFn(library_handle));
-  TF_RETURN_IF_ERROR(SetTpuNodeContextStructFns(library_handle));
-  TF_RETURN_IF_ERROR(SetTpuUtilStructFns(library_handle));
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/core/tpu/tpu_on_demand_compiler.cc b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
index 01ea9f5848a..c99808f5ee7 100644
--- a/tensorflow/core/tpu/tpu_on_demand_compiler.cc
+++ b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
@@ -88,7 +88,7 @@ class TpuExecutable : public TpuExecutableInterface {
  public:
   TpuExecutable(SE_Executable* se_executable,
                 std::shared_ptr<HloModule> hlo_module)
-      : TpuExecutableInterface(std::move(hlo_module), nullptr, nullptr),
+      : TpuExecutableInterface(std::move(hlo_module)),
         se_executable_(se_executable) {}
 
   ~TpuExecutable() override {
@@ -276,16 +276,6 @@ class TpuCompiler : public Compiler {
     return HloModule::CreateFromProto(result_proto, module->config());
   }
 
-  StatusOr<
-      std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
-  RunHloPassesAndBufferAssignement(
-      std::unique_ptr<HloModule> module,
-      stream_executor::StreamExecutor* executor,
-      stream_executor::DeviceMemoryAllocator* device_allocator) override {
-    return Unimplemented(
-        "This compiler does not support RunHloPassesAndBufferAssignment.");
-  }
-
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       stream_executor::StreamExecutor* executor,
diff --git a/tensorflow/core/tpu/tpu_ops_c_api.h b/tensorflow/core/tpu/tpu_ops_c_api.h
new file mode 100644
index 00000000000..bbe954433ff
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_ops_c_api.h
@@ -0,0 +1,344 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_KERNELS_C_API_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_KERNELS_C_API_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+
+#include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
+#include "tensorflow/stream_executor/tpu/proto_helper.h"
+
+typedef struct TpuSerializedProto TpuSerializedProto;
+
+namespace tensorflow {
+class TpuMeshCommonState;
+}  // namespace tensorflow
+
+extern "C" {
+
+typedef struct XLA_TpuProgram XLA_TpuProgram;
+
+// Enum for choosing sharding/unsharding program from a `XLA_TpuProgram` obj.
+enum TpuProgramShardingType { kInvalid = 0, kMain, kSharding, kUnsharding };
+
+struct TpuExecutableSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct CompilerMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct HostComputeMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+typedef struct XLA_TpuMeshState XLA_TpuMeshState;
+
+typedef struct XLA_DeviceAssignment {
+  const char* bytes;
+  size_t size;
+} XLA_DeviceAssignment;
+
+// Property for creating compilation cache key.
+struct CompilationCacheKeyProperty {
+  const char* config_prefix;
+  const char* shapes_prefix;
+  const char* function_name;
+  uint64_t mlir_module_fingerprint;
+  const int32_t* device_ids;
+  size_t device_ids_size;
+  int32_t guaranteed_constants_size;
+  uint64_t function_library_fingerprint;
+  int32_t num_cores_per_replica;
+  int32_t num_replicas;
+  const XLA_TpuMeshState* mesh_state;
+};
+
+// Compilation cache key result returning both the key and a more verbose debug
+// version.
+struct CompilationCacheKeyResult {
+  const char* key;
+  const char* debug_string;
+};
+
+typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
+
+// Compiles Mlir or TF function computation by lowering into HLO IR and returns
+// `count` number of TPU programs ready for execution.
+// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
+// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
+// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
+// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
+// API respectively.
+TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
+    TpuSerializedProto compilation_request, const XLA_TpuMeshState* mesh_state,
+    XLA_TpuProgram** tpu_programs[], size_t* count, TF_Status* status);
+
+// Creates a new TPU mesh state object.
+TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
+
+// Deletes the given TPU `mesh_state` object. Once deleted the object is
+// unusable.
+TFTPU_CAPI_EXPORT void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
+
+// Returns a pointer to an opaque mesh data structure used internally.
+TFTPU_CAPI_EXPORT void* TpuMeshState_MeshCommonState(
+    XLA_TpuMeshState* mesh_state);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_LoadProgramAndEnqueueToStream(
+    const XLA_TpuProgram* program, SE_DeviceMemoryBase* arguments,
+    size_t arguments_len, SE_DeviceMemoryBase* result,
+    SE_DeviceMemoryBase* cross_program_prefetch_addr, int32_t rng_seed,
+    XLA_DeviceAssignment* device_assignment, SE_Stream* stream,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT void HardwareLayout_HostShapeToDeviceShape(
+    XLA_Shape* host_shape, XLA_Shape* device_shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSize(XLA_Shape* shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompact(XLA_Shape* shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompactRaw(XLA_Shape* shape);
+
+TFTPU_CAPI_EXPORT void TpuExecute_RuntimeInputToPaddedData(
+    uint32_t* runtime_input_ptr, size_t runtime_input_size,
+    int8_t* padded_data_ptr, size_t padded_data_size, XLA_Shape* runtime_shape,
+    XLA_Shape* compile_time_shape, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
+    const size_t num_cores_per_host_size, const int32_t* num_cores_per_host,
+    size_t server_address_size, const char* server_address,
+    size_t* host_config_output_size, char** host_config_output,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
+    const size_t num_hosts, const size_t num_cores_per_host,
+    const int32_t** host_ordinal_to_global_core_id_map,
+    tensorflow::TpuMeshCommonState* tpu_mesh_common_state,
+    size_t* tpu_topology_output_size, char** tpu_topology_output,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT void InitializeHostForDistributedTpuOp_DoWork(
+    const size_t tpu_host_config_size, const char* tpu_host_config,
+    const bool enable_whole_mesh_compilations, bool is_master_worker,
+    size_t* core_id_output_size, int32_t** core_id_output, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void SetGlobalTPUArrayOp_DoWork(
+    const size_t tpu_topology_size, const char* tpu_topology,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT void DisconnectDistributedTpuChipsOp_DoWork(
+    int32_t* number_of_chips_output, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeCharArray(char* output);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeInt32Array(int32_t* output);
+
+TFTPU_CAPI_EXPORT bool TpuConfigurationApi_HasTPUPodState();
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpusPerHost(int32_t* tpus,
+                                                       TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpuMemoryLimit(int64_t* memory_limit,
+                                                          TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_RemoteCompilationCacheSizeInBytes(
+    int64_t* cache_size_in_bytes);
+TFTPU_CAPI_EXPORT
+void TpuConfigurationApi_CompilationCacheServerAddressFromConfig(
+    size_t tpu_host_config_size, const char* tpu_host_config,
+    size_t* server_address_output_size, char** server_address_output,
+    TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_GetServerAddressAndPort(
+    size_t* server_address_output_size, char** server_address_output,
+    int* port_output, TF_Status* status);
+
+// Creates a new TPU program.
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_New();
+
+// Destroys the `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_Free(XLA_TpuProgram* tpu_program);
+
+// Creates an array of `XLA_TpuProgram*`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram** TpuProgram_NewArray(size_t count);
+
+// Destroys an array of `XLA_TpuProgram*`.
+TFTPU_CAPI_EXPORT void TpuProgram_FreeArray(XLA_TpuProgram* tpu_program[]);
+
+// Unloads and destroys the `tpu_program`. Once the TPU program is unloaded and
+// destroyed, it is in an unusable state.
+TFTPU_CAPI_EXPORT void TpuProgram_UnloadAndDestroy(XLA_TpuProgram* tpu_program,
+                                                   TF_Status* status);
+
+// Gets TPU program size in bytes from the `tpu_program`.
+TFTPU_CAPI_EXPORT int64_t
+TpuProgram_GetProgramSize(const XLA_TpuProgram* tpu_program);
+
+// Logs the summary of current memory state snapshot of the `tpu_program`.
+TFTPU_CAPI_EXPORT bool TpuProgram_LogProgramMemorySummary(
+    const XLA_TpuProgram* tpu_program);
+
+// Gets TPU program executable info from the `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_GetExecutableInfo(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info,
+    TF_Status* status);
+
+// Gets host transfer info proto.
+TFTPU_CAPI_EXPORT void TpuProgram_GetHostTransferInfo(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info,
+    TF_Status* status);
+
+// Gets HLO metadata proto.
+TFTPU_CAPI_EXPORT void TpuProgram_GetHloMetadata(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata,
+    TF_Status* status);
+
+// Gets may modify variables boolean value.
+TFTPU_CAPI_EXPORT void TpuProgram_GetMayModifyVariables(
+    const XLA_TpuProgram* tpu_program, bool* may_modify_variables);
+
+// Checks if TPU program has sharding.
+TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
+    const XLA_TpuProgram* tpu_program);
+
+// Gets TPU program by sharding type. Return value is valid only when the
+// `status.status()` returns `OK`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_GetTpuProgram(
+    XLA_TpuProgram* tpu_program, TpuProgramShardingType type);
+
+// Gets TPU executable proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeTpuExecutable(
+    const XLA_TpuProgram* tpu_program, TpuExecutableSerializedProto* executable,
+    TF_Status* status);
+
+// Gets compilation metadata proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeCompilerMetadata(
+    const XLA_TpuProgram* tpu_program,
+    CompilerMetadataSerializedProto* compiler_metadata, TF_Status* status);
+
+// Deserializes the `GetTpuProgramResponse` proto into an `XLA_TpuProgram`.
+TFTPU_CAPI_EXPORT void TpuProgram_DeserializeFromGetTpuProgramResponseProto(
+    TpuSerializedProto get_tpu_program_response, XLA_TpuProgram* tpu_program,
+    TF_Status* status);
+
+// Checks if whether a TPU compilation is enabled.
+TFTPU_CAPI_EXPORT bool TpuCompile_IsTpuCompilationEnabled();
+
+// XLA compilation cannot be cancelled. To avoid hanging the TF worker will exit
+// when cancellation is requested for an XLA compile op. Some tests require this
+// behavior to be disabled, and we test for this condition with the following
+// flag function.
+TFTPU_CAPI_EXPORT bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
+
+// Returns the number of available TPU core count.
+TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
+    const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
+
+// Recycle unused service port.
+TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
+
+// Creates a unique compilation cache `key` used for `put` and `get` operations.
+// Returned buffers are heap-allocated and must be owned.
+TFTPU_CAPI_EXPORT CompilationCacheKeyResult
+TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty property);
+
+// Destroys the CompilationCacheKeyResult returned by calling the
+// `TpuCompile_CreateCompilationCacheKey` API.
+TFTPU_CAPI_EXPORT void TpuCompile_DestroyCompilationCacheKey(
+    CompilationCacheKeyResult result);
+
+// Creates a guaranteed const fingerprint. Guarantee const is normally used in
+// TPU inference to avoid re-copying unchanged variables onto the TPU device.
+// It promises the value is identical for every execution in the same session
+// even if the actual value changes in later executions.
+TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
+    uint64_t fingerprint, const char* data, size_t size);
+
+XLA_TpuNodeContext* TpuNodeContext_Create(int device_ordinal,
+                                          TF_Status* status);
+void TpuNodeContext_Free(XLA_TpuNodeContext* node_context);
+
+void TpuNodeContext_StopChipHeartbeats(TF_Status* status);
+
+void TpuNodeContext_CloseTpuHost(TF_Status* status);
+
+void TpuNodeContext_Initialize(int device_ordinal, TF_Status* status);
+
+struct TfTpu_OpsApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_MeshCommonState);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_LoadProgramAndEnqueueToStream);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_HostShapeToDeviceShape);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSize);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompact);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompactRaw);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecute_RuntimeInputToPaddedData);
+
+  TFTPU_ADD_FN_IN_STRUCT(ConfigureDistributedTpuOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(WaitForDistributedTpuOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(InitializeHostForDistributedTpuOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(SetGlobalTPUArrayOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(DisconnectDistributedTpuChipsOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeCharArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeInt32Array);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpusPerHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
+  TFTPU_ADD_FN_IN_STRUCT(
+      TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_GetServerAddressAndPort);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_NewArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_FreeArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_UnloadAndDestroy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetProgramSize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_LogProgramMemorySummary);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetExecutableInfo);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHostTransferInfo);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHloMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetMayModifyVariables);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_HasSharding);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetTpuProgram);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeTpuExecutable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeCompilerMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_DeserializeFromGetTpuProgramResponseProto);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
+};
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_KERNELS_C_API_H_
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index a4d93c3686c..e8179d33d7e 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -668,7 +668,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -703,6 +703,9 @@ tf_cuda_only_cc_test(
     srcs = [
         "gpu_kernel_helper_test.cu.cc",
     ],
+    tags = [
+        "no_cuda_asan",  # TODO(b/171342366): re-enable.
+    ],
     deps = [
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -737,7 +740,6 @@ tf_cc_tests(
         "tensor_slice_writer_test.cc",
         "work_sharder_test.cc",
     ],
-    create_named_test_suite = True,
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -758,7 +760,6 @@ tf_cc_tests(
         "//tensorflow/core",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -768,6 +769,7 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform:regexp",
         "//third_party/eigen3",
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 00a9cbaa3d8..ce83f2d2fb3 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -132,51 +132,61 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 }  // namespace
 
-Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text)
+Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_INT32),
-      int32_hook_([dst](int32 value) {
+      int32_hook_([dst, dst_updated](int32 value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       int32_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, tensorflow::int64* dst, const string& usage_text)
+Flag::Flag(const char* name, tensorflow::int64* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_INT64),
-      int64_hook_([dst](int64 value) {
+      int64_hook_([dst, dst_updated](int64 value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       int64_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, float* dst, const string& usage_text)
+Flag::Flag(const char* name, float* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_FLOAT),
-      float_hook_([dst](float value) {
+      float_hook_([dst, dst_updated](float value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       float_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, bool* dst, const string& usage_text)
+Flag::Flag(const char* name, bool* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_BOOL),
-      bool_hook_([dst](bool value) {
+      bool_hook_([dst, dst_updated](bool value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       bool_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, string* dst, const string& usage_text)
+Flag::Flag(const char* name, string* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_STRING),
-      string_hook_([dst](string value) {
+      string_hook_([dst, dst_updated](string value) {
         *dst = std::move(value);
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       string_default_for_display_(*dst),
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 928ae8a4e94..3d583a605b5 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -62,11 +62,16 @@ namespace tensorflow {
 // text, and a pointer to the corresponding variable.
 class Flag {
  public:
-  Flag(const char* name, int32* dst, const string& usage_text);
-  Flag(const char* name, int64* dst, const string& usage_text);
-  Flag(const char* name, bool* dst, const string& usage_text);
-  Flag(const char* name, string* dst, const string& usage_text);
-  Flag(const char* name, float* dst, const string& usage_text);
+  Flag(const char* name, int32* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, int64* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, bool* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, string* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, float* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
 
   // These constructors invoke a hook on a match instead of writing to a
   // specific memory location.  The hook may return false to signal a malformed
@@ -85,6 +90,8 @@ class Flag {
   Flag(const char* name, std::function<bool(string)> string_hook,
        string default_value_for_display, const string& usage_text);
 
+  bool is_default_initialized() const { return default_initialized_; }
+
  private:
   friend class Flags;
 
@@ -115,6 +122,7 @@ class Flag {
   string string_default_for_display_;
 
   string usage_text_;
+  bool default_initialized_ = true;
 };
 
 class Flags {
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index fa2b3d25e29..df4a05b61d4 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -89,8 +89,8 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
diff --git a/tensorflow/examples/README.md b/tensorflow/examples/README.md
new file mode 100644
index 00000000000..d2869e7d321
--- /dev/null
+++ b/tensorflow/examples/README.md
@@ -0,0 +1,16 @@
+# TensorFlow C++ Examples
+
+This directory contains examples of the TensorFlow C++ API (and some redirects).
+If that's not what you're looking for here are some links:
+
+* For TensorFlow python examples see
+  [the tutorials on tensorflow.org](https://tensorflow.org/tutorials)
+* For community maintained keras examples goto [keras.io/examples](https://keras.io/examples/)
+* For TensorFlow Lite examples see [the tensorflow/examples repository](https://github.com/tensorflow/examples/tree/master/lite)
+
+## About these examples
+
+* The C++ API is only easily buildable from within the TensorFlow `bazel` build.
+  If you need a stand alone build [see the C API](https://www.tensorflow.org/install/lang_c).
+* This directory is not actively maintained.
+
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
deleted file mode 100644
index 33bc6f2a64e..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ /dev/null
@@ -1,84 +0,0 @@
-load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
-
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
-
-py_library(
-    name = "integration_scripts",
-    srcs = [
-        "deploy_mnist_cnn.py",
-        "export_mnist_cnn.py",
-        "export_rnn_cell.py",
-        "export_simple_text_embedding.py",
-        "export_text_rnn_model.py",
-        "integration_scripts.py",
-        "use_mnist_cnn.py",
-        "use_model_in_sequential_keras.py",
-        "use_rnn_cell.py",
-        "use_text_embedding_in_dataset.py",
-        "use_text_rnn_model.py",
-    ],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":distribution_strategy_utils",
-        ":mnist_util",
-        "//tensorflow:tensorflow_py",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "mnist_util",
-    srcs = ["mnist_util.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "distribution_strategy_utils",
-    srcs = ["distribution_strategy_utils.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python/distribute:strategy_combinations",
-    ],
-)
-
-distribute_py_test(
-    name = "saved_model_test",
-    srcs = [
-        "saved_model_test.py",
-    ],
-    shard_count = 4,
-    tags = [
-        "no_pip",  # b/131697937 and b/132196869
-        "noasan",  # forge input size exceeded
-        "nomsan",  # forge input size exceeded
-        "notsan",  # forge input size exceeded
-    ],
-    tpu_tags = [
-        "no_oss",  # Test infra collision (b/157754990)
-    ],
-    deps = [
-        ":distribution_strategy_utils",
-        ":integration_scripts",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python/distribute:combinations",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-# b/132234211: Target added to support internal test target that runs the test
-# in an environment that has the extra dependencies required to test integration
-# with non core tensorflow packages.
-py_library(
-    name = "saved_model_test_lib",
-    srcs = [
-        "saved_model_test.py",
-    ],
-    visibility = ["//tensorflow:internal"],
-    deps = [":integration_scripts"],
-)
diff --git a/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
deleted file mode 100644
index cc5cb6b2a6c..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Deploys a SavedModel with an MNIST classifier to TFLite."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.examples.saved_model.integration_tests import mnist_util
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    'saved_model_dir', None,
-    'Directory of the SavedModel to deploy.')
-flags.DEFINE_bool(
-    'use_fashion_mnist', False,
-    'Use Fashion MNIST (products) instead of the real MNIST (digits).')
-flags.DEFINE_bool(
-    'fast_test_mode', False,
-    'Limit amount of test data for running in unit tests.')
-flags.DEFINE_string(
-    'tflite_output_file', None,
-    'The filename of the .tflite model file to write (optional).')
-flags.DEFINE_bool(
-    'reload_as_keras_model', True,
-    'Also test tf.keras.models.load_model() on --saved_model_dir.')
-
-
-def main(argv):
-  del argv
-
-  # First convert the SavedModel in a pristine environment.
-  converter = tf.lite.TFLiteConverter.from_saved_model(FLAGS.saved_model_dir)
-  lite_model_content = converter.convert()
-  # Here is how you can save it for actual deployment.
-  if FLAGS.tflite_output_file:
-    with open(FLAGS.tflite_output_file, 'wb') as outfile:
-      outfile.write(lite_model_content)
-  # For testing, the TFLite model can be executed like this.
-  interpreter = tf.lite.Interpreter(model_content=lite_model_content)
-  def lite_model(images):
-    interpreter.allocate_tensors()
-    interpreter.set_tensor(interpreter.get_input_details()[0]['index'], images)
-    interpreter.invoke()
-    return interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
-
-  # Load the SavedModel again for use as a test baseline.
-  imported = tf.saved_model.load(FLAGS.saved_model_dir)
-  def tf_model(images):
-    output_dict = imported.signatures['serving_default'](tf.constant(images))
-    logits, = output_dict.values()  # Unpack single value.
-    return logits
-
-  # Compare model outputs on the test inputs.
-  (_, _), (x_test, _) = mnist_util.load_reshaped_data(
-      use_fashion_mnist=FLAGS.use_fashion_mnist,
-      fake_tiny_data=FLAGS.fast_test_mode)
-  for i, x in enumerate(x_test):
-    x = x[None, ...]  # Make batch of size 1.
-    y_lite = lite_model(x)
-    y_tf = tf_model(x)
-    # This numpy primitive uses plain `raise` and works outside tf.TestCase.
-    # Model outputs are probabilities that sum to 1, so atol makes sense here.
-    np.testing.assert_allclose(
-        y_lite, y_tf, rtol=0, atol=1e-5,
-        err_msg='Mismatch with TF Lite at test example %d' % i)
-
-  # Test that the SavedModel loads correctly with v1 load APIs as well.
-  with tf.compat.v1.Graph().as_default(), tf.compat.v1.Session() as session:
-    tf.compat.v1.saved_model.load(
-        session,
-        [tf.compat.v1.saved_model.SERVING],
-        FLAGS.saved_model_dir)
-
-  # The SavedModel actually was a Keras Model; test that it also loads as that.
-  if FLAGS.reload_as_keras_model:
-    keras_model = tf.keras.models.load_model(FLAGS.saved_model_dir)
-    for i, x in enumerate(x_test):
-      x = x[None, ...]  # Make batch of size 1.
-      y_tf = tf_model(x)
-      y_keras = keras_model(x)
-      # This numpy primitive uses plain `raise` and works outside tf.TestCase.
-      # Model outputs are probabilities that sum to 1, so atol makes sense here.
-      np.testing.assert_allclose(
-          y_tf, y_keras, rtol=0, atol=1e-5,
-          err_msg='Mismatch with Keras at test example %d' % i)
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py b/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py
deleted file mode 100644
index eabe4cf3802..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils related to tf.distribute.strategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import sys
-
-from tensorflow.python.distribute import strategy_combinations
-
-_strategies = [
-    strategy_combinations.one_device_strategy,
-    strategy_combinations.mirrored_strategy_with_one_cpu,
-    strategy_combinations.mirrored_strategy_with_one_gpu,
-    strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-    strategy_combinations.mirrored_strategy_with_two_gpus,
-    strategy_combinations.tpu_strategy,
-]
-
-# The presence of GPU strategies upsets TPU initialization,
-# despite their test instances being skipped early. This is a workaround
-# for b/145386854.
-if "test_tpu" in sys.argv[0]:
-  _strategies = [s for s in _strategies if "GPU" not in str(s)]
-
-
-named_strategies = collections.OrderedDict(
-    [(None, None)] +
-    [(str(s), s) for s in _strategies]
-)
-
-
-class MaybeDistributionScope(object):
-  """Provides a context allowing no distribution strategy."""
-
-  @staticmethod
-  def from_name(name):
-    return MaybeDistributionScope(named_strategies[name].strategy if name
-                                  else None)
-
-  def __init__(self, distribution):
-    self._distribution = distribution
-    self._scope = None
-
-  def __enter__(self):
-    if self._distribution:
-      self._scope = self._distribution.scope()
-      self._scope.__enter__()
-
-  def __exit__(self, exc_type, value, traceback):
-    if self._distribution:
-      self._scope.__exit__(exc_type, value, traceback)
-      self._scope = None
diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
deleted file mode 100644
index ea1c138d164..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Exports a convolutional feature extractor for MNIST in SavedModel format.
-
-The feature extractor is a convolutional neural network plus a hidden layer
-that gets trained as part of an MNIST classifier and then written to a
-SavedModel (without the classification layer). From there, use_mnist_cnn.py
-picks it up for transfer learning.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import tensorflow.compat.v2 as tf
-
-from tensorflow.examples.saved_model.integration_tests import mnist_util
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    'export_dir', None,
-    'Directory of exported SavedModel.')
-flags.DEFINE_integer(
-    'epochs', 10,
-    'Number of epochs to train.')
-flags.DEFINE_bool(
-    'use_keras_save_api', False,
-    'Uses tf.keras.models.save_model() on the feature extractor '
-    'instead of tf.saved_model.save() on a manually wrapped version. '
-    'With this, the exported model has no hparams.')
-flags.DEFINE_bool(
-    'fast_test_mode', False,
-    'Shortcut training for running in unit tests.')
-flags.DEFINE_bool(
-    'export_print_hparams', False,
-    'If true, the exported function will print its effective hparams.')
-
-
-def make_feature_extractor(l2_strength, dropout_rate):
-  """Returns a Keras Model to compute a feature vector from MNIST images."""
-  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
-  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
-  net = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', name='conv1',
-                               kernel_regularizer=regularizer())(net)
-  net = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', name='conv2',
-                               kernel_regularizer=regularizer())(net)
-  net = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), name='pool1')(net)
-  net = tf.keras.layers.Dropout(dropout_rate, name='dropout1')(net)
-  net = tf.keras.layers.Flatten(name='flatten')(net)
-  net = tf.keras.layers.Dense(10, activation='relu', name='dense1',
-                              kernel_regularizer=regularizer())(net)
-  return tf.keras.Model(inputs=inp, outputs=net)
-
-
-def set_feature_extractor_hparams(model, dropout_rate):
-  model.get_layer('dropout1').rate = dropout_rate
-
-
-def make_classifier(feature_extractor, l2_strength, dropout_rate=0.5):
-  """Returns a Keras Model to classify MNIST using feature_extractor."""
-  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
-  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
-  net = feature_extractor(net)
-  net = tf.keras.layers.Dropout(dropout_rate)(net)
-  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
-                              kernel_regularizer=regularizer())(net)
-  return tf.keras.Model(inputs=inp, outputs=net)
-
-
-def wrap_keras_model_for_export(model, batch_input_shape,
-                                set_hparams, default_hparams):
-  """Wraps `model` for saving and loading as SavedModel."""
-  # The primary input to the module is a Tensor with a batch of images.
-  # Here we determine its spec.
-  inputs_spec = tf.TensorSpec(shape=batch_input_shape, dtype=tf.float32)
-
-  # The module also accepts certain hparams as optional Tensor inputs.
-  # Here, we cut all the relevant slices from `default_hparams`
-  # (and don't worry if anyone accidentally modifies it later).
-  if default_hparams is None: default_hparams = {}
-  hparam_keys = list(default_hparams.keys())
-  hparam_defaults = tuple(default_hparams.values())
-  hparams_spec = {name: tf.TensorSpec.from_tensor(tf.constant(value))
-                  for name, value in default_hparams.items()}
-
-  # The goal is to save a function with this argspec...
-  argspec = tf_inspect.FullArgSpec(
-      args=(['inputs', 'training'] + hparam_keys),
-      defaults=((False,) + hparam_defaults),
-      varargs=None, varkw=None,
-      kwonlyargs=[], kwonlydefaults=None,
-      annotations={})
-  # ...and this behavior:
-  def call_fn(inputs, training, *args):
-    if FLAGS.export_print_hparams:
-      args = [tf.keras.backend.print_tensor(args[i], 'training=%s and %s='
-                                            % (training, hparam_keys[i]))
-              for i in range(len(args))]
-    kwargs = dict(zip(hparam_keys, args))
-    if kwargs: set_hparams(model, **kwargs)
-    return model(inputs, training=training)
-
-  # We cannot spell out `args` in def statement for call_fn, but since
-  # tf.function uses tf_inspect, we can use tf_decorator to wrap it with
-  # the desired argspec.
-  def wrapped(*args, **kwargs):  # TODO(arnoegw): Can we use call_fn itself?
-    return call_fn(*args, **kwargs)
-  traced_call_fn = tf.function(
-      tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec))
-
-  # Now we need to trigger traces for all supported combinations of the
-  # non-Tensor-value inputs.
-  for training in (True, False):
-    traced_call_fn.get_concrete_function(inputs_spec, training, **hparams_spec)
-
-  # Finally, we assemble the object for tf.saved_model.save().
-  obj = tf.train.Checkpoint()
-  obj.__call__ = traced_call_fn
-  obj.trainable_variables = model.trainable_variables
-  obj.variables = model.trainable_variables + model.non_trainable_variables
-  # Make tf.functions for the regularization terms of the loss.
-  obj.regularization_losses = [_get_traced_loss(model, i)
-                               for i in range(len(model.losses))]
-  return obj
-
-
-def _get_traced_loss(model, i):
-  """Returns tf.function for model.losses[i] with a trace for zero args.
-
-  The intended usage is
-    [_get_traced_loss(model, i) for i in range(len(model.losses))]
-  This is better than
-    [tf.function(lambda: model.losses[i], input_signature=[]) for i ...]
-  because it avoids capturing a loop index in a lambda, and removes any
-  chance of deferring the trace.
-
-  Args:
-    model: a Keras Model.
-    i: an integer between from 0 up to but to len(model.losses).
-  """
-  f = tf.function(lambda: model.losses[i])
-  _ = f.get_concrete_function()
-  return f
-
-
-def main(argv):
-  del argv
-
-  # Build a complete classifier model using a feature extractor.
-  default_hparams = dict(dropout_rate=0.25)
-  l2_strength = 0.01  # Not a hparam for inputs -> outputs.
-  feature_extractor = make_feature_extractor(l2_strength=l2_strength,
-                                             **default_hparams)
-  classifier = make_classifier(feature_extractor, l2_strength=l2_strength)
-
-  # Train the complete model.
-  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
-      fake_tiny_data=FLAGS.fast_test_mode)
-  classifier.compile(loss=tf.keras.losses.categorical_crossentropy,
-                     optimizer=tf.keras.optimizers.SGD(),
-                     metrics=['accuracy'])
-  classifier.fit(x_train, y_train,
-                 batch_size=128,
-                 epochs=FLAGS.epochs,
-                 verbose=1,
-                 validation_data=(x_test, y_test))
-
-  # Save the feature extractor to a framework-agnostic SavedModel for reuse.
-  # Note that the feature_extractor object has not been compiled or fitted,
-  # so it does not contain an optimizer and related state.
-  if FLAGS.use_keras_save_api:
-    # Use Keras' built-in way of creating reusable SavedModels.
-    # This has no support for adjustable hparams at this time (July 2019).
-    # (We could also call tf.saved_model.save(feature_extractor, ...),
-    # point is we're passing a Keras model, not a plain Checkpoint.)
-    tf.keras.models.save_model(feature_extractor, FLAGS.export_dir)
-  else:
-    # Assemble a reusable SavedModel manually, with adjustable hparams.
-    exportable = wrap_keras_model_for_export(feature_extractor,
-                                             (None,) + mnist_util.INPUT_SHAPE,
-                                             set_feature_extractor_hparams,
-                                             default_hparams)
-    tf.saved_model.save(exportable, FLAGS.export_dir)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
deleted file mode 100644
index 6a2853f0617..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Export an RNN cell in SavedModel format."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import numpy as np
-
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
-
-
-def main(argv):
-  del argv
-
-  root = tf.train.Checkpoint()
-  # Create a cell and attach to our trackable.
-  root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
-
-  # Wrap the rnn_cell.__call__ function and assign to next_state.
-  root.next_state = tf.function(root.rnn_cell.__call__)
-
-  # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
-  # attribute with the same name.
-  @tf.function(input_signature=[tf.TensorSpec([None, None], tf.float32)])
-  def get_initial_state(tensor):
-    return root.rnn_cell.get_initial_state(tensor, None, None)
-
-  root.get_initial_state = get_initial_state
-
-  # Construct an initial_state, then call next_state explicitly to trigger a
-  # trace for serialization (we need an explicit call, because next_state has
-  # not been annotated with an input_signature).
-  initial_state = root.get_initial_state(
-      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
-  root.next_state(
-      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
-      initial_state)
-
-  tf.saved_model.save(root, FLAGS.export_dir)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
deleted file mode 100644
index 891a8f1c7e2..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Text embedding model stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-from absl import app
-from absl import flags
-
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
-
-
-def write_vocabulary_file(vocabulary):
-  """Write temporary vocab file for module construction."""
-  tmpdir = tempfile.mkdtemp()
-  vocabulary_file = os.path.join(tmpdir, "tokens.txt")
-  with tf.io.gfile.GFile(vocabulary_file, "w") as f:
-    for entry in vocabulary:
-      f.write(entry + "\n")
-  return vocabulary_file
-
-
-class TextEmbeddingModel(tf.train.Checkpoint):
-  """Text embedding model.
-
-  A text embeddings model that takes a sentences on input and outputs the
-  sentence embedding.
-  """
-
-  def __init__(self, vocabulary, emb_dim, oov_buckets):
-    super(TextEmbeddingModel, self).__init__()
-    self._oov_buckets = oov_buckets
-    self._total_size = len(vocabulary) + oov_buckets
-    # Assign the table initializer to this instance to ensure the asset
-    # it depends on is saved with the SavedModel.
-    self._table_initializer = tf.lookup.TextFileInitializer(
-        write_vocabulary_file(vocabulary), tf.string,
-        tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64,
-        tf.lookup.TextFileIndex.LINE_NUMBER)
-    self._table = tf.lookup.StaticVocabularyTable(
-        self._table_initializer, num_oov_buckets=self._oov_buckets)
-    self.embeddings = tf.Variable(
-        tf.random.uniform(shape=[self._total_size, emb_dim]))
-    self.variables = [self.embeddings]
-    self.trainable_variables = self.variables
-
-  def _tokenize(self, sentences):
-    # Perform a minimalistic text preprocessing by removing punctuation and
-    # splitting on spaces.
-    normalized_sentences = tf.strings.regex_replace(
-        input=sentences, pattern=r"\pP", rewrite="")
-    normalized_sentences = tf.reshape(normalized_sentences, [-1])
-    sparse_tokens = tf.strings.split(normalized_sentences, " ").to_sparse()
-
-    # Deal with a corner case: there is one empty sentence.
-    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
-    # Deal with a corner case: all sentences are empty.
-    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
-    sparse_token_ids = self._table.lookup(sparse_tokens.values)
-
-    return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
-
-  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
-  def __call__(self, sentences):
-    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
-
-    return tf.nn.safe_embedding_lookup_sparse(
-        embedding_weights=self.embeddings,
-        sparse_ids=tf.sparse.SparseTensor(token_ids, token_values,
-                                          token_dense_shape),
-        sparse_weights=None,
-        combiner="sqrtn")
-
-
-def main(argv):
-  del argv
-
-  vocabulary = ["cat", "is", "on", "the", "mat"]
-  module = TextEmbeddingModel(vocabulary=vocabulary, emb_dim=10, oov_buckets=10)
-  tf.saved_model.save(module, FLAGS.export_dir)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
deleted file mode 100644
index 9b9f5925588..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Text RNN model stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
-
-
-class TextRnnModel(tf.train.Checkpoint):
-  """Text RNN model.
-
-  A full generative text RNN model that can train and decode sentences from a
-  starting word.
-  """
-
-  def __init__(self, vocab, emb_dim, buckets, state_size):
-    super(TextRnnModel, self).__init__()
-    self._buckets = buckets
-    self._lstm_cell = tf.keras.layers.LSTMCell(units=state_size)
-    self._rnn_layer = tf.keras.layers.RNN(
-        self._lstm_cell, return_sequences=True)
-    self._embeddings = tf.Variable(tf.random.uniform(shape=[buckets, emb_dim]))
-    self._logit_layer = tf.keras.layers.Dense(buckets)
-    self._set_up_vocab(vocab)
-
-  def _tokenize(self, sentences):
-    # Perform a minimalistic text preprocessing by removing punctuation and
-    # splitting on spaces.
-    normalized_sentences = tf.strings.regex_replace(
-        input=sentences, pattern=r"\pP", rewrite="")
-    sparse_tokens = tf.strings.split(normalized_sentences, " ").to_sparse()
-
-    # Deal with a corner case: there is one empty sentence.
-    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
-    # Deal with a corner case: all sentences are empty.
-    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
-
-    return (sparse_tokens.indices, sparse_tokens.values,
-            sparse_tokens.dense_shape)
-
-  def _set_up_vocab(self, vocab_tokens):
-    # TODO(vbardiovsky): Currently there is no real vocabulary, because
-    # saved_model serialization does not support trackable resources. Add a real
-    # vocabulary when it does.
-    vocab_list = ["UNK"] * self._buckets
-    for vocab_token in vocab_tokens:
-      index = self._words_to_indices(vocab_token).numpy()
-      vocab_list[index] = vocab_token
-    # This is a variable representing an inverse index.
-    self._vocab_tensor = tf.Variable(vocab_list)
-
-  def _indices_to_words(self, indices):
-    return tf.gather(self._vocab_tensor, indices)
-
-  def _words_to_indices(self, words):
-    return tf.strings.to_hash_bucket(words, self._buckets)
-
-  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
-  def train(self, sentences):
-    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
-    tokens_sparse = tf.sparse.SparseTensor(
-        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
-    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")
-
-    sparse_lookup_ids = tf.sparse.SparseTensor(
-        indices=tokens_sparse.indices,
-        values=self._words_to_indices(tokens_sparse.values),
-        dense_shape=tokens_sparse.dense_shape)
-    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)
-
-    # Targets are the next word for each word of the sentence.
-    tokens_ids_seq = lookup_ids[:, 0:-1]
-    tokens_ids_target = lookup_ids[:, 1:]
-
-    tokens_prefix = tokens[:, 0:-1]
-
-    # Mask determining which positions we care about for a loss: all positions
-    # that have a valid non-terminal token.
-    mask = tf.logical_and(
-        tf.logical_not(tf.equal(tokens_prefix, "")),
-        tf.logical_not(tf.equal(tokens_prefix, "<E>")))
-
-    input_mask = tf.cast(mask, tf.int32)
-
-    with tf.GradientTape() as t:
-      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
-                                                   tokens_ids_seq)
-
-      lstm_initial_state = self._lstm_cell.get_initial_state(
-          sentence_embeddings)
-
-      lstm_output = self._rnn_layer(
-          inputs=sentence_embeddings, initial_state=lstm_initial_state)
-
-      # Stack LSTM outputs into a batch instead of a 2D array.
-      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])
-
-      logits = self._logit_layer(lstm_output)
-
-      targets = tf.reshape(tokens_ids_target, [-1])
-      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)
-
-      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=targets, logits=logits)
-
-      # Final loss is the mean loss for all token losses.
-      final_loss = tf.math.divide(
-          tf.reduce_sum(tf.multiply(losses, weights)),
-          tf.reduce_sum(weights),
-          name="final_loss")
-
-    watched = t.watched_variables()
-    gradients = t.gradient(final_loss, watched)
-
-    for w, g in zip(watched, gradients):
-      w.assign_sub(g)
-
-    return final_loss
-
-  @tf.function
-  def decode_greedy(self, sequence_length, first_word):
-    initial_state = self._lstm_cell.get_initial_state(
-        dtype=tf.float32, batch_size=1)
-
-    sequence = [first_word]
-    current_word = first_word
-    current_id = tf.expand_dims(self._words_to_indices(current_word), 0)
-    current_state = initial_state
-
-    for _ in range(sequence_length):
-      token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id)
-      lstm_outputs, current_state = self._lstm_cell(token_embeddings,
-                                                    current_state)
-      lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size])
-      logits = self._logit_layer(lstm_outputs)
-      softmax = tf.nn.softmax(logits)
-
-      next_ids = tf.math.argmax(softmax, axis=1)
-      next_words = self._indices_to_words(next_ids)[0]
-
-      current_id = next_ids
-      current_word = next_words
-      sequence.append(current_word)
-
-    return sequence
-
-
-def main(argv):
-  del argv
-
-  sentences = ["<S> hello there <E>", "<S> how are you doing today <E>"]
-  vocab = [
-      "<S>", "<E>", "hello", "there", "how", "are", "you", "doing", "today"
-  ]
-
-  module = TextRnnModel(vocab=vocab, emb_dim=10, buckets=100, state_size=128)
-
-  for _ in range(100):
-    _ = module.train(tf.constant(sentences))
-
-  # We have to call this function explicitly if we want it exported, because it
-  # has no input_signature in the @tf.function decorator.
-  decoded = module.decode_greedy(
-      sequence_length=10, first_word=tf.constant("<S>"))
-  _ = [d.numpy() for d in decoded]
-
-  tf.saved_model.save(module, FLAGS.export_dir)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
deleted file mode 100644
index 6f1ccfa2f05..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility to write SavedModel integration tests.
-
-SavedModel testing requires isolation between the process that creates and
-consumes it. This file helps doing that by relaunching the same binary that
-calls `assertCommandSucceeded` with an environment flag indicating what source
-file to execute. That binary must start by calling `MaybeRunScriptInstead`.
-
-This allows to wire this into existing building systems without having to depend
-on data dependencies. And as so allow to keep a fixed binary size and allows
-interop with GPU tests.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import os
-import subprocess
-import sys
-
-from absl import app
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.platform import tf_logging as logging
-
-
-class TestCase(tf.test.TestCase):
-  """Base class to write SavedModel integration tests."""
-
-  def assertCommandSucceeded(self, script_name, **flags):
-    """Runs an integration test script with given flags."""
-    run_script = sys.argv[0]
-    if run_script.endswith(".py"):
-      command_parts = [sys.executable, run_script]
-    else:
-      command_parts = [run_script]
-    command_parts.append("--alsologtostderr")  # For visibility in sponge.
-    for flag_key, flag_value in flags.items():
-      command_parts.append("--%s=%s" % (flag_key, flag_value))
-
-    env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name)
-    logging.info("Running %s with added environment variables %s" %
-                 (command_parts, env))
-    subprocess.check_call(command_parts, env=dict(os.environ, **env))
-
-
-def MaybeRunScriptInstead():
-  if "SCRIPT_NAME" in os.environ:
-    # Append current path to import path and execute `SCRIPT_NAME` main.
-    sys.path.extend([os.path.dirname(__file__)])
-    module_name = os.environ["SCRIPT_NAME"]
-    retval = app.run(importlib.import_module(module_name).main)  # pylint: disable=assignment-from-no-return
-    sys.exit(retval)
diff --git a/tensorflow/examples/saved_model/integration_tests/mnist_util.py b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
deleted file mode 100644
index 9770c849603..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/mnist_util.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convenience wrapper around Keras' MNIST and Fashion MNIST data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-INPUT_SHAPE = (28, 28, 1)
-NUM_CLASSES = 10
-
-
-def _load_random_data(num_train_and_test):
-  return ((np.random.randint(0, 256, (num, 28, 28), dtype=np.uint8),
-           np.random.randint(0, 10, (num,), dtype=np.int64))
-          for num in num_train_and_test)
-
-
-def load_reshaped_data(use_fashion_mnist=False, fake_tiny_data=False):
-  """Returns MNIST or Fashion MNIST or fake train and test data."""
-  load = ((lambda: _load_random_data([128, 128])) if fake_tiny_data else
-          tf.keras.datasets.fashion_mnist.load_data if use_fashion_mnist else
-          tf.keras.datasets.mnist.load_data)
-  (x_train, y_train), (x_test, y_test) = load()
-  return ((_prepare_image(x_train), _prepare_label(y_train)),
-          (_prepare_image(x_test), _prepare_label(y_test)))
-
-
-def _prepare_image(x):
-  """Converts images to [n,h,w,c] format in range [0,1]."""
-  return x[..., None].astype('float32') / 255.
-
-
-def _prepare_label(y):
-  """Conerts labels to one-hot encoding."""
-  return tf.keras.utils.to_categorical(y, NUM_CLASSES)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
deleted file mode 100644
index 434d5ed4ad5..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SavedModel integration tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import tensorflow.compat.v2 as tf
-
-from tensorflow.examples.saved_model.integration_tests import distribution_strategy_utils as ds_utils
-from tensorflow.examples.saved_model.integration_tests import integration_scripts as scripts
-from tensorflow.python.distribute import combinations as distribute_combinations
-from tensorflow.python.framework import combinations
-
-
-class SavedModelTest(scripts.TestCase, parameterized.TestCase):
-
-  def __init__(self, method_name="runTest", has_extra_deps=False):
-    super(SavedModelTest, self).__init__(method_name)
-    self.has_extra_deps = has_extra_deps
-
-  def skipIfMissingExtraDeps(self):
-    """Skip test if it requires extra dependencies.
-
-    b/132234211: The extra dependencies are not available in all environments
-    that run the tests, e.g. "tensorflow_hub" is not available from tests
-    within "tensorflow" alone. Those tests are instead run by another
-    internal test target.
-    """
-    if not self.has_extra_deps:
-      self.skipTest("Missing extra dependencies")
-
-  def test_text_rnn(self):
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded("export_text_rnn_model", export_dir=export_dir)
-    self.assertCommandSucceeded("use_text_rnn_model", model_dir=export_dir)
-
-  def test_rnn_cell(self):
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded("export_rnn_cell", export_dir=export_dir)
-    self.assertCommandSucceeded("use_rnn_cell", model_dir=export_dir)
-
-  def test_text_embedding_in_sequential_keras(self):
-    self.skipIfMissingExtraDeps()
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded(
-        "export_simple_text_embedding", export_dir=export_dir)
-    self.assertCommandSucceeded(
-        "use_model_in_sequential_keras", model_dir=export_dir)
-
-  def test_text_embedding_in_dataset(self):
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded(
-        "export_simple_text_embedding", export_dir=export_dir)
-    self.assertCommandSucceeded(
-        "use_text_embedding_in_dataset", model_dir=export_dir)
-
-  TEST_MNIST_CNN_GENERATE_KWARGS = dict(
-      combinations=(
-          combinations.combine(
-              # Test all combinations with tf.saved_model.save().
-              # Test all combinations using tf.keras.models.save_model()
-              # for both the reusable and the final full model.
-              use_keras_save_api=True,
-              named_strategy=list(ds_utils.named_strategies.values()),
-              retrain_flag_value=["true", "false"],
-              regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
-          ) + combinations.combine(
-              # Test few critcial combinations with raw tf.saved_model.save(),
-              # including export of a reusable SavedModel that gets assembled
-              # manually, including support for adjustable hparams.
-              use_keras_save_api=False,
-              named_strategy=None,
-              retrain_flag_value=["true", "false"],
-              regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
-          )),
-      test_combinations=(distribute_combinations.GPUCombination(),
-                         distribute_combinations.TPUCombination()))
-
-  @combinations.generate(**TEST_MNIST_CNN_GENERATE_KWARGS)
-  def test_mnist_cnn(self, use_keras_save_api, named_strategy,
-                     retrain_flag_value, regularization_loss_multiplier):
-
-    self.skipIfMissingExtraDeps()
-
-    fast_test_mode = True
-    temp_dir = self.get_temp_dir()
-    feature_extrator_dir = os.path.join(temp_dir, "mnist_feature_extractor")
-    full_model_dir = os.path.join(temp_dir, "full_model")
-
-    self.assertCommandSucceeded(
-        "export_mnist_cnn",
-        fast_test_mode=fast_test_mode,
-        export_dir=feature_extrator_dir,
-        use_keras_save_api=use_keras_save_api)
-
-    use_kwargs = dict(fast_test_mode=fast_test_mode,
-                      input_saved_model_dir=feature_extrator_dir,
-                      retrain=retrain_flag_value,
-                      output_saved_model_dir=full_model_dir,
-                      use_keras_save_api=use_keras_save_api)
-    if named_strategy:
-      use_kwargs["strategy"] = str(named_strategy)
-    if regularization_loss_multiplier is not None:
-      use_kwargs[
-          "regularization_loss_multiplier"] = regularization_loss_multiplier
-    self.assertCommandSucceeded("use_mnist_cnn", **use_kwargs)
-
-    self.assertCommandSucceeded(
-        "deploy_mnist_cnn",
-        fast_test_mode=fast_test_mode,
-        saved_model_dir=full_model_dir)
-
-
-if __name__ == "__main__":
-  scripts.MaybeRunScriptInstead()
-  tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
deleted file mode 100644
index ae45a02a59b..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Imports a convolutional feature extractor for MNIST in SavedModel format.
-
-This program picks up the SavedModel written by export_mnist_cnn.py and
-uses the feature extractor contained in it to do classification on either
-classic MNIST (digits) or Fashion MNIST (thumbnails of apparel). Optionally,
-it trains the feature extractor further as part of the new classifier.
-As expected, that makes training slower but does not help much for the
-original training dataset but helps a lot for transfer to the other dataset.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import tensorflow.compat.v2 as tf
-import tensorflow_hub as hub
-
-from tensorflow.examples.saved_model.integration_tests import distribution_strategy_utils as ds_utils
-from tensorflow.examples.saved_model.integration_tests import mnist_util
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    'input_saved_model_dir', None,
-    'Directory of the reusable SavedModel that is imported into this program.')
-flags.DEFINE_integer(
-    'epochs', 5,
-    'Number of epochs to train.')
-flags.DEFINE_bool(
-    'retrain', False,
-    'If set, the imported SavedModel is trained further.')
-flags.DEFINE_float(
-    'dropout_rate', None,
-    'If set, dropout rate passed to the SavedModel. '
-    'Requires a SavedModel with support for adjustable hyperparameters.')
-flags.DEFINE_float(
-    'regularization_loss_multiplier', None,
-    'If set, multiplier for the regularization losses in the SavedModel.')
-flags.DEFINE_bool(
-    'use_fashion_mnist', False,
-    'Use Fashion MNIST (products) instead of the real MNIST (digits). '
-    'With this, --retrain gains a lot.')
-flags.DEFINE_bool(
-    'fast_test_mode', False,
-    'Shortcut training for running in unit tests.')
-flags.DEFINE_string(
-    'output_saved_model_dir', None,
-    'Directory of the SavedModel that was exported for reuse.')
-flags.DEFINE_bool(
-    'use_keras_save_api', False,
-    'Uses tf.keras.models.save_model() instead of tf.saved_model.save().')
-flags.DEFINE_string('strategy', None,
-                    'Name of the distribution strategy to use.')
-
-
-def make_feature_extractor(saved_model_path, trainable,
-                           regularization_loss_multiplier):
-  """Load a pre-trained feature extractor and wrap it for use in Keras."""
-  if regularization_loss_multiplier is not None:
-    # TODO(b/63257857): Scaling regularization losses requires manual loading
-    # and modification of the SavedModel
-    obj = tf.saved_model.load(saved_model_path)
-    def _scale_one_loss(l):  # Separate def avoids lambda capture of loop var.
-      f = tf.function(lambda: tf.multiply(regularization_loss_multiplier, l()))
-      _ = f.get_concrete_function()
-      return f
-    obj.regularization_losses = [_scale_one_loss(l)
-                                 for l in obj.regularization_losses]
-    # The modified object is then passed to hub.KerasLayer instead of the
-    # string handle. That prevents it from saving a Keras config (b/134528831).
-    handle = obj
-  else:
-    # If possible, we exercise the more common case of passing a string handle
-    # such that hub.KerasLayer can save a Keras config (b/134528831).
-    handle = saved_model_path
-
-  arguments = {}
-  if FLAGS.dropout_rate is not None:
-    arguments['dropout_rate'] = FLAGS.dropout_rate
-
-  return hub.KerasLayer(handle, trainable=trainable, arguments=arguments)
-
-
-def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
-  """Returns a Keras Model to classify MNIST using feature_extractor."""
-  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
-  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
-  net = feature_extractor(net)
-  if dropout_rate:
-    net = tf.keras.layers.Dropout(dropout_rate)(net)
-  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
-                              kernel_regularizer=regularizer())(net)
-  return tf.keras.Model(inputs=inp, outputs=net)
-
-
-def main(argv):
-  del argv
-
-  with ds_utils.MaybeDistributionScope.from_name(FLAGS.strategy):
-    feature_extractor = make_feature_extractor(
-        FLAGS.input_saved_model_dir,
-        FLAGS.retrain,
-        FLAGS.regularization_loss_multiplier)
-    model = make_classifier(feature_extractor)
-
-    model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                  optimizer=tf.keras.optimizers.SGD(),
-                  metrics=['accuracy'])
-
-  # Train the classifier (possibly on a different dataset).
-  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
-      use_fashion_mnist=FLAGS.use_fashion_mnist,
-      fake_tiny_data=FLAGS.fast_test_mode)
-  print('Training on %s with %d trainable and %d untrainable variables.' %
-        ('Fashion MNIST' if FLAGS.use_fashion_mnist else 'MNIST',
-         len(model.trainable_variables), len(model.non_trainable_variables)))
-  model.fit(x_train, y_train,
-            batch_size=128,
-            epochs=FLAGS.epochs,
-            verbose=1,
-            validation_data=(x_test, y_test))
-
-  if FLAGS.output_saved_model_dir:
-    if FLAGS.use_keras_save_api:
-      tf.keras.models.save_model(model, FLAGS.output_saved_model_dir)
-    else:
-      tf.saved_model.save(model, FLAGS.output_saved_model_dir)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
deleted file mode 100644
index 47a10fbb608..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use text embedding module in sequential Keras."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-
-import numpy as np
-import tensorflow.compat.v2 as tf
-import tensorflow_hub as hub
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def train(fine_tuning):
-  """Build a Keras model and train with mock data."""
-  features = np.array(["my first sentence", "my second sentence"])
-  labels = np.array([1, 0])
-  dataset = tf.data.Dataset.from_tensor_slices((features, labels))
-
-  module = tf.saved_model.load(FLAGS.model_dir)
-
-  # Create the sequential keras model.
-  l = tf.keras.layers
-  model = tf.keras.Sequential()
-  model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
-  # TODO(b/124219898): output_shape should be optional.
-  model.add(hub.KerasLayer(module, output_shape=[10], trainable=fine_tuning))
-  model.add(l.Dense(100, activation="relu"))
-  model.add(l.Dense(50, activation="relu"))
-  model.add(l.Dense(1, activation="sigmoid"))
-
-  model.compile(
-      optimizer="adam",
-      loss="binary_crossentropy",
-      metrics=["accuracy"],
-      # TODO(b/124446120): Remove after fixed.
-      run_eagerly=True)
-
-  model.fit_generator(generator=dataset.batch(1), epochs=5)
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(model, tempfile.mkdtemp())
-
-
-def main(argv):
-  del argv
-
-  train(fine_tuning=False)
-  train(fine_tuning=True)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
deleted file mode 100644
index 2caca306cac..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use an RNN cell stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def main(argv):
-  del argv
-  cell = tf.saved_model.load(FLAGS.model_dir)
-
-  initial_state = cell.get_initial_state(
-      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
-
-  cell.next_state(
-      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
-      initial_state)
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(cell, tempfile.mkdtemp())
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_embedding_in_dataset.py b/tensorflow/examples/saved_model/integration_tests/use_text_embedding_in_dataset.py
deleted file mode 100644
index be147a86d4c..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_text_embedding_in_dataset.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use text embedding module in a Dataset map function."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def train():
-  """Build a Keras model and train with mock data."""
-  module = tf.saved_model.load(FLAGS.model_dir)
-  def _map_fn(features, labels):
-    features = tf.expand_dims(features, 0)
-    features = module(features)
-    features = tf.squeeze(features, 0)
-    return features, labels
-
-  features = np.array(["my first sentence", "my second sentence"])
-  labels = np.array([1, 0])
-  dataset = tf.data.Dataset.from_tensor_slices((features, labels)).map(_map_fn)
-
-  # Create the sequential keras model.
-  l = tf.keras.layers
-  model = tf.keras.Sequential()
-  model.add(l.Dense(10, activation="relu"))
-  model.add(l.Dense(1, activation="sigmoid"))
-
-  model.compile(
-      optimizer="adam",
-      loss="binary_crossentropy",
-      metrics=["accuracy"])
-
-  model.fit_generator(generator=dataset.batch(10), epochs=5)
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(model, tempfile.mkdtemp())
-
-
-def main(argv):
-  del argv
-
-  train()
-
-
-if __name__ == "__main__":
-  tf.enable_v2_behavior()
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
deleted file mode 100644
index a3c0f230976..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use RNN model stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def main(argv):
-  del argv
-
-  sentences = [
-      "<S> sentence <E>", "<S> second sentence <E>", "<S> third sentence<E>"
-  ]
-
-  model = tf.saved_model.load(FLAGS.model_dir)
-  model.train(tf.constant(sentences))
-  decoded = model.decode_greedy(
-      sequence_length=10, first_word=tf.constant("<S>"))
-  _ = [d.numpy() for d in decoded]
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(model, tempfile.mkdtemp())
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 21513b9f0f7..90fe716cdf7 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -93,11 +93,11 @@ go generate github.com/tensorflow/tensorflow/tensorflow/go/op
 
 ## Support
 
-Use [stackoverflow](http://stackoverflow.com/questions/tagged/tensorflow) and/or
-[Github issues](https://github.com/tensorflow/tensorflow/issues).
+Use [Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
+and/or [GitHub issues](https://github.com/tensorflow/tensorflow/issues).
 
 ## Contributions
 
 Contributions are welcome. If making any signification changes, probably best to
-discuss on a [Github issue](https://github.com/tensorflow/tensorflow/issues)
-before investing too much time. Github pull requests are used for contributions.
+discuss on a [GitHub issue](https://github.com/tensorflow/tensorflow/issues)
+before investing too much time. GitHub pull requests are used for contributions.
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 60de1e1a29e..956b5040efc 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -72,7 +72,7 @@ type GraphImportOptions struct {
 }
 
 // AddInputMapping adds a mapping between an Output in the imported graph
-// and an Ouput in the destination graph that it should be replaced with,
+// and an Output in the destination graph that it should be replaced with,
 // where src:srcIndex is the name of the Operation and Output index to
 // replace and dst is the output to replace it with.
 func (o *GraphImportOptions) AddInputMapping(src string, srcIndex int, dst Output) {
@@ -500,7 +500,7 @@ type LibraryHandler struct {
 	cptr *C.TF_Library
 }
 
-// Load library content into current context, useful to load ops implementation into non-monolitic TF build. Returns LibraryHandler or nil and error
+// Load library content into current context, useful to load ops implementation into non-monolithic TF build. Returns LibraryHandler or nil and error
 func LoadLibrary(path string) (*LibraryHandler, error) {
 	status := newStatus()
 
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1d8afe6e0b8..8923b76b519 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,6 +38,179 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 	return list, start + size, nil
 }
 
+// Operator that connects the output of an XLA computation to other consumer graph nodes.
+func XlaClusterOutput(scope *Scope, input tf.Output) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaClusterOutput",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op used by XLA SPMD partitioner to switch from manual partitioning to
+//
+// automatic partitioning. It converts the shard-shaped, manually partitioned input
+// into full-shaped tensor to be partitioned automatically with the same sharding
+// used by manual partitioning.
+func XlaSpmdShardToFullShape(scope *Scope, input tf.Output, manual_sharding string, full_shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"manual_sharding": manual_sharding, "full_shape": full_shape}
+	opspec := tf.OpSpec{
+		Type: "XlaSpmdShardToFullShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps the XLA Sort operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#sort
+// .
+//
+// Sorts a tensor. Currently only sorts in ascending order are supported.
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//
+// Returns A `Tensor` of type T.
+func XlaSort(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSort",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Receives the named tensor from another XLA computation. Wraps the XLA Recv
+//
+// operator documented at
+//  https://www.tensorflow.org/performance/xla/operation_semantics#recv .
+//
+// Arguments:
+//	dtype: The type of the tensor.
+//	tensor_name: A string key that identifies the channel.
+//	shape: The shape of the tensor.
+//
+// Returns The tensor to receive.
+func XlaRecv(scope *Scope, dtype tf.DataType, tensor_name string, shape tf.Shape) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "tensor_name": tensor_name, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "XlaRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps the XLA DynamicSlice operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#dynamicslice
+// .
+//
+// DynamicSlice extracts a sub-array from the input array at dynamic
+// start_indices. The size of the slice in each dimension is passed in
+// size_indices, which specify the end point of exclusive slice intervals in each
+// dimension -- [start, start + size). The shape of start_indices must have rank 1,
+// with dimension size equal to the rank of operand.
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//	start_indices: List of N integers containing the slice size for each
+// dimension. Each value must be strictly greater than zero, and start + size
+// must be less than or equal to the size of the dimension to avoid
+// implementation defined behavior.
+//
+func XlaDynamicSlice(scope *Scope, input tf.Output, start_indices tf.Output, size_indices tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaDynamicSlice",
+		Input: []tf.Input{
+			input, start_indices, size_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Set a bound for the given input value as a hint to Xla compiler,
+//
+//         returns the same value.
+func XlaSetBound(scope *Scope, input tf.Output, bound tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSetBound",
+		Input: []tf.Input{
+			input, bound,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps the XLA DotGeneral operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
+// .
+//
+// Arguments:
+//	lhs: the LHS tensor
+//	rhs: the RHS tensor
+//	dimension_numbers: a serialized xla::DotDimensionNumbers proto.
+//	precision_config: a serialized xla::PrecisionConfig proto.
+func XlaDot(scope *Scope, lhs tf.Output, rhs tf.Output, dimension_numbers string, precision_config string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "precision_config": precision_config}
+	opspec := tf.OpSpec{
+		Type: "XlaDot",
+		Input: []tf.Input{
+			lhs, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
 type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
 
@@ -96,6 +269,43 @@ func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Computes the eigen decomposition of a batch of self-adjoint matrices
+//
+// (Note: Only real inputs are supported).
+//
+// Computes the eigenvalues and eigenvectors of the innermost M-by-N matrices in
+// tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[...,:,:]).
+//
+// Arguments:
+//	a: the input tensor.
+//	max_iter: maximum number of sweep update, i.e., the whole lower triangular
+// part or upper triangular part based on parameter lower. Heuristically, it has
+// been argued that approximately log(min (M, N)) sweeps are needed in practice
+// (Ref: Golub & van Loan "Matrix Computation").
+//	epsilon: the tolerance ratio.
+//	precision_config: a serialized xla::PrecisionConfig proto.
+//
+// Returns:
+//	s: Singular values. The values are sorted in reverse order of magnitude, so
+// s[..., 0] is the largest value, s[..., 1] is the second largest, etc.
+//	u: Left singular vectors.
+//	v: Right singular vectors.
+func XlaSvd(scope *Scope, a tf.Output, max_iter int64, epsilon float32, precision_config string) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_iter": max_iter, "epsilon": epsilon, "precision_config": precision_config}
+	opspec := tf.OpSpec{
+		Type: "XlaSvd",
+		Input: []tf.Input{
+			a,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
 type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
 
@@ -158,6 +368,34 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
+// Helper operator for performing XLA-style broadcasts
+//
+// Broadcasts `lhs` and `rhs` to the same rank, by adding size 1 dimensions to
+// whichever of `lhs` and `rhs` has the lower rank, using XLA's broadcasting rules
+// for binary operators.
+//
+// Arguments:
+//	lhs: the LHS input tensor
+//	rhs: the RHS input tensor
+//	broadcast_dims: an XLA-style broadcast dimension specification
+//
+// Returns:
+//	lhs_output: the broadcasted LHS tensor
+//	rhs_output: the broadcasted RHS tensor
+func XlaBroadcastHelper(scope *Scope, lhs tf.Output, rhs tf.Output, broadcast_dims tf.Output) (lhs_output tf.Output, rhs_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaBroadcastHelper",
+		Input: []tf.Input{
+			lhs, rhs, broadcast_dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Subtracts sparse `updates` from an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by subtracting sparse `updates` from the
@@ -357,6 +595,40 @@ func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// QuantizeAndDequantizeV4GradAttr is an optional argument to QuantizeAndDequantizeV4Grad.
+type QuantizeAndDequantizeV4GradAttr func(optionalAttr)
+
+// QuantizeAndDequantizeV4GradAxis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeAndDequantizeV4GradAxis(value int64) QuantizeAndDequantizeV4GradAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns the gradient of `QuantizeAndDequantizeV4`.
+//
+// Returns a gradient of 1 for inputs that are within the quantization range,
+// or 0 otherwise.
+func QuantizeAndDequantizeV4Grad(scope *Scope, gradients tf.Output, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV4GradAttr) (input_backprop tf.Output, input_min_backprop tf.Output, input_max_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV4Grad",
+		Input: []tf.Input{
+			gradients, input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
 type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
@@ -1438,6 +1710,45 @@ func TensorStridedSliceUpdate(scope *Scope, input tf.Output, begin tf.Output, en
 	return op.Output(0)
 }
 
+// Computes the eigen decomposition of a batch of self-adjoint matrices
+//
+// (Note: Only real inputs are supported).
+//
+// Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices in
+// tensor such that tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i], for
+// i=0...N-1.
+//
+// Arguments:
+//	a: the input tensor.
+//	lower: a boolean specifies whether the calculation is done with the lower
+// triangular part or the upper triangular part.
+//	max_iter: maximum number of sweep update, i.e., the whole lower triangular
+// part or upper triangular part based on parameter lower. Heuristically, it has
+// been argued that approximately logN sweeps are needed in practice (Ref: Golub &
+// van Loan "Matrix Computation").
+//	epsilon: the tolerance ratio.
+//
+// Returns:
+//	w: The eigenvalues in ascending order, each repeated according to its
+// multiplicity.
+//	v: The column v[..., :, i] is the normalized eigenvector corresponding to the
+// eigenvalue w[..., i].
+func XlaSelfAdjointEig(scope *Scope, a tf.Output, lower bool, max_iter int64, epsilon float32) (w tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"lower": lower, "max_iter": max_iter, "epsilon": epsilon}
+	opspec := tf.OpSpec{
+		Type: "XlaSelfAdjointEig",
+		Input: []tf.Input{
+			a,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Ensures that the tensor's shape matches the expected shape.
 //
 // Raises an error if the input tensor's shape does not match the specified shape.
@@ -4343,6 +4654,31 @@ func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Ou
 	return op.Output(0), op.Output(1)
 }
 
+// Sends the named tensor to another XLA computation. Wraps the XLA Send operator
+//
+// documented at
+//  https://www.tensorflow.org/performance/xla/operation_semantics#send .
+//
+// Arguments:
+//	tensor: The tensor to send.
+//	tensor_name: A string key that identifies the channel.
+//
+// Returns the created operation.
+func XlaSend(scope *Scope, tensor tf.Output, tensor_name string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"tensor_name": tensor_name}
+	opspec := tf.OpSpec{
+		Type: "XlaSend",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns the index of a data point that should be added to the seed set.
 //
 // Entries in distances are assumed to be squared distances of candidate points to
@@ -7528,6 +7864,33 @@ func ResourceAccumulatorApplyGradient(scope *Scope, handle tf.Output, local_step
 	return scope.AddOperation(opspec)
 }
 
+// Wraps the XLA Pad operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#pad
+// .
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//	padding_value: A scalar `Tensor` of type T.
+//	padding_low: the padding to apply at the start of each input dimensions
+//	padding_high: the padding to apply at the end of each input dimension.
+//	padding_interior: the padding to apply between each input element.
+//
+// Returns A `Tensor` of type T.
+func XlaPad(scope *Scope, input tf.Output, padding_value tf.Output, padding_low tf.Output, padding_high tf.Output, padding_interior tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaPad",
+		Input: []tf.Input{
+			input, padding_value, padding_low, padding_high, padding_interior,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Updates the accumulator with a new value for global_step.
 //
 // Logs warning if the accumulator's value is already higher than
@@ -8015,6 +8378,80 @@ func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Ou
 	return op.Output(0)
 }
 
+// QuantizeAndDequantizeV4Attr is an optional argument to QuantizeAndDequantizeV4.
+type QuantizeAndDequantizeV4Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV4SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV4SignedInput(value bool) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4NumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV4NumBits(value int64) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV4RangeGiven(value bool) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV4RoundMode(value string) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4NarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV4NarrowRange(value bool) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4Axis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeAndDequantizeV4Axis(value int64) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns the gradient of `QuantizeAndDequantizeV4`.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that it returns a
+// gradient of 1 for inputs that are within the quantization range, or 0 otherwise.
+func QuantizeAndDequantizeV4(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV4Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV4",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Gets next element for the provided shard number.
 //
 // Arguments:
@@ -8137,6 +8574,39 @@ func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Wraps the XLA DynamicUpdateSlice operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice
+// .
+//
+// XlaDynamicUpdateSlice generates a result which is the value of the `input`
+// operand, with a slice update overwritten at `indices`. The shape of `update`
+// determines the shape of the sub-array of the result which is updated. The shape
+// of indices must be rank == 1, with dimension size equal to the rank of `input`.
+//
+// Handling of out-of-bounds slice indices is implementation-defined.
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//	update: A `Tensor` of type T. Same rank as `input`.
+//	indices: A vector of indices into `input`. Must have length equal to the rank of
+// `input`.
+//
+// Returns A `Tensor` of type T.
+func XlaDynamicUpdateSlice(scope *Scope, input tf.Output, update tf.Output, indices tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaDynamicUpdateSlice",
+		Input: []tf.Input{
+			input, update, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ModelDatasetAttr is an optional argument to ModelDataset.
 type ModelDatasetAttr func(optionalAttr)
 
@@ -8756,6 +9226,24 @@ func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_
 	return op.Output(0)
 }
 
+// Make a static dimension into a xla bounded dynamic dimension.
+//
+//         The current static dimension size will become the bound and the second
+//         operand becomes the dynamic size of the dimension.
+func XlaSetDynamicDimensionSize(scope *Scope, input tf.Output, dim_index tf.Output, size tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSetDynamicDimensionSize",
+		Input: []tf.Input{
+			input, dim_index, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PrefetchDatasetAttr is an optional argument to PrefetchDataset.
 type PrefetchDatasetAttr func(optionalAttr)
 
@@ -15597,6 +16085,12 @@ func DecodeImageExpandAnimations(value bool) DecodeImageAttr {
 // False, in which case the op will return 3-dimensional tensors and will truncate
 // animated GIF files to the first frame.
 //
+// *NOTE*: If the first frame of an animated GIF does not occupy the entire
+// canvas (maximum frame width x maximum frame height), then it fills the
+// unoccupied areas (in the first frame) with zeros (black). For frames after the
+// first frame that does not occupy the entire canvas, it uses the previous
+// frame to fill the unoccupied areas.
+//
 // Arguments:
 //	contents: 0-D. The encoded image bytes.
 //
@@ -15835,6 +16329,62 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the gradient for the sqrt of `x` wrt its input.
 //
 // Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
@@ -18820,7 +19370,7 @@ func CollectiveReduceV2TimeoutSeconds(value float32) CollectiveReduceV2Attr {
 }
 
 // Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduceV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, merge_op string, final_op string, optional ...CollectiveReduceV2Attr) (data tf.Output) {
+func CollectiveReduceV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, ordering_token []tf.Output, merge_op string, final_op string, optional ...CollectiveReduceV2Attr) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18831,7 +19381,7 @@ func CollectiveReduceV2(scope *Scope, input tf.Output, group_size tf.Output, gro
 	opspec := tf.OpSpec{
 		Type: "CollectiveReduceV2",
 		Input: []tf.Input{
-			input, group_size, group_key, instance_key,
+			input, group_size, group_key, instance_key, tf.OutputList(ordering_token),
 		},
 		Attrs: attrs,
 	}
@@ -21800,6 +22350,187 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
+//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
+//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$\text{lr}_t := \mathrm{learning_rate} * \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+// $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+// $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+// $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PrintAttr is an optional argument to Print.
 type PrintAttr func(optionalAttr)
 
@@ -24679,6 +25410,37 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
+// Wraps the XLA ConvGeneralDilated operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
+// .
+//
+// Arguments:
+//	lhs: the input tensor
+//	rhs: the kernel tensor
+//	window_strides: the inter-window strides
+//	padding: the padding to apply at the start and end of each input dimensions
+//	lhs_dilation: dilation to apply between input elements
+//	rhs_dilation: dilation to apply between kernel elements
+//	feature_group_count: number of feature groups for grouped convolution.
+//	dimension_numbers: a serialized xla::ConvolutionDimensionNumbers proto.
+//	precision_config: a serialized xla::PrecisionConfig proto.
+func XlaConv(scope *Scope, lhs tf.Output, rhs tf.Output, window_strides tf.Output, padding tf.Output, lhs_dilation tf.Output, rhs_dilation tf.Output, feature_group_count tf.Output, dimension_numbers string, precision_config string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "precision_config": precision_config}
+	opspec := tf.OpSpec{
+		Type: "XlaConv",
+		Input: []tf.Input{
+			lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation, feature_group_count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // NthElementAttr is an optional argument to NthElement.
 type NthElementAttr func(optionalAttr)
 
@@ -25527,6 +26289,37 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
+// Takes the packed uint32 input and unpacks the input to uint8 to do
+//
+// Dequantization on device.
+//
+// Arguments:
+//	input: Input tensors whose types is uint32, shape is [d0, ..., dn].
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//	mode: String to determine the dequantize mode in {"MIN_COMBINED", "MIN_FIRST", "SCALED"}.
+//	transpose_output: Boolean to determine if output is transposed. transpose_output
+// is faster when input is large and rank of input is higher than 1.
+//
+// Returns Output tensors whose types is bloat16. If transpose_output is true,
+// output shape is [dn * 4, dn-1, ..., d1, d0]. If transpose_output
+// is false, output shape is [d0,..., dn * 4].
+func XlaDequantize(scope *Scope, input tf.Output, min_range float32, max_range float32, mode string, transpose_output bool) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"min_range": min_range, "max_range": max_range, "mode": mode, "transpose_output": transpose_output}
+	opspec := tf.OpSpec{
+		Type: "XlaDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
 type MaxPoolGradV2Attr func(optionalAttr)
 
@@ -27368,6 +28161,28 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
+// An op used by XLA SPMD partitioner to switch from automatic partitioning to
+//
+// manual partitioning. It annotates the input (full-shape, to be automatically
+// partitioned) with the same sharding used by manual partitioning, and outputs a
+// shard-shaped tensor to be consumed by later manually-partitioned ops. If the
+// shape is not evenly partitionable, the padding region will be masked with 0s.
+func XlaSpmdFullToShardShape(scope *Scope, input tf.Output, manual_sharding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"manual_sharding": manual_sharding}
+	opspec := tf.OpSpec{
+		Type: "XlaSpmdFullToShardShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeCSVAttr is an optional argument to DecodeCSV.
 type DecodeCSVAttr func(optionalAttr)
 
@@ -27945,6 +28760,92 @@ func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+//
+// Example:
+//
+// ```python
+// import tensorflow as tf
+// from tensorflow.python.ops import bitwise_ops
+// import numpy as np
+// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
+//
+// for dtype in dtype_list:
+//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
+//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
+//
+//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
+//
+//   print(left_shift_result)
+//
+// # This will print:
+// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
+//
+// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
+// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
+// bitwise_ops.left_shift(lhs, rhs)
+// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
+// ```
+//
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates a feature cross from a list of tensors, and returns it as a
+// RaggedTensor.  See `tf.ragged.cross` for more details.
+//
+// Arguments:
+//	ragged_values: The values tensor for each RaggedTensor input.
+//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
+//	sparse_indices: The indices tensor for each SparseTensor input.
+//	sparse_values: The values tensor for each SparseTensor input.
+//	sparse_shape: The dense_shape tensor for each SparseTensor input.
+//	dense_inputs: The tf.Tensor inputs.
+//	input_order: String specifying the tensor type for each input.  The `i`th character in
+// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
+// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
+// values are combined in the order of the inputs from the call to tf.ragged.cross.
+//
+//
+//
+//
+//
+//
+// Returns:
+//	output_values: The `values` for the returned `RaggedTensor`.
+//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
+func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
+	opspec := tf.OpSpec{
+		Type: "RaggedCross",
+		Input: []tf.Input{
+			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // BatchMatMulAttr is an optional argument to BatchMatMul.
 type BatchMatMulAttr func(optionalAttr)
 
@@ -28841,67 +29742,6 @@ func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
-//
-// Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that takes a Bernoulli sample of the contents of another dataset.
 //
 // There is no transformation in the `tf.data` Python API for creating this dataset.
@@ -29092,6 +29932,364 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns:
+//	output: output tensor after fractional max pooling.
+//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
+//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reciprocal",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adagrad embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersConfig(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve centered RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the centered RMSProp optimization algorithm.
+//	ms: Parameter ms updated by the centered RMSProp optimization algorithm.
+//	mom: Parameter mom updated by the centered RMSProp optimization algorithm.
+//	mg: Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `RiscAdd` does not supports broadcasting.
+//
+// Given two input tensors, the `tf.risc_add` operation computes the sum for every element in the tensor.
+//
+// Both input and output have a range `(-inf, inf)`.
+//
+func RiscAdd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RiscAdd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
 type QuantizedMatMulAttr func(optionalAttr)
 
@@ -29366,6 +30564,45 @@ func BlockLSTMV2(scope *Scope, seq_len_max tf.Output, x tf.Output, cs_prev tf.Ou
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
 type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
@@ -29433,45 +30670,6 @@ func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -29638,33 +30836,6 @@ func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Ou
 	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
-		Input: []tf.Input{
-			tag, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Performs gradient updates of embedding tables.
 //
 // Arguments:
@@ -30101,6 +31272,37 @@ func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Out
 	return op.Output(0)
 }
 
+// Helper used to compute the gradient for `RaggedTensorToVariant`.
+//
+// Computes the gradient for the dense_values input to the RaggedTensorToVariant
+// op, given the variant-encoded ragged gradients of the outputs, along with
+// the outer row-splits and the shape of the dense-values that were provided as
+// inputs to the RaggedTensorToVariant op.
+//
+// Arguments:
+//	encoded_ragged_grad: A `variant` Tensor containing encoded `RaggedTensor` gradients.
+//	row_splits: Outermost row-splits that were used as input to the RaggedTensorToVariant op.
+//	dense_values_shape: Shape of the dense_values that was used as an input to the
+// RaggedTensorToVariant op.
+//
+//
+// Returns Gradient for the dense_values of the RaggedTensorToVariant op.
+func RaggedTensorToVariantGradient(scope *Scope, encoded_ragged_grad tf.Output, row_splits tf.Output, dense_values_shape tf.Output, Tvalues tf.DataType) (dense_values_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariantGradient",
+		Input: []tf.Input{
+			encoded_ragged_grad, row_splits, dense_values_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
 type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
@@ -30236,6 +31438,32 @@ func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shap
 	return op.Output(0)
 }
 
+// Wraps the XLA Gather operator documented at
+//
+//   https://www.tensorflow.org/xla/operation_semantics#gather
+//
+// Arguments:
+//	operand: The array we're gathering from.
+//	start_indices: Array containing the starting indices of the slices we gather.
+//	slice_sizes: slice_sizes[i] is the bounds for the slice on dimension i.
+//	dimension_numbers: A serialized xla::GatherDimensionNumbers proto.
+//	indices_are_sorted: Boolean indicating if the indices are sorted.
+func XlaGather(scope *Scope, operand tf.Output, start_indices tf.Output, slice_sizes tf.Output, dimension_numbers string, indices_are_sorted bool) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "indices_are_sorted": indices_are_sorted}
+	opspec := tf.OpSpec{
+		Type: "XlaGather",
+		Input: []tf.Input{
+			operand, start_indices, slice_sizes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
 type QuantizedConv2DAttr func(optionalAttr)
 
@@ -31527,6 +32755,34 @@ func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.
 	return scope.AddOperation(opspec)
 }
 
+// Wraps the XLA Sort operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#sort
+// .
+//
+// Sorts a tensor. Currently only sorts in ascending order are supported.
+//
+// Arguments:
+//	keys: A `Tensor` of type K.
+//	values: A `Tensor` of type V.
+//
+// Returns:
+//	sorted_keys: A `Tensor` of type K.
+//	sorted_values: A `Tensor` of type V.
+func XlaKeyValueSort(scope *Scope, keys tf.Output, values tf.Output) (sorted_keys tf.Output, sorted_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaKeyValueSort",
+		Input: []tf.Input{
+			keys, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Asserts that compilation succeeded. This op produces no output and closes the
 //
 // device during failure to ensure all pending device interactions fail.
@@ -31666,6 +32922,18 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
+// Replica ID.
+func XlaReplicaId(scope *Scope) (id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaReplicaId",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns conj(x - y)(x - y) element-wise.
 //
 // *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
@@ -31863,6 +33131,21 @@ func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...Booste
 	return op.Output(0)
 }
 
+// An op which shards the input based on the given sharding attribute.
+func XlaSharding(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSharding",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EagerPyFuncAttr is an optional argument to EagerPyFunc.
 type EagerPyFuncAttr func(optionalAttr)
 
@@ -34764,6 +36047,307 @@ func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Pads a tensor with mirrored values.
+//
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns:
+//	handle: The handle to the TensorArray.
+//	flow: A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolveLs",
+		Input: []tf.Input{
+			matrix, rhs, l2_regularizer,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Generate a glob pattern matching all sharded file names.
 func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
@@ -35123,29 +36707,6 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
-//
-func ThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, thread_pool,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Bitcasts a tensor from one type to another without copying data.
 //
 // Given a tensor `input`, this operation returns a tensor that has the same buffer
@@ -35214,6 +36775,29 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 	return op.Output(0)
 }
 
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, thread_pool,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
 type ResourceApplyAdagradDAAttr func(optionalAttr)
 
@@ -35835,6 +37419,67 @@ func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Selects elements from `x` or `y`, depending on `condition`.
 //
 // The `x`, and `y` tensors must all have the same shape, and the
@@ -38331,435 +39976,6 @@ func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output,
 	return op.Output(0)
 }
 
-// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
-type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`. Then do
-// relu activation to get non-negative result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndRelu",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
-type StatelessRandomUniformFullIntAttr func(optionalAttr)
-
-// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformFullInt",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of 3D max pooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a copy of the input tensor.
 func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -40552,215 +41768,6 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
-//
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns:
-//	output: output tensor after fractional max pooling.
-//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
-//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adagrad embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Strip leading and trailing whitespaces from the Tensor.
 //
 // Arguments:
@@ -40944,187 +41951,6 @@ func DatasetToGraph(scope *Scope, input_dataset tf.Output, optional ...DatasetTo
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
-type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
-//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
-//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$\text{lr}_t := \mathrm{learning_rate} * \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-// $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-// $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
-// $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
 type ResizeNearestNeighborAttr func(optionalAttr)
 
@@ -43781,35 +44607,110 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 	return op.Output(0), op.Output(1)
 }
 
-// Merges summaries.
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of 3D max pooling function.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Reshapes a SparseTensor to represent values in a new dense shape.
 //
 // This operation has the same semantics as reshape on the represented dense
@@ -44093,52 +44994,6 @@ func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear t
 	return scope.AddOperation(opspec)
 }
 
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// >>> a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
-// >>> b = tf.constant([1,2], tf.int32)
-// >>> tf.tile(a, b)
-// <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
-// array([[1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
-// >>> c = tf.constant([2,1], tf.int32)
-// >>> tf.tile(a, c)
-// <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
-// array([[1, 2, 3],
-//        [4, 5, 6],
-//        [1, 2, 3],
-//        [4, 5, 6]], dtype=int32)>
-// >>> d = tf.constant([2,2], tf.int32)
-// >>> tf.tile(a, d)
-// <tf.Tensor: shape=(4, 6), dtype=int32, numpy=
-// array([[1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6],
-//        [1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes softmax cross entropy cost and gradients to backpropagate.
 //
 // Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
@@ -44238,405 +45093,52 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise left-shift of `x` and `y`.
+// Constructs a tensor by tiling a given tensor.
 //
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
-// Example:
+// >>> a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
+// >>> b = tf.constant([1,2], tf.int32)
+// >>> tf.tile(a, b)
+// <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
+// array([[1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
+// >>> c = tf.constant([2,1], tf.int32)
+// >>> tf.tile(a, c)
+// <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
+// array([[1, 2, 3],
+//        [4, 5, 6],
+//        [1, 2, 3],
+//        [4, 5, 6]], dtype=int32)>
+// >>> d = tf.constant([2,2], tf.int32)
+// >>> tf.tile(a, d)
+// <tf.Tensor: shape=(4, 6), dtype=int32, numpy=
+// array([[1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6],
+//        [1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
 //
-// ```python
-// import tensorflow as tf
-// from tensorflow.python.ops import bitwise_ops
-// import numpy as np
-// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
-//
-// for dtype in dtype_list:
-//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
-//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
-//
-//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
-//
-//   print(left_shift_result)
-//
-// # This will print:
-// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
-//
-// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
-// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
-// bitwise_ops.left_shift(lhs, rhs)
-// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
-// ```
-//
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "Tile",
 		Input: []tf.Input{
-			x, y,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generates a feature cross from a list of tensors, and returns it as a
-// RaggedTensor.  See `tf.ragged.cross` for more details.
-//
-// Arguments:
-//	ragged_values: The values tensor for each RaggedTensor input.
-//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
-//	sparse_indices: The indices tensor for each SparseTensor input.
-//	sparse_values: The values tensor for each SparseTensor input.
-//	sparse_shape: The dense_shape tensor for each SparseTensor input.
-//	dense_inputs: The tf.Tensor inputs.
-//	input_order: String specifying the tensor type for each input.  The `i`th character in
-// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
-// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
-// values are combined in the order of the inputs from the call to tf.ragged.cross.
-//
-//
-//
-//
-//
-//
-// Returns:
-//	output_values: The `values` for the returned `RaggedTensor`.
-//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
-func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
-	opspec := tf.OpSpec{
-		Type: "RaggedCross",
-		Input: []tf.Input{
-			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
-//
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
-//
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
-//
-// Returns:
-//	handle: The handle to the TensorArray.
-//	flow: A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
-		Input: []tf.Input{
-			size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
-		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns:
-//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
-//	output_values: 1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.
-//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
-	opspec := tf.OpSpec{
-		Type: "SparseCross",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Creates and returns an empty tensor list.
 //
 // All list elements must be tensors of dtype element_dtype and shape compatible
@@ -44681,6 +45183,188 @@ func ConfigureTPUEmbedding(scope *Scope, config string) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
+type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`. Then do
+// relu activation to get non-negative result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndRelu",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
+type StatelessRandomUniformFullIntAttr func(optionalAttr)
+
+// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformFullInt",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Shuts down a running distributed TPU system.
 //
 // The op returns an error if no system is running.
@@ -45179,133 +45863,6 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
-type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersConfig(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve centered RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the centered RMSProp optimization algorithm.
-//	ms: Parameter ms updated by the centered RMSProp optimization algorithm.
-//	mom: Parameter mom updated by the centered RMSProp optimization algorithm.
-//	mg: Parameter mg updated by the centered RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
 // Records the latency of producing `input_dataset` elements in a StatsAggregator.
 func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -46497,6 +47054,98 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
 type RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -46838,6 +47487,153 @@ func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters t
 	return scope.AddOperation(opspec)
 }
 
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+//
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
+//
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCompressed",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Non-deterministically generates some integers.
+//
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonDeterministicInts",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SerializeSparseAttr is an optional argument to SerializeSparse.
 type SerializeSparseAttr func(optionalAttr)
 
@@ -46877,6 +47673,26 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 	return op.Output(0)
 }
 
+// An op which supports basic einsum op with 2 inputs and 1 output.
+//
+// This op has better TPU performance since it doesn't have explicitly reshape and
+// transpose operations as tf.einsum does.
+func XlaEinsum(scope *Scope, a tf.Output, b tf.Output, equation string) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"equation": equation}
+	opspec := tf.OpSpec{
+		Type: "XlaEinsum",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Extracts the average gradient in the given ConditionalAccumulator.
 //
 // The op blocks until sufficient (i.e., more than num_required)
@@ -48400,6 +49216,57 @@ func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAt
 	return op.Output(0)
 }
 
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
 type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
 
@@ -48924,106 +49791,6 @@ func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (out
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Multinomial",
-		Input: []tf.Input{
-			logits, num_samples,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
-type NonDeterministicIntsAttr func(optionalAttr)
-
-// NonDeterministicIntsDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Non-deterministically generates some integers.
-//
-// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//
-// Returns Non-deterministic integer values with specified shape.
-func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonDeterministicInts",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that caches elements from `input_dataset`.
 //
 // A CacheDataset will iterate over the input_dataset, and store tensors. If the
@@ -49425,7 +50192,7 @@ func CollectiveGatherV2TimeoutSeconds(value float32) CollectiveGatherV2Attr {
 }
 
 // Mutually accumulates multiple tensors of identical type and shape.
-func CollectiveGatherV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, optional ...CollectiveGatherV2Attr) (data tf.Output) {
+func CollectiveGatherV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, ordering_token []tf.Output, optional ...CollectiveGatherV2Attr) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -49436,7 +50203,7 @@ func CollectiveGatherV2(scope *Scope, input tf.Output, group_size tf.Output, gro
 	opspec := tf.OpSpec{
 		Type: "CollectiveGatherV2",
 		Input: []tf.Input{
-			input, group_size, group_key, instance_key,
+			input, group_size, group_key, instance_key, tf.OutputList(ordering_token),
 		},
 		Attrs: attrs,
 	}
@@ -49771,53 +50538,6 @@ func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Outp
 	return op.Output(0)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
-//
-// Arguments:
-//	bytes: A Tensor of string which is compressed.
-//
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DecodeJpegAttr is an optional argument to DecodeJpeg.
 type DecodeJpegAttr func(optionalAttr)
 
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index f40090ac45d..19f5e29da2b 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -35,7 +35,7 @@
         <java.version>1.8</java.version>
         <spark.version>2.4.5</spark.version>
         <yarn.api.version>2.7.3</yarn.api.version>
-        <junit.version>4.11</junit.version>
+        <junit.version>4.13.1</junit.version>
     </properties>
 
     <build>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index e900d81e5da..675a3369cf1 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -16,7 +16,7 @@
         <maven.compiler.target>1.6</maven.compiler.target>
         <hadoop.version>2.6.0</hadoop.version>
         <protobuf.version>3.5.1</protobuf.version>
-        <junit.version>4.11</junit.version>
+        <junit.version>4.13.1</junit.version>
     </properties>
 
     <licenses>
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 7d22a78bcb4..fabfa19f638 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -15,6 +15,20 @@ limitations under the License.
 
 package org.tensorflow.processor;
 
+import com.google.common.base.CaseFormat;
+import com.google.common.base.Strings;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+import com.squareup.javapoet.ClassName;
+import com.squareup.javapoet.FieldSpec;
+import com.squareup.javapoet.JavaFile;
+import com.squareup.javapoet.MethodSpec;
+import com.squareup.javapoet.ParameterSpec;
+import com.squareup.javapoet.ParameterizedTypeName;
+import com.squareup.javapoet.TypeName;
+import com.squareup.javapoet.TypeSpec;
+import com.squareup.javapoet.TypeVariableName;
+import com.squareup.javapoet.WildcardTypeName;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
@@ -23,7 +37,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
 import javax.annotation.processing.AbstractProcessor;
 import javax.annotation.processing.Filer;
 import javax.annotation.processing.Messager;
@@ -44,21 +57,6 @@ import javax.lang.model.util.ElementFilter;
 import javax.lang.model.util.Elements;
 import javax.tools.Diagnostic.Kind;
 
-import com.google.common.base.CaseFormat;
-import com.google.common.base.Strings;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-import com.squareup.javapoet.ClassName;
-import com.squareup.javapoet.FieldSpec;
-import com.squareup.javapoet.JavaFile;
-import com.squareup.javapoet.MethodSpec;
-import com.squareup.javapoet.ParameterSpec;
-import com.squareup.javapoet.ParameterizedTypeName;
-import com.squareup.javapoet.TypeName;
-import com.squareup.javapoet.TypeSpec;
-import com.squareup.javapoet.TypeVariableName;
-import com.squareup.javapoet.WildcardTypeName;
-
 /**
  * A compile-time Processor that aggregates classes annotated with {@link
  * org.tensorflow.op.annotation.Operator} and generates the {@code Ops} convenience API. Please
@@ -109,7 +107,7 @@ public final class OperatorProcessor extends AbstractProcessor {
 
     // If there are no annotated elements, claim the annotation but do nothing.
     if (annotated.size() == 0) {
-      return true;
+      return false;
     }
 
     // This processor has to aggregate all op classes in one round, as it generates a single Ops
@@ -124,25 +122,25 @@ public final class OperatorProcessor extends AbstractProcessor {
                 + "One reason this can happen is if other annotation processors generate\n"
                 + "new @Operator source files.");
       }
-      return true;
+      return false;
     }
 
     // Collect all classes tagged with our annotation.
     Multimap<String, MethodSpec> groupedMethods = HashMultimap.create();
     if (!collectOpsMethods(roundEnv, groupedMethods, annotation)) {
-      return true;
+      return false;
     }
 
     // Nothing to do when there are no tagged classes.
     if (groupedMethods.isEmpty()) {
-      return true;
+      return false;
     }
 
     // Validate operator classes and generate Op API.
     writeApi(groupedMethods);
 
     hasRun = true;
-    return true;
+    return false;
   }
 
   @Override
@@ -410,7 +408,8 @@ public final class OperatorProcessor extends AbstractProcessor {
             .returns(T_OPS)
             .addStatement("return new Ops(scope.withControlDependencies(controls))")
             .addJavadoc(
-                "Returns an API that adds operations to the graph with the provided control dependencies.\n\n"
+                "Returns an API that adds operations to the graph with the provided control"
+                    + " dependencies.\n\n"
                     + "@see {@link $T#withControlDependencies(Iterable<Operand<?>>)}\n",
                 T_SCOPE)
             .build());
@@ -457,8 +456,10 @@ public final class OperatorProcessor extends AbstractProcessor {
             .returns(T_OPS)
             .addStatement("return new Ops(new $T($T.getDefault()))", T_SCOPE, T_EAGER_SESSION)
             .addJavadoc(
-                "Creates an API for building operations in the default eager execution environment\n\n"
-                    + "<p>Invoking this method is equivalent to {@code Ops.create(EagerSession.getDefault())}.\n")
+                "Creates an API for building operations in the default eager execution"
+                    + " environment\n\n"
+                    + "<p>Invoking this method is equivalent to {@code"
+                    + " Ops.create(EagerSession.getDefault())}.\n")
             .build());
 
     return opsBuilder.build();
diff --git a/tensorflow/java/src/main/java/org/tensorflow/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
index e6772854fac..f554944e389 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
@@ -16,10 +16,14 @@ limitations under the License.
 /**
  * Defines classes to build, save, load and execute TensorFlow models.
  *
- * <p><b>WARNING</b>: The API is currently experimental and is not covered by TensorFlow <a
- * href="https://www.tensorflow.org/guide/version_compat">API stability guarantees</a>. See <a
- * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/README.md">README.md</a>
- * for installation instructions.
+  *<aside class="warning">
+ * <b>Warning:</b> This API is deprecated and will be removed in a future
+ *       version of TensorFlow after <a href="https://tensorflow.org/java">the replacement</a>
+ *       is stable.
+ *</aside>
+ *
+ * <p>To get started, see the <a href="https://tensorflow.org/install/lang_java_legacy">
+ * installation instructions.</a></p>
  *
  * <p>The <a
  * href="https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java">LabelImage</a>
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index c4a0b5d525d..b725be34844 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -261,10 +261,8 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/delegates:status",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
@@ -296,9 +294,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
@@ -309,8 +305,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
+        "//visibility:public",
     ],
     deps = [
         "//tensorflow/lite:stderr_reporter",
@@ -325,8 +320,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
+        "//visibility:public",
     ],
     deps = [
         ":minimal_logging",
@@ -341,8 +335,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
+        "//visibility:public",
     ],
     deps = [
         "//tensorflow/lite:mutable_op_resolver",
@@ -357,8 +350,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
+        "//visibility:public",
     ],
     deps = [
         ":util",
@@ -628,6 +620,15 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "stderr_reporter_test",
+    srcs = ["stderr_reporter_test.cc"],
+    deps = [
+        ":stderr_reporter",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "util",
     srcs = ["util.cc"],
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 45b5e073e0b..c0fc2ae2502 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -20,8 +20,6 @@
 # This has only been tested on Windows, Linux and macOS.
 #
 # The following are not currently supported:
-# - GPU acceleration
-# - Android
 # - iOS
 # - Micro backend
 # - Tests
@@ -60,7 +58,7 @@ option(TFLITE_ENABLE_RUY "Enable experimental RUY integration" OFF)
 option(TFLITE_ENABLE_RESOURCE "Enable experimental support for resources" ON)
 option(TFLITE_ENABLE_NNAPI "Enable NNAPI (Android only)." ON)
 option(TFLITE_ENABLE_MMAP "Enable MMAP (unsupported on Windows)" ON)
-option(TFLITE_ENABLE_GPU "Enable GPU (not supported)" OFF)
+option(TFLITE_ENABLE_GPU "Enable GPU" OFF)
 # This must be enabled when converting from TF models with SELECT_TF_OPS
 # enabled.
 # https://www.tensorflow.org/lite/guide/ops_select#converting_the_model
@@ -192,9 +190,60 @@ if(TFLITE_ENABLE_FLEX)
   )
 endif()
 if(TFLITE_ENABLE_GPU)
-  # Implementation is under delegates/gpu.
-  message(FATAL_ERROR
-    "GPU acceleration is not currently supported in CMake builds"
+  find_package(opencl_headers REQUIRED)
+  find_package(vulkan_headers REQUIRED)
+  populate_tflite_source_vars(
+    "delegates/gpu/cl" TFLITE_DELEGATES_GPU_CL_SRCS
+    FILTER "(_test|gl_interop|egl_sync)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/cl/kernels" TFLITE_DELEGATES_GPU_CL_KERNELS_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/cl/kernels/special"
+    TFLITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/cl/selectors" TFLITE_DELEGATES_GPU_CL_SELECTORS_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common" TFLITE_DELEGATES_GPU_COMMON_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common/default" TFLITE_DELEGATES_GPU_COMMON_DEFAULT_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common/memory_management"
+    TFLITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common/transformations"
+    TFLITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  list(APPEND TFLITE_DELEGATES_GPU_SRCS
+    ${TFLITE_SOURCE_DIR}/delegates/gpu/api.cc
+    ${TFLITE_SOURCE_DIR}/delegates/gpu/delegate.cc
+    ${TFLITE_DELEGATES_GPU_CL_SRCS}
+    ${TFLITE_DELEGATES_GPU_CL_KERNELS_SRCS}
+    ${TFLITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_SRCS}
+    ${TFLITE_DELEGATES_GPU_CL_SELECTORS_SRCS}
+    ${TFLITE_SOURCE_DIR}/delegates/gpu/cl/selectors/default/default_selector.cc
+    ${TFLITE_DELEGATES_GPU_COMMON_SRCS}
+    ${TFLITE_DELEGATES_GPU_COMMON_DEFAULT_SRCS}
+    ${TFLITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_SRCS}
+    ${TFLITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_SRCS}
+  )
+  list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DCL_DELEGATE_NO_GL" "-DEGL_NO_X11")
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    absl::any
+    absl::flat_hash_map
   )
 endif()
 if(_TFLITE_ENABLE_NNAPI)
@@ -281,6 +330,7 @@ add_library(tensorflow-lite
   ${TFLITE_CORE_SRCS}
   ${TFLITE_C_SRCS}
   ${TFLITE_DELEGATES_FLEX_SRCS}
+  ${TFLITE_DELEGATES_GPU_SRCS}
   ${TFLITE_DELEGATES_NNAPI_SRCS}
   ${TFLITE_DELEGATES_SRCS}
   ${TFLITE_DELEGATES_XNNPACK_SRCS}
@@ -298,6 +348,7 @@ add_library(tensorflow-lite
   ${TFLITE_NNAPI_SRCS}
   ${TFLITE_SRCS}
   ${TFLITE_SOURCE_DIR}/profiling/platform_profiler.cc
+  ${TFLITE_SOURCE_DIR}/schema/schema_utils.cc
   ${TFLITE_SOURCE_DIR}/tools/optimize/sparsity/format_converter.cc
 )
 target_include_directories(tensorflow-lite
@@ -330,7 +381,7 @@ add_library(tensorflow::tensorflowlite ALIAS tensorflow-lite)
 # Benchmark Tool
 populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
   TFLITE_BENCHMARK_SRCS
-  FILTER "(_test|_plus_flex_main|_performance_options_main)\\.cc$"
+  FILTER "(_test|_plus_flex_main|_performance_options.*)\\.cc$"
 )
 list(APPEND TFLITE_BENCHMARK_SRCS
   ${TF_SOURCE_DIR}/core/util/stats_calculator.cc
@@ -351,6 +402,13 @@ list(APPEND TFLITE_BENCHMARK_LIBS
   ${CMAKE_DL_LIBS}
 )
 
+# TODO(b/171007016): Enable performance options on Windows.
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/tools/benchmark/benchmark_performance_options.cc
+  )
+endif()
+
 if(TFLITE_ENABLE_XNNPACK)
   list(APPEND TFLITE_BENCHMARK_SRCS
     ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
@@ -362,14 +420,24 @@ endif()  # TFLITE_ENABLE_XNNPACK
 if(CMAKE_SYSTEM_NAME MATCHES "Android")
   list(APPEND TFLITE_BENCHMARK_SRCS
     ${TFLITE_SOURCE_DIR}/profiling/atrace_profiler.cc
-    ${TFLITE_SOURCE_DIR}/tools/delegates/nnapi_delegate_provider.cc
   )
+  if(_TFLITE_ENABLE_NNAPI)
+    list(APPEND TFLITE_BENCHMARK_SRCS
+      ${TFLITE_SOURCE_DIR}/tools/delegates/nnapi_delegate_provider.cc
+    )
+  endif()  # _TFLITE_ENABLE_NNAPI
   list(APPEND TFLITE_BENCHMARK_LIBS
     ${ANDROID_LOG_LIB}
     absl::strings
   )
 endif()  # Android
 
+if(TFLITE_ENABLE_GPU)
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/tools/delegates/gpu_delegate_provider.cc
+  )
+endif()  # TFLITE_ENABLE_GPU
+
 add_executable(benchmark_model
   EXCLUDE_FROM_ALL
   ${TFLITE_BENCHMARK_SRCS}
@@ -381,3 +449,4 @@ target_compile_options(benchmark_model
 target_link_libraries(benchmark_model
     ${TFLITE_BENCHMARK_LIBS}
 )
+
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 57c04be11f0..bca7a5a8cc4 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -169,6 +169,33 @@ def tflite_cc_shared_object(
 def tf_to_tflite(name, src, options, out):
     """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
 
+    Args:
+      name: Name of rule.
+      src: name of the input graphdef file.
+      options: options passed to TFLite Converter.
+      out: name of the output flatbuffer file.
+    """
+
+    toco_cmdline = " ".join([
+        "$(location //tensorflow/lite/python:tflite_convert)",
+        "--experimental_new_converter",
+        ("--graph_def_file=$(location %s)" % src),
+        ("--output_file=$(location %s)" % out),
+    ] + options)
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        cmd = toco_cmdline,
+        tools = ["//tensorflow/lite/python:tflite_convert"] + tf_binary_additional_srcs(),
+    )
+
+def DEPRECATED_tf_to_tflite(name, src, options, out):
+    """DEPRECATED Convert a frozen tensorflow graphdef to TF Lite's flatbuffer, using toco.
+
+    Please use tf_to_tflite instead.
+    TODO(b/138396996): Migrate away from this deprecated rule.
+
     Args:
       name: Name of rule.
       src: name of the input graphdef file.
@@ -736,6 +763,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "no_windows",
             ] + tags + coverage_tags,
             deps = [
+                "//third_party/py/tensorflow",
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index a37607f6260..71cefe62f92 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -155,6 +155,8 @@ typedef enum {
   kTfLiteBuiltinSegmentSum = 125,
   kTfLiteBuiltinBatchMatmul = 126,
   kTfLiteBuiltinPlaceholderForGreaterOpCodes = 127,
+  kTfLiteBuiltinCumsum = 128,
+  kTfLiteBuiltinCallOnce = 129,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index e205f075b43..5452ef63748 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -465,6 +465,15 @@ typedef struct {
   int body_subgraph_index;
 } TfLiteWhileParams;
 
+typedef struct {
+  bool exclusive;
+  bool reverse;
+} TfLiteCumsumParams;
+
+typedef struct {
+  int init_subgraph_index;
+} TfLiteCallOnceParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 8917c254825..e04e1a12cd4 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -46,8 +46,17 @@ extern "C" {
 
 typedef enum TfLiteStatus {
   kTfLiteOk = 0,
+
+  // Generally referring to an error in the runtime (i.e. interpreter)
   kTfLiteError = 1,
+
+  // Generally referring to an error from a TfLiteDelegate itself.
   kTfLiteDelegateError = 2,
+
+  // Generally referring to an error in applying a delegate due to
+  // incompatibility between runtime and delegate, e.g., this error is returned
+  // when trying to apply a TfLite delegate onto a model graph that's already
+  // immutable.
   kTfLiteApplicationError = 3
 } TfLiteStatus;
 
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index de0bb172069..55920b5ab55 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -49,8 +49,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
+        "//visibility:public",
     ],
     deps = [
         ":error_reporter",
@@ -68,8 +67,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
+        "//visibility:public",
     ],
     deps = [],
 )
@@ -79,10 +77,7 @@ cc_library(
     hdrs = ["verifier.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "//tensorflow_lite_support:__subpackages__",
-    ],
+    visibility = ["//visibility:public"],
     deps = [":error_reporter"],
 )
 
@@ -102,7 +97,7 @@ cc_test(
     srcs = ["op_resolver_test.cc"],
     deps = [
         ":api",
-        "//tensorflow/lite/schema:schema_utils",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 77621c3f2fd..dee2a990761 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -761,6 +761,26 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
+    case BuiltinOperator_CALL_ONCE: {
+      auto params = safe_allocator.Allocate<TfLiteCallOnceParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
+      if (const auto* call_once_params =
+              op->builtin_options_as_CallOnceOptions()) {
+        params->init_subgraph_index = call_once_params->init_subgraph_index();
+      }
+      *builtin_data = params.release();
+      return kTfLiteOk;
+    }
+    case BuiltinOperator_CUMSUM: {
+      auto params = safe_allocator.Allocate<TfLiteCumsumParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
+      if (const auto* cumsum_params = op->builtin_options_as_CumsumOptions()) {
+        params->exclusive = cumsum_params->exclusive();
+        params->reverse = cumsum_params->reverse();
+      }
+      *builtin_data = params.release();
+      return kTfLiteOk;
+    }
     // Below are the ops with no builtin_data structure.
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
@@ -1431,9 +1451,6 @@ TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
 
   if (schema_params != nullptr) {
     const flatbuffers::Vector<int32_t>* new_shape = schema_params->new_shape();
-    // TODO(b/147203660): We need to figure out when dynamic reshape
-    // (new_shape is a tensor) happens, why the option is not a nullptr.
-    // But nonethless, we should only copy when new_shape is not a nullptr.
     if (new_shape != nullptr) {
       TF_LITE_ENSURE_STATUS(
           FlatBufferIntVectorToArray(sizeof(params->shape), new_shape,
diff --git a/tensorflow/lite/core/api/op_resolver_test.cc b/tensorflow/lite/core/api/op_resolver_test.cc
index 44acc92ba8c..b0c0fda88a0 100644
--- a/tensorflow/lite/core/api/op_resolver_test.cc
+++ b/tensorflow/lite/core/api/op_resolver_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstring>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/schema/schema_utils.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index ebbf20706ca..20d68590740 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -86,6 +85,7 @@ template <typename TensorIntArray>
 bool HasDynamicTensorImpl(const TfLiteContext& context,
                           const TensorIntArray& int_array) {
   for (int i : int_array) {
+    if (i == kTfLiteOptionalTensor) continue;
     const TfLiteTensor& tensor = context.tensors[i];
     if (tensor.allocation_type == kTfLiteDynamic) {
       return true;
@@ -990,13 +990,6 @@ TfLiteStatus Subgraph::Invoke() {
     return kTfLiteError;
   }
 
-  // This is only needed for UseNNAPI(true);
-  if (should_apply_nnapi_delegate_ && !applied_nnapi_delegate_) {
-    TF_LITE_ENSURE_OK(&context_, ModifyGraphWithDelegate(NnApiDelegate()));
-    // only need to modify the graph once upon the first invocation.
-    applied_nnapi_delegate_ = true;
-  }
-
   // Invocations are always done in node order.
   // Note that calling Invoke repeatedly will cause the original memory plan to
   // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
@@ -1334,16 +1327,6 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   return kTfLiteOk;
 }
 
-void Subgraph::UseNNAPI(bool enable) {
-  // Note that there is no way to disable the delegate once it modified the
-  // graph.
-  if (applied_nnapi_delegate_ && !enable) {
-    ReportError("Attempting to disable NNAPI delegate after it's applied.");
-  } else {
-    should_apply_nnapi_delegate_ = enable;
-  }
-}
-
 void Subgraph::SwitchToDelegateContext() {
   context_.GetNodeAndRegistration = GetNodeAndRegistration;
   context_.ReplaceNodeSubsetsWithDelegateKernels =
@@ -1525,7 +1508,7 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
       ReportError(
           "Attempting to use a delegate that only supports static-sized "
           "tensors with a graph that has dynamic-sized tensors.");
-      return kTfLiteError;
+      return kTfLiteApplicationError;
     }
   }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index b94d1a0b2bc..14b4f068a11 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -25,16 +25,12 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/macros.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
 
-// Forward declare since NNAPIDelegate uses Interpreter.
-class NNAPIDelegate;
-
 class Subgraph {
  public:
   friend class Interpreter;
@@ -247,8 +243,6 @@ class Subgraph {
   // Entry point for C node plugin API to report an error.
   void ReportError(const char* format, ...);
 
-  void UseNNAPI(bool enable);
-
   // Return the subgraph specific context.
   TfLiteContext* context() { return &context_; }
 
@@ -567,7 +561,8 @@ class Subgraph {
   // delegate*. The Subgraph has been restored to its pre-delegation state.
   // NOTE: This reverts all delegates previously applied to the Subgraph.
   // 3. kTfLiteApplicationError : Delegation failed to be applied due to the
-  // state that the TfLite runtime is in. However, the Subgraph is still in a
+  // incompatibility with the TfLite runtime, e.g., the model graph is already
+  // immutable when applying the delegate. However, the Subgraph is still in a
   // invokable state.
   // 4. kTfLiteError: Unexpected/runtime failure.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
@@ -703,10 +698,6 @@ class Subgraph {
   // Used by PreviewDelegateParitioning.
   std::vector<TfLiteDelegateParams> partitioning_preview_cache_;
 
-  // Whether to use delegate to modify the graph.
-  bool should_apply_nnapi_delegate_ = false;
-  bool applied_nnapi_delegate_ = false;
-
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
   // Contains <tensor idx, custom allocation> pairs for all applicable tensors.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index d106ae4a738..240de7fef94 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -85,8 +85,8 @@ cc_test(
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
         "//third_party/eigen3",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index b70ebdcc3aa..a51d5bc431a 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 5826e1f83cd..53854463627 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -20,6 +20,7 @@ load(
     "tflite_jni_linkopts",
 )
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load("//tensorflow/lite:special_rules.bzl", "flex_portable_tensorflow_deps")
 
 def generate_flex_kernel_header(
         name,
@@ -130,13 +131,7 @@ def tflite_flex_cc_library(
                 clean_dep("//tensorflow/core/kernels:android_all_ops_textual_hdrs"),
             ],
             visibility = visibility,
-            deps = [
-                "@com_google_absl//absl/strings:str_format",
-                "//third_party/fft2d:fft2d_headers",
-                "//third_party/eigen3",
-                "@com_google_absl//absl/types:optional",
-                "@gemmlowp",
-                "@icu//:common",
+            deps = flex_portable_tensorflow_deps() + [
                 clean_dep("//tensorflow/core:protos_all_cc"),
                 clean_dep("//tensorflow/core:portable_tensorflow_lib_lite"),
                 clean_dep("//tensorflow/core/platform:strong_hash"),
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 8778653b586..3dfab18867f 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -99,7 +99,7 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/metal:api",
         "//tensorflow/lite/delegates/gpu/metal:buffer_convert",
         "//tensorflow/lite/delegates/gpu/metal:compiled_model",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index c616b081829..9626839997a 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -52,41 +52,17 @@ cc_library(
     srcs = ["arguments.cc"],
     hdrs = ["arguments.h"],
     deps = [
-        ":cl_device",
-        ":gpu_object",
-        ":opencl_wrapper",
-        ":tensor_type",
-        ":util",
+        ":serialization_cc_fbs",
         "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_object_desc",
         "@com_google_absl//absl/strings",
     ],
 )
 
-cc_test(
-    name = "arguments_test",
-    srcs = ["arguments_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":arguments",
-        ":buffer",
-        ":device_info",
-        ":gpu_object",
-        ":tensor",
-        ":tensor_type",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "buffer",
     srcs = ["buffer.cc"],
@@ -131,6 +107,48 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cl_arguments",
+    srcs = ["cl_arguments.cc"],
+    hdrs = ["cl_arguments.h"],
+    deps = [
+        ":arguments",
+        ":buffer",
+        ":cl_context",
+        ":device_info",
+        ":gpu_object",
+        ":linear_storage",
+        ":tensor",
+        ":tensor_type",
+        ":texture2d",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "cl_arguments_test",
+    srcs = ["cl_arguments_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":arguments",
+        ":buffer",
+        ":cl_arguments",
+        ":device_info",
+        ":gpu_object",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "cl_command_queue",
     srcs = ["cl_command_queue.cc"],
@@ -353,32 +371,42 @@ cc_library(
 
 cc_library(
     name = "gpu_object",
-    srcs = ["gpu_object.cc"],
     hdrs = ["gpu_object.h"],
     deps = [
-        ":cl_context",
         ":opencl_wrapper",
         "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_object_desc",
     ],
 )
 
 cc_library(
     name = "inference_context",
-    srcs = ["inference_context.cc"],
-    hdrs = ["inference_context.h"],
+    srcs = [
+        "inference_context.cc",
+        "serialization.cc",
+    ],
+    hdrs = [
+        "inference_context.h",
+        "serialization.h",
+    ],
     deps = [
+        ":arguments",
         ":buffer",
         ":cl_command_queue",
+        ":cl_context",
         ":cl_device",
         ":environment",
         ":gpu_object",
+        ":linear_storage",
         ":model_hints",
         ":opencl_wrapper",
         ":precision",
+        ":serialization_cc_fbs",
         ":storage_type_util",
         ":tensor_type",
+        ":texture2d",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
         "//tensorflow/lite/delegates/gpu/cl/selectors:special_selector",
@@ -396,6 +424,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -404,6 +433,7 @@ cc_library(
     srcs = ["linear_storage.cc"],
     hdrs = ["linear_storage.h"],
     deps = [
+        ":cl_context",
         ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
@@ -467,6 +497,18 @@ cc_library(
     ],
 )
 
+flatbuffer_cc_library(
+    name = "serialization_cc_fbs",
+    srcs = ["serialization.fbs"],
+    flatc_args = [
+        "--scoped-enums",
+        "-I ./",
+    ],
+    includes = [
+        "//tensorflow/lite/delegates/gpu/common/task:serialization_base_cc_fbs_includes",
+    ],
+)
+
 cc_library(
     name = "storage_type_util",
     srcs = ["storage_type_util.cc"],
@@ -528,9 +570,11 @@ cc_library(
     srcs = ["tensor_type.cc"],
     hdrs = ["tensor_type.h"],
     deps = [
-        ":gpu_object",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_object_desc",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -589,6 +633,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_object_desc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index dfd8a8680fc..e2135d05b53 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -570,6 +570,56 @@ TensorObjectDef TensorToDef(const Tensor& tensor) {
   return def;
 }
 
+CalculationsPrecision GetPrecision(const Environment& env,
+                                   const InferenceOptions& options) {
+  CalculationsPrecision precision;
+  switch (GetPosition(options, InferencePriority::MAX_PRECISION)) {
+    case 1:
+      precision = CalculationsPrecision::F32;
+      break;
+    case 2:
+      precision = CalculationsPrecision::F32_F16;
+      break;
+    case 3:
+      precision = CalculationsPrecision::F16;
+      break;
+    default:
+      precision = CalculationsPrecision::F16;
+      break;
+  }
+  // Increase precision if lower precision is not supported.
+  if (!env.IsSupported(precision)) {
+    precision = CalculationsPrecision::F32_F16;
+    if (!env.IsSupported(precision)) {
+      precision = CalculationsPrecision::F32;
+    }
+  }
+  return precision;
+}
+
+TensorStorageType GetStorageTypeFromOptions(const Environment& env,
+                                            const InferenceOptions& options) {
+  // Fallback to BUFFER that should be supported by default.
+  std::vector<TensorStorageType> preferred_storage_types;
+  if (GetRelativeImportance(options, InferencePriority::MIN_LATENCY,
+                            InferencePriority::MIN_MEMORY_USAGE) ==
+      PriorityImportance::HIGHER) {
+    preferred_storage_types = {GetFastestStorageType(env.device().GetInfo()),
+                               TensorStorageType::BUFFER};
+  } else {
+    preferred_storage_types = {
+        GetStorageTypeWithMinimalMemoryConsumption(env.device().GetInfo()),
+        TensorStorageType::BUFFER};
+  }
+
+  for (TensorStorageType storage_type : preferred_storage_types) {
+    if (env.IsSupported(storage_type)) {
+      return storage_type;
+    }
+  }
+  return TensorStorageType::UNKNOWN;
+}
+
 class InferenceBuilderImpl : public InferenceBuilder {
  public:
   explicit InferenceBuilderImpl(Environment* environment)
@@ -580,8 +630,9 @@ class InferenceBuilderImpl : public InferenceBuilder {
                           const GraphFloat32& graph) {
     context_ = absl::make_unique<InferenceContext>();
     InferenceContext::CreateInferenceInfo create_info;
-    create_info.precision = GetPrecision(options);
-    create_info.storage_type = GetStorageType(options);
+    create_info.precision = GetPrecision(*environment_, options);
+    create_info.storage_type =
+        GetStorageTypeFromOptions(*environment_, options);
     if (options.usage == InferenceUsage::FAST_SINGLE_ANSWER) {
       create_info.hints.Add(ModelHints::kReduceKernelsCount);
       create_info.hints.Add(ModelHints::kFastTuning);
@@ -590,6 +641,30 @@ class InferenceBuilderImpl : public InferenceBuilder {
     }
     RETURN_IF_ERROR(context_->InitFromGraph(create_info, graph, environment_));
 
+#ifdef CL_DELEGATE_ALLOW_GL
+    if (env_options.IsGlAware() &&
+        IsGlSharingSupported(environment_->device())) {
+      gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
+          env_options.egl_display, environment_);
+    }
+    tie_factory_ = absl::make_unique<TensorTieFactory>(
+        environment_, context_.get(), gl_interop_fabric_.get());
+#else
+    tie_factory_ =
+        absl::make_unique<TensorTieFactory>(environment_, context_.get());
+#endif
+
+    inputs_ = LinkTensors(context_->GetInputIds(), AccessType::READ);
+    outputs_ = LinkTensors(context_->GetOutputIds(), AccessType::WRITE);
+    return absl::OkStatus();
+  }
+
+  absl::Status Initialize(const InferenceEnvironmentOptions& env_options,
+                          const std::vector<uint8_t>& serialized_model) {
+    context_ = absl::make_unique<InferenceContext>();
+    RETURN_IF_ERROR(
+        context_->RestoreDeserialized(serialized_model, environment_));
+
 #ifdef CL_DELEGATE_ALLOW_GL
     if (env_options.IsGlAware() &&
         IsGlSharingSupported(environment_->device())) {
@@ -671,55 +746,6 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
  private:
-  TensorStorageType GetStorageType(const InferenceOptions& options) const {
-    // Fallback to BUFFER that should be supported by default.
-    std::vector<TensorStorageType> preferred_storage_types;
-    if (GetRelativeImportance(options, InferencePriority::MIN_LATENCY,
-                              InferencePriority::MIN_MEMORY_USAGE) ==
-        PriorityImportance::HIGHER) {
-      preferred_storage_types = {
-          GetFastestStorageType(environment_->device().GetInfo()),
-          TensorStorageType::BUFFER};
-    } else {
-      preferred_storage_types = {GetStorageTypeWithMinimalMemoryConsumption(
-                                     environment_->device().GetInfo()),
-                                 TensorStorageType::BUFFER};
-    }
-
-    for (TensorStorageType storage_type : preferred_storage_types) {
-      if (environment_->IsSupported(storage_type)) {
-        return storage_type;
-      }
-    }
-    return TensorStorageType::UNKNOWN;
-  }
-
-  CalculationsPrecision GetPrecision(const InferenceOptions& options) const {
-    CalculationsPrecision precision;
-    switch (GetPosition(options, InferencePriority::MAX_PRECISION)) {
-      case 1:
-        precision = CalculationsPrecision::F32;
-        break;
-      case 2:
-        precision = CalculationsPrecision::F32_F16;
-        break;
-      case 3:
-        precision = CalculationsPrecision::F16;
-        break;
-      default:
-        precision = CalculationsPrecision::F16;
-        break;
-    }
-    // Increase precision if lower precision is not supported.
-    if (!environment_->IsSupported(precision)) {
-      precision = CalculationsPrecision::F32_F16;
-      if (!environment_->IsSupported(precision)) {
-        precision = CalculationsPrecision::F32;
-      }
-    }
-    return precision;
-  }
-
   // Links internal tensors with external user-facing objects.
   std::vector<TensorTieDef> LinkTensors(const std::vector<ValueId>& ids,
                                         AccessType access) {
@@ -840,6 +866,39 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
     return environment_.Init();
   }
 
+  absl::Status BuildSerializedModel(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::vector<uint8_t>* serialized_model) final {
+    if (!IsValid(options)) {
+      return absl::InvalidArgumentError("InferenceOptions are invalid.");
+    }
+    InferenceOptions resolved_options = options;
+    ResolveAutoPriority(&resolved_options);
+    if (environment_.program_cache() &&
+        !options_.serialized_binary_cache.empty()) {
+      // Ignore returned error. Cache is discarded.
+      environment_.program_cache()
+          ->AddSerializedCache(environment_.context(), environment_.device(),
+                               options_.serialized_binary_cache)
+          .IgnoreError();
+    }
+
+    RETURN_IF_ERROR(RunGraphTransforms(&model));
+    InferenceContext context;
+    InferenceContext::CreateInferenceInfo create_info;
+    create_info.precision = GetPrecision(environment_, options);
+    create_info.storage_type = GetStorageTypeFromOptions(environment_, options);
+    if (options.usage == InferenceUsage::FAST_SINGLE_ANSWER) {
+      create_info.hints.Add(ModelHints::kReduceKernelsCount);
+      create_info.hints.Add(ModelHints::kFastTuning);
+    } else if (options.usage == InferenceUsage::SUSTAINED_SPEED) {
+      create_info.hints.Add(ModelHints::kAllowSpecialKernels);
+    }
+    RETURN_IF_ERROR(context.InitFromGraph(create_info, model, &environment_,
+                                          serialized_model));
+    return absl::OkStatus();
+  }
+
   absl::Status NewInferenceBuilder(
       const InferenceOptions& options, GraphFloat32 model,
       std::unique_ptr<InferenceBuilder>* builder) final {
@@ -865,6 +924,24 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
     return absl::OkStatus();
   }
 
+  absl::Status NewInferenceBuilder(
+      const std::vector<uint8_t>& serialized_model,
+      std::unique_ptr<InferenceBuilder>* builder) final {
+    if (environment_.program_cache() &&
+        !options_.serialized_binary_cache.empty()) {
+      // Ignore returned error. Cache is discarded.
+      environment_.program_cache()
+          ->AddSerializedCache(environment_.context(), environment_.device(),
+                               options_.serialized_binary_cache)
+          .IgnoreError();
+    }
+
+    auto builder_impl = absl::make_unique<InferenceBuilderImpl>(&environment_);
+    RETURN_IF_ERROR(builder_impl->Initialize(options_, serialized_model));
+    *builder = std::move(builder_impl);
+    return absl::OkStatus();
+  }
+
   std::vector<uint8_t> GetSerializedBinaryCache() const final {
     std::vector<uint8_t> data;
     // Is there was a problem, data would be empty.
diff --git a/tensorflow/lite/delegates/gpu/cl/api.h b/tensorflow/lite/delegates/gpu/cl/api.h
index 826d4f2bc78..65671117522 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@@ -75,6 +75,20 @@ class InferenceEnvironment {
  public:
   virtual ~InferenceEnvironment() {}
 
+  // Converts GraphFloat32 into intermediate, device-specific representation.
+  // This serialized_model specific for device and InferenceOptions.
+  // serialized_model cannot be used with another device or InferenceOptions.
+  // Loading serialized_model is much faster than loading GraphFloat32.
+  // serialized_model must be used with appropriate NewInferenceBuilder
+  // method (see below).
+  virtual absl::Status BuildSerializedModel(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::vector<uint8_t>* serialized_model) = 0;
+
+  virtual absl::Status NewInferenceBuilder(
+      const std::vector<uint8_t>& serialized_model,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+
   virtual absl::Status NewInferenceBuilder(
       const InferenceOptions& options, GraphFloat32 model,
       std::unique_ptr<InferenceBuilder>* builder) = 0;
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 526a09f18f9..f5e58f87ba8 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -16,12 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 
 #include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -42,75 +38,17 @@ std::string GetNextWord(const std::string& code, size_t first_position) {
   return code.substr(first_position, pos - first_position);
 }
 
-size_t FindEnclosingBracket(const std::string& text, size_t first_pos,
-                            char bracket) {
-  const std::map<char, char> brackets = {
-      {'(', ')'},
-      {'{', '}'},
-      {'[', ']'},
-      {'<', '>'},
-  };
-  char b_open = bracket;
-  auto it = brackets.find(b_open);
-  if (it == brackets.end()) {
-    return -1;
-  }
-  char b_close = it->second;
-  size_t pos = first_pos;
-  int opened = 1;
-  int closed = 0;
-  while (opened != closed && pos < text.size()) {
-    if (text[pos] == b_open) {
-      opened++;
-    } else if (text[pos] == b_close) {
-      closed++;
+bool HasWord(const std::string& word, const std::string& text) {
+  size_t pos = text.find(word);
+  while (pos != std::string::npos) {
+    char prev = pos == 0 ? '.' : text[pos - 1];
+    char next = pos + word.size() < text.size() ? text[pos + word.size()] : '.';
+    if (!IsWordSymbol(prev) & !IsWordSymbol(next)) {
+      return true;
     }
-    pos++;
-  }
-  if (opened == closed) {
-    return pos;
-  } else {
-    return -1;
-  }
-}
-
-absl::Status ParseArgsInsideBrackets(const std::string& text,
-                                     size_t open_bracket_pos,
-                                     size_t* close_bracket_pos,
-                                     std::vector<std::string>* args) {
-  *close_bracket_pos =
-      FindEnclosingBracket(text, open_bracket_pos + 1, text[open_bracket_pos]);
-  if (*close_bracket_pos == -1) {
-    return absl::NotFoundError("Not found enclosing bracket");
-  }
-  std::string str_args = text.substr(open_bracket_pos + 1,
-                                     *close_bracket_pos - open_bracket_pos - 2);
-  std::vector<absl::string_view> words = absl::StrSplit(str_args, ',');
-  args->reserve(words.size());
-  for (const auto& word : words) {
-    absl::string_view arg = absl::StripAsciiWhitespace(word);
-    if (!arg.empty()) {
-      args->push_back(std::string(arg));
-    }
-  }
-  return absl::OkStatus();
-}
-
-void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
-                     std::string* str) {
-  size_t position = str->find(old_word);
-  while (position != std::string::npos) {
-    char prev = position == 0 ? '.' : (*str)[position - 1];
-    char next = position + old_word.size() < str->size()
-                    ? (*str)[position + old_word.size()]
-                    : '.';
-    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
-      position = str->find(old_word, position + 1);
-      continue;
-    }
-    str->replace(position, old_word.size(), new_word);
-    position = str->find(old_word, position + new_word.size());
+    pos = text.find(word, pos + 1);
   }
+  return false;
 }
 
 std::string RenameArg(const std::vector<std::string>& object_names,
@@ -127,91 +65,8 @@ std::string RenameArg(const std::vector<std::string>& object_names,
   return arg_name + postfix;
 }
 
-void AppendArgument(const std::string& arg, std::string* args) {
-  if (!args->empty()) {
-    absl::StrAppend(args, ",\n  ");
-  }
-  absl::StrAppend(args, arg);
-}
-
-std::string GetImageModifier(AccessType access) {
-  switch (access) {
-    case AccessType::READ:
-      return "__read_only";
-    case AccessType::WRITE:
-      return "__write_only";
-    case AccessType::READ_WRITE:
-      return "__read_write";
-  }
-}
-
-std::string GetDefaultSamplers(const DeviceInfo& device_info) {
-  std::string result;
-  result +=
-      "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
-      "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
-  if (device_info.IsAdreno3xx()) {
-    // Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
-    // we can observe huge register overhead when compared to other modes.
-
-    // While using CLK_ADDRESS_NONE with out-of-range image coordinates is
-    // undefined in the OpenCL specification, we have observed that
-    // CLK_ADDRESS_NONE works like CLK_ADDRESS_CLAMP for out-of-range image
-    // coordinates for RGBA F16/F32 textures on Adreno3xx devices. Using
-    // CLK_ADDRESS_NONE is significantly faster than CLK_ADDRESS_CLAMP on Adreno
-    // 3xx.
-    result +=
-        "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
-        "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
-  } else {
-    result +=
-        "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
-        "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
-  }
-
-  return result;
-}
-
 }  // namespace
 
-// Static
-constexpr char Arguments::kArgsPrefix[];
-
-Arguments::Arguments(Arguments&& args)
-    : int_values_(std::move(args.int_values_)),
-      shared_int4s_data_(std::move(args.shared_int4s_data_)),
-      float_values_(std::move(args.float_values_)),
-      shared_float4s_data_(std::move(args.shared_float4s_data_)),
-      half_values_(std::move(args.half_values_)),
-      shared_half4s_data_(std::move(args.shared_half4s_data_)),
-      buffers_(std::move(args.buffers_)),
-      images2d_(std::move(args.images2d_)),
-      image2d_arrays_(std::move(args.image2d_arrays_)),
-      images3d_(std::move(args.images3d_)),
-      image_buffers_(std::move(args.image_buffers_)),
-      custom_memories_(std::move(args.custom_memories_)),
-      object_refs_(std::move(args.object_refs_)),
-      objects_(std::move(args.objects_)) {}
-Arguments& Arguments::operator=(Arguments&& args) {
-  if (this != &args) {
-    int_values_ = std::move(args.int_values_);
-    shared_int4s_data_ = std::move(args.shared_int4s_data_);
-    float_values_ = std::move(args.float_values_);
-    shared_float4s_data_ = std::move(args.shared_float4s_data_);
-    half_values_ = std::move(args.half_values_);
-    shared_half4s_data_ = std::move(args.shared_half4s_data_);
-    buffers_ = std::move(args.buffers_);
-    images2d_ = std::move(args.images2d_);
-    image2d_arrays_ = std::move(args.image2d_arrays_);
-    images3d_ = std::move(args.images3d_);
-    image_buffers_ = std::move(args.image_buffers_);
-    custom_memories_ = std::move(args.custom_memories_);
-    object_refs_ = std::move(args.object_refs_);
-    objects_ = std::move(args.objects_);
-  }
-  return *this;
-}
-
 void Arguments::AddFloat(const std::string& name, float value) {
   float_values_[name].value = value;
 }
@@ -221,34 +76,6 @@ void Arguments::AddHalf(const std::string& name, half value) {
 void Arguments::AddInt(const std::string& name, int value) {
   int_values_[name].value = value;
 }
-void Arguments::AddBuffer(const std::string& name,
-                          const GPUBufferDescriptor& desc) {
-  buffers_[name] = desc;
-}
-void Arguments::AddImage2D(const std::string& name,
-                           const GPUImage2DDescriptor& desc) {
-  images2d_[name] = desc;
-}
-
-void Arguments::AddImage2DArray(const std::string& name,
-                                const GPUImage2DArrayDescriptor& desc) {
-  image2d_arrays_[name] = desc;
-}
-
-void Arguments::AddImage3D(const std::string& name,
-                           const GPUImage3DDescriptor& desc) {
-  images3d_[name] = desc;
-}
-
-void Arguments::AddImageBuffer(const std::string& name,
-                               const GPUImageBufferDescriptor& desc) {
-  image_buffers_[name] = desc;
-}
-
-void Arguments::AddCustomMemory(const std::string& name,
-                                const GPUCustomMemoryDescriptor& desc) {
-  custom_memories_[name] = desc;
-}
 
 void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
                              GPUObjectDescriptorPtr&& descriptor_ptr) {
@@ -259,188 +86,12 @@ void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
 void Arguments::AddObject(const std::string& name,
                           GPUObjectDescriptorPtr&& descriptor_ptr) {
   descriptor_ptr->SetAccess(AccessType::READ);
-  objects_[name] = {nullptr, std::move(descriptor_ptr)};
-}
-
-void Arguments::AddGPUResources(const std::string& name,
-                                const GPUResources& resources) {
-  for (const auto& r : resources.ints) {
-    AddInt(absl::StrCat(name, "_", r));
-  }
-  for (const auto& r : resources.floats) {
-    AddFloat(absl::StrCat(name, "_", r));
-  }
-  for (const auto& r : resources.buffers) {
-    AddBuffer(absl::StrCat(name, "_", r.first), r.second);
-  }
-  for (const auto& r : resources.images2d) {
-    AddImage2D(absl::StrCat(name, "_", r.first), r.second);
-  }
-  for (const auto& r : resources.image2d_arrays) {
-    AddImage2DArray(absl::StrCat(name, "_", r.first), r.second);
-  }
-  for (const auto& r : resources.images3d) {
-    AddImage3D(absl::StrCat(name, "_", r.first), r.second);
-  }
-  for (const auto& r : resources.image_buffers) {
-    AddImageBuffer(absl::StrCat(name, "_", r.first), r.second);
-  }
-  for (const auto& r : resources.custom_memories) {
-    AddCustomMemory(absl::StrCat(name, "_", r.first), r.second);
-  }
-}
-
-absl::Status Arguments::SetInt(const std::string& name, int value) {
-  auto it = int_values_.find(name);
-  if (it == int_values_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No int argument with name - ", name));
-  }
-  it->second.value = value;
-  if (it->second.active) {
-    shared_int4s_data_[it->second.offset] = value;
-  }
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetFloat(const std::string& name, float value) {
-  auto it = float_values_.find(name);
-  if (it == float_values_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No float argument with name - ", name));
-  }
-  it->second.value = value;
-  if (it->second.active) {
-    shared_float4s_data_[it->second.offset] = value;
-  }
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetHalf(const std::string& name, half value) {
-  auto it = half_values_.find(name);
-  if (it == half_values_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No half argument with name - ", name));
-  }
-  it->second.value = value;
-  if (it->second.active) {
-    if (it->second.store_as_f32) {
-      shared_float4s_data_[it->second.offset] = value;
-    } else {
-      shared_half4s_data_[it->second.offset] = value;
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetImage2D(const std::string& name, cl_mem memory) {
-  auto it = images2d_.find(name);
-  if (it == images2d_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No image2D argument with name - ", name));
-  }
-  it->second.memory = memory;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetBuffer(const std::string& name, cl_mem memory) {
-  auto it = buffers_.find(name);
-  if (it == buffers_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No buffer argument with name - ", name));
-  }
-  it->second.memory = memory;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetImage2DArray(const std::string& name,
-                                        cl_mem memory) {
-  auto it = image2d_arrays_.find(name);
-  if (it == image2d_arrays_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No image2D array argument with name - ", name));
-  }
-  it->second.memory = memory;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetImage3D(const std::string& name, cl_mem memory) {
-  auto it = images3d_.find(name);
-  if (it == images3d_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No image3D argument with name - ", name));
-  }
-  it->second.memory = memory;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetImageBuffer(const std::string& name, cl_mem memory) {
-  auto it = image_buffers_.find(name);
-  if (it == image_buffers_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No image buffer argument with name - ", name));
-  }
-  it->second.memory = memory;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetCustomMemory(const std::string& name,
-                                        cl_mem memory) {
-  auto it = custom_memories_.find(name);
-  if (it == custom_memories_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No custom memory argument with name - ", name));
-  }
-  it->second.memory = memory;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::SetObjectRef(const std::string& name,
-                                     const GPUObject* object) {
-  auto it = object_refs_.find(name);
-  if (it == object_refs_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No object ref with name - ", name));
-  }
-  GPUResourcesWithValue resources;
-  RETURN_IF_ERROR(
-      object->GetGPUResources(it->second.descriptor.get(), &resources));
-  return SetGPUResources(name, resources);
-}
-
-absl::Status Arguments::SetGPUResources(
-    const std::string& name, const GPUResourcesWithValue& resources) {
-  for (const auto& r : resources.ints) {
-    RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.floats) {
-    RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.buffers) {
-    RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.images2d) {
-    RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.image2d_arrays) {
-    RETURN_IF_ERROR(
-        SetImage2DArray(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.images3d) {
-    RETURN_IF_ERROR(SetImage3D(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.image_buffers) {
-    RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second));
-  }
-  for (const auto& r : resources.custom_memories) {
-    RETURN_IF_ERROR(
-        SetCustomMemory(absl::StrCat(name, "_", r.first), r.second));
-  }
-  return absl::OkStatus();
+  objects_[name] = {std::move(descriptor_ptr)};
 }
 
 void Arguments::RenameArgs(const std::string& postfix,
                            std::string* code) const {
+  static constexpr char kArgsPrefix[] = "args.";
   size_t next_position = code->find(kArgsPrefix);
   while (next_position != std::string::npos) {
     size_t arg_pos = next_position + strlen(kArgsPrefix);
@@ -460,7 +111,7 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object reference name collision. Name - ", name));
     }
-    object_refs_[name] = {std::move(v.second.descriptor)};
+    object_refs_[name] = {std::move(v.second)};
   }
   for (auto& v : args.objects_) {
     object_names.push_back(v.first);
@@ -469,8 +120,7 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object name collision. Name - ", name));
     }
-    objects_[name] = {std::move(v.second.obj_ptr),
-                      std::move(v.second.descriptor)};
+    objects_[name] = {std::move(v.second)};
   }
   for (const auto& v : args.int_values_) {
     AddInt(RenameArg(object_names, postfix, v.first), v.second.value);
@@ -481,390 +131,26 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
   for (const auto& v : args.half_values_) {
     AddHalf(RenameArg(object_names, postfix, v.first), v.second.value);
   }
-  for (const auto& v : args.buffers_) {
-    AddBuffer(RenameArg(object_names, postfix, v.first), v.second);
-  }
-  for (const auto& v : args.images2d_) {
-    AddImage2D(RenameArg(object_names, postfix, v.first), v.second);
-  }
-  for (const auto& v : args.image2d_arrays_) {
-    AddImage2DArray(RenameArg(object_names, postfix, v.first), v.second);
-  }
-  for (const auto& v : args.images3d_) {
-    AddImage3D(RenameArg(object_names, postfix, v.first), v.second);
-  }
-  for (const auto& v : args.image_buffers_) {
-    AddImageBuffer(RenameArg(object_names, postfix, v.first), v.second);
-  }
-  for (const auto& v : args.custom_memories_) {
-    AddCustomMemory(RenameArg(object_names, postfix, v.first), v.second);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::TransformToCLCode(
-    const DeviceInfo& device_info,
-    const std::map<std::string, std::string>& linkables, std::string* code) {
-  RETURN_IF_ERROR(AddObjectArgs());
-  RETURN_IF_ERROR(ResolveSelectorsPass(linkables, code));
-  ResolveArgsPass(device_info, code);
-  *code = absl::Substitute(*code, GetListOfArgs());
-  *code = GetDefaultSamplers(device_info) + *code;
-  return absl::OkStatus();
-}
-
-std::string Arguments::GetListOfArgs() {
-  std::string result;
-  for (auto& t : buffers_) {
-    const std::string type_name =
-        t.second.data_type == DataType::FLOAT32 ? "float" : "half";
-    std::string attributes;
-    for (const auto& attr : t.second.attributes) {
-      attributes += absl::StrCat("  __attribute__((", attr, "))");
-    }
-    AppendArgument(
-        absl::StrCat(MemoryTypeToCLType(t.second.memory_type), " ",
-                     ToCLDataType(t.second.data_type, t.second.element_size),
-                     "* ", t.first, attributes),
-        &result);
-  }
-  for (auto& t : image_buffers_) {
-    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
-                                " image1d_buffer_t ", t.first),
-                   &result);
-  }
-  for (auto& t : images2d_) {
-    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
-                                " image2d_t ", t.first),
-                   &result);
-  }
-  for (auto& t : image2d_arrays_) {
-    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
-                                " image2d_array_t ", t.first),
-                   &result);
-  }
-  for (auto& t : images3d_) {
-    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
-                                " image3d_t ", t.first),
-                   &result);
-  }
-  for (auto& t : custom_memories_) {
-    AppendArgument(absl::StrCat(t.second.type_name, " ", t.first), &result);
-  }
-  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
-    AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
-  }
-  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
-    AppendArgument(absl::StrCat("float4 shared_float4_", i), &result);
-  }
-  for (int i = 0; i < shared_half4s_data_.size() / 4; ++i) {
-    AppendArgument(absl::StrCat("half4 shared_half4_", i), &result);
-  }
-  return result;
-}
-
-absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
-  for (auto& t : buffers_) {
-    const int error_code =
-        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (auto& t : image_buffers_) {
-    const int error_code =
-        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (auto& t : images2d_) {
-    const int error_code =
-        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (auto& t : image2d_arrays_) {
-    const int error_code =
-        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (auto& t : images3d_) {
-    const int error_code =
-        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (auto& t : custom_memories_) {
-    const int error_code =
-        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
-    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
-                                          &shared_int4s_data_[i * 4]);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
-    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
-                                          &shared_float4s_data_[i * 4]);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  for (int i = 0; i < shared_half4s_data_.size() / 4; ++i) {
-    const int error_code = clSetKernelArg(kernel, offset, sizeof(int16_t) * 4,
-                                          &shared_half4s_data_[i * 4]);
-    if (error_code != CL_SUCCESS) {
-      return absl::UnknownError(absl::StrCat(
-          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
-          "(at index - ", offset, ")"));
-    }
-    offset++;
-  }
-  return absl::OkStatus();
-}
-
-std::string Arguments::AddActiveArgument(const std::string& arg_name,
-                                         bool use_f32_for_halfs) {
-  if (auto it = int_values_.find(arg_name); it != int_values_.end()) {
-    int int_index;
-    if (it->second.active) {
-      int_index = it->second.offset;
-    } else {
-      it->second.active = true;
-      it->second.offset = shared_int4s_data_.size();
-      int_index = it->second.offset;
-      shared_int4s_data_.push_back(it->second.value);
-    }
-    std::string index = std::to_string(int_index / 4);
-    std::string postfixes[4] = {"x", "y", "z", "w"};
-    return "shared_int4_" + index + "." + postfixes[int_index % 4];
-  }
-  if (auto it = float_values_.find(arg_name); it != float_values_.end()) {
-    int float_index;
-    if (it->second.active) {
-      float_index = it->second.offset;
-    } else {
-      it->second.active = true;
-      it->second.offset = shared_float4s_data_.size();
-      float_index = it->second.offset;
-      shared_float4s_data_.push_back(it->second.value);
-    }
-    std::string index = std::to_string(float_index / 4);
-    std::string postfixes[4] = {"x", "y", "z", "w"};
-    return "shared_float4_" + index + "." + postfixes[float_index % 4];
-  }
-  if (auto it = half_values_.find(arg_name); it != half_values_.end()) {
-    int half_index;
-    if (it->second.active) {
-      half_index = it->second.offset;
-    } else {
-      it->second.active = true;
-      if (use_f32_for_halfs) {
-        it->second.store_as_f32 = true;
-        it->second.offset = shared_float4s_data_.size();
-        shared_float4s_data_.push_back(it->second.value);
-      } else {
-        it->second.offset = shared_half4s_data_.size();
-        shared_half4s_data_.push_back(it->second.value);
-      }
-      half_index = it->second.offset;
-    }
-    std::string index = std::to_string(half_index / 4);
-    std::string postfixes[4] = {"x", "y", "z", "w"};
-    if (it->second.store_as_f32) {
-      return "(half)(shared_float4_" + index + "." + postfixes[half_index % 4] +
-             ")";
-    } else {
-      return "shared_half4_" + index + "." + postfixes[half_index % 4];
-    }
-  }
-  return arg_name;
-}
-
-void Arguments::ResolveArgsPass(const DeviceInfo& device_info,
-                                std::string* code) {
-  bool use_f32_for_half_arguments = device_info.IsPowerVR();
-  size_t position = 0;
-  size_t next_position = code->find(kArgsPrefix);
-  while (next_position != std::string::npos) {
-    size_t arg_pos = next_position;
-    next_position += strlen(kArgsPrefix);
-    std::string object_name = GetNextWord(*code, next_position);
-    std::string new_name =
-        AddActiveArgument(object_name, use_f32_for_half_arguments);
-    code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name);
-    position = arg_pos + new_name.size();
-    next_position = code->find(kArgsPrefix, position);
-  }
-
-  int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
-  shared_int4s_data_.resize(shared_int4s_aligned_size);
-  int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4);
-  shared_float4s_data_.resize(shared_float4s_aligned_size);
-  int shared_half4s_aligned_size = AlignByN(shared_half4s_data_.size(), 4);
-  shared_half4s_data_.resize(shared_half4s_aligned_size);
-}
-
-void Arguments::ResolveObjectNames(const std::string& object_name,
-                                   const std::vector<std::string>& member_names,
-                                   std::string* code) {
-  for (const auto& member_name : member_names) {
-    const std::string new_name = kArgsPrefix + object_name + "_" + member_name;
-    ReplaceAllWords(member_name, new_name, code);
-  }
-}
-
-absl::Status Arguments::ResolveSelector(
-    const std::map<std::string, std::string>& linkables,
-    const std::string& object_name, const std::string& selector,
-    const std::vector<std::string>& args,
-    const std::vector<std::string>& template_args, std::string* result) {
-  const GPUObjectDescriptor* desc_ptr;
-  if (auto it = object_refs_.find(object_name); it != object_refs_.end()) {
-    desc_ptr = it->second.descriptor.get();
-  } else if (auto it = objects_.find(object_name); it != objects_.end()) {
-    desc_ptr = it->second.descriptor.get();
-  } else {
-    return absl::NotFoundError(
-        absl::StrCat("No object with name - ", object_name));
-  }
-  auto names = desc_ptr->GetGPUResources().GetNames();
-  const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(desc_ptr);
-  if (tensor_desc && selector == "Write") {
-    if (auto it = linkables.find(object_name); it != linkables.end()) {
-      if (desc_ptr->GetAccess() != AccessType::WRITE &&
-          desc_ptr->GetAccess() != AccessType::READ_WRITE) {
-        return absl::FailedPreconditionError(absl::StrCat(
-            "Object with name - ", object_name, " should have Write access."));
-      }
-      std::string value_name, x_coord, y_coord, s_coord;
-      RETURN_IF_ERROR(tensor_desc->GetLinkingContextFromWriteSelector(
-          args, &value_name, &x_coord, &y_coord, &s_coord));
-      // x_coord can have batch size property of link_object
-      ResolveObjectNames(object_name, names, &x_coord);
-      *result = it->second;
-      ReplaceAllWords("in_out_value", value_name, result);
-      ReplaceAllWords("X_COORD", x_coord, result);
-      ReplaceAllWords("Y_COORD", y_coord, result);
-      ReplaceAllWords("S_COORD", s_coord, result);
-      RETURN_IF_ERROR(ResolveSelectorsPass({}, result));
-    }
-  }
-  std::string patch;
-  RETURN_IF_ERROR(
-      desc_ptr->PerformSelector(selector, args, template_args, &patch));
-  ResolveObjectNames(object_name, names, &patch);
-  *result += patch;
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::ResolveSelectorsPass(
-    const std::map<std::string, std::string>& linkables, std::string* code) {
-  std::string result;
-  size_t position = 0;
-  size_t next_position = code->find(kArgsPrefix);
-  while (next_position != std::string::npos) {
-    size_t arg_pos = next_position;
-    next_position += strlen(kArgsPrefix);
-    std::string object_name = GetNextWord(*code, next_position);
-    char next = (*code)[next_position + object_name.size()];
-    if (next == '.') {
-      next_position += object_name.size() + 1;
-      std::string selector_name = GetNextWord(*code, next_position);
-      next_position += selector_name.size();
-      next = (*code)[next_position];
-      std::vector<std::string> template_args;
-      if (next == '<') {
-        size_t close_bracket_pos;
-        RETURN_IF_ERROR(ParseArgsInsideBrackets(
-            *code, next_position, &close_bracket_pos, &template_args));
-        next_position = close_bracket_pos;
-        next = (*code)[next_position];
-      }
-      if (next != '(') {
-        return absl::NotFoundError(absl::StrCat(
-            "Expected ( after ", object_name, ".", selector_name, " call"));
-      }
-      std::vector<std::string> args;
-      size_t close_bracket_pos;
-      RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position,
-                                              &close_bracket_pos, &args));
-      for (auto& arg : args) {
-        RETURN_IF_ERROR(ResolveSelectorsPass({}, &arg));
-      }
-      std::string patch;
-      RETURN_IF_ERROR(ResolveSelector(linkables, object_name, selector_name,
-                                      args, template_args, &patch));
-      code->replace(arg_pos, close_bracket_pos - arg_pos, patch);
-      position = arg_pos + patch.size();
-    } else {
-      position = arg_pos + strlen(kArgsPrefix);
-    }
-    next_position = code->find(kArgsPrefix, position);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status Arguments::AllocateObjects(CLContext* context) {
-  for (auto& t : objects_) {
-    RETURN_IF_ERROR(
-        t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
-  }
   return absl::OkStatus();
 }
 
 void Arguments::ReleaseCPURepresentation() {
   for (auto& t : objects_) {
-    t.second.descriptor->Release();
+    t.second->Release();
   }
 }
 
-absl::Status Arguments::AddObjectArgs() {
-  for (auto& t : objects_) {
-    AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
-    GPUResourcesWithValue resources;
-    RETURN_IF_ERROR(t.second.obj_ptr->GetGPUResources(t.second.descriptor.get(),
-                                                      &resources));
-    RETURN_IF_ERROR(SetGPUResources(t.first, resources));
+void Arguments::GetActiveArguments(const std::string& args_prefix,
+                                   const std::string& code) {
+  for (auto& float_val : float_values_) {
+    float_val.second.active = HasWord(args_prefix + float_val.first, code);
   }
-  for (auto& t : object_refs_) {
-    AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
+  for (auto& int_val : int_values_) {
+    int_val.second.active = HasWord(args_prefix + int_val.first, code);
+  }
+  for (auto& half_val : half_values_) {
+    half_val.second.active = HasWord(args_prefix + half_val.first, code);
   }
-  return absl::OkStatus();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 7d76b24a47a..91b0de65c9b 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -20,12 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
-#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
-#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 
@@ -33,9 +31,25 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+class ArgumentsBinder {
+ public:
+  virtual absl::Status SetInt(const std::string& name, int value) = 0;
+  virtual absl::Status SetFloat(const std::string& name, float value) = 0;
+  virtual absl::Status SetHalf(const std::string& name, half value) = 0;
+  virtual ~ArgumentsBinder() = default;
+};
+
 class Arguments {
  public:
   Arguments() = default;
+  ~Arguments() = default;
+
+  // Move only
+  Arguments(Arguments&& args) = default;
+  Arguments& operator=(Arguments&& args) = default;
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
   void AddFloat(const std::string& name, float value = 0.0f);
   void AddHalf(const std::string& name, half value = half(0.0f));
   void AddInt(const std::string& name, int value = 0);
@@ -44,72 +58,19 @@ class Arguments {
   void AddObject(const std::string& name,
                  GPUObjectDescriptorPtr&& descriptor_ptr);
 
-  absl::Status SetInt(const std::string& name, int value);
-  absl::Status SetFloat(const std::string& name, float value);
-  absl::Status SetHalf(const std::string& name, half value);
-  absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
-
-  absl::Status Bind(cl_kernel kernel, int offset = 0);
-
   void RenameArgs(const std::string& postfix, std::string* code) const;
   absl::Status Merge(Arguments&& args, const std::string& postfix);
 
-  absl::Status AllocateObjects(CLContext* context);
   void ReleaseCPURepresentation();
-  absl::Status TransformToCLCode(
-      const DeviceInfo& device_info,
-      const std::map<std::string, std::string>& linkables, std::string* code);
-
-  // Move only
-  Arguments(Arguments&& args);
-  Arguments& operator=(Arguments&& args);
-  Arguments(const Arguments&) = delete;
-  Arguments& operator=(const Arguments&) = delete;
 
  private:
-  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
-  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
-  void AddImage2DArray(const std::string& name,
-                       const GPUImage2DArrayDescriptor& desc);
-  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
-  void AddImageBuffer(const std::string& name,
-                      const GPUImageBufferDescriptor& desc);
-  void AddCustomMemory(const std::string& name,
-                       const GPUCustomMemoryDescriptor& desc);
+  friend flatbuffers::Offset<data::Arguments> Encode(
+      const Arguments& args, flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(const data::Arguments* fb_args, Arguments* args);
 
-  absl::Status SetImage2D(const std::string& name, cl_mem memory);
-  absl::Status SetBuffer(const std::string& name, cl_mem memory);
-  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
-  absl::Status SetImage3D(const std::string& name, cl_mem memory);
-  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
-  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
-
-  std::string GetListOfArgs();
-
-  std::string AddActiveArgument(const std::string& arg_name,
-                                bool use_f32_for_halfs);
-  void AddGPUResources(const std::string& name, const GPUResources& resources);
-
-  absl::Status SetGPUResources(const std::string& name,
-                               const GPUResourcesWithValue& resources);
-
-  absl::Status AddObjectArgs();
-
-  void ResolveArgsPass(const DeviceInfo& device_info, std::string* code);
-  absl::Status ResolveSelectorsPass(
-      const std::map<std::string, std::string>& linkables, std::string* code);
-
-  absl::Status ResolveSelector(
-      const std::map<std::string, std::string>& linkables,
-      const std::string& object_name, const std::string& selector,
-      const std::vector<std::string>& args,
-      const std::vector<std::string>& template_args, std::string* result);
-
-  void ResolveObjectNames(const std::string& object_name,
-                          const std::vector<std::string>& member_names,
-                          std::string* code);
-
-  static constexpr char kArgsPrefix[] = "args.";
+  friend class CLArguments;
+  void GetActiveArguments(const std::string& args_prefix,
+                          const std::string& code);
 
   struct IntValue {
     int value;
@@ -117,12 +78,8 @@ class Arguments {
     // many uniforms generated automatically and not used
     // to reduce amount of data transferred we adding this optimization
     bool active = false;
-
-    // offset to shared uniform storage.
-    uint32_t offset = -1;
   };
   std::map<std::string, IntValue> int_values_;
-  std::vector<int32_t> shared_int4s_data_;
 
   struct FloatValue {
     float value;
@@ -130,12 +87,8 @@ class Arguments {
     // many uniforms generated automatically and not used
     // to reduce amount of data transferred we adding this optimization
     bool active = false;
-
-    // offset to shared uniform storage.
-    uint32_t offset = -1;
   };
   std::map<std::string, FloatValue> float_values_;
-  std::vector<float> shared_float4s_data_;
 
   struct HalfValue {
     half value;
@@ -143,33 +96,11 @@ class Arguments {
     // many uniforms generated automatically and not used
     // to reduce amount of data transferred we adding this optimization
     bool active = false;
-
-    // some devices have issues with half parameters.
-    bool store_as_f32 = false;
-
-    // offset to shared uniform storage.
-    uint32_t offset = -1;
   };
   std::map<std::string, HalfValue> half_values_;
-  std::vector<half> shared_half4s_data_;
 
-  std::map<std::string, GPUBufferDescriptor> buffers_;
-  std::map<std::string, GPUImage2DDescriptor> images2d_;
-  std::map<std::string, GPUImage2DArrayDescriptor> image2d_arrays_;
-  std::map<std::string, GPUImage3DDescriptor> images3d_;
-  std::map<std::string, GPUImageBufferDescriptor> image_buffers_;
-  std::map<std::string, GPUCustomMemoryDescriptor> custom_memories_;
-
-  struct ObjectRefArg {
-    GPUObjectDescriptorPtr descriptor;
-  };
-  std::map<std::string, ObjectRefArg> object_refs_;
-
-  struct ObjectArg {
-    GPUObjectPtr obj_ptr;
-    GPUObjectDescriptorPtr descriptor;
-  };
-  std::map<std::string, ObjectArg> objects_;
+  std::map<std::string, GPUObjectDescriptorPtr> object_refs_;
+  std::map<std::string, GPUObjectDescriptorPtr> objects_;
 };
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 340c2a7f9ac..e672acdc598 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -130,14 +130,6 @@ absl::Status BufferDescriptor::PerformGetPtrSelector(
   return absl::OkStatus();
 }
 
-absl::Status BufferDescriptor::CreateGPUObject(CLContext* context,
-                                               GPUObjectPtr* result) const {
-  Buffer gpu_buffer;
-  RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context));
-  *result = absl::make_unique<Buffer>(std::move(gpu_buffer));
-  return absl::OkStatus();
-}
-
 Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
     : buffer_(buffer), size_(size_in_bytes) {}
 
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index 60c48304e95..597fbf68cbb 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -57,8 +57,6 @@ struct BufferDescriptor : public GPUObjectDescriptor {
       const std::vector<std::string>& args,
       const std::vector<std::string>& template_args, std::string* result) const;
 
-  absl::Status CreateGPUObject(CLContext* context,
-                               GPUObjectPtr* result) const override;
   void Release() override;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc b/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc
new file mode 100644
index 00000000000..2d10c0d12a3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc
@@ -0,0 +1,835 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
+void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
+                     std::string* str) {
+  size_t position = str->find(old_word);
+  while (position != std::string::npos) {
+    char prev = position == 0 ? '.' : (*str)[position - 1];
+    char next = position + old_word.size() < str->size()
+                    ? (*str)[position + old_word.size()]
+                    : '.';
+    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
+      position = str->find(old_word, position + 1);
+      continue;
+    }
+    str->replace(position, old_word.size(), new_word);
+    position = str->find(old_word, position + new_word.size());
+  }
+}
+
+std::string GetNextWord(const std::string& code, size_t first_position) {
+  size_t pos = first_position;
+  char t = code[pos];
+  while (IsWordSymbol(t)) {
+    pos++;
+    t = code[pos];
+  }
+  return code.substr(first_position, pos - first_position);
+}
+
+size_t FindEnclosingBracket(const std::string& text, size_t first_pos,
+                            char bracket) {
+  const std::map<char, char> brackets = {
+      {'(', ')'},
+      {'{', '}'},
+      {'[', ']'},
+      {'<', '>'},
+  };
+  char b_open = bracket;
+  auto it = brackets.find(b_open);
+  if (it == brackets.end()) {
+    return -1;
+  }
+  char b_close = it->second;
+  size_t pos = first_pos;
+  int opened = 1;
+  int closed = 0;
+  while (opened != closed && pos < text.size()) {
+    if (text[pos] == b_open) {
+      opened++;
+    } else if (text[pos] == b_close) {
+      closed++;
+    }
+    pos++;
+  }
+  if (opened == closed) {
+    return pos;
+  } else {
+    return -1;
+  }
+}
+
+absl::Status ParseArgsInsideBrackets(const std::string& text,
+                                     size_t open_bracket_pos,
+                                     size_t* close_bracket_pos,
+                                     std::vector<std::string>* args) {
+  *close_bracket_pos =
+      FindEnclosingBracket(text, open_bracket_pos + 1, text[open_bracket_pos]);
+  if (*close_bracket_pos == -1) {
+    return absl::NotFoundError("Not found enclosing bracket");
+  }
+  std::string str_args = text.substr(open_bracket_pos + 1,
+                                     *close_bracket_pos - open_bracket_pos - 2);
+  std::vector<absl::string_view> words = absl::StrSplit(str_args, ',');
+  args->reserve(words.size());
+  for (const auto& word : words) {
+    absl::string_view arg = absl::StripAsciiWhitespace(word);
+    if (!arg.empty()) {
+      args->push_back(std::string(arg));
+    }
+  }
+  return absl::OkStatus();
+}
+
+void AppendArgument(const std::string& arg, std::string* args) {
+  if (!args->empty()) {
+    absl::StrAppend(args, ",\n  ");
+  }
+  absl::StrAppend(args, arg);
+}
+
+std::string GetImageModifier(AccessType access) {
+  switch (access) {
+    case AccessType::READ:
+      return "__read_only";
+    case AccessType::WRITE:
+      return "__write_only";
+    case AccessType::READ_WRITE:
+      return "__read_write";
+  }
+}
+
+std::string GetDefaultSamplers(const DeviceInfo& device_info) {
+  std::string result;
+  result +=
+      "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+  if (device_info.IsAdreno3xx()) {
+    // Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
+    // we can observe huge register overhead when compared to other modes.
+
+    // While using CLK_ADDRESS_NONE with out-of-range image coordinates is
+    // undefined in the OpenCL specification, we have observed that
+    // CLK_ADDRESS_NONE works like CLK_ADDRESS_CLAMP for out-of-range image
+    // coordinates for RGBA F16/F32 textures on Adreno3xx devices. Using
+    // CLK_ADDRESS_NONE is significantly faster than CLK_ADDRESS_CLAMP on Adreno
+    // 3xx.
+    result +=
+        "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+        "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+  } else {
+    result +=
+        "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+        "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
+  }
+
+  return result;
+}
+
+absl::Status CreateCLObject(GPUObjectDescriptor* desc, CLContext* context,
+                            GPUObjectPtr* result) {
+  const auto* buffer_desc = dynamic_cast<const BufferDescriptor*>(desc);
+  if (buffer_desc) {
+    Buffer gpu_buffer;
+    RETURN_IF_ERROR(
+        gpu_buffer.CreateFromBufferDescriptor(*buffer_desc, context));
+    *result = absl::make_unique<Buffer>(std::move(gpu_buffer));
+    return absl::OkStatus();
+  }
+
+  const auto* texture_desc = dynamic_cast<const Texture2DDescriptor*>(desc);
+  if (texture_desc) {
+    Texture2D gpu_texture;
+    RETURN_IF_ERROR(
+        gpu_texture.CreateFromTexture2DDescriptor(*texture_desc, context));
+    *result = absl::make_unique<Texture2D>(std::move(gpu_texture));
+    return absl::OkStatus();
+  }
+
+  const auto* linear_desc = dynamic_cast<const TensorLinearDescriptor*>(desc);
+  if (linear_desc) {
+    LinearStorage gpu_storage;
+    RETURN_IF_ERROR(
+        gpu_storage.CreateFromTensorLinearDescriptor(*linear_desc, context));
+    *result = absl::make_unique<LinearStorage>(std::move(gpu_storage));
+    return absl::OkStatus();
+  }
+
+  const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(desc);
+  if (tensor_desc) {
+    Tensor gpu_tensor;
+    RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*tensor_desc, context));
+    *result = absl::make_unique<Tensor>(std::move(gpu_tensor));
+    return absl::OkStatus();
+  }
+
+  return absl::InvalidArgumentError("Unknown GPU descriptor.");
+}
+
+}  // namespace
+
+// Static
+constexpr char CLArguments::kArgsPrefix[];
+
+absl::Status CLArguments::Init(
+    const DeviceInfo& device_info,
+    const std::map<std::string, std::string>& linkables, CLContext* context,
+    Arguments* args, std::string* code) {
+  RETURN_IF_ERROR(AllocateObjects(*args, context));
+  RETURN_IF_ERROR(AddObjectArgs(args));
+  RETURN_IF_ERROR(ResolveSelectorsPass(*args, linkables, code));
+  object_refs_ = std::move(args->object_refs_);
+  args->GetActiveArguments(kArgsPrefix, *code);
+  const bool use_f32_for_halfs = device_info.IsPowerVR();
+  CopyArguments(*args, use_f32_for_halfs);
+  RETURN_IF_ERROR(SetObjectsResources(*args));
+  RenameArgumentsInCode(code);
+  ResolveArgsPass(code);
+  *code = absl::Substitute(*code, GetListOfArgs());
+  *code = GetDefaultSamplers(device_info) + *code;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::Init(const DeviceInfo& device_info, Arguments* args,
+                               CLContext* context) {
+  RETURN_IF_ERROR(AllocateObjects(*args, context));
+  RETURN_IF_ERROR(AddObjectArgs(args));
+  object_refs_ = std::move(args->object_refs_);
+  const bool use_f32_for_halfs = device_info.IsPowerVR();
+  CopyArguments(*args, use_f32_for_halfs);
+  RETURN_IF_ERROR(SetObjectsResources(*args));
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::AllocateObjects(const Arguments& args,
+                                          CLContext* context) {
+  objects_.resize(args.objects_.size());
+  int i = 0;
+  for (auto& t : args.objects_) {
+    RETURN_IF_ERROR(CreateCLObject(t.second.get(), context, &objects_[i]));
+    i++;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::AddObjectArgs(Arguments* args) {
+  for (auto& t : args->objects_) {
+    AddGPUResources(t.first, t.second->GetGPUResources(), args);
+  }
+  for (auto& t : args->object_refs_) {
+    AddGPUResources(t.first, t.second->GetGPUResources(), args);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetObjectsResources(const Arguments& args) {
+  int i = 0;
+  for (const auto& t : args.objects_) {
+    GPUResourcesWithValue resources;
+    RETURN_IF_ERROR(objects_[i]->GetGPUResources(t.second.get(), &resources));
+    RETURN_IF_ERROR(SetGPUResources(t.first, resources));
+    i++;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::ResolveSelectorsPass(
+    const Arguments& args, const std::map<std::string, std::string>& linkables,
+    std::string* code) {
+  std::string result;
+  size_t position = 0;
+  size_t next_position = code->find(kArgsPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kArgsPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    char next = (*code)[next_position + object_name.size()];
+    if (next == '.') {
+      next_position += object_name.size() + 1;
+      std::string selector_name = GetNextWord(*code, next_position);
+      next_position += selector_name.size();
+      next = (*code)[next_position];
+      std::vector<std::string> template_args;
+      if (next == '<') {
+        size_t close_bracket_pos;
+        RETURN_IF_ERROR(ParseArgsInsideBrackets(
+            *code, next_position, &close_bracket_pos, &template_args));
+        next_position = close_bracket_pos;
+        next = (*code)[next_position];
+      }
+      if (next != '(') {
+        return absl::NotFoundError(absl::StrCat(
+            "Expected ( after ", object_name, ".", selector_name, " call"));
+      }
+      std::vector<std::string> function_args;
+      size_t close_bracket_pos;
+      RETURN_IF_ERROR(ParseArgsInsideBrackets(
+          *code, next_position, &close_bracket_pos, &function_args));
+      for (auto& arg : function_args) {
+        RETURN_IF_ERROR(ResolveSelectorsPass(args, {}, &arg));
+      }
+      std::string patch;
+      RETURN_IF_ERROR(ResolveSelector(args, linkables, object_name,
+                                      selector_name, function_args,
+                                      template_args, &patch));
+      code->replace(arg_pos, close_bracket_pos - arg_pos, patch);
+      position = arg_pos + patch.size();
+    } else {
+      position = arg_pos + strlen(kArgsPrefix);
+    }
+    next_position = code->find(kArgsPrefix, position);
+  }
+  return absl::OkStatus();
+}
+
+void CLArguments::ResolveObjectNames(
+    const std::string& object_name,
+    const std::vector<std::string>& member_names, std::string* code) {
+  for (const auto& member_name : member_names) {
+    const std::string new_name = kArgsPrefix + object_name + "_" + member_name;
+    ReplaceAllWords(member_name, new_name, code);
+  }
+}
+
+absl::Status CLArguments::ResolveSelector(
+    const Arguments& args, const std::map<std::string, std::string>& linkables,
+    const std::string& object_name, const std::string& selector,
+    const std::vector<std::string>& function_args,
+    const std::vector<std::string>& template_args, std::string* result) {
+  const GPUObjectDescriptor* desc_ptr;
+  auto it_ref = args.object_refs_.find(object_name);
+  auto it_obj = args.objects_.find(object_name);
+  if (it_ref != args.object_refs_.end()) {
+    desc_ptr = it_ref->second.get();
+  } else if (it_obj != args.objects_.end()) {
+    desc_ptr = it_obj->second.get();
+  } else {
+    return absl::NotFoundError(
+        absl::StrCat("No object with name - ", object_name));
+  }
+  auto names = desc_ptr->GetGPUResources().GetNames();
+  const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(desc_ptr);
+  if (tensor_desc && selector == "Write") {
+    auto it = linkables.find(object_name);
+    if (it != linkables.end()) {
+      if (desc_ptr->GetAccess() != AccessType::WRITE &&
+          desc_ptr->GetAccess() != AccessType::READ_WRITE) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Object with name - ", object_name, " should have Write access."));
+      }
+      std::string value_name, x_coord, y_coord, s_coord;
+      RETURN_IF_ERROR(tensor_desc->GetLinkingContextFromWriteSelector(
+          function_args, &value_name, &x_coord, &y_coord, &s_coord));
+      // x_coord can have batch size property of link_object
+      ResolveObjectNames(object_name, names, &x_coord);
+      *result = it->second;
+      ReplaceAllWords("in_out_value", value_name, result);
+      ReplaceAllWords("X_COORD", x_coord, result);
+      ReplaceAllWords("Y_COORD", y_coord, result);
+      ReplaceAllWords("S_COORD", s_coord, result);
+      RETURN_IF_ERROR(ResolveSelectorsPass(args, {}, result));
+    }
+  }
+  std::string patch;
+  RETURN_IF_ERROR(desc_ptr->PerformSelector(selector, function_args,
+                                            template_args, &patch));
+  ResolveObjectNames(object_name, names, &patch);
+  *result += patch;
+  return absl::OkStatus();
+}
+
+void CLArguments::ResolveArgsPass(std::string* code) {
+  size_t position = 0;
+  size_t next_position = code->find(kArgsPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kArgsPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    std::string new_name = object_name;
+    code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name);
+    position = arg_pos + new_name.size();
+    next_position = code->find(kArgsPrefix, position);
+  }
+}
+
+void CLArguments::CopyScalarValues(Arguments* args) const {
+  for (const auto& fvalue : float_values_) {
+    args->float_values_[fvalue.first].value = fvalue.second.value;
+  }
+  for (const auto& ivalue : int_values_) {
+    args->int_values_[ivalue.first].value = ivalue.second.value;
+  }
+  for (const auto& hfvalue : half_values_) {
+    args->half_values_[hfvalue.first].value = hfvalue.second.value;
+  }
+}
+
+void CLArguments::CopyArguments(const Arguments& args, bool use_f32_for_halfs) {
+  for (const auto& fvalue : args.float_values_) {
+    auto& new_val = float_values_[fvalue.first];
+    new_val.value = fvalue.second.value;
+    new_val.active = fvalue.second.active;
+    if (fvalue.second.active) {
+      new_val.offset = shared_float4s_data_.size();
+      shared_float4s_data_.push_back(new_val.value);
+    }
+  }
+  for (const auto& ivalue : args.int_values_) {
+    auto& new_val = int_values_[ivalue.first];
+    new_val.value = ivalue.second.value;
+    new_val.active = ivalue.second.active;
+    if (ivalue.second.active) {
+      new_val.offset = shared_int4s_data_.size();
+      shared_int4s_data_.push_back(new_val.value);
+    }
+  }
+  for (const auto& hfvalue : args.half_values_) {
+    auto& new_val = half_values_[hfvalue.first];
+    new_val.value = hfvalue.second.value;
+    new_val.active = hfvalue.second.active;
+    if (hfvalue.second.active) {
+      if (use_f32_for_halfs) {
+        new_val.store_as_f32 = true;
+        new_val.offset = shared_float4s_data_.size();
+        shared_float4s_data_.push_back(new_val.value);
+      } else {
+        new_val.store_as_f32 = false;
+        new_val.offset = shared_half4s_data_.size();
+        shared_half4s_data_.push_back(new_val.value);
+      }
+    }
+  }
+  int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
+  shared_int4s_data_.resize(shared_int4s_aligned_size);
+  int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4);
+  shared_float4s_data_.resize(shared_float4s_aligned_size);
+  int shared_half4s_aligned_size = AlignByN(shared_half4s_data_.size(), 4);
+  shared_half4s_data_.resize(shared_half4s_aligned_size);
+}
+
+void CLArguments::RenameArgumentsInCode(std::string* code) {
+  const std::string postfixes[4] = {"x", "y", "z", "w"};
+  for (const auto& fvalue : float_values_) {
+    if (fvalue.second.active) {
+      std::string index = std::to_string(fvalue.second.offset / 4);
+      std::string new_name =
+          "shared_float4_" + index + "." + postfixes[fvalue.second.offset % 4];
+      ReplaceAllWords(kArgsPrefix + fvalue.first, new_name, code);
+    }
+  }
+  for (const auto& ivalue : int_values_) {
+    if (ivalue.second.active) {
+      std::string index = std::to_string(ivalue.second.offset / 4);
+      std::string new_name =
+          "shared_int4_" + index + "." + postfixes[ivalue.second.offset % 4];
+      ReplaceAllWords(kArgsPrefix + ivalue.first, new_name, code);
+    }
+  }
+  for (const auto& hfvalue : half_values_) {
+    if (hfvalue.second.active) {
+      std::string index = std::to_string(hfvalue.second.offset / 4);
+      std::string new_name;
+      if (hfvalue.second.store_as_f32) {
+        new_name = "(half)(shared_float4_" + index + "." +
+                   postfixes[hfvalue.second.offset % 4] + ")";
+      } else {
+        new_name = "shared_half4_" + index + "." +
+                   postfixes[hfvalue.second.offset % 4];
+      }
+      ReplaceAllWords(kArgsPrefix + hfvalue.first, new_name, code);
+    }
+  }
+}
+
+void CLArguments::AddBuffer(const std::string& name,
+                            const GPUBufferDescriptor& desc) {
+  buffers_[name].desc = desc;
+}
+void CLArguments::AddImage2D(const std::string& name,
+                             const GPUImage2DDescriptor& desc) {
+  images2d_[name].desc = desc;
+}
+
+void CLArguments::AddImage2DArray(const std::string& name,
+                                  const GPUImage2DArrayDescriptor& desc) {
+  image2d_arrays_[name].desc = desc;
+}
+
+void CLArguments::AddImage3D(const std::string& name,
+                             const GPUImage3DDescriptor& desc) {
+  images3d_[name].desc = desc;
+}
+
+void CLArguments::AddImageBuffer(const std::string& name,
+                                 const GPUImageBufferDescriptor& desc) {
+  image_buffers_[name].desc = desc;
+}
+
+void CLArguments::AddCustomMemory(const std::string& name,
+                                  const GPUCustomMemoryDescriptor& desc) {
+  custom_memories_[name].desc = desc;
+}
+
+void CLArguments::AddGPUResources(const std::string& name,
+                                  const GPUResources& resources,
+                                  Arguments* args) {
+  for (const auto& r : resources.ints) {
+    args->AddInt(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.floats) {
+    args->AddFloat(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.buffers) {
+    AddBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images2d) {
+    AddImage2D(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.image2d_arrays) {
+    AddImage2DArray(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images3d) {
+    AddImage3D(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.image_buffers) {
+    AddImageBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.custom_memories) {
+    AddCustomMemory(absl::StrCat(name, "_", r.first), r.second);
+  }
+}
+
+absl::Status CLArguments::SetInt(const std::string& name, int value) {
+  auto it = int_values_.find(name);
+  if (it == int_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No int argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    shared_int4s_data_[it->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+absl::Status CLArguments::SetFloat(const std::string& name, float value) {
+  auto it = float_values_.find(name);
+  if (it == float_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No float argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    shared_float4s_data_[it->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetHalf(const std::string& name, half value) {
+  auto it = half_values_.find(name);
+  if (it == half_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No half argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    if (it->second.store_as_f32) {
+      shared_float4s_data_[it->second.offset] = value;
+    } else {
+      shared_half4s_data_[it->second.offset] = value;
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetImage2D(const std::string& name, cl_mem memory) {
+  auto it = images2d_.find(name);
+  if (it == images2d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetBuffer(const std::string& name, cl_mem memory) {
+  auto it = buffers_.find(name);
+  if (it == buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetImage2DArray(const std::string& name,
+                                          cl_mem memory) {
+  auto it = image2d_arrays_.find(name);
+  if (it == image2d_arrays_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D array argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetImage3D(const std::string& name, cl_mem memory) {
+  auto it = images3d_.find(name);
+  if (it == images3d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image3D argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetImageBuffer(const std::string& name,
+                                         cl_mem memory) {
+  auto it = image_buffers_.find(name);
+  if (it == image_buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetCustomMemory(const std::string& name,
+                                          cl_mem memory) {
+  auto it = custom_memories_.find(name);
+  if (it == custom_memories_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No custom memory argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status CLArguments::SetObjectRef(const std::string& name,
+                                       const GPUObject* object) {
+  auto it = object_refs_.find(name);
+  if (it == object_refs_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No object ref with name - ", name));
+  }
+  GPUResourcesWithValue resources;
+  RETURN_IF_ERROR(object->GetGPUResources(it->second.get(), &resources));
+  return SetGPUResources(name, resources);
+}
+
+absl::Status CLArguments::SetGPUResources(
+    const std::string& name, const GPUResourcesWithValue& resources) {
+  for (const auto& r : resources.ints) {
+    RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.floats) {
+    RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.buffers) {
+    RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images2d) {
+    RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.image2d_arrays) {
+    RETURN_IF_ERROR(
+        SetImage2DArray(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images3d) {
+    RETURN_IF_ERROR(SetImage3D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.image_buffers) {
+    RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.custom_memories) {
+    RETURN_IF_ERROR(
+        SetCustomMemory(absl::StrCat(name, "_", r.first), r.second));
+  }
+  return absl::OkStatus();
+}
+
+std::string CLArguments::GetListOfArgs() {
+  std::string result;
+  for (auto& t : buffers_) {
+    const std::string type_name =
+        t.second.desc.data_type == DataType::FLOAT32 ? "float" : "half";
+    std::string attributes;
+    for (const auto& attr : t.second.desc.attributes) {
+      attributes += absl::StrCat("  __attribute__((", attr, "))");
+    }
+    AppendArgument(
+        absl::StrCat(
+            MemoryTypeToCLType(t.second.desc.memory_type), " ",
+            ToCLDataType(t.second.desc.data_type, t.second.desc.element_size),
+            "* ", t.first, attributes),
+        &result);
+  }
+  for (auto& t : image_buffers_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.desc.access_type),
+                                " image1d_buffer_t ", t.first),
+                   &result);
+  }
+  for (auto& t : images2d_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.desc.access_type),
+                                " image2d_t ", t.first),
+                   &result);
+  }
+  for (auto& t : image2d_arrays_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.desc.access_type),
+                                " image2d_array_t ", t.first),
+                   &result);
+  }
+  for (auto& t : images3d_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.desc.access_type),
+                                " image3d_t ", t.first),
+                   &result);
+  }
+  for (auto& t : custom_memories_) {
+    AppendArgument(absl::StrCat(t.second.desc.type_name, " ", t.first),
+                   &result);
+  }
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    AppendArgument(absl::StrCat("float4 shared_float4_", i), &result);
+  }
+  for (int i = 0; i < shared_half4s_data_.size() / 4; ++i) {
+    AppendArgument(absl::StrCat("half4 shared_half4_", i), &result);
+  }
+  return result;
+}
+
+absl::Status CLArguments::Bind(cl_kernel kernel, int offset) {
+  for (auto& t : buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : image_buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images2d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : image2d_arrays_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images3d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : custom_memories_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_int4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_float4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_half4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int16_t) * 4,
+                                          &shared_half4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments.h b/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
new file mode 100644
index 00000000000..4291771425e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
@@ -0,0 +1,192 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class CLArguments : public ArgumentsBinder {
+ public:
+  CLArguments() = default;
+
+  absl::Status Init(const DeviceInfo& device_info,
+                    const std::map<std::string, std::string>& linkables,
+                    CLContext* context, Arguments* args, std::string* code);
+  absl::Status Init(const DeviceInfo& device_info, Arguments* args,
+                    CLContext* context);
+
+  // Temporary, will be resolved later
+  void MoveObjectRefsIn(Arguments* args) {
+    object_refs_ = std::move(args->object_refs_);
+  }
+  void MoveObjectRefsOut(Arguments* args) {
+    args->object_refs_ = std::move(object_refs_);
+  }
+  void CopyScalarValues(Arguments* args) const;
+
+  // Move only
+  CLArguments(CLArguments&& args) = default;
+  CLArguments& operator=(CLArguments&& args) = default;
+  CLArguments(const CLArguments&) = delete;
+  CLArguments& operator=(const CLArguments&) = delete;
+
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+  absl::Status SetHalf(const std::string& name, half value) override;
+  absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
+
+  absl::Status Bind(cl_kernel kernel, int offset = 0);
+
+ private:
+  absl::Status AllocateObjects(const Arguments& args, CLContext* context);
+  absl::Status AddObjectArgs(Arguments* args);
+
+  absl::Status ResolveSelectorsPass(
+      const Arguments& args,
+      const std::map<std::string, std::string>& linkables, std::string* code);
+  absl::Status ResolveSelector(
+      const Arguments& args,
+      const std::map<std::string, std::string>& linkables,
+      const std::string& object_name, const std::string& selector,
+      const std::vector<std::string>& function_args,
+      const std::vector<std::string>& template_args, std::string* result);
+  void ResolveObjectNames(const std::string& object_name,
+                          const std::vector<std::string>& member_names,
+                          std::string* code);
+  void ResolveArgsPass(std::string* code);
+
+  void CopyArguments(const Arguments& args, bool use_f32_for_halfs);
+  void RenameArgumentsInCode(std::string* code);
+  std::string GetListOfArgs();
+
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
+  void AddCustomMemory(const std::string& name,
+                       const GPUCustomMemoryDescriptor& desc);
+  void AddGPUResources(const std::string& name, const GPUResources& resources,
+                       Arguments* args);
+  absl::Status SetObjectsResources(const Arguments& args);
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
+  absl::Status SetImage3D(const std::string& name, cl_mem memory);
+  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
+
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+  std::vector<int32_t> shared_int4s_data_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<float> shared_float4s_data_;
+
+  struct HalfValue {
+    half value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // some devices have issues with half parameters.
+    bool store_as_f32 = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, HalfValue> half_values_;
+  std::vector<half> shared_half4s_data_;
+
+  struct CLBufferDescriptor {
+    GPUBufferDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImage2DDescriptor {
+    GPUImage2DDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImage2DArrayDescriptor {
+    GPUImage2DArrayDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImage3DDescriptor {
+    GPUImage3DDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImageBufferDescriptor {
+    GPUImageBufferDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLCustomMemoryDescriptor {
+    GPUCustomMemoryDescriptor desc;
+    cl_mem memory;
+  };
+
+  std::map<std::string, CLBufferDescriptor> buffers_;
+  std::map<std::string, CLImage2DDescriptor> images2d_;
+  std::map<std::string, CLImage2DArrayDescriptor> image2d_arrays_;
+  std::map<std::string, CLImage3DDescriptor> images3d_;
+  std::map<std::string, CLImageBufferDescriptor> image_buffers_;
+  std::map<std::string, CLCustomMemoryDescriptor> custom_memories_;
+
+  std::map<std::string, GPUObjectDescriptorPtr> object_refs_;
+  std::vector<GPUObjectPtr> objects_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
similarity index 80%
rename from tensorflow/lite/delegates/gpu/cl/arguments_test.cc
rename to tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
index 722ca5b1827..adaf30b7909 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
 
 #include <cstdint>
 #include <string>
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
@@ -27,7 +28,7 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-TEST(ArgumentsTest, TestSelectorResolve) {
+TEST(CLArgumentsTest, TestSelectorResolve) {
   BufferDescriptor desc;
   desc.element_type = DataType::FLOAT32;
   desc.element_size = 4;
@@ -43,14 +44,15 @@ __kernel void main_function($0) {
   }
 })";
 
+  CLArguments cl_args;
   DeviceInfo device_info;
-  ASSERT_OK(args.TransformToCLCode(device_info, {}, &sample_code));
+  ASSERT_OK(cl_args.Init(device_info, {}, nullptr, &args, &sample_code));
   EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
   EXPECT_TRUE(
       absl::StrContains(sample_code, "__global float4* weights_buffer"));
 }
 
-TEST(ArgumentsTest, TestNoSelector) {
+TEST(CLArgumentsTest, TestNoSelector) {
   BufferDescriptor desc;
   desc.element_type = DataType::FLOAT32;
   desc.element_size = 4;
@@ -64,16 +66,10 @@ TEST(ArgumentsTest, TestNoSelector) {
     value = args.weights.UnknownSelector(id);
   }
 )";
+  CLArguments cl_args;
   DeviceInfo device_info;
-  EXPECT_FALSE(args.TransformToCLCode(device_info, {}, &sample_code).ok());
-}
-
-TEST(ArgumentsTest, TestRenameArgs) {
-  Arguments linkable_args;
-  linkable_args.AddFloat("alpha", 0.5f);
-  std::string linkable_code = "in_out_value += args.alpha;\n";
-  linkable_args.RenameArgs("_link0", &linkable_code);
-  EXPECT_EQ(linkable_code, "in_out_value += args.alpha_link0;\n");
+  EXPECT_FALSE(
+      cl_args.Init(device_info, {}, nullptr, &args, &sample_code).ok());
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
index 66af3b1ffa5..10937cfc56b 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@@ -56,14 +56,15 @@ void CLCommandQueue::Release() {
   }
 }
 
-absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                              int3 work_group_size,
-                                              CLEvent* event) {
+absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
+                                      const int3& work_groups_count,
+                                      const int3& work_group_size,
+                                      CLEvent* event) {
   std::vector<size_t> local(3);
   std::vector<size_t> global(3);
   for (int i = 0; i < 3; ++i) {
     local[i] = work_group_size[i];
-    global[i] = AlignByN(grid[i], work_group_size[i]);
+    global[i] = work_groups_count[i] * work_group_size[i];
   }
   cl_event resulting_event;
   const int error_code = clEnqueueNDRangeKernel(
@@ -80,9 +81,10 @@ absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
   return absl::OkStatus();
 }
 
-absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                              int3 work_group_size) {
-  return DispatchImplicit(kernel, grid, work_group_size, nullptr);
+absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
+                                      const int3& work_groups_count,
+                                      const int3& work_group_size) {
+  return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
 }
 
 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
@@ -191,12 +193,13 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
 
 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
 
-absl::Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
-                                                     int3 grid,
-                                                     int3 work_group_size) {
+absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
+                                             const int3& work_groups_count,
+                                             const int3& work_group_size) {
   events_.push_back(CLEvent());
-  RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
-      kernel, grid, work_group_size, &events_[events_.size() - 1]));
+  RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
+                                           work_group_size,
+                                           &events_[events_.size() - 1]));
   events_.back().SetName(current_label_);
   return absl::OkStatus();
 }
@@ -213,14 +216,15 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
 }
 
 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
-    const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
+    const CLKernel& kernel, const DeviceInfo& device_info,
+    const std::vector<int3>& work_groups_count,
     const std::vector<int3>& work_group_sizes, int* index) {
   // Some Adreno 3xx can have wrong numbers for some events
   const bool possible_bug_with_events = device_info.IsAdreno3xx();
   events_.resize(work_group_sizes.size());
   for (int i = 0; i < work_group_sizes.size(); ++i) {
-    RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
-        kernel, grid, work_group_sizes[i], &events_[i]));
+    RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
+                                             work_group_sizes[i], &events_[i]));
 
     // reducing the speed of memory leak on Mali for some kernels
     if (device_info.IsMali() && i % 8 == 7) {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
index 178e3b21a1e..519b87640e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@@ -74,14 +74,15 @@ class CLCommandQueue {
 
   cl_command_queue queue() const { return queue_; }
 
-  virtual absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                        int3 work_group_size);
+  virtual absl::Status Dispatch(const CLKernel& kernel,
+                                const int3& work_groups_count,
+                                const int3& work_group_size);
+
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size, CLEvent* event);
 
   absl::Status EnqueueEvent(CLEvent* event);
 
-  absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                int3 work_group_size, CLEvent* event);
-
   absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
   absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
 
@@ -110,13 +111,13 @@ class ProfilingCommandQueue : public CLCommandQueue {
   ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
   ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
 
-  absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                int3 work_group_size) override;
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size) override;
 
   // will write index for fastest work_group among work_group_sizes
   absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
                                      const DeviceInfo& device_info,
-                                     const int3& grid,
+                                     const std::vector<int3>& work_groups_count,
                                      const std::vector<int3>& work_group_sizes,
                                      int* index);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
new file mode 100644
index 00000000000..8a12bf2a9db
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
@@ -0,0 +1,207 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
+#define FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace data {
+
+struct Program;
+
+struct CompiledCache;
+
+struct Program FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FINGERPRINT = 4,
+    VT_BINARY = 6
+  };
+  uint64_t fingerprint() const {
+    return GetField<uint64_t>(VT_FINGERPRINT, 0);
+  }
+  const flatbuffers::Vector<uint8_t> *binary() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_BINARY);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_FINGERPRINT) &&
+           VerifyOffset(verifier, VT_BINARY) &&
+           verifier.VerifyVector(binary()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ProgramBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fingerprint(uint64_t fingerprint) {
+    fbb_.AddElement<uint64_t>(Program::VT_FINGERPRINT, fingerprint, 0);
+  }
+  void add_binary(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary) {
+    fbb_.AddOffset(Program::VT_BINARY, binary);
+  }
+  explicit ProgramBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ProgramBuilder &operator=(const ProgramBuilder &);
+  flatbuffers::Offset<Program> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Program>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Program> CreateProgram(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary = 0) {
+  ProgramBuilder builder_(_fbb);
+  builder_.add_fingerprint(fingerprint);
+  builder_.add_binary(binary);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Program> CreateProgramDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    const std::vector<uint8_t> *binary = nullptr) {
+  auto binary__ = binary ? _fbb.CreateVector<uint8_t>(*binary) : 0;
+  return tflite::gpu::cl::data::CreateProgram(
+      _fbb,
+      fingerprint,
+      binary__);
+}
+
+struct CompiledCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DRIVER_VERSION = 4,
+    VT_PROGRAMS = 6
+  };
+  const flatbuffers::String *driver_version() const {
+    return GetPointer<const flatbuffers::String *>(VT_DRIVER_VERSION);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<Program>> *programs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Program>> *>(VT_PROGRAMS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DRIVER_VERSION) &&
+           verifier.VerifyString(driver_version()) &&
+           VerifyOffset(verifier, VT_PROGRAMS) &&
+           verifier.VerifyVector(programs()) &&
+           verifier.VerifyVectorOfTables(programs()) &&
+           verifier.EndTable();
+  }
+};
+
+struct CompiledCacheBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_driver_version(flatbuffers::Offset<flatbuffers::String> driver_version) {
+    fbb_.AddOffset(CompiledCache::VT_DRIVER_VERSION, driver_version);
+  }
+  void add_programs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Program>>> programs) {
+    fbb_.AddOffset(CompiledCache::VT_PROGRAMS, programs);
+  }
+  explicit CompiledCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CompiledCacheBuilder &operator=(const CompiledCacheBuilder &);
+  flatbuffers::Offset<CompiledCache> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CompiledCache>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CompiledCache> CreateCompiledCache(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> driver_version = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Program>>> programs = 0) {
+  CompiledCacheBuilder builder_(_fbb);
+  builder_.add_programs(programs);
+  builder_.add_driver_version(driver_version);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CompiledCache> CreateCompiledCacheDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *driver_version = nullptr,
+    const std::vector<flatbuffers::Offset<Program>> *programs = nullptr) {
+  auto driver_version__ = driver_version ? _fbb.CreateString(driver_version) : 0;
+  auto programs__ = programs ? _fbb.CreateVector<flatbuffers::Offset<Program>>(*programs) : 0;
+  return tflite::gpu::cl::data::CreateCompiledCache(
+      _fbb,
+      driver_version__,
+      programs__);
+}
+
+inline const tflite::gpu::cl::data::CompiledCache *GetCompiledCache(const void *buf) {
+  return flatbuffers::GetRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+}
+
+inline const tflite::gpu::cl::data::CompiledCache *GetSizePrefixedCompiledCache(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+}
+
+inline const char *CompiledCacheIdentifier() {
+  return "AFCM";
+}
+
+inline bool CompiledCacheBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, CompiledCacheIdentifier());
+}
+
+inline bool VerifyCompiledCacheBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
+}
+
+inline bool VerifySizePrefixedCompiledCacheBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
+}
+
+inline const char *CompiledCacheExtension() {
+  return "jetbin";
+}
+
+inline void FinishCompiledCacheBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+  fbb.Finish(root, CompiledCacheIdentifier());
+}
+
+inline void FinishSizePrefixedCompiledCacheBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+  fbb.FinishSizePrefixed(root, CompiledCacheIdentifier());
+}
+
+}  // namespace data
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.cc b/tensorflow/lite/delegates/gpu/cl/device_info.cc
index 43d050e8371..1e90bd2d673 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.cc
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.cc
@@ -150,7 +150,13 @@ int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
   } else if (gpu_version >= 500 && gpu_version < 600) {
     return -1;  // Adreno 5xx does not support it currently
   } else if (gpu_version >= 600 && gpu_version < 700) {
-    return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
+    if (gpu_version == 640) {
+      return 128 * 144 * 16;
+    } else if (gpu_version == 650) {
+      return 128 * 64 * 16;
+    } else {
+      return 128 * 96 * 16;
+    }
   } else {
     return -1;  //  Adreno 7xx and higher does not exist yet
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 297a5f70858..1c6764c0093 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -21,94 +21,16 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
-struct GPUImage2DDescriptor {
-  DataType data_type;
-  AccessType access_type;
-  cl_mem memory;
-};
-
-struct GPUImage3DDescriptor {
-  DataType data_type;
-  AccessType access_type;
-  cl_mem memory;
-};
-
-struct GPUImage2DArrayDescriptor {
-  DataType data_type;
-  AccessType access_type;
-  cl_mem memory;
-};
-
-struct GPUImageBufferDescriptor {
-  DataType data_type;
-  AccessType access_type;
-  cl_mem memory;
-};
-
-struct GPUCustomMemoryDescriptor {
-  std::string type_name;
-  cl_mem memory;
-};
-
-enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
-
-std::string MemoryTypeToCLType(MemoryType type);
-
-struct GPUBufferDescriptor {
-  DataType data_type;
-  AccessType access_type;
-  int element_size;
-  MemoryType memory_type = MemoryType::GLOBAL;
-  std::vector<std::string> attributes;
-  cl_mem memory;
-};
-
-struct GPUResources {
-  std::vector<std::string> ints;
-  std::vector<std::string> floats;
-  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
-  std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
-  std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
-  std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
-  std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
-  std::vector<std::pair<std::string, GPUCustomMemoryDescriptor>>
-      custom_memories;
-
-  std::vector<std::string> GetNames() const {
-    std::vector<std::string> names = ints;
-    names.insert(names.end(), floats.begin(), floats.end());
-    for (const auto& obj : buffers) {
-      names.push_back(obj.first);
-    }
-    for (const auto& obj : images2d) {
-      names.push_back(obj.first);
-    }
-    for (const auto& obj : image2d_arrays) {
-      names.push_back(obj.first);
-    }
-    for (const auto& obj : images3d) {
-      names.push_back(obj.first);
-    }
-    for (const auto& obj : image_buffers) {
-      names.push_back(obj.first);
-    }
-    for (const auto& obj : custom_memories) {
-      names.push_back(obj.first);
-    }
-    return names;
-  }
-};
-
 struct GPUResourcesWithValue {
   std::vector<std::pair<std::string, int>> ints;
   std::vector<std::pair<std::string, float>> floats;
@@ -120,56 +42,6 @@ struct GPUResourcesWithValue {
   std::vector<std::pair<std::string, cl_mem>> custom_memories;
 };
 
-class GPUObject;
-
-class GPUObjectDescriptor {
- public:
-  GPUObjectDescriptor() = default;
-  GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
-  GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
-  GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc)
-      : state_vars_(std::move(obj_desc.state_vars_)) {}
-  GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) {
-    if (this != &obj_desc) {
-      state_vars_ = std::move(obj_desc.state_vars_);
-    }
-    return *this;
-  }
-  virtual ~GPUObjectDescriptor() = default;
-
-  void SetStateVar(const std::string& key, const std::string& value) const {
-    state_vars_[key] = value;
-  }
-
-  virtual std::string PerformConstExpr(const std::string& const_expr) const {
-    return "";
-  }
-
-  virtual absl::Status PerformSelector(
-      const std::string& selector, const std::vector<std::string>& args,
-      const std::vector<std::string>& template_args,
-      std::string* result) const {
-    *result = "";
-    return absl::OkStatus();
-  }
-  virtual GPUResources GetGPUResources() const { return GPUResources(); }
-
-  virtual absl::Status CreateGPUObject(
-      CLContext* context, std::unique_ptr<GPUObject>* result) const {
-    return absl::OkStatus();
-  }
-  virtual void Release() {}
-
-  void SetAccess(AccessType access_type) { access_type_ = access_type; }
-  AccessType GetAccess() const { return access_type_; }
-
- protected:
-  mutable std::map<std::string, std::string> state_vars_;
-  AccessType access_type_;
-};
-
-using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
-
 class GPUObject {
  public:
   GPUObject() = default;
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index b834bbfffef..add7671a2e4 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -153,7 +153,7 @@ CLNode& CLNode::operator=(CLNode&& node) {
 
 absl::Status InferenceContext::InitFromGraph(
     const CreateInferenceInfo& create_info, const GraphFloat32& graph,
-    Environment* env) {
+    Environment* env, std::vector<uint8_t>* serialized_model) {
   CreationContext creation_context;
   creation_context.device = env->GetDevicePtr();
   creation_context.context = &env->context();
@@ -182,10 +182,6 @@ absl::Status InferenceContext::InitFromGraph(
   RETURN_IF_ERROR(Compile(creation_context));
   RETURN_IF_ERROR(UpdateParams());
 
-  for (auto& node : nodes_) {
-    node.operation->args_.ReleaseCPURepresentation();
-  }
-
   TuningParameters tuning_parameters;
   tuning_parameters.queue = env->profiling_queue();
   tuning_parameters.info = &env->device().info_;
@@ -201,14 +197,63 @@ absl::Status InferenceContext::InitFromGraph(
     }
   }
   RETURN_IF_ERROR(Tune(tuning_parameters));
+
+  if (serialized_model) {
+    // Temporary, will be resolved later, now we don't have complete
+    // intermediate representation
+    for (auto& node : nodes_) {
+      node.operation->MoveObjectRefsFromCLToGeneric();
+      node.operation->SyncScalarValues();
+    }
+    flatbuffers::FlatBufferBuilder builder;
+    auto encoded_fb = Encode(*this, &builder);
+    data::FinishInferenceContextBuffer(builder, encoded_fb);
+    serialized_model->resize(builder.GetSize());
+    std::memcpy(serialized_model->data(), builder.GetBufferPointer(),
+                builder.GetSize());
+    for (auto& node : nodes_) {
+      node.operation->MoveObjectRefsFromGenericToCL();
+    }
+  }
+  for (auto& node : nodes_) {
+    node.operation->args_.ReleaseCPURepresentation();
+  }
+  return absl::OkStatus();
+}
+
+absl::Status InferenceContext::RestoreDeserialized(
+    const std::vector<uint8_t>& serialized_model, Environment* env) {
+  flatbuffers::Verifier verifier(serialized_model.data(),
+                                 serialized_model.size());
+  if (!data::VerifyInferenceContextBuffer(verifier)) {
+    return absl::DataLossError("Deserialization failed.");
+  }
+  auto decoded_fb = data::GetInferenceContext(serialized_model.data());
+  RETURN_IF_ERROR(Decode(decoded_fb, this));
+
+  CreationContext creation_context;
+  creation_context.device = env->GetDevicePtr();
+  creation_context.context = &env->context();
+  creation_context.queue = env->queue();
+  creation_context.cache = env->program_cache();
+
+  RETURN_IF_ERROR(AllocateMemory(creation_context.context));
+  BindMemoryToOperations();
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operation->CompileDeserialized(creation_context));
+  }
+  RETURN_IF_ERROR(UpdateParams());
+  for (auto& node : nodes_) {
+    node.operation->args_.ReleaseCPURepresentation();
+  }
   return absl::OkStatus();
 }
 
 absl::Status InferenceContext::InitFromGraphWithTransforms(
     const CreateInferenceInfo& create_info, GraphFloat32* graph,
-    Environment* env) {
+    Environment* env, std::vector<uint8_t>* serialized_model) {
   RETURN_IF_ERROR(RunGraphTransforms(graph));
-  RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env));
+  RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env, serialized_model));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index da687ffa3b5..8e2aa964f74 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -65,14 +66,15 @@ class InferenceContext {
   };
 
   absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
-                             const GraphFloat32& graph, Environment* env);
+                             const GraphFloat32& graph, Environment* env,
+                             std::vector<uint8_t>* serialized_model = nullptr);
 
   // Applies OpenCL-specific transformations to the graph before the
   // initialization. These transformations are either impossible or useless in
   // other backends.
   absl::Status InitFromGraphWithTransforms(
       const CreateInferenceInfo& create_info, GraphFloat32* graph,
-      Environment* env);
+      Environment* env, std::vector<uint8_t>* serialized_model = nullptr);
 
   absl::Status AddToQueue(CLCommandQueue* queue);
   absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
@@ -92,9 +94,18 @@ class InferenceContext {
   const std::vector<ValueId>& GetInputIds() const { return input_ids_; }
   const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
 
+  absl::Status RestoreDeserialized(const std::vector<uint8_t>& serialized_model,
+                                   Environment* env);
+
  private:
   enum TensorMemoryType { STRONG_SHAPE = 0, BUFFER = 1, VARIABLE = 2 };
 
+  friend flatbuffers::Offset<data::InferenceContext> Encode(
+      const InferenceContext& inference,
+      flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(const data::InferenceContext* fb_inference,
+                             InferenceContext* inference);
+
   void CopyInAndOutIds(const GraphFloat32& graph);
   absl::Status ConvertOperations(const DeviceInfo& device_info,
                                  const GraphFloat32& graph, ModelHints hints);
@@ -165,6 +176,32 @@ class InferenceContext {
     void SetNext(ValueId id) { next_ = id; }
     DummyTensor Get(ValueId id) { return reservations_[id]; }
 
+    std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const {
+      std::vector<std::pair<ValueId, TensorDescriptor>> result;
+      for (auto& v : reservations_) {
+        TensorDescriptor desc = v.second.descriptor;
+        desc.shape.b = v.second.shape.b;
+        desc.shape.h = v.second.shape.h;
+        desc.shape.w = v.second.shape.w;
+        desc.shape.d = 1;
+        desc.shape.c = v.second.shape.c;
+        result.push_back({v.first, desc});
+      }
+      return result;
+    }
+
+    void Add(const std::vector<std::pair<ValueId, TensorDescriptor>>& tensors) {
+      for (auto& v : tensors) {
+        DummyTensor dummy;
+        dummy.descriptor = v.second;
+        dummy.shape.b = v.second.shape.b;
+        dummy.shape.h = v.second.shape.h;
+        dummy.shape.w = v.second.shape.w;
+        dummy.shape.c = v.second.shape.c;
+        Add(v.first, dummy);
+      }
+    }
+
    private:
     absl::flat_hash_map<ValueId, DummyTensor> reservations_;
     ValueId next_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 5aa92e0cc81..17a4ecfd9c7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -239,50 +239,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "conv_texture",
-    srcs = ["conv_texture.cc"],
-    hdrs = ["conv_texture.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
-        "//tensorflow/lite/delegates/gpu/cl:cl_context",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:winograd_util",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_test(
-    name = "conv_texture_test",
-    srcs = ["conv_texture_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":cl_test",
-        ":conv_texture",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "conv_weights_converter",
     srcs = ["conv_weights_converter.cc"],
@@ -307,6 +263,7 @@ cc_library(
         ":util",
         "//tensorflow/lite/delegates/gpu:spi",
         "//tensorflow/lite/delegates/gpu/cl:arguments",
+        "//tensorflow/lite/delegates/gpu/cl:cl_arguments",
         "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:cl_errors",
         "//tensorflow/lite/delegates/gpu/cl:environment",
@@ -390,6 +347,7 @@ cc_test(
     tags = [
         "linux",
         "local",
+        "no_mac",  # TODO(b/171880729)
     ],
     deps = [
         ":cl_test",
@@ -468,6 +426,7 @@ cc_test(
     tags = [
         "linux",
         "local",
+        "no_mac",  # TODO(b/171880729)
     ],
     deps = [
         ":cl_test",
@@ -687,6 +646,7 @@ cc_library(
         ":work_group_picking",
         "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_arguments",
         "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
@@ -695,6 +655,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:program_cache",
+        "//tensorflow/lite/delegates/gpu/cl:serialization_cc_fbs",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/common:access_type",
@@ -1397,7 +1358,7 @@ test_suite(
         "conv_buffer_1x1_test",
         "conv_constants_test",
         "conv_powervr_test",
-        "conv_texture_test",
+        "convolution_transposed_3x3_test",
         "convolution_transposed_3x3_thin_test",
         "convolution_transposed_4x4_test",
         "convolution_transposed_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index c3663634177..716e1800777 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -46,9 +46,57 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
   }
 }
 
+// src_size and dst_size must be <= 4;
+std::string GenerateConv(int src_size, int dst_size, bool use_dot_conv,
+                         int const_mem_offset, CalculationsPrecision precision,
+                         const std::string& dst, const std::string& src) {
+  std::string result;
+  const std::string postfixes[] = {".x", ".y", ".z", ".w"};
+  if (use_dot_conv) {
+    const std::string src_postfixes[] = {".x", ".xy", ".xyz", ""};
+    const std::string src_postfix = src_postfixes[src_size - 1];
+    for (int i = 0; i < dst_size; ++i) {
+      result += "    " + dst + postfixes[i] + " += dot(" + src +
+                ", constants[" + std::to_string(const_mem_offset + i) + "]" +
+                src_postfix + ");\n";
+    }
+  } else {
+    const std::string dst_postfixes[] = {".x", ".xy", ".xyz", ""};
+    const std::string dst_postfix = dst_postfixes[dst_size - 1];
+    if (precision == CalculationsPrecision::F32_F16) {
+      for (int i = 0; i < src_size; ++i) {
+        if (i != 0) {
+          result += " + ";
+        }
+        std::string src_name = src;
+        if (src_size != 1) {
+          src_name += postfixes[i];
+        }
+        result += src_name + " * constants[" +
+                  std::to_string(const_mem_offset + i) + "]" + dst_postfix;
+      }
+      std::string size = dst_size == 1 ? "" : std::to_string(dst_size);
+      result = "    " + dst + dst_postfix + " += convert_float" + size + "(" +
+               result + ");\n";
+    } else {
+      for (int i = 0; i < src_size; ++i) {
+        std::string src_name = src;
+        if (src_size != 1) {
+          src_name += postfixes[i];
+        }
+        result += "    " + dst + dst_postfix + " += " + src_name +
+                  " * constants[" + std::to_string(const_mem_offset + i) + "]" +
+                  dst_postfix + ";\n";
+      }
+    }
+  }
+  return result;
+}
+
 std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
                                             const OHWI& weights_shape,
                                             bool stride_correction,
+                                            bool use_dot_conv,
                                             GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
@@ -69,48 +117,6 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
   const std::string kOutZ = std::to_string(out_z);
   const int src_depth = DivideRoundUp(weights_shape.i, 4);
 
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-    case CalculationsPrecision::F16:
-      c += "#define CONV4(R, SRC, F, i) \\\n";
-      c += "  R += SRC.x * F[i + 0]; \\\n";
-      c += "  R += SRC.y * F[i + 1]; \\\n";
-      c += "  R += SRC.z * F[i + 2]; \\\n";
-      c += "  R += SRC.w * F[i + 3];   \n";
-
-      c += "#define CONV3(R, SRC, F, i) \\\n";
-      c += "  R += SRC.x * F[i + 0]; \\\n";
-      c += "  R += SRC.y * F[i + 1]; \\\n";
-      c += "  R += SRC.z * F[i + 2]; \n";
-
-      c += "#define CONV2(R, SRC, F, i) \\\n";
-      c += "  R += SRC.x * F[i + 0]; \\\n";
-      c += "  R += SRC.y * F[i + 1]; \n";
-
-      c += "#define CONV1(R, SRC, F, i) \\\n";
-      c += "  R += SRC * F[i + 0]; \n";
-      break;
-    case CalculationsPrecision::F32_F16:
-      c += "#define CONV4(R, SRC, F, i) \\\n";
-      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
-      c += " + SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
-
-      c += "#define CONV3(R, SRC, F, i) \\\n";
-      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
-      c += " + SRC.z * F[i + 2]);\n";
-
-      c += "#define CONV2(R, SRC, F, i) \\\n";
-      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]);\n";
-
-      c += "#define CONV1(R, SRC, F, i) \\\n";
-      c += "  R += convert_float4(SRC * F[i + 0]);\n";
-      break;
-  }
-
   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
 
   c += "__kernel void main_function(\n";
@@ -133,23 +139,40 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
     }
   }
   c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
-  c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
-  c += "  for (int i = 0; i < " + kOutZ + "; ++i) {\n";
-  c += "    r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  c += "  }\n";
+  c += "  __constant FLT4* constants = args.weights.GetPtr();\n";
+  for (int i = 0; i < out_z; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) +
+         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  }
+  auto generate_check = [&]() {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"x_out", "y_out", "z_out"};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
+        if (!check.empty()) {
+          check += " || ";
+        }
+        check += names[i];
+      }
+    }
+    return check;
+  };
+  const std::string check = generate_check();
   int filters_counter = 0;
   for (int s = 0; s < src_depth; ++s) {
-    const int ch_count = std::min(4, weights_shape.i - s * 4);
-    const std::string s_conv = "CONV" + std::to_string(ch_count);
-    const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
+    const int src_ch_count = std::min(4, weights_shape.i - s * 4);
+    const std::string s_count =
+        src_ch_count == 1 ? "" : std::to_string(src_ch_count);
     const std::string s_type = absl::StrCat("FLT", s_count);
-    const std::string s_postfix = postfixes[ch_count - 1];
+    const std::string s_postfix = postfixes[src_ch_count - 1];
     const std::string dilation_x =
         op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
                                   : "args.dilation_x";
     for (int ky = 0; ky < weights_shape.h; ++ky) {
       std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
-      if (manual_clamp) {
+      if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
         c += "  {\n";
         c += "  bool y_out = " + s_y + " < 0 || " + s_y +
              " >= args.src_tensor.Height();\n";
@@ -158,25 +181,28 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
         c += "  {\n";
         std::string s_x =
             absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
-        if (manual_clamp) {
-          c += "    bool x_out = " + s_x + "< 0 || " + s_x +
+        if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
+          c += "    bool x_out = " + s_x + " < 0 || " + s_x +
                ">= args.src_tensor.Width();\n";
-          c += "    " + s_type + " src = x_out || y_out ?";
-          c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " +
+        }
+        if (check.empty()) {
+          c += "    " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
                s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
         } else {
-          c += "    " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
+          c += "    " + s_type + " src = x_out || y_out ? ";
+          c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " +
                s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
         }
         for (int d = 0; d < out_z; ++d) {
-          c += "    " + s_conv + "(r[" + std::to_string(d) +
-               "], src, args.weigths.GetPtr(),";
-          c += " " + std::to_string(filters_counter) + ");\n";
-          filters_counter += ch_count;
+          const int dst_ch_count = std::min(4, weights_shape.o - d * 4);
+          c += GenerateConv(src_ch_count, dst_ch_count, use_dot_conv,
+                            filters_counter, op_def.precision,
+                            "r" + std::to_string(d), "src");
+          filters_counter += use_dot_conv ? dst_ch_count : src_ch_count;
         }
         c += "  }\n";
       }
-      if (manual_clamp) {
+      if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
         c += "  }\n";
       }
     }
@@ -184,15 +210,31 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
   for (int i = 0; i < out_z; ++i) {
     std::string s_i = std::to_string(i);
     c += "  {\n";
-    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + args.biases.Read(" + s_i +
+    c += "    FLT4 res = TO_FLT4(r" + s_i + ") + args.biases.Read(" + s_i +
          ");\n";
-    c += "  args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
+    c += "    args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
     c += "  }\n";
   }
   c += "}\n";
   return c;
 }
 
+bool IsDotConvBetter(int src_channels, int dst_channels) {
+  if (dst_channels % 4 == 0) {
+    return false;
+  }
+
+  // dst_channels % 4 != 0
+  if (src_channels % 4 == 0) {
+    return true;
+  }
+
+  // dst_channels % 4 != 0 && src_channels % 4 != 0
+  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
+  return dst_channels * src_depth < src_channels * dst_depth;
+}
+
 }  // namespace
 
 bool IsConvConstantsSupported(const DeviceInfo& device_info,
@@ -205,9 +247,14 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
     return false;
   }
 
+  const bool use_dot_conv =
+      IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
   const auto& w_shape = attr.weights.shape;
-  const int dst_channels = AlignByN(w_shape.o, 4);
-  const int filters_count = w_shape.i * dst_channels * w_shape.h * w_shape.w;
+  const int src_depth = DivideRoundUp(w_shape.i, 4);
+  const int dst_depth = DivideRoundUp(w_shape.o, 4);
+  const int aligned_ch_count =
+      use_dot_conv ? w_shape.o * src_depth * 4 : w_shape.i * dst_depth * 4;
+  const int filters_count = aligned_ch_count * w_shape.h * w_shape.w;
   const int float_size = definition.precision == CalculationsPrecision::F32
                              ? sizeof(float)
                              : sizeof(half);
@@ -220,8 +267,11 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
 GPUOperation CreateConvConstants(const DeviceInfo& device_info,
                                  const OperationDef& definition,
                                  const Convolution2DAttributes& attr) {
+  const bool use_dot_conv =
+      IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
   GPUOperation op(definition);
-  UploadWeightsForConvConstants(attr.weights, definition.precision, &op);
+  UploadWeightsForConvConstants(attr.weights, definition.precision,
+                                use_dot_conv, &op);
   op.args_.AddInt("stride_x", attr.strides.w);
   op.args_.AddInt("stride_y", attr.strides.h);
   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
@@ -232,8 +282,9 @@ GPUOperation CreateConvConstants(const DeviceInfo& device_info,
 
   const bool stride_correction =
       definition.IsBatchSupported() && attr.strides.w != 1;
-  op.code_ = GenerateConvolutionConstantCode(definition, attr.weights.shape,
-                                             stride_correction, &op);
+
+  op.code_ = GenerateConvolutionConstantCode(
+      definition, attr.weights.shape, stride_correction, use_dot_conv, &op);
   if (definition.precision == CalculationsPrecision::F16 &&
       device_info.IsAdreno3xx()) {
     op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index c341ecb5753..e80bcbdd14a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -54,20 +54,51 @@ void RearrangeWeightsForConvConstants(
               if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
                 const int f_index =
                     weights.shape.LinearIndex({d_ch, y, x, s_ch});
-                filters[i][j] = weights.data[f_index];
+                filters[j][i] = weights.data[f_index];
               } else {
-                filters[i][j] = 0.0f;
+                filters[j][i] = 0.0f;
               }
             }
           }
-          T filters_new[4];
-          for (int i = 0; i < 4; ++i) {
-            for (int j = 0; j < 4; ++j) {
-              filters_new[i][j] = filters[j][i];
+          for (int i = 0; i < channels_count; ++i) {
+            dst[counter++] = filters[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsForConvConstantsDot(
+    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int d = 0; d < dst_depth; ++d) {
+          const int channels_count = std::min(4, weights.shape.o - d * 4);
+          T filters[4];
+          for (int j = 0; j < channels_count; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + i;
+              const int d_ch = d * 4 + j;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[j][i] = weights.data[f_index];
+              } else {
+                filters[j][i] = 0.0f;
+              }
             }
           }
           for (int i = 0; i < channels_count; ++i) {
-            dst[counter++] = filters_new[i];
+            dst[counter++] = filters[i];
           }
         }
       }
@@ -78,14 +109,17 @@ void RearrangeWeightsForConvConstants(
 template <DataType T>
 void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
                                    CalculationsPrecision precision,
-                                   GPUOperation* op) {
+                                   bool use_dot_conv, GPUOperation* op) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
   const bool f32_weights = precision == CalculationsPrecision::F32;
   const int float_size = f32_weights ? 4 : 2;
-  const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y;
+  const int aligned_ch_count = use_dot_conv ? weights.shape.o * src_depth * 4
+                                            : weights.shape.i * dst_depth * 4;
+  const int float_count = aligned_ch_count * kernel_x * kernel_y;
 
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
@@ -96,15 +130,25 @@ void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
 
   if (f32_weights) {
     float4* ptr = reinterpret_cast<float4*>(desc.data.data());
-    RearrangeWeightsForConvConstants(weights,
-                                     absl::MakeSpan(ptr, float_count / 4));
+    if (use_dot_conv) {
+      RearrangeWeightsForConvConstantsDot(weights,
+                                          absl::MakeSpan(ptr, float_count / 4));
+    } else {
+      RearrangeWeightsForConvConstants(weights,
+                                       absl::MakeSpan(ptr, float_count / 4));
+    }
   } else {
     half4* ptr = reinterpret_cast<half4*>(desc.data.data());
-    RearrangeWeightsForConvConstants(weights,
-                                     absl::MakeSpan(ptr, float_count / 4));
+    if (use_dot_conv) {
+      RearrangeWeightsForConvConstantsDot(weights,
+                                          absl::MakeSpan(ptr, float_count / 4));
+    } else {
+      RearrangeWeightsForConvConstants(weights,
+                                       absl::MakeSpan(ptr, float_count / 4));
+    }
   }
 
-  op->args_.AddObject("weigths",
+  op->args_.AddObject("weights",
                       absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 1db2f28e09d..8952504bda0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -222,6 +222,9 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 }
 
 void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
+  if (conv_params_.linear_spatial) {
+    grid_dimension_ = 2;
+  }
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
   code_ =
@@ -233,37 +236,47 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
   if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
     compiler_options_.push_back(CompilerOptions::CL_2_0);
   }
+  bool kernel_is_trivial =
+      conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
+  }
+  if (device_info.IsAdreno3xx() &&
+      definition_.precision == CalculationsPrecision::F16 &&
+      kernel_is_trivial) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
 }
 
-absl::Status ConvPowerVR::BindArguments() {
+absl::Status ConvPowerVR::BindArguments(ArgumentsBinder* args) {
   if (!conv_params_.x_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
   }
   if (!conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+    RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
   }
   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
       !conv_params_.z_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
+    RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
   }
   if (conv_params_.linear_spatial) {
     const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                      conv_params_.block_size.x);
-    RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
+    RETURN_IF_ERROR(args->SetInt("task_size_x", grid_x));
   }
   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
     const int task_size_y =
         DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
-    RETURN_IF_ERROR(args_.SetInt("task_size_y", task_size_y));
+    RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
   }
   return absl::OkStatus();
 }
@@ -284,23 +297,13 @@ int3 ConvPowerVR::GetGridSize() const {
     if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
       grid_x *= task_size_z;
     }
-    wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-    wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
-    return int3(
-        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
+    return int3(grid_x, task_size_s, 1);
   } else {
     int grid_y = task_size_y;
     if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
       grid_y *= task_size_z;
     }
-    wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
-    wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-    wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
-    return int3(
-        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
-        wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
+    return int3(task_size_x, grid_y, task_size_s);
   }
 }
 
@@ -315,14 +318,8 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
     work_groups->push_back(work_group_size_);
     return;
   }
-  if (conv_params_.work_group_launch_order[0] == 0 &&
-      conv_params_.work_group_launch_order[1] == 1 &&
-      conv_params_.work_group_launch_order[2] == 2) {
-    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                              work_groups);
-  } else {
-    work_groups->push_back(work_group_size_);
-  }
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
@@ -492,9 +489,9 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
   }
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  c += GenerateBlockCoords(
-      conv_params.block_size, conv_params.work_group_launch_order,
-      conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH));
+  c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
+                           conv_params.linear_spatial,
+                           src_def.HasAxis(Axis::DEPTH));
   if (!late_oob_check) {
     c += "  if (" + dst_oob_check + ") {\n";
     c += "    return;\n";
@@ -1030,12 +1027,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   if (device_info.IsNvidia()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_spatial = true;
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(1, 0, 2);
+      work_group_launch_order_ = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.block_size = int4(2, 1, 1, 4);
@@ -1075,12 +1072,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   } else if (device_info.IsPowerVR()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_spatial = true;
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(1, 0, 2);
+      work_group_launch_order_ = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.weights_data_type =
@@ -1123,11 +1120,11 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   } else if (device_info.IsAMD()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       work_group_size_ = int3(8, 4, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     }
 
@@ -1186,13 +1183,22 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       conv_params.src_depth_loop_size = 4;
     }
     work_group_size_ = int3(4, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device_info.IsAdreno()) {
-    conv_params.block_size = int4(2, 2, 1, 1);
+    conv_params.block_size = int4(2, 2, 1, 2);
+    if (device_info.IsAdreno3xx()) {
+      if (definition.precision == CalculationsPrecision::F16) {
+        conv_params.block_size = int4(2, 2, 1, 2);
+      } else if (definition.precision == CalculationsPrecision::F32_F16) {
+        conv_params.block_size = int4(2, 1, 1, 2);
+      } else {  // F32
+        conv_params.block_size = int4(2, 2, 1, 1);
+      }
+    }
     work_group_size_ = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     if (definition.src_tensors.size() == 2) {
@@ -1204,12 +1210,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   } else if (device_info.IsIntel()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(16, 1, 1);
-      conv_params.work_group_launch_order = int3(0, 1, 2);
+      work_group_launch_order_ = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_spatial = true;
       work_group_size_ = int3(16, 1, 1);
-      conv_params.work_group_launch_order = int3(0, 1, 2);
+      work_group_launch_order_ = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.block_size = int4(1, 1, 1, 4);
@@ -1244,7 +1250,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   } else {
     conv_params.block_size = int4(1, 1, 1, 4);
     work_group_size_ = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index a26e5770ce8..30e412cd923 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -47,7 +47,7 @@ class ConvPowerVR : public GPUOperation {
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
       std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
@@ -83,7 +83,6 @@ class ConvPowerVR : public GPUOperation {
     // F32_F16 precision mode
     DataType weights_data_type;  // used for weights and biases
     int4 block_size;             // WHDS
-    int3 work_group_launch_order;
     bool fixed_work_group_size;
     bool linear_spatial;  // spatial dimensions are Width/Height/Depth
     bool different_weights_for_height;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
deleted file mode 100644
index bff328772d7..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
-                 bool kernel1x1) {
-  if (!device_info.IsAdreno()) {
-    return false;
-  }
-  switch (precision) {
-    case CalculationsPrecision::F32:
-    case CalculationsPrecision::F32_F16:
-      return false;
-    case CalculationsPrecision::F16:
-      return device_info.IsAdreno3xx() && kernel1x1;
-  }
-}
-}  // namespace
-
-ConvTexture::ConvTexture(const OperationDef& definition,
-                         const Convolution2DAttributes& attr)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      different_weights_for_height_(false),
-      block_size_(2, 2, 2) {
-  work_group_size_ = int3(4, 4, 2);
-}
-
-ConvTexture::ConvTexture(const OperationDef& definition)
-    : GPUOperation(definition),
-      kernel_size_(1, 1),
-      stride_(1, 1),
-      padding_(0, 0),
-      dilation_(1, 1),
-      different_weights_for_height_(false),
-      block_size_(4, 1, 2) {
-  work_group_size_ = int3(16, 1, 2);
-}
-
-ConvTexture::ConvTexture(ConvTexture&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      different_weights_for_height_(operation.different_weights_for_height_),
-      block_size_(operation.block_size_) {}
-
-ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
-  if (this != &operation) {
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(different_weights_for_height_,
-              operation.different_weights_for_height_);
-    std::swap(block_size_, operation.block_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
-                                          const int3& block_size, bool is1x1,
-                                          bool adreno4xx_optimization,
-                                          bool stride_correction,
-                                          bool different_weights_for_height) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  if (!is1x1) {
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("dilation_x");
-    args_.AddInt("dilation_y");
-  }
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
-                         src_tensor_type == TensorStorageType::BUFFER;
-
-  std::vector<std::string> xs(block_size.x);
-  for (int x = 0; x < block_size.x; ++x) {
-    xs[x] = std::to_string(x);
-  }
-
-  std::vector<std::string> ys(block_size.y);
-  for (int y = 0; y < block_size.y; ++y) {
-    ys[y] = std::to_string(y);
-  }
-
-  std::vector<std::string> zs(block_size.z);
-  for (int z = 0; z < block_size.z; ++z) {
-    zs[z] = std::to_string(z);
-  }
-
-  std::string c = GetCommonDefines(op_def.precision);
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string f0 = std::to_string(z * 4 + 0);
-    const std::string f1 = std::to_string(z * 4 + 1);
-    const std::string f2 = std::to_string(z * 4 + 2);
-    const std::string f3 = std::to_string(z * 4 + 3);
-    switch (op_def.precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        c += "#define CONV" + zs[z] + "(R, S)    \\\n";
-        c += "R += S.x * f" + f0 + "; \\\n";
-        c += "R += S.y * f" + f1 + "; \\\n";
-        c += "R += S.z * f" + f2 + "; \\\n";
-        c += "R += S.w * f" + f3 + ";   \n";
-        break;
-      case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + zs[z] + "(R, S) \\\n";
-        c += "R += convert_float4(S.x * f" + f0 + " + S.y * f" + f1 +
-             " + S.z * f" + f2 + " + S.w * f" + f3 + ");\n";
-        break;
-    }
-  }
-
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
-  c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
-  c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-       "|| Z >= args.dst_tensor.Slices()) return;\n";
-  std::vector<std::string> s_x(block_size.x);
-  std::vector<std::string> s_y(block_size.y);
-  for (int x = 0; x < block_size.x; ++x) {
-    if (stride_correction) {
-      c += "  int xc" + xs[x] + " = " +
-           GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
-                               "args.stride_x", "args.padding_x") +
-           ";\n";
-    } else {
-      c += "  int xc" + xs[x] + " = (X +" + xs[x] +
-           ") * args.stride_x + args.padding_x;\n";
-    }
-    s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
-  }
-  for (int y = 0; y < block_size.y; ++y) {
-    c += "  int yc" + ys[y] + " = (Y +" + ys[y] +
-         ") * args.stride_y + args.padding_y;\n";
-    s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
-  }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  }
-  std::string f_y = is1x1 ? "s" : "filter_offset";
-  if (different_weights_for_height) {
-    f_y = "Y * args.src_tensor.Slices() + s";
-  }
-  if (!is1x1) {
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  int cx" + xs[x] + ";\n";
-    }
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  int cy" + ys[y] + ";\n";
-    }
-    c += "  int filter_offset = 0;\n";
-    c += "  for (int y = 0; y < args.kernel_size_y; ++y) {\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
-    }
-    if (is_buffer) {
-      for (int y = 0; y < block_size.y; ++y) {
-        c += "  bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
-             " < args.src_tensor.Height();\n";
-        if (src_tensor_type == TensorStorageType::BUFFER) {
-          c += "    cy" + ys[y] + " = clamp(cy" + ys[y] +
-               ", 0, args.src_tensor.Height() - 1);\n";
-        }
-      }
-    }
-    c += "  for (int x = 0; x < args.kernel_size_x; ++x) {\n";
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
-    }
-    if (is_buffer) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
-             " < args.src_tensor.Width();\n";
-        if (src_tensor_type == TensorStorageType::BUFFER) {
-          c += "    cx" + xs[x] + " = clamp(cx" + xs[x] +
-               ", 0, args.src_tensor.Width() - 1);\n";
-        }
-      }
-      for (int x = 0; x < block_size.x; ++x) {
-        for (int y = 0; y < block_size.y; ++y) {
-          const std::string id = std::to_string(y * block_size.x + x);
-          if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-            c += absl::Substitute(
-                "  int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
-                "cx$1, (in_x$1 "
-                "&& "
-                "in_y$2));\n",
-                y * block_size.x + x, x, y);
-            c += absl::Substitute(
-                "  int dz_$0 = select(0, args.src_tensor.Width() * "
-                "args.src_tensor.Height(), (in_x$1 && "
-                "in_y$2));\n",
-                y * block_size.x + x, x, y);
-          } else {
-            c += absl::Substitute(
-                "  int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
-                y * block_size.x + x, x, y);
-          }
-        }
-      }
-      if (src_tensor_type == TensorStorageType::BUFFER) {
-        c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
-      }
-    }
-  } else if (is_buffer) {
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
-           " < args.src_tensor.Height();\n";
-    }
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
-           " < args.src_tensor.Width();\n";
-    }
-    for (int x = 0; x < block_size.x; ++x) {
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string id = std::to_string(y * block_size.x + x);
-        if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-          c += absl::Substitute(
-              "  int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
-              "xc$1, (in_x$1 && "
-              "in_y$2));\n",
-              y * block_size.x + x, x, y);
-          c += absl::Substitute(
-              "  int dz_$0 = select(0, args.src_tensor.Width() * "
-              "args.src_tensor.Height(), (in_x$1 && "
-              "in_y$2));\n",
-              y * block_size.x + x, x, y);
-        } else {
-          c += absl::Substitute(
-              "  int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
-              y * block_size.x + x, x, y);
-        }
-      }
-    }
-    if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
-    }
-  }
-  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
-  if (is_buffer) {
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      for (int index = 0; index < block_size.x * block_size.y; ++index) {
-        const std::string id = std::to_string(index);
-        c +=
-            "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
-      }
-    } else {
-      for (int x = 0; x < block_size.x; ++x) {
-        for (int y = 0; y < block_size.y; ++y) {
-          const std::string id = std::to_string(y * block_size.x + x);
-          c += "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-               ") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
-               " += dz;\n";
-        }
-      }
-    }
-  }
-  for (int z = 0; z < block_size.z; ++z) {
-    c += absl::Substitute(R"(    FLT4 f$2 = args.weights0.Read($0, $1);
-    FLT4 f$3 = args.weights1.Read($0, $1);
-    FLT4 f$4 = args.weights2.Read($0, $1);
-    FLT4 f$5 = args.weights3.Read($0, $1);
-)",
-                          "Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
-                          z * 4 + 3);
-  }
-  if (!is_buffer) {
-    for (int x = 0; x < block_size.x; ++x) {
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string id = std::to_string(y * block_size.x + x);
-        c += "    FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
-             s_y[y] + ", s);\n";
-      }
-    }
-  }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y; ++i) {
-      c += "    CONV" + zs[z] + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y) + ", src" +
-           std::to_string(i) + ");\n";
-    }
-  }
-  if (!is1x1) {
-    c += "    filter_offset++;\n";
-  }
-  if (is_buffer) {
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      for (int index = 0; index < block_size.x * block_size.y; ++index) {
-        const std::string id = std::to_string(index);
-        c += "     addr_" + id + " += dz_" + id + ";\n";
-      }
-    }
-  }
-  c += "  }\n";  // args.src_tensor.Slices()
-  if (!is1x1) {
-    c += "  }\n";  // kernel_size_x
-    c += "  }\n";  // kernel_size_y
-  }
-  // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
-  std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
-  std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
-  for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(Z);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "    {\n";
-        c += "      int xc = " + dst_x + " + " + xs[x] + ";\n";
-        c += "      int yc = " + dst_y + " + " + ys[y] + ";\n";
-        c += "      if (xc < args.dst_tensor.Width() && yc < "
-             "args.dst_tensor.Height()) {\n";
-        c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        c += "        args.dst_tensor.Write(res, xc, yc, Z);\n";
-        c += "      }\n";
-        c += "    }\n";
-      }
-    }
-    c += "  }\n";
-    c += "  Z++;\n";
-  }
-  c += "}\n";
-  return c;
-}
-
-void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
-  auto storage_type = definition_.GetPrimaryStorageType();
-  bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
-  bool adreno4xx_optimization =
-      stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
-      device_info.IsAdreno4xx() &&
-      storage_type == TensorStorageType::TEXTURE_ARRAY &&
-      definition_.precision == CalculationsPrecision::F16;
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ =
-      GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
-                       stride_correction, different_weights_for_height_);
-
-  if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
-    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-}
-
-absl::Status ConvTexture::BindArguments() {
-  if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  }
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  return absl::OkStatus();
-}
-
-int3 ConvTexture::GetGridSize() const {
-  const int grid_x =
-      DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
-  const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-void ConvTexture::GetPossibleKernelWorkGroups(
-    TuningType tuning_type, const DeviceInfo& device_info,
-    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
-  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                            work_groups);
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const Convolution2DAttributes& attr) {
-  ConvTexture result(definition, attr);
-  result.GenerateCode(device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const FullyConnectedAttributes& attr) {
-  ConvTexture result(definition);
-  result.GenerateCode(device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
-                                          const OperationDef& definition,
-                                          const Convolution2DAttributes& attr) {
-  ConvTexture result(definition);
-  result.different_weights_for_height_ = true;
-  result.block_size_ = {4, 1, 2};
-  result.GenerateCode(device_info);
-  result.UploadDataForWinograd4x4To6x6(attr.weights);
-  return result;
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
deleted file mode 100644
index b1889265930..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-// This convolution process BLOCK_SIZE(XxYxZ) of FLT4 values per thread.
-class ConvTexture : public GPUOperation {
- public:
-  ConvTexture() = default;
-  void GetPossibleKernelWorkGroups(
-      TuningType tuning_type, const DeviceInfo& device_info,
-      const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvTexture(ConvTexture&& operation);
-  ConvTexture& operator=(ConvTexture&& operation);
-  ConvTexture(const ConvTexture&) = delete;
-  ConvTexture& operator=(const ConvTexture&) = delete;
-
- private:
-  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                                       const OperationDef& definition,
-                                       const Convolution2DAttributes& attr);
-  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                                       const OperationDef& definition,
-                                       const FullyConnectedAttributes& attr);
-
-  friend ConvTexture CreateConvTextureWino4x4To6x6(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const Convolution2DAttributes& attr);
-
-  ConvTexture(const OperationDef& definition,
-              const Convolution2DAttributes& attr);
-  explicit ConvTexture(const OperationDef& definition);
-  template <DataType T>
-  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                  const tflite::gpu::Tensor<Linear, T>& biases);
-
-  template <DataType T>
-  void UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  void GenerateCode(const DeviceInfo& device_info);
-
-  std::string GenerateConvCode(const OperationDef& op_def,
-                               const int3& block_size, bool is1x1,
-                               bool adreno4xx_optimization,
-                               bool stride_correction,
-                               bool different_weights_for_height);
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-
-  // By default in 2d convolution we have the same weights for WH dims, but in
-  // some cases we need separate weights for H dimension and convolution kernel
-  // requires very small modifications to support it.
-  bool different_weights_for_height_;
-
-  int3 block_size_ = int3(2, 2, 2);
-};
-
-template <DataType T>
-void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             const tflite::gpu::Tensor<Linear, T>& biases) {
-  UploadWeights(weights);
-
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(biases);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void ConvTexture::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights) {
-  tflite::gpu::Tensor<OHWI, T> wino_weights;
-  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  UploadWeights(wino_weights);
-
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
-  bias.shape = Linear(1);
-  bias.data = {0.0f};
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(bias);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
-  int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-  DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-
-  const int elements_count = dst_depth * src_depth * kernel_x * kernel_y * 4;
-  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
-
-  std::vector<uint8_t> data(float4_size * elements_count);
-
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
-                                     absl::MakeSpan(ptr, elements_count));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
-                                     absl::MakeSpan(ptr, elements_count));
-  }
-
-  const int texture_width = dst_depth;
-  const int texture_height = src_depth * kernel_x * kernel_y;
-  const int sub_size = float4_size * texture_width * texture_height;
-  for (int i = 0; i < 4; ++i) {
-    Texture2DDescriptor desc;
-    desc.element_type = data_type;
-    desc.size = int2(texture_width, texture_height);
-    desc.data.resize(sub_size);
-    memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
-    const std::string name = "weights" + std::to_string(i);
-    args_.AddObject(name,
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
-  }
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const Convolution2DAttributes& attr);
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const FullyConnectedAttributes& attr);
-
-ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
-                                          const OperationDef& definition,
-                                          const Convolution2DAttributes& attr);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
deleted file mode 100644
index 2a92573b689..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(1, 2, 2, 2);
-  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  attr.bias.shape = Linear(1);
-  attr.bias.data = {0.0f};
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      ConvTexture operation =
-          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 2, 1), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, ConvTexture) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(2, 2, 2, 2);
-  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  attr.bias.shape = Linear(2);
-  attr.bias.data = {0.5f, -0.5f};
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      ConvTexture operation =
-          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 2, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
-                                             60.5f, 235.5f, 20.5f, 123.5f}));
-    }
-  }
-}
-
-}  // namespace
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
index d6e17ce2a86..521cbefd885 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -110,12 +110,12 @@ std::string ConverterToConvWeights::GetConverterToConvWeightsCode(
   return c;
 }
 
-absl::Status ConverterToConvWeights::BindArguments() {
+absl::Status ConverterToConvWeights::BindArguments(ArgumentsBinder* args) {
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
-  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
-  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
-  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
-  return args_.SetFloat("mask_w", mask.w);
+  RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z));
+  return args->SetFloat("mask_w", mask.w);
 }
 
 int3 ConverterToConvWeights::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
index fe814d296fa..3c7314ea6c9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
@@ -31,7 +31,7 @@ class ConverterToConvWeights : public GPUOperation {
  public:
   ConverterToConvWeights(const OperationDef& definition,
                          const ConvWeightsDescription& conv_weights_desc);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index 17be7e1551c..8087df56a05 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
@@ -44,14 +45,17 @@ class OpenClConverterImpl : public TensorObjectConverter {
   absl::Status DispatchKernel(cl_mem buffer_mem, Tensor* tensor) {
     kernel_.ResetBindingCounter();
     RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
-    RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
-    int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
-                     tensor->Slices());
-    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+    RETURN_IF_ERROR(cl_args_.SetObjectRef("tensor", tensor));
+    RETURN_IF_ERROR(
+        cl_args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
+    const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
+                           tensor->Slices());
+    const int3 work_group_size = {16, 8, 1};
+    const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+    return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
   }
 
-  Arguments args_;
+  CLArguments cl_args_;
   BHWC shape_;
   CLKernel kernel_;
   TensorDescriptor tensor_descriptor_;
@@ -113,7 +117,8 @@ class TensorToTensorConverter : public OpenClConverterImpl {
     src_tensor_descriptor_.storage_type = ToTensorStorageType(
         input_def.object_def.object_type, input_def.object_def.data_layout);
     src_tensor_descriptor_.data_type = input_def.object_def.data_type;
-    args_.AddObjectRef(
+    Arguments args;
+    args.AddObjectRef(
         "src_tensor", AccessType::READ,
         absl::make_unique<TensorDescriptor>(src_tensor_descriptor_));
 
@@ -121,7 +126,7 @@ class TensorToTensorConverter : public OpenClConverterImpl {
     dst_tensor_descriptor_.storage_type = ToTensorStorageType(
         output_def.object_def.object_type, output_def.object_def.data_layout);
     dst_tensor_descriptor_.data_type = output_def.object_def.data_type;
-    args_.AddObjectRef(
+    args.AddObjectRef(
         "dst_tensor", AccessType::WRITE,
         absl::make_unique<TensorDescriptor>(dst_tensor_descriptor_));
 
@@ -150,8 +155,8 @@ class TensorToTensorConverter : public OpenClConverterImpl {
     context_ = &environment->context();
     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
                   input_def.dimensions.w, input_def.dimensions.c);
-    RETURN_IF_ERROR(
-        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+    RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), {}, nullptr,
+                                  &args, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "tensor_to_tensor", environment->context(),
         environment->device(), &kernel_);
@@ -170,12 +175,14 @@ class TensorToTensorConverter : public OpenClConverterImpl {
     Tensor dst_tensor;
     RETURN_IF_ERROR(CreateSharedTensor(*context_, out_memory, shape_,
                                        dst_tensor_descriptor_, &dst_tensor));
-    RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor));
-    RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor));
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
-                     dst_tensor.Height(), dst_tensor.Slices());
-    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+    RETURN_IF_ERROR(cl_args_.SetObjectRef("src_tensor", &src_tensor));
+    RETURN_IF_ERROR(cl_args_.SetObjectRef("dst_tensor", &dst_tensor));
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
+                           dst_tensor.Height(), dst_tensor.Slices());
+    const int3 work_group_size = {16, 8, 1};
+    const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+    return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
   }
 
  private:
@@ -199,8 +206,9 @@ class TensorToBHWCBufferConverter : public OpenClConverterImpl {
     tensor_descriptor_.layout = Layout::BHWC;
     tensor_descriptor_.storage_type = src_tensor_type;
     tensor_descriptor_.data_type = input_def.object_def.data_type;
-    args_.AddObjectRef("tensor", AccessType::READ,
-                       absl::make_unique<TensorDescriptor>(tensor_descriptor_));
+    Arguments args;
+    args.AddObjectRef("tensor", AccessType::READ,
+                      absl::make_unique<TensorDescriptor>(tensor_descriptor_));
 
     const bool need_fp16_support =
         input_def.object_def.data_type == DataType::FLOAT16 ||
@@ -240,8 +248,8 @@ class TensorToBHWCBufferConverter : public OpenClConverterImpl {
     context_ = &environment->context();
     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
                   input_def.dimensions.w, input_def.dimensions.c);
-    RETURN_IF_ERROR(
-        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+    RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), {}, nullptr,
+                                  &args, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "tensor_to_bhwc", environment->context(),
         environment->device(), &kernel_);
@@ -296,8 +304,9 @@ class BHWCBufferToTensorConverter : public OpenClConverterImpl {
     tensor_descriptor_.layout = Layout::BHWC;
     tensor_descriptor_.storage_type = dst_tensor_type;
     tensor_descriptor_.data_type = output_def.object_def.data_type;
-    args_.AddObjectRef("tensor", AccessType::WRITE,
-                       absl::make_unique<TensorDescriptor>(tensor_descriptor_));
+    Arguments args;
+    args.AddObjectRef("tensor", AccessType::WRITE,
+                      absl::make_unique<TensorDescriptor>(tensor_descriptor_));
 
     const bool need_fp16_support =
         input_def.object_def.data_type == DataType::FLOAT16 ||
@@ -334,8 +343,8 @@ class BHWCBufferToTensorConverter : public OpenClConverterImpl {
     context_ = &environment->context();
     shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
                   output_def.dimensions.w, output_def.dimensions.c);
-    RETURN_IF_ERROR(
-        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+    RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), {}, nullptr,
+                                  &args, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "bhwc_to_tensor", environment->context(),
         environment->device(), &kernel_);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index d22dbbd88cf..b2bf5216f8e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -516,12 +516,12 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   return c;
 }
 
-absl::Status ConvolutionTransposed::BindArguments() {
+absl::Status ConvolutionTransposed::BindArguments(ArgumentsBinder* args) {
   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
     const int aligned_h =
         AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
     RETURN_IF_ERROR(
-        args_.SetInt("grid_size_y", DivideRoundUp(aligned_h, block_size_.y)));
+        args->SetInt("grid_size_y", DivideRoundUp(aligned_h, block_size_.y)));
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 134bfc4839c..5aa86f33e5a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -44,7 +44,7 @@ class ConvolutionTransposed : public GPUOperation {
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
       std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index af952dd3f78..39a71ade86a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -29,10 +29,9 @@ namespace gpu {
 namespace cl {
 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
     const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
-    : GPUOperation(definition),
-      padding_(padding),
-      work_group_launch_order_(2, 0, 1) {
+    : GPUOperation(definition), padding_(padding) {
   work_group_size_ = int3(8, 4, 1);
+  work_group_launch_order_ = int3(2, 0, 1);
   if (device_info.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
   } else if (device_info.IsNvidia() || device_info.IsIntel()) {
@@ -54,14 +53,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
     ConvolutionTransposed3x3&& operation)
     : GPUOperation(std::move(operation)),
       padding_(operation.padding_),
-      work_group_launch_order_(operation.work_group_launch_order_),
       weights_upload_type_(operation.weights_upload_type_) {}
 
 ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
     ConvolutionTransposed3x3&& operation) {
   if (this != &operation) {
     std::swap(padding_, operation.padding_);
-    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
     std::swap(weights_upload_type_, operation.weights_upload_type_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -89,10 +86,6 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
   args_.AddInt("padding_x");
   args_.AddInt("padding_y");
 
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
   const bool need_local_mem =
       weights_upload_type ==
           ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
@@ -173,26 +166,35 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
       ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "  int local_id = (int)(get_local_id(1) * 8 + get_local_id(0));\n";
   }
-  if (manual_clamp) {
-    const std::string next_x = "SRC_X + " + pixel_stride;
+  const std::string next_x = "SRC_X + " + pixel_stride;
+  if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
     c += "  bool in_x0 = SRC_X >= 0 && SRC_X < args.src_tensor.Width();\n";
     c += "  bool in_x1 = " + next_x + " >= 0 && " + next_x +
          " < args.src_tensor.Width();\n";
+  }
+  if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
     c += "  bool in_y0 = SRC_Y >= 0 && SRC_Y < args.src_tensor.Height();\n";
     c += "  bool in_y1 = SRC_Y + 1 >= 0 && SRC_Y + 1 < "
          "args.src_tensor.Height();\n";
-    if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int xc0 = clamp(SRC_X, 0, args.src_tensor.Width() - 1);\n";
-      c += "  int xc1 = clamp(" + next_x +
-           ", 0, args.src_tensor.Width() - 1);\n";
-      c += "  int yc0 = clamp(SRC_Y, 0, args.src_tensor.Height() - 1);\n";
-      c += "  int yc1 = clamp(SRC_Y + 1, 0, args.src_tensor.Height() - 1);\n";
-      c += "  args.src_tensor.GetAddress(addr_0, xc0, yc0, 0);\n";
-      c += "  args.src_tensor.GetAddress(addr_1, xc1, yc0, 0);\n";
-      c += "  args.src_tensor.GetAddress(addr_2, xc0, yc1, 0);\n";
-      c += "  args.src_tensor.GetAddress(addr_3, xc1, yc1, 0);\n";
-      c += "  int dz = args.src_tensor.SliceStride();\n";
-    } else {  // TensorStorageType::IMAGE_BUFFER
+  }
+  auto generate_check = [&](int x, int y) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
+    const std::vector<std::string> names{"in_x" + std::to_string(x),
+                                         "in_y" + std::to_string(y)};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i];
+      }
+    }
+    return check;
+  };
+  if (src_desc.IsLinear()) {
+    if (src_desc.ReturnsZeroForNegOneRead()) {
       c += "  args.src_tensor.GetAddress(addr_0, SRC_X, SRC_Y, 0);\n";
       c += "  args.src_tensor.GetAddress(addr_1," + next_x + ", SRC_Y, 0);\n";
       c += "  args.src_tensor.GetAddress(addr_2, SRC_X, SRC_Y + 1, 0);\n";
@@ -209,13 +211,24 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
            "in_y1));\n";
       c += "  int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
            "in_y1));\n";
+    } else {
+      c += "  int xc0 = clamp(SRC_X, 0, args.src_tensor.Width() - 1);\n";
+      c += "  int xc1 = clamp(" + next_x +
+           ", 0, args.src_tensor.Width() - 1);\n";
+      c += "  int yc0 = clamp(SRC_Y, 0, args.src_tensor.Height() - 1);\n";
+      c += "  int yc1 = clamp(SRC_Y + 1, 0, args.src_tensor.Height() - 1);\n";
+      c += "  args.src_tensor.GetAddress(addr_0, xc0, yc0, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_1, xc1, yc0, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_2, xc0, yc1, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_3, xc1, yc1, 0);\n";
+      c += "  int dz = args.src_tensor.SliceStride();\n";
     }
   }
   auto read_src = [&](int x, int y) {
-    if (manual_clamp) {
+    if (src_desc.IsLinear()) {
       const std::string id = std::to_string(y * 2 + x);
       const std::string addr = "addr_" + std::to_string(y * 2 + x);
-      if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
+      if (src_desc.ReturnsZeroForNegOneRead()) {
         return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
                ";\n";
       } else {
@@ -224,8 +237,13 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
                addr + " += dz;\n";
       }
     } else {
+      std::string check = generate_check(x, y);
+      if (!check.empty()) {
+        check = " * (FLT)(" + check + ")";
+      }
       return "args.src_tensor.Read(SRC_X + " + std::to_string(x) + "*" +
-             pixel_stride + ", SRC_Y + " + std::to_string(y) + ", s);\n";
+             pixel_stride + ", SRC_Y + " + std::to_string(y) + ", s)" + check +
+             ";\n";
     }
   };
   const int padding_x_rem = abs(padding.x) % 2;
@@ -305,27 +323,33 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
   return c;
 }
 
-absl::Status ConvolutionTransposed3x3::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
+absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
+  RETURN_IF_ERROR(args->SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
   const int padding_x =
       padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
   const int padding_y =
       padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_x * src_[0]->Batch()));
-  return args_.SetInt("padding_y", padding_y);
+  RETURN_IF_ERROR(args->SetInt("padding_x", padding_x * src_[0]->Batch()));
+  return args->SetInt("padding_y", padding_y);
+}
+
+void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
+      weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
+    work_groups->push_back(work_group_size_);
+    return;
+  }
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 int3 ConvolutionTransposed3x3::GetGridSize() const {
   const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
   const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
   const int grid_z = dst_[0]->Slices();
-  int3 wg;
-  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
-  return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
-              wg[work_group_launch_order_[1]] * work_group_size_.y,
-              wg[work_group_launch_order_[2]] * work_group_size_.z);
+  return int3(grid_x, grid_y, grid_z);
 }
 
 bool IsConvolutionTransposed3x3Supported(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index ad3e459da3e..074fc23b0e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -40,10 +40,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override {
-    work_groups->push_back(work_group_size_);
-  }
-  absl::Status BindArguments() override;
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -78,7 +76,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
       int2 padding, int3 work_group_launch_order);
 
   int2 padding_;
-  int3 work_group_launch_order_;
   WeightsUploadType weights_upload_type_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index d606a822d7e..b92846cb794 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -80,10 +80,6 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
 
   args_.AddInt("filter_offset");
 
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
   const bool need_local_mem =
       weights_upload_type ==
           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
@@ -150,24 +146,42 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
       ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "  int local_id = (int)(get_local_id(1) * 8 + get_local_id(0));\n";
   }
-  if (manual_clamp) {
-    const std::string prev_x = "X - " + pixel_stride;
+  const std::string prev_x = "X - " + pixel_stride;
+  if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
     c += "  bool in_x0 = " + prev_x + " >= 0 && " + prev_x +
          " < args.src_tensor.Width();\n";
     c += "  bool in_x1 = X >= 0 && X < args.src_tensor.Width();\n";
+  }
+  if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
     c += "  bool in_y0 = Y - 1 >= 0 && Y - 1 < args.src_tensor.Height();\n";
     c += "  bool in_y1 = Y >= 0 && Y < args.src_tensor.Height();\n";
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      c += "  int addr_0 = select(-1, (Y - 1) * args.src_tensor.Width() + " +
-           prev_x + ", (in_x0 && in_y0));\n";
-      c += "  int addr_1 = select(-1, (Y - 1) * args.src_tensor.Width() + X, "
-           "(in_x1 && "
-           "in_y0));\n";
-      c += "  int addr_2 = select(-1, Y * args.src_tensor.Width() + " + prev_x +
-           ", (in_x0 && in_y1));\n";
-      c += "  int addr_3 = select(-1, Y * args.src_tensor.Width() + X, (in_x1 "
-           "&& "
-           "in_y1));\n";
+  }
+  auto generate_check = [&](int x, int y) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
+    const std::vector<std::string> names{"in_x" + std::to_string(x),
+                                         "in_y" + std::to_string(y)};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i];
+      }
+    }
+    return check;
+  };
+  if (src_desc.IsLinear()) {
+    if (src_desc.ReturnsZeroForNegOneRead()) {
+      c += "  args.src_tensor.GetAddress(addr_0, " + prev_x + ", Y - 1, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_1, X, Y - 1, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_2, " + prev_x + ", Y, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_3, X, Y, 0);\n";
+      c += "  addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
+      c += "  addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
+      c += "  addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
+      c += "  addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
       c += "  int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
            "in_y0));\n";
       c += "  int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
@@ -176,25 +190,24 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
            "in_y1));\n";
       c += "  int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
            "in_y1));\n";
-    }
-    if (src_tensor_type == TensorStorageType::BUFFER) {
+    } else {
       c += "  int xc0 = clamp(" + prev_x +
            ", 0, args.src_tensor.Width() - 1);\n";
       c += "  int xc1 = clamp(X, 0, args.src_tensor.Width() - 1);\n";
       c += "  int yc0 = clamp(Y - 1, 0, args.src_tensor.Height() - 1);\n";
       c += "  int yc1 = clamp(Y, 0, args.src_tensor.Height() - 1);\n";
-      c += "  int addr_0 = yc0 * args.src_tensor.Width() + xc0;\n";
-      c += "  int addr_1 = yc0 * args.src_tensor.Width() + xc1;\n";
-      c += "  int addr_2 = yc1 * args.src_tensor.Width() + xc0;\n";
-      c += "  int addr_3 = yc1 * args.src_tensor.Width() + xc1;\n";
+      c += "  args.src_tensor.GetAddress(addr_0, xc0, yc0, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_1, xc1, yc0, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_2, xc0, yc1, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_3, xc1, yc1, 0);\n";
       c += "  int dz = args.src_tensor.SliceStride();\n";
     }
   }
   auto read_src = [&](int x, int y) {
-    if (manual_clamp) {
+    if (src_desc.IsLinear()) {
       const std::string id = std::to_string(y * 2 + x);
       const std::string addr = "addr_" + std::to_string(y * 2 + x);
-      if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
+      if (src_desc.ReturnsZeroForNegOneRead()) {
         return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
                ";";
       } else {
@@ -203,8 +216,13 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
                addr + " += dz;";
       }
     } else {
+      std::string check = generate_check(x, y);
+      if (!check.empty()) {
+        check = " * (FLT)(" + check + ")";
+      }
       return "args.src_tensor.Read(X + " + std::to_string(x - 1) + " * " +
-             pixel_stride + ", Y + " + std::to_string(y - 1) + ", s);";
+             pixel_stride + ", Y + " + std::to_string(y - 1) + ", s)" + check +
+             ";";
     }
   };
   c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
@@ -296,8 +314,8 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
   return c;
 }
 
-absl::Status ConvolutionTransposed4x4::BindArguments() {
-  return args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
+absl::Status ConvolutionTransposed4x4::BindArguments(ArgumentsBinder* args) {
+  return args->SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
 }
 
 int3 ConvolutionTransposed4x4::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 2577eb47513..17d63233864 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -43,7 +43,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
       std::vector<int3>* work_groups) const override {
     work_groups->push_back(work_group_size_);
   }
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 05d5d086bc7..fca32db392c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -86,13 +86,8 @@ std::string GenerateDepthwiseConvolutionCode(
   }
   op->AddDstTensor("dst_tensor", dst_desc);
 
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-
   std::string c = GetCommonDefines(op_def.precision);
 
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
@@ -142,84 +137,91 @@ std::string GenerateDepthwiseConvolutionCode(
   std::string kernel_size_z =
       dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
 
-  std::string flat_coords = "x_c, y_c";
-  if (manual_clamp) {
-    std::string check = "!outside_x && !outside_y";
-    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-      check += " && !outside_z";
-      flat_coords += ", z_c";
-      c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
-      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
+  auto generate_check = [&]() {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"outside_x", "outside_y", "outside_z"};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += "!" + names[i];
+      }
+    }
+    return check;
+  };
+  auto generate_coords = [&]() {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"x_c", "y_c", "z_c"};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_desc.HasAxis(axis)) {
+        if (!check.empty()) {
+          check += ", ";
+        }
+        check += names[i];
+      }
+    }
+    return check;
+  };
+  const std::string check = generate_check();
+  const std::string coords = generate_coords();
+
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
+    c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
+    if (!src_desc.SupportsZeroClamp(Axis::DEPTH)) {
       c += "    bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
     }
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
     c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
     c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
-    c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
-    c += "    for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
+    if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
+      c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+    }
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    c += "  for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
     const std::string dilation_x =
         op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
                                   : "args.dilation_x";
-    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
-    c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
-    c += "      if (" + check + ") {\n";
-    if (dynamic_weights) {
-      c += "        FLT4 f = args.weights.Read(kx, ky, S);\n";
+    c += "    int x_c = x_offseted + kx * " + dilation_x + ";\n";
+    if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
+      c += "    bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
+    }
+  }
+  if (!check.empty()) {
+    c += "    if (" + check + ") {\n";
+  }
+  if (dynamic_weights) {
+    c += "      FLT4 f = args.weights.Read(kx, ky, S);\n";
+  } else {
+    if (weights_are_buffer) {
+      c += "      FLT4 f = args.weights.Read(fx_c);\n";
     } else {
-      if (weights_are_buffer) {
-        c += "        FLT4 f = args.weights.Read(fx_c);\n";
-      } else {
-        c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
-      }
-    }
-    c += GetSrcValue(channel_multiplier, flat_coords);
-    c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
-    c += "      };\n";
-    if (!dynamic_weights) {
-      c += "      fx_c++;\n";
+      c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
     }
+  }
+  c += GetSrcValue(channel_multiplier, coords);
+  c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
+  if (!check.empty()) {
     c += "    }\n";
+  }
+  if (!dynamic_weights) {
+    c += "    fx_c++;\n";
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
     c += "  }\n";
-    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-      c += "  }\n";
-    }
-  } else {  // Texture types with ZERO clamping
-    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-      flat_coords += ", z_c";
-      c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
-      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
-      if (src_tensor_type !=
-          TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
-                                            // in DEPTH dimension
-        c += "    if (z_c < 0 || z_c >= args.src_tensor.Depth()) {\n";
-        c += "      fx_c += args.kernel_size_y * args.kernel_size_x;\n";
-        c += "      continue;\n";
-        c += "    }\n";
-      }
-    }
-    c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
-    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
-    c += "    for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
-    const std::string dilation_x =
-        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
-                                  : "args.dilation_x";
-    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
-    c += GetSrcValue(channel_multiplier, flat_coords);
-    if (dynamic_weights) {
-      c += "      FLT4 f = args.weights.Read(kx, ky, S);\n";
-    } else {
-      if (weights_are_buffer) {
-        c += "      FLT4 f = args.weights.Read(fx_c);\n";
-      } else {
-        c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
-      }
-      c += "      fx_c++;\n";
-    }
-    c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
-    c += "    }\n";
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    c += "  }\n";
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
     c += "  }\n";
-    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-      c += "  }\n";
-    }
   }
   c += "  FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n";
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
@@ -228,7 +230,6 @@ std::string GenerateDepthwiseConvolutionCode(
     c += "  args.dst_tensor.Write(res0, X, Y, S);\n";
   }
   c += "}\n";
-
   return c;
 }
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index b34b8e38b41..9d6bc59f716 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -49,6 +49,33 @@ std::string GetElementWiseCode(const OperationDef& op_def,
   return c;
 }
 
+int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
+                        const int3& work_group_size,
+                        const int3& work_group_launch_order) {
+  int3 work_groups_count;
+  if (grid_dimension == 1) {
+    work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    work_groups_count.y = 1;
+    work_groups_count.z = 1;
+  } else if (grid_dimension == 2) {
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = 1;
+  } else {  // grid_dimension == 3
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = wgs[work_group_launch_order[2]];
+  }
+  return work_groups_count;
+}
+
 }  // namespace
 
 DataType OperationDef::GetDataType() const {
@@ -105,10 +132,14 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
       definition_(std::move(operation.definition_)),
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
-      kernel_(std::move(operation.kernel_)),
+      grid_dimension_(operation.grid_dimension_),
+      work_group_launch_order_(operation.work_group_launch_order_),
       grid_size_(operation.grid_size_),
       src_tensors_names_(std::move(operation.src_tensors_names_)),
       dst_tensors_names_(std::move(operation.dst_tensors_names_)),
+      kernel_(std::move(operation.kernel_)),
+      cl_args_(std::move(operation.cl_args_)),
+      work_groups_count_(operation.work_groups_count_),
       linkable_count_(operation.linkable_count_),
       elementwise_code_(std::move(operation.elementwise_code_)) {}
 
@@ -125,10 +156,14 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
     definition_ = std::move(operation.definition_);
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
-    kernel_ = std::move(operation.kernel_);
+    std::swap(grid_dimension_, operation.grid_dimension_);
+    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
     std::swap(grid_size_, operation.grid_size_);
     src_tensors_names_ = std::move(operation.src_tensors_names_);
     dst_tensors_names_ = std::move(operation.dst_tensors_names_);
+    kernel_ = std::move(operation.kernel_);
+    cl_args_ = std::move(operation.cl_args_);
+    std::swap(work_groups_count_, operation.work_groups_count_);
     std::swap(linkable_count_, operation.linkable_count_);
     elementwise_code_ = std::move(operation.elementwise_code_);
   }
@@ -178,17 +213,20 @@ void GPUOperation::AddDstTensor(const std::string& tensor_name,
 
 absl::Status GPUOperation::UpdateParams() {
   for (int i = 0; i < src_tensors_names_.size(); ++i) {
-    RETURN_IF_ERROR(args_.SetObjectRef(src_tensors_names_[i], src_[i]));
+    RETURN_IF_ERROR(cl_args_.SetObjectRef(src_tensors_names_[i], src_[i]));
   }
   for (int i = 0; i < dst_tensors_names_.size(); ++i) {
-    RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
+    RETURN_IF_ERROR(cl_args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
   }
-  RETURN_IF_ERROR(BindArguments());
+  RETURN_IF_ERROR(BindArguments(&cl_args_));
   grid_size_ = GetGridSize();
+  work_groups_count_ = GetWorkGroupsCount(
+      grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
   return absl::OkStatus();
 }
 
-absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
+absl::Status GPUOperation::AssembleCode(const DeviceInfo& device_info,
+                                        CLContext* context) {
   if (elementwise_) {
     auto src_desc =
         absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]);
@@ -206,28 +244,32 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
     dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor");
     args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
 
-    std::string code =
-        GetElementWiseCode(definition_, check_src_channels_size_);
     elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
-    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
-    RETURN_IF_ERROR(args_.TransformToCLCode(
-        creation_context.device->info_,
-        {{dst_tensors_names_[0], elementwise_code_}}, &code));
-    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-        code, "main_function", *creation_context.context,
-        *creation_context.device, &kernel_));
-  } else {
-    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
-    RETURN_IF_ERROR(args_.TransformToCLCode(
-        creation_context.device->info_,
-        {{dst_tensors_names_[0], elementwise_code_}}, &code_));
-    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-        code_, "main_function", compiler_options_, *creation_context.context,
-        *creation_context.device, &kernel_));
+    code_ = GetElementWiseCode(definition_, check_src_channels_size_);
   }
+  return cl_args_.Init(device_info,
+                       {{dst_tensors_names_[0], elementwise_code_}}, context,
+                       &args_, &code_);
+}
+
+absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
+  RETURN_IF_ERROR(
+      AssembleCode(creation_context.GetDeviceInfo(), creation_context.context));
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code_, "main_function", compiler_options_, *creation_context.context,
+      *creation_context.device, &kernel_));
   return PostCompileCheck(creation_context.device->info_, kernel_.info_);
 }
 
+absl::Status GPUOperation::CompileDeserialized(
+    const CreationContext& creation_context) {
+  RETURN_IF_ERROR(cl_args_.Init(creation_context.GetDeviceInfo(), &args_,
+                                creation_context.context));
+  return creation_context.cache->GetOrCreateCLKernel(
+      code_, "main_function", compiler_options_, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
 void GPUOperation::GetPossibleKernelWorkGroups(
     TuningType tuning_type, const DeviceInfo& device_info,
     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
@@ -245,14 +287,26 @@ absl::Status GPUOperation::Tune(const TuningParameters& params) {
   }
   if (possible_work_groups.size() == 1) {
     work_group_size_ = possible_work_groups[0];
+    work_groups_count_ =
+        GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
+                           work_group_launch_order_);
     return absl::OkStatus();
   } else {
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    std::vector<int3> work_groups_count(possible_work_groups.size());
+    for (int i = 0; i < work_groups_count.size(); ++i) {
+      work_groups_count[i] =
+          GetWorkGroupsCount(grid_dimension_, grid_size_,
+                             possible_work_groups[i], work_group_launch_order_);
+    }
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
     int best_work_group_index;
     RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-        kernel_, *params.info, grid_size_, possible_work_groups,
+        kernel_, *params.info, work_groups_count, possible_work_groups,
         &best_work_group_index));
     work_group_size_ = possible_work_groups[best_work_group_index];
+    work_groups_count_ =
+        GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
+                           work_group_launch_order_);
     return absl::OkStatus();
   }
 }
@@ -282,7 +336,7 @@ int3 GPUOperation::GetGridSize() const {
     const int grid_z = 1;
     return int3(grid_x, grid_y, grid_z);
   }
-  return int3(0, 0, 0);
+  return grid_size_;
 }
 
 void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 11386cc434e..844e45f28b2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -119,8 +121,8 @@ class GPUOperation {
   absl::Status UpdateParams();
 
   absl::Status AddToQueue(CLCommandQueue* queue) {
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    return queue->Dispatch(kernel_, work_groups_count_, work_group_size_);
   }
 
   virtual void GetPossibleKernelWorkGroups(
@@ -129,8 +131,12 @@ class GPUOperation {
 
   absl::Status Tune(const TuningParameters& params);
 
+  absl::Status AssembleCode(const DeviceInfo& device_info, CLContext* context);
+
   absl::Status Compile(const CreationContext& creation_context);
 
+  absl::Status CompileDeserialized(const CreationContext& creation_context);
+
   virtual absl::Status PostCompileCheck(const DeviceInfo& device_info,
                                         const KernelInfo& kernel_info) {
     return absl::OkStatus();
@@ -163,20 +169,35 @@ class GPUOperation {
   // applicable only with elementwise_ = true;
   bool check_src_channels_size_ = false;
 
+  // Temporary, will be resolved later
+  void MoveObjectRefsFromCLToGeneric() { cl_args_.MoveObjectRefsOut(&args_); }
+  void MoveObjectRefsFromGenericToCL() { cl_args_.MoveObjectRefsIn(&args_); }
+  void SyncScalarValues() { cl_args_.CopyScalarValues(&args_); }
+
  protected:
-  virtual absl::Status BindArguments() { return absl::OkStatus(); }
+  friend flatbuffers::Offset<data::GPUOperation> Encode(
+      const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(const data::GPUOperation* fb_op, GPUOperation* op);
+
+  virtual absl::Status BindArguments(ArgumentsBinder* args) {
+    return absl::OkStatus();
+  }
   virtual int3 GetGridSize() const;
 
   // Defines operation calculation precision and format of src/dst tensors.
   OperationDef definition_;
   std::vector<Tensor*> src_;
   std::vector<Tensor*> dst_;
-  CLKernel kernel_;
+  int grid_dimension_ = 3;  // can be 1, 2 or 3
+  int3 work_group_launch_order_ = int3(0, 1, 2);
   int3 grid_size_ = int3(0, 0, 0);
   std::vector<std::string> src_tensors_names_;
   std::vector<std::string> dst_tensors_names_;
 
  private:
+  CLKernel kernel_;
+  CLArguments cl_args_;
+  int3 work_groups_count_ = int3(0, 0, 0);
   int linkable_count_ = 0;
   std::string elementwise_code_;  // temporary, used during op construction
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index a4b01ffad16..c5b659463ea 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -117,12 +117,12 @@ std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status Mean::BindArguments() {
+absl::Status Mean::BindArguments(ArgumentsBinder* args) {
   const double total_size = src_[0]->Width() * src_[0]->Height();
   const double size_0 = work_group_size_.x * work_group_size_.y;
   const double size_1 = total_size / size_0;
-  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_1", 1.0 / size_1));
-  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_2", 1.0 / size_0));
+  RETURN_IF_ERROR(args->SetFloat("inv_multiplier_1", 1.0 / size_1));
+  RETURN_IF_ERROR(args->SetFloat("inv_multiplier_2", 1.0 / size_0));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
index 12735c0b916..3bf2061d329 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
@@ -37,7 +37,7 @@ class Mean : public GPUOperation {
       std::vector<int3>* work_groups) const override {
     work_groups->push_back(work_group_size_);
   }
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
index a0fd699062c..91266ef29a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
@@ -132,13 +132,13 @@ std::string Resize::GetResizeCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status Resize::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
-  RETURN_IF_ERROR(args_.SetFloat(
+absl::Status Resize::BindArguments(ArgumentsBinder* args) {
+  RETURN_IF_ERROR(args->SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args->SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_x",
       CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_y",
       CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
   return absl::OkStatus();
@@ -286,17 +286,17 @@ std::string Resize3D::GetResize3DCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status Resize3D::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_z", src_[0]->Depth() - 1));
-  RETURN_IF_ERROR(args_.SetFloat(
+absl::Status Resize3D::BindArguments(ArgumentsBinder* args) {
+  RETURN_IF_ERROR(args->SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args->SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args->SetInt("border_z", src_[0]->Depth() - 1));
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_x",
       CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_y",
       CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_z",
       CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_)));
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
index 0349afe5664..859d750b7e0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
@@ -27,7 +27,7 @@ namespace cl {
 
 class Resize : public GPUOperation {
  public:
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -53,7 +53,7 @@ Resize CreateResize(const OperationDef& definition,
 
 class Resize3D : public GPUOperation {
  public:
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index e7cf72aa72a..d4d0442e61d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -109,14 +109,14 @@ std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
   return c;
 }
 
-absl::Status Softmax1x1::BindArguments() {
+absl::Status Softmax1x1::BindArguments(ArgumentsBinder* args) {
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
-  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
-  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
-  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
-  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
+  RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z));
+  RETURN_IF_ERROR(args->SetFloat("mask_w", mask.w));
   RETURN_IF_ERROR(
-      args_.SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
+      args->SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
index 5bc9278d612..202f46d2a51 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -35,7 +35,7 @@ class Softmax1x1 : public GPUOperation {
       std::vector<int3>* work_groups) const override {
     work_groups->push_back(work_group_size_);
   }
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
index f451d09d32d..52f197e4ca6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
@@ -128,11 +128,6 @@ std::string GenerateCode(const OperationDef& op_def,
   result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
   result->args_.AddInt("dilation_y", dw_attr.dilations.h);
 
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -160,29 +155,54 @@ std::string GenerateCode(const OperationDef& op_def,
   c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
   c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
   c += "  int x_c, y_c;\n";
-  if (manual_clamp) {
-    c += "  bool x_in, y_in;\n";
+
+  auto generate_check = [&]() {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"x_in", "y_in", "z_in"};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i];
+      }
+    }
+    return check;
+  };
+  const std::string check = generate_check();
+  if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
+    c += "  bool y_in;\n";
   }
+  if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
+    c += "  bool x_in;\n";
+  }
+
+  const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
   c += "  FLT4 src;\n";
   for (int ky = 0; ky < dw_attr.weights.shape.h; ++ky) {
     c += "  y_c = y_offseted + " + std::to_string(ky) + " * args.dilation_y;\n";
-    if (manual_clamp) {
+    if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
       c += "  y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
       c += "  y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
     }
     for (int kx = 0; kx < dw_attr.weights.shape.w; ++kx) {
       c += "  x_c = x_offseted + " + std::to_string(kx) +
            " * args.dilation_x;\n";
-      if (manual_clamp) {
+      if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
         c += "  x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
         c += "  x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
       }
       for (int d = 0; d < intermediate_depth; ++d) {
-        std::string multiplier = manual_clamp ? "* (FLT)(x_in && y_in)" : "";
-        c += "  src = args.src_tensor.Read(x_c, y_c, " + std::to_string(d) +
-             ")" + multiplier + ";\n";
-        c += "  dw_res_" + std::to_string(d) + " += src * constants[" +
-             std::to_string(weights_counter++) + "];\n";
+        const int src_ch_count = std::min(4, dw_attr.weights.shape.i - d * 4);
+        const std::string s_postfix = postfixes[src_ch_count - 1];
+        std::string multiplier = check.empty() ? "" : " * (FLT)(" + check + ")";
+        c += "  src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
+             std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
+        c += "  dw_res_" + std::to_string(d) + s_postfix + " += src" +
+             s_postfix + " * constants[" + std::to_string(weights_counter++) +
+             "]" + s_postfix + ";\n";
       }
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index b2ce0690a9c..1f8f985f3ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -154,17 +154,17 @@ std::string StridedSlice::GetStridedSliceCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status StridedSlice::BindArguments() {
+absl::Status StridedSlice::BindArguments(ArgumentsBinder* args) {
   int4 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
                           src_[0]->Channels(), src_[0]->Batch());
-  RETURN_IF_ERROR(args_.SetInt("offset_x", offset.x));
-  RETURN_IF_ERROR(args_.SetInt("offset_y", offset.y));
-  RETURN_IF_ERROR(args_.SetInt("offset_z", offset.z));
-  RETURN_IF_ERROR(args_.SetInt("offset_b", offset.w));
-  RETURN_IF_ERROR(args_.SetInt("stride_x", attributes_.strides.w));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", attributes_.strides.h));
-  RETURN_IF_ERROR(args_.SetInt("stride_z", attributes_.strides.c));
-  RETURN_IF_ERROR(args_.SetInt("stride_b", attributes_.strides.b));
+  RETURN_IF_ERROR(args->SetInt("offset_x", offset.x));
+  RETURN_IF_ERROR(args->SetInt("offset_y", offset.y));
+  RETURN_IF_ERROR(args->SetInt("offset_z", offset.z));
+  RETURN_IF_ERROR(args->SetInt("offset_b", offset.w));
+  RETURN_IF_ERROR(args->SetInt("stride_x", attributes_.strides.w));
+  RETURN_IF_ERROR(args->SetInt("stride_y", attributes_.strides.h));
+  RETURN_IF_ERROR(args->SetInt("stride_z", attributes_.strides.c));
+  RETURN_IF_ERROR(args->SetInt("stride_b", attributes_.strides.b));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
index 5a6d8ad6047..dddff2faf35 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
@@ -27,7 +27,7 @@ namespace cl {
 class StridedSlice : public GPUOperation {
  public:
   StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index f0e0c412b7e..25fa60c776a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -188,6 +188,14 @@ int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
   return block_size;
 }
 
+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size) {
+  int3 work_groups_count;
+  work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+  work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y);
+  work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z);
+  return work_groups_count;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 093e9b83292..69f6808146c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -213,6 +213,8 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
 int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
                                    CalculationsPrecision precision,
                                    int task_size);
+
+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size);
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 0f94847f08a..1244f769b48 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -262,16 +262,16 @@ int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
-absl::Status Winograd4x4To36::BindArguments() {
+absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
   const int tiles_x = DivideRoundUp(
       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
   const int tiles_y = DivideRoundUp(
       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
   const int tiles_total = tiles_x * tiles_y;
-  RETURN_IF_ERROR(args_.SetInt("padding_x", -padding_.prepended.w));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
-  RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
-  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  RETURN_IF_ERROR(args->SetInt("padding_x", -padding_.prepended.w));
+  RETURN_IF_ERROR(args->SetInt("padding_y", -padding_.prepended.h));
+  RETURN_IF_ERROR(args->SetInt("tiles_total", tiles_total));
+  RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
   return absl::OkStatus();
 }
 
@@ -463,9 +463,9 @@ int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
-absl::Status Winograd36To4x4::BindArguments() {
+absl::Status Winograd36To4x4::BindArguments(ArgumentsBinder* args) {
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
-  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index a5da49e7939..609e38a4c9a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -36,7 +36,7 @@ class Winograd4x4To36 : public GPUOperation {
   Winograd4x4To36() = default;
   Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
                   const DeviceInfo& device_info);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
@@ -73,7 +73,7 @@ class Winograd36To4x4 : public GPUOperation {
   Winograd36To4x4() = default;
   Winograd36To4x4(const OperationDef& definition,
                   const DeviceInfo& device_info);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 8f7b314b707..deaf886751a 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -104,14 +104,6 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
   }
 }
 
-absl::Status TensorLinearDescriptor::CreateGPUObject(
-    CLContext* context, GPUObjectPtr* result) const {
-  LinearStorage gpu_storage;
-  RETURN_IF_ERROR(gpu_storage.CreateFromTensorLinearDescriptor(*this, context));
-  *result = absl::make_unique<LinearStorage>(std::move(gpu_storage));
-  return absl::OkStatus();
-}
-
 void TensorLinearDescriptor::UploadLinearData(
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src,
     int aligned_size) {
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 37e7f12dfb3..dcd947e9e08 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@@ -63,8 +64,6 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
 
-  absl::Status CreateGPUObject(CLContext* context,
-                               GPUObjectPtr* result) const override;
   void Release() override;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index bf2fd449291..add0e2fd4e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -36,7 +36,7 @@ namespace cl {
 
 #ifdef __ANDROID__
 #define LoadFunction(function)                                                 \
-  if (is_pixel) {                                                              \
+  if (use_wrapper) {                                                           \
     function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
   } else {                                                                     \
     function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));  \
@@ -53,7 +53,7 @@ namespace cl {
 #ifdef __WINDOWS__
 void LoadOpenCLFunctions(HMODULE libopencl);
 #else
-void LoadOpenCLFunctions(void* libopencl, bool is_pixel);
+void LoadOpenCLFunctions(void* libopencl, bool use_wrapper);
 #endif
 
 absl::Status LoadOpenCL() {
@@ -77,8 +77,11 @@ absl::Status LoadOpenCL() {
   // record error
   std::string error(dlerror());
 #ifdef __ANDROID__
-  // Pixel phone?
+  // Pixel phone or auto?
   libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+  if (!libopencl) {
+    libopencl = dlopen("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL);
+  }
   if (libopencl) {
     typedef void (*enableOpenCL_t)();
     enableOpenCL_t enableOpenCL =
@@ -96,11 +99,11 @@ absl::Status LoadOpenCL() {
 #ifdef __WINDOWS__
 void LoadOpenCLFunctions(HMODULE libopencl) {
 #else
-void LoadOpenCLFunctions(void* libopencl, bool is_pixel) {
+void LoadOpenCLFunctions(void* libopencl, bool use_wrapper) {
 #ifdef __ANDROID__
   typedef void* (*loadOpenCLPointer_t)(const char* name);
   loadOpenCLPointer_t loadOpenCLPointer;
-  if (is_pixel) {
+  if (use_wrapper) {
     loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
         dlsym(libopencl, "loadOpenCLPointer"));
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 432a2df0292..8a22741f013 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -14,7 +14,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_weights_converter",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
@@ -82,7 +81,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
         "//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common:operations",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index 7fa7978034b..a3282f05200 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@@ -38,8 +37,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
     GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
     return absl::make_unique<GPUOperation>(std::move(conv));
   } else {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
 }
 
@@ -47,8 +46,9 @@ std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
     const DeviceInfo& device_info, const OperationDef& op_def,
     ModelHints hints) {
-  ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
-  return absl::make_unique<ConvTexture>(std::move(conv));
+  ConvPowerVR conv =
+      CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
 }
 
 std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 24c48d52f2a..6c6ee044cdd 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -31,8 +30,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
     const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
     const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
     FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
     return absl::make_unique<FullyConnected>(std::move(fc));
@@ -43,8 +43,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
     const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
     const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
     FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
     return absl::make_unique<FullyConnected>(std::move(fc));
@@ -71,8 +72,10 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
       ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
       return absl::make_unique<ConvBuffer1x1>(std::move(conv));
     } else {
-      ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-      return absl::make_unique<ConvTexture>(std::move(conv));
+      BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+      ConvPowerVR conv =
+          CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+      return absl::make_unique<ConvPowerVR>(std::move(conv));
     }
   } else {
     FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.cc b/tensorflow/lite/delegates/gpu/cl/serialization.cc
new file mode 100644
index 00000000000..b965c1d7c85
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.cc
@@ -0,0 +1,1008 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/serialization.h"
+
+#include <cstdint>
+
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+tflite::gpu::data::AccessType ToFB(AccessType type) {
+  switch (type) {
+    case AccessType::READ:
+      return tflite::gpu::data::AccessType::READ;
+    case AccessType::WRITE:
+      return tflite::gpu::data::AccessType::WRITE;
+    case AccessType::READ_WRITE:
+      return tflite::gpu::data::AccessType::READ_WRITE;
+    default:
+      return tflite::gpu::data::AccessType::READ_WRITE;
+  }
+}
+
+data::DataType ToFB(DataType type) {
+  switch (type) {
+    case DataType::FLOAT16:
+      return data::DataType::FLOAT16;
+    case DataType::FLOAT32:
+      return data::DataType::FLOAT32;
+    default:
+      return data::DataType::UNKNOWN;
+  }
+}
+
+data::MemoryType ToFB(MemoryType type) {
+  switch (type) {
+    case MemoryType::CONSTANT:
+      return data::MemoryType::CONSTANT;
+    case MemoryType::GLOBAL:
+      return data::MemoryType::GLOBAL;
+    case MemoryType::LOCAL:
+      return data::MemoryType::LOCAL;
+  }
+}
+
+data::LinearStorageType ToFB(LinearStorageType type) {
+  switch (type) {
+    case LinearStorageType::BUFFER:
+      return data::LinearStorageType::BUFFER;
+    case LinearStorageType::TEXTURE_2D:
+      return data::LinearStorageType::TEXTURE_2D;
+  }
+}
+
+data::TensorStorageType ToFB(TensorStorageType type) {
+  switch (type) {
+    case TensorStorageType::BUFFER:
+      return data::TensorStorageType::BUFFER;
+    case TensorStorageType::IMAGE_BUFFER:
+      return data::TensorStorageType::IMAGE_BUFFER;
+    case TensorStorageType::TEXTURE_2D:
+      return data::TensorStorageType::TEXTURE_2D;
+    case TensorStorageType::TEXTURE_ARRAY:
+      return data::TensorStorageType::TEXTURE_ARRAY;
+    case TensorStorageType::TEXTURE_3D:
+      return data::TensorStorageType::TEXTURE_3D;
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return data::TensorStorageType::SINGLE_TEXTURE_2D;
+    case TensorStorageType::UNKNOWN:
+      return data::TensorStorageType::UNKNOWN;
+  }
+}
+
+data::Layout ToFB(Layout type) {
+  switch (type) {
+    case Layout::HWC:
+      return data::Layout::HWC;
+    case Layout::BHWC:
+      return data::Layout::BHWC;
+    case Layout::HWDC:
+      return data::Layout::HWDC;
+    case Layout::BHWDC:
+      return data::Layout::BHWDC;
+    default:
+      return data::Layout::UNKNOWN;
+  }
+}
+
+data::CalculationsPrecision ToFB(CalculationsPrecision type) {
+  switch (type) {
+    case CalculationsPrecision::F32:
+      return data::CalculationsPrecision::F32;
+    case CalculationsPrecision::F32_F16:
+      return data::CalculationsPrecision::F32_F16;
+    case CalculationsPrecision::F16:
+      return data::CalculationsPrecision::F16;
+  }
+}
+
+data::TensorToGrid ToFB(TensorToGrid type) {
+  switch (type) {
+    case TensorToGrid::kCustom:
+      return data::TensorToGrid::CUSTOM;
+    case TensorToGrid::kWBToX_HDToY_SToZ:
+      return data::TensorToGrid::WB_TO_X_HD_TO_Y_S_TO_Z;
+    case TensorToGrid::kWBToX_HDToY_ZIs1:
+      return data::TensorToGrid::WB_TO_X_HD_TO_Y_Z_IS_1;
+    case TensorToGrid::kWBToX_HToY_DToZ:
+      return data::TensorToGrid::WB_TO_X_H_TO_Y_D_TO_Z;
+    case TensorToGrid::kBToX_YIs1_ZIs1:
+      return data::TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1;
+  }
+}
+
+data::CompilerOptions ToFB(CompilerOptions type) {
+  switch (type) {
+    case CompilerOptions::ADRENO_FULL_SIMD_LINE:
+      return data::CompilerOptions::ADRENO_FULL_SIMD_LINE;
+    case CompilerOptions::ADRENO_MORE_WAVES:
+      return data::CompilerOptions::ADRENO_MORE_WAVES;
+    case CompilerOptions::POWERVR_FP16:
+      return data::CompilerOptions::POWERVR_FP16;
+    case CompilerOptions::CL_OPT_DISABLE:
+      return data::CompilerOptions::CL_OPT_DISABLE;
+    case CompilerOptions::CL_2_0:
+      return data::CompilerOptions::CL_2_0;
+    case CompilerOptions::CL_3_0:
+      return data::CompilerOptions::CL_3_0;
+  }
+}
+
+DataType ToEnum(data::DataType type) {
+  switch (type) {
+    case data::DataType::FLOAT16:
+      return DataType::FLOAT16;
+    case data::DataType::FLOAT32:
+      return DataType::FLOAT32;
+    default:
+      return DataType::UNKNOWN;
+  }
+}
+
+AccessType ToEnum(tflite::gpu::data::AccessType type) {
+  switch (type) {
+    case tflite::gpu::data::AccessType::READ:
+      return AccessType::READ;
+    case tflite::gpu::data::AccessType::WRITE:
+      return AccessType::WRITE;
+    case tflite::gpu::data::AccessType::READ_WRITE:
+      return AccessType::READ_WRITE;
+  }
+}
+
+MemoryType ToEnum(data::MemoryType type) {
+  switch (type) {
+    case data::MemoryType::CONSTANT:
+      return MemoryType::CONSTANT;
+    case data::MemoryType::GLOBAL:
+      return MemoryType::GLOBAL;
+    case data::MemoryType::LOCAL:
+      return MemoryType::LOCAL;
+  }
+}
+
+LinearStorageType ToEnum(data::LinearStorageType type) {
+  switch (type) {
+    case data::LinearStorageType::BUFFER:
+      return LinearStorageType::BUFFER;
+    case data::LinearStorageType::TEXTURE_2D:
+      return LinearStorageType::TEXTURE_2D;
+  }
+}
+
+TensorStorageType ToEnum(data::TensorStorageType type) {
+  switch (type) {
+    case data::TensorStorageType::BUFFER:
+      return TensorStorageType::BUFFER;
+    case data::TensorStorageType::IMAGE_BUFFER:
+      return TensorStorageType::IMAGE_BUFFER;
+    case data::TensorStorageType::TEXTURE_2D:
+      return TensorStorageType::TEXTURE_2D;
+    case data::TensorStorageType::TEXTURE_ARRAY:
+      return TensorStorageType::TEXTURE_ARRAY;
+    case data::TensorStorageType::TEXTURE_3D:
+      return TensorStorageType::TEXTURE_3D;
+    case data::TensorStorageType::SINGLE_TEXTURE_2D:
+      return TensorStorageType::SINGLE_TEXTURE_2D;
+    case data::TensorStorageType::UNKNOWN:
+      return TensorStorageType::UNKNOWN;
+  }
+}
+
+Layout ToEnum(data::Layout type) {
+  switch (type) {
+    case data::Layout::HWC:
+      return Layout::HWC;
+    case data::Layout::BHWC:
+      return Layout::BHWC;
+    case data::Layout::HWDC:
+      return Layout::HWDC;
+    case data::Layout::BHWDC:
+      return Layout::BHWDC;
+    default:
+      return Layout::UNKNOWN;
+  }
+}
+
+CalculationsPrecision ToEnum(data::CalculationsPrecision type) {
+  switch (type) {
+    case data::CalculationsPrecision::F32:
+      return CalculationsPrecision::F32;
+    case data::CalculationsPrecision::F32_F16:
+      return CalculationsPrecision::F32_F16;
+    case data::CalculationsPrecision::F16:
+      return CalculationsPrecision::F16;
+  }
+}
+
+TensorToGrid ToEnum(data::TensorToGrid type) {
+  switch (type) {
+    case data::TensorToGrid::CUSTOM:
+      return TensorToGrid::kCustom;
+    case data::TensorToGrid::WB_TO_X_HD_TO_Y_S_TO_Z:
+      return TensorToGrid::kWBToX_HDToY_SToZ;
+    case data::TensorToGrid::WB_TO_X_HD_TO_Y_Z_IS_1:
+      return TensorToGrid::kWBToX_HDToY_ZIs1;
+    case data::TensorToGrid::WB_TO_X_H_TO_Y_D_TO_Z:
+      return TensorToGrid::kWBToX_HToY_DToZ;
+    case data::TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1:
+      return TensorToGrid::kBToX_YIs1_ZIs1;
+  }
+}
+
+CompilerOptions ToEnum(data::CompilerOptions type) {
+  switch (type) {
+    case data::CompilerOptions::ADRENO_FULL_SIMD_LINE:
+      return CompilerOptions::ADRENO_FULL_SIMD_LINE;
+    case data::CompilerOptions::ADRENO_MORE_WAVES:
+      return CompilerOptions::ADRENO_MORE_WAVES;
+    case data::CompilerOptions::POWERVR_FP16:
+      return CompilerOptions::POWERVR_FP16;
+    case data::CompilerOptions::CL_OPT_DISABLE:
+      return CompilerOptions::CL_OPT_DISABLE;
+    case data::CompilerOptions::CL_2_0:
+      return CompilerOptions::CL_2_0;
+    case data::CompilerOptions::CL_3_0:
+      return CompilerOptions::CL_3_0;
+  }
+}
+
+}  // namespace
+
+flatbuffers::Offset<data::Int2> Encode(
+    const int2& v, flatbuffers::FlatBufferBuilder* builder) {
+  data::Int2Builder int2_builder(*builder);
+  int2_builder.add_x(v.x);
+  int2_builder.add_y(v.y);
+  return int2_builder.Finish();
+}
+
+flatbuffers::Offset<data::Int3> Encode(
+    const int3& v, flatbuffers::FlatBufferBuilder* builder) {
+  data::Int3Builder int3_builder(*builder);
+  int3_builder.add_x(v.x);
+  int3_builder.add_y(v.y);
+  int3_builder.add_z(v.z);
+  return int3_builder.Finish();
+}
+
+flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> Encode(
+    const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<flatbuffers::Offset<tflite::gpu::data::StateVariable>>
+      state_vars_fb;
+  for (auto& v0 : desc.state_vars_) {
+    auto key_fb = builder->CreateString(v0.first);
+    auto value_fb = builder->CreateString(v0.second);
+    tflite::gpu::data::StateVariableBuilder state_builder(*builder);
+    state_builder.add_key(key_fb);
+    state_builder.add_value(value_fb);
+    state_vars_fb.push_back(state_builder.Finish());
+  }
+  auto state_vars_fb_vec = builder->CreateVector(state_vars_fb);
+  tflite::gpu::data::GPUObjectDescriptorBuilder obj_builder(*builder);
+  obj_builder.add_state_vars(state_vars_fb_vec);
+  obj_builder.add_access_type(ToFB(desc.access_type_));
+  return obj_builder.Finish();
+}
+
+void Decode(const tflite::gpu::data::GPUObjectDescriptor* fb_obj,
+            GPUObjectDescriptor* obj) {
+  obj->access_type_ = ToEnum(fb_obj->access_type());
+  for (auto state_fb : *fb_obj->state_vars()) {
+    std::string key(state_fb->key()->c_str(), state_fb->key()->size());
+    std::string value(state_fb->value()->c_str(), state_fb->value()->size());
+    obj->state_vars_[key] = value;
+  }
+}
+
+flatbuffers::Offset<data::BufferDescriptor> Encode(
+    const BufferDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  std::vector<flatbuffers::Offset<flatbuffers::String>> attributes_fb;
+  for (auto& attr : desc.attributes) {
+    attributes_fb.push_back(builder->CreateString(attr));
+  }
+  auto attributes_fb_vec = builder->CreateVector(attributes_fb);
+  auto data_fb = builder->CreateVector(desc.data);
+  data::BufferDescriptorBuilder buf_builder(*builder);
+  buf_builder.add_base_obj(obj_fb);
+  buf_builder.add_element_type(ToFB(desc.element_type));
+  buf_builder.add_element_size(desc.element_size);
+  buf_builder.add_memory_type(ToFB(desc.memory_type));
+  buf_builder.add_attributes(attributes_fb_vec);
+  buf_builder.add_size(desc.size);
+  buf_builder.add_data(data_fb);
+  return buf_builder.Finish();
+}
+
+void Decode(const data::BufferDescriptor* fb_desc, BufferDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->element_type = ToEnum(fb_desc->element_type());
+  desc->element_size = fb_desc->element_size();
+  desc->memory_type = ToEnum(fb_desc->memory_type());
+  for (auto attr_fb : *fb_desc->attributes()) {
+    std::string attr(attr_fb->c_str(), attr_fb->size());
+    desc->attributes.push_back(attr);
+  }
+  desc->size = fb_desc->size();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::Texture2DDescriptor> Encode(
+    const Texture2DDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  auto data_fb = builder->CreateVector(desc.data);
+  auto size_fb = Encode(desc.size, builder);
+  data::Texture2DDescriptorBuilder tex_builder(*builder);
+  tex_builder.add_base_obj(obj_fb);
+  tex_builder.add_element_type(ToFB(desc.element_type));
+  tex_builder.add_normalized(desc.normalized);
+  tex_builder.add_normalized_type(ToFB(desc.normalized_type));
+  tex_builder.add_size(size_fb);
+  tex_builder.add_data(data_fb);
+  return tex_builder.Finish();
+}
+
+void Decode(const data::Texture2DDescriptor* fb_desc,
+            Texture2DDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->element_type = ToEnum(fb_desc->element_type());
+  desc->normalized = fb_desc->normalized();
+  desc->normalized_type = ToEnum(fb_desc->normalized_type());
+  desc->size.x = fb_desc->size()->x();
+  desc->size.y = fb_desc->size()->y();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::TensorLinearDescriptor> Encode(
+    const TensorLinearDescriptor& desc,
+    flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  auto data_fb = builder->CreateVector(desc.data);
+  data::TensorLinearDescriptorBuilder tensor_builder(*builder);
+  tensor_builder.add_base_obj(obj_fb);
+  tensor_builder.add_element_type(ToFB(desc.element_type));
+  tensor_builder.add_storage_type(ToFB(desc.storage_type));
+  tensor_builder.add_memory_type(ToFB(desc.memory_type));
+  tensor_builder.add_size(desc.size);
+  tensor_builder.add_data(data_fb);
+  return tensor_builder.Finish();
+}
+
+void Decode(const data::TensorLinearDescriptor* fb_desc,
+            TensorLinearDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->element_type = ToEnum(fb_desc->element_type());
+  desc->storage_type = ToEnum(fb_desc->storage_type());
+  desc->memory_type = ToEnum(fb_desc->memory_type());
+  desc->size = fb_desc->size();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::TensorDescriptor> Encode(
+    const TensorDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  data::BHWDCBuilder shape_builder(*builder);
+  shape_builder.add_b(desc.shape.b);
+  shape_builder.add_h(desc.shape.h);
+  shape_builder.add_w(desc.shape.w);
+  shape_builder.add_d(desc.shape.d);
+  shape_builder.add_c(desc.shape.c);
+  auto shape_fb = shape_builder.Finish();
+
+  auto data_fb = builder->CreateVector(desc.data);
+  data::TensorDescriptorBuilder tensor_builder(*builder);
+  tensor_builder.add_base_obj(obj_fb);
+  tensor_builder.add_data_type(ToFB(desc.data_type));
+  tensor_builder.add_storage_type(ToFB(desc.storage_type));
+  tensor_builder.add_layout(ToFB(desc.layout));
+  tensor_builder.add_shape(shape_fb);
+  tensor_builder.add_data(data_fb);
+  return tensor_builder.Finish();
+}
+
+void Decode(const data::TensorDescriptor* fb_desc, TensorDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->data_type = ToEnum(fb_desc->data_type());
+  desc->storage_type = ToEnum(fb_desc->storage_type());
+  desc->layout = ToEnum(fb_desc->layout());
+  desc->shape.b = fb_desc->shape()->b();
+  desc->shape.h = fb_desc->shape()->h();
+  desc->shape.w = fb_desc->shape()->w();
+  desc->shape.d = fb_desc->shape()->d();
+  desc->shape.c = fb_desc->shape()->c();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::OperationDef> Encode(
+    const OperationDef& def, flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<flatbuffers::Offset<data::TensorDescriptor>> src_tensors_fb;
+  for (auto& desc : def.src_tensors) {
+    auto desc_fb = Encode(desc, builder);
+    src_tensors_fb.push_back(desc_fb);
+  }
+
+  std::vector<flatbuffers::Offset<data::TensorDescriptor>> dst_tensors_fb;
+  for (auto& desc : def.dst_tensors) {
+    auto desc_fb = Encode(desc, builder);
+    dst_tensors_fb.push_back(desc_fb);
+  }
+
+  auto src_tensors_fb_vec = builder->CreateVector(src_tensors_fb);
+  auto dst_tensors_fb_vec = builder->CreateVector(dst_tensors_fb);
+
+  data::OperationDefBuilder def_builder(*builder);
+  def_builder.add_precision(ToFB(def.precision));
+  def_builder.add_src_tensors(src_tensors_fb_vec);
+  def_builder.add_dst_tensors(dst_tensors_fb_vec);
+  return def_builder.Finish();
+}
+
+void Decode(const data::OperationDef* fb_def, OperationDef* def) {
+  for (auto src_fb : *fb_def->src_tensors()) {
+    TensorDescriptor desc;
+    Decode(src_fb, &desc);
+    def->src_tensors.push_back(std::move(desc));
+  }
+  for (auto dst_fb : *fb_def->dst_tensors()) {
+    TensorDescriptor desc;
+    Decode(dst_fb, &desc);
+    def->dst_tensors.push_back(std::move(desc));
+  }
+  def->precision = ToEnum(fb_def->precision());
+}
+
+flatbuffers::Offset<data::TensorDescWithId> Encode(
+    const TensorDescriptor& desc, const ValueId& id,
+    flatbuffers::FlatBufferBuilder* builder) {
+  auto desc_fb = Encode(desc, builder);
+  data::TensorDescWithIdBuilder desc_builder(*builder);
+  desc_builder.add_desc(desc_fb);
+  desc_builder.add_id(id);
+  return desc_builder.Finish();
+}
+
+void Decode(const data::TensorDescWithId* fb_desc, TensorDescriptor* desc,
+            ValueId* id) {
+  Decode(fb_desc->desc(), desc);
+  *id = fb_desc->id();
+}
+
+absl::Status Decode(const data::Arguments* fb_args, Arguments* args) {
+  args->int_values_.clear();
+  for (auto int_values_fb : *fb_args->int_values()) {
+    Arguments::IntValue value;
+    value.value = int_values_fb->value();
+    value.active = int_values_fb->active();
+    std::string name(int_values_fb->name()->c_str(),
+                     int_values_fb->name()->size());
+    args->int_values_[name] = value;
+  }
+
+  args->float_values_.clear();
+  for (auto float_values_fb : *fb_args->float_values()) {
+    Arguments::FloatValue value;
+    value.value = float_values_fb->value();
+    value.active = float_values_fb->active();
+    std::string name(float_values_fb->name()->c_str(),
+                     float_values_fb->name()->size());
+    args->float_values_[name] = value;
+  }
+
+  args->half_values_.clear();
+  for (auto half_values_fb : *fb_args->half_values()) {
+    Arguments::HalfValue value;
+    value.value = half_values_fb->value();
+    value.active = half_values_fb->active();
+    std::string name(half_values_fb->name()->c_str(),
+                     half_values_fb->name()->size());
+    args->half_values_[name] = value;
+  }
+
+  for (auto buffer_pair_fb : *fb_args->buffer_objects()) {
+    std::string key(buffer_pair_fb->key()->c_str(),
+                    buffer_pair_fb->key()->size());
+    BufferDescriptor desc;
+    Decode(buffer_pair_fb->value(), &desc);
+    args->AddObject(key, absl::make_unique<BufferDescriptor>(std::move(desc)));
+  }
+
+  for (auto texture_pair_fb : *fb_args->texture2d_objects()) {
+    std::string key(texture_pair_fb->key()->c_str(),
+                    texture_pair_fb->key()->size());
+    Texture2DDescriptor desc;
+    Decode(texture_pair_fb->value(), &desc);
+    args->AddObject(key,
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_linear_objects()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorLinearDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    args->AddObject(key,
+                    absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_objects()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    args->AddObject(key, absl::make_unique<TensorDescriptor>(std::move(desc)));
+  }
+
+  for (auto buffer_pair_fb : *fb_args->buffer_refs()) {
+    std::string key(buffer_pair_fb->key()->c_str(),
+                    buffer_pair_fb->key()->size());
+    BufferDescriptor desc;
+    Decode(buffer_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(key, access_type,
+                       absl::make_unique<BufferDescriptor>(std::move(desc)));
+  }
+
+  for (auto texture_pair_fb : *fb_args->texture2d_refs()) {
+    std::string key(texture_pair_fb->key()->c_str(),
+                    texture_pair_fb->key()->size());
+    Texture2DDescriptor desc;
+    Decode(texture_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(key, access_type,
+                       absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_linear_refs()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorLinearDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(
+        key, access_type,
+        absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_refs()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(key, access_type,
+                       absl::make_unique<TensorDescriptor>(std::move(desc)));
+  }
+  return absl::OkStatus();
+}
+
+flatbuffers::Offset<data::Arguments> Encode(
+    const Arguments& args, flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<flatbuffers::Offset<data::IntValue>> int_values_fb;
+  for (auto& value : args.int_values_) {
+    auto name_fb = builder->CreateString(value.first);
+    data::IntValueBuilder value_builder(*builder);
+    value_builder.add_name(name_fb);
+    value_builder.add_value(value.second.value);
+    value_builder.add_active(value.second.active);
+    int_values_fb.push_back(value_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::FloatValue>> float_values_fb;
+  for (auto& value : args.float_values_) {
+    auto name_fb = builder->CreateString(value.first);
+    data::FloatValueBuilder value_builder(*builder);
+    value_builder.add_name(name_fb);
+    value_builder.add_value(value.second.value);
+    value_builder.add_active(value.second.active);
+    float_values_fb.push_back(value_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::HalfValue>> half_values_fb;
+  for (auto& value : args.half_values_) {
+    auto name_fb = builder->CreateString(value.first);
+    data::HalfValueBuilder value_builder(*builder);
+    value_builder.add_name(name_fb);
+    value_builder.add_value(value.second.value);
+    value_builder.add_active(value.second.active);
+    half_values_fb.push_back(value_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::BufferDescriptorMapValue>>
+      buffer_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* buffer_desc =
+        dynamic_cast<const BufferDescriptor*>(value.second.get());
+    if (!buffer_desc) continue;
+    auto desc_fb = Encode(*buffer_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::BufferDescriptorMapValueBuilder buf_map_builder(*builder);
+    buf_map_builder.add_key(key_fb);
+    buf_map_builder.add_value(desc_fb);
+    buffer_objs_fb.push_back(buf_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::Texture2DDescriptorMapValue>>
+      texture2d_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* texture_desc =
+        dynamic_cast<const Texture2DDescriptor*>(value.second.get());
+    if (!texture_desc) continue;
+    auto desc_fb = Encode(*texture_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::Texture2DDescriptorMapValueBuilder tex_map_builder(*builder);
+    tex_map_builder.add_key(key_fb);
+    tex_map_builder.add_value(desc_fb);
+    texture2d_objs_fb.push_back(tex_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorLinearDescriptorMapValue>>
+      tensor_linear_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* tensor_desc =
+        dynamic_cast<const TensorLinearDescriptor*>(value.second.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorLinearDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_linear_objs_fb.push_back(ten_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorDescriptorMapValue>>
+      tensor_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* tensor_desc =
+        dynamic_cast<const TensorDescriptor*>(value.second.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_objs_fb.push_back(ten_map_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::BufferDescriptorMapValue>>
+      buffer_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* buffer_desc =
+        dynamic_cast<const BufferDescriptor*>(value.second.get());
+    if (!buffer_desc) continue;
+    auto desc_fb = Encode(*buffer_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::BufferDescriptorMapValueBuilder buf_map_builder(*builder);
+    buf_map_builder.add_key(key_fb);
+    buf_map_builder.add_value(desc_fb);
+    buffer_refs_fb.push_back(buf_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::Texture2DDescriptorMapValue>>
+      texture2d_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* texture_desc =
+        dynamic_cast<const Texture2DDescriptor*>(value.second.get());
+    if (!texture_desc) continue;
+    auto desc_fb = Encode(*texture_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::Texture2DDescriptorMapValueBuilder tex_map_builder(*builder);
+    tex_map_builder.add_key(key_fb);
+    tex_map_builder.add_value(desc_fb);
+    texture2d_refs_fb.push_back(tex_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorLinearDescriptorMapValue>>
+      tensor_linear_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* tensor_desc =
+        dynamic_cast<const TensorLinearDescriptor*>(value.second.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorLinearDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_linear_refs_fb.push_back(ten_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorDescriptorMapValue>>
+      tensor_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* tensor_desc =
+        dynamic_cast<const TensorDescriptor*>(value.second.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_refs_fb.push_back(ten_map_builder.Finish());
+  }
+
+  auto int_values_fb_vec = builder->CreateVector(int_values_fb);
+  auto float_values_fb_vec = builder->CreateVector(float_values_fb);
+  auto half_values_fb_vec = builder->CreateVector(half_values_fb);
+  auto buffer_objs_fb_vec = builder->CreateVector(buffer_objs_fb);
+  auto texture2d_objs_fb_vec = builder->CreateVector(texture2d_objs_fb);
+  auto tensor_linear_objs_fb_vec = builder->CreateVector(tensor_linear_objs_fb);
+  auto tensor_objs_fb_vec = builder->CreateVector(tensor_objs_fb);
+  auto buffer_refs_fb_vec = builder->CreateVector(buffer_refs_fb);
+  auto texture2d_refs_fb_vec = builder->CreateVector(texture2d_refs_fb);
+  auto tensor_linear_refs_fb_vec = builder->CreateVector(tensor_linear_refs_fb);
+  auto tensor_refs_fb_vec = builder->CreateVector(tensor_refs_fb);
+  data::ArgumentsBuilder arguments_builder(*builder);
+  arguments_builder.add_int_values(int_values_fb_vec);
+  arguments_builder.add_float_values(float_values_fb_vec);
+  arguments_builder.add_half_values(half_values_fb_vec);
+  arguments_builder.add_buffer_objects(buffer_objs_fb_vec);
+  arguments_builder.add_texture2d_objects(texture2d_objs_fb_vec);
+  arguments_builder.add_tensor_linear_objects(tensor_linear_objs_fb_vec);
+  arguments_builder.add_tensor_objects(tensor_objs_fb_vec);
+  arguments_builder.add_buffer_refs(buffer_refs_fb_vec);
+  arguments_builder.add_texture2d_refs(texture2d_refs_fb_vec);
+  arguments_builder.add_tensor_linear_refs(tensor_linear_refs_fb_vec);
+  arguments_builder.add_tensor_refs(tensor_refs_fb_vec);
+  return arguments_builder.Finish();
+}
+
+absl::Status Decode(const data::GPUOperation* fb_op, GPUOperation* op) {
+  RETURN_IF_ERROR(Decode(fb_op->arguments(), &op->args_));
+  op->code_ = std::string(fb_op->code()->c_str(), fb_op->code()->size());
+  op->work_group_size_.x = fb_op->work_group_size()->x();
+  op->work_group_size_.y = fb_op->work_group_size()->y();
+  op->work_group_size_.z = fb_op->work_group_size()->z();
+  for (auto option_fb : *fb_op->compiler_options()) {
+    op->compiler_options_.push_back(ToEnum(option_fb->option()));
+  }
+  op->tensor_to_grid_ = ToEnum(fb_op->tensor_to_grid());
+  op->elementwise_ = fb_op->elementwise();
+  op->linkable_ = fb_op->linkable();
+  op->check_src_channels_size_ = fb_op->check_src_channels_size();
+  Decode(fb_op->definition(), &op->definition_);
+  op->grid_dimension_ = fb_op->grid_dimension();
+  op->work_group_launch_order_.x = fb_op->work_group_launch_order()->x();
+  op->work_group_launch_order_.y = fb_op->work_group_launch_order()->y();
+  op->work_group_launch_order_.z = fb_op->work_group_launch_order()->z();
+  op->grid_size_.x = fb_op->grid_size()->x();
+  op->grid_size_.y = fb_op->grid_size()->y();
+  op->grid_size_.z = fb_op->grid_size()->z();
+  for (auto name_fb : *fb_op->src_tensors_names()) {
+    std::string name(name_fb->c_str(), name_fb->size());
+    op->src_tensors_names_.push_back(std::move(name));
+  }
+  for (auto name_fb : *fb_op->dst_tensors_names()) {
+    std::string name(name_fb->c_str(), name_fb->size());
+    op->dst_tensors_names_.push_back(std::move(name));
+  }
+  op->work_groups_count_.x = fb_op->work_groups_count()->x();
+  op->work_groups_count_.y = fb_op->work_groups_count()->y();
+  op->work_groups_count_.z = fb_op->work_groups_count()->z();
+  op->linkable_count_ = fb_op->linkable_count();
+  op->elementwise_code_ = std::string(fb_op->elementwise_code()->c_str(),
+                                      fb_op->elementwise_code()->size());
+  return absl::OkStatus();
+}
+
+flatbuffers::Offset<data::GPUOperation> Encode(
+    const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder) {
+  auto args_fb = Encode(op.args_, builder);
+  auto code_fb = builder->CreateString(op.code_);
+  auto work_group_size_fb = Encode(op.work_group_size_, builder);
+  std::vector<flatbuffers::Offset<data::CompilerOption>> compiler_options_fb;
+  for (int i = 0; i < op.compiler_options_.size(); ++i) {
+    data::CompilerOptionBuilder option_builder(*builder);
+    option_builder.add_option(ToFB(op.compiler_options_[i]));
+    compiler_options_fb.push_back(option_builder.Finish());
+  }
+  auto compiler_options_fb_vec = builder->CreateVector(compiler_options_fb);
+
+  auto def_fb = Encode(op.definition_, builder);
+  auto work_group_launch_order_fb =
+      Encode(op.work_group_launch_order_, builder);
+  auto grid_size_fb = Encode(op.grid_size_, builder);
+  auto work_groups_count_fb = Encode(op.work_groups_count_, builder);
+
+  std::vector<flatbuffers::Offset<flatbuffers::String>> src_names_fb;
+  for (auto& name : op.src_tensors_names_) {
+    src_names_fb.push_back(builder->CreateString(name));
+  }
+  auto src_names_fb_vec = builder->CreateVector(src_names_fb);
+
+  std::vector<flatbuffers::Offset<flatbuffers::String>> dst_names_fb;
+  for (auto& name : op.dst_tensors_names_) {
+    dst_names_fb.push_back(builder->CreateString(name));
+  }
+  auto dst_names_fb_vec = builder->CreateVector(dst_names_fb);
+
+  auto elementwise_code_fb = builder->CreateString(op.elementwise_code_);
+
+  data::GPUOperationBuilder op_builder(*builder);
+  op_builder.add_arguments(args_fb);
+  op_builder.add_code(code_fb);
+  op_builder.add_work_group_size(work_group_size_fb);
+  op_builder.add_compiler_options(compiler_options_fb_vec);
+  op_builder.add_tensor_to_grid(ToFB(op.tensor_to_grid_));
+  op_builder.add_elementwise(op.elementwise_);
+  op_builder.add_linkable(op.linkable_);
+  op_builder.add_check_src_channels_size(op.check_src_channels_size_);
+  op_builder.add_definition(def_fb);
+  op_builder.add_grid_dimension(op.grid_dimension_);
+  op_builder.add_work_group_launch_order(work_group_launch_order_fb);
+  op_builder.add_grid_size(grid_size_fb);
+  op_builder.add_src_tensors_names(src_names_fb_vec);
+  op_builder.add_dst_tensors_names(dst_names_fb_vec);
+  op_builder.add_work_groups_count(work_groups_count_fb);
+  op_builder.add_linkable_count(op.linkable_count_);
+  op_builder.add_elementwise_code(elementwise_code_fb);
+  return op_builder.Finish();
+}
+
+flatbuffers::Offset<data::CLNode> Encode(
+    const CLNode& node, flatbuffers::FlatBufferBuilder* builder) {
+  auto op_fb = Encode(*node.operation, builder);
+  std::vector<int32_t> in_ids(node.inputs.size());
+  for (int i = 0; i < in_ids.size(); ++i) {
+    in_ids[i] = node.inputs[i];
+  }
+  std::vector<int32_t> out_ids(node.outputs.size());
+  for (int i = 0; i < out_ids.size(); ++i) {
+    out_ids[i] = node.outputs[i];
+  }
+  auto in_ids_fb = builder->CreateVector(in_ids);
+  auto out_ids_fb = builder->CreateVector(out_ids);
+  auto name_fb = builder->CreateString(node.name);
+  data::CLNodeBuilder node_builder(*builder);
+  node_builder.add_gpu_op(op_fb);
+  node_builder.add_input_ids(in_ids_fb);
+  node_builder.add_output_ids(out_ids_fb);
+  node_builder.add_name(name_fb);
+  return node_builder.Finish();
+}
+
+absl::Status Decode(const data::CLNode* fb_node, CLNode* node) {
+  GPUOperation op;
+  RETURN_IF_ERROR(Decode(fb_node->gpu_op(), &op));
+  node->operation = absl::make_unique<GPUOperation>(std::move(op));
+  for (auto in_fb : *fb_node->input_ids()) {
+    node->inputs.push_back(in_fb);
+  }
+  for (auto out_fb : *fb_node->output_ids()) {
+    node->outputs.push_back(out_fb);
+  }
+  node->name = std::string(fb_node->name()->c_str(), fb_node->name()->size());
+
+  return absl::OkStatus();
+}
+
+flatbuffers::Offset<data::InferenceContext> Encode(
+    const InferenceContext& inference,
+    flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<int32_t> in_ids(inference.input_ids_.size());
+  for (int i = 0; i < in_ids.size(); ++i) {
+    in_ids[i] = inference.input_ids_[i];
+  }
+  std::vector<int32_t> out_ids(inference.output_ids_.size());
+  for (int i = 0; i < out_ids.size(); ++i) {
+    out_ids[i] = inference.output_ids_[i];
+  }
+  auto in_ids_fb = builder->CreateVector(in_ids);
+  auto out_ids_fb = builder->CreateVector(out_ids);
+
+  std::vector<flatbuffers::Offset<data::CLNode>> nodes_fb;
+  for (int i = 0; i < inference.nodes_.size(); ++i) {
+    auto node_fb = Encode(inference.nodes_[i], builder);
+    nodes_fb.push_back(node_fb);
+  }
+  auto nodes_fb_vec = builder->CreateVector(nodes_fb);
+
+  std::vector<flatbuffers::Offset<data::TensorDescWithId>> tensors_fb;
+  auto tensors = inference.tensor_reserver_.GetTensorDescs();
+  for (auto& tensor : tensors) {
+    auto tensor_fb = Encode(tensor.second, tensor.first, builder);
+    tensors_fb.push_back(tensor_fb);
+  }
+  auto tensors_fb_vec = builder->CreateVector(tensors_fb);
+
+  std::vector<flatbuffers::Offset<data::PairOfValueIds>>
+      variable_ids_and_refs_fb;
+  for (auto& pair : inference.variable_ids_and_refs_) {
+    data::PairOfValueIdsBuilder pair_builder(*builder);
+    pair_builder.add_first(pair.first);
+    pair_builder.add_second(pair.second);
+    variable_ids_and_refs_fb.push_back(pair_builder.Finish());
+  }
+  auto variable_ids_and_refs_fb_vec =
+      builder->CreateVector(variable_ids_and_refs_fb);
+
+  data::InferenceContextBuilder inf_builder(*builder);
+  inf_builder.add_need_flush(inference.need_flush_);
+  inf_builder.add_flush_periodically(inference.flush_periodically_);
+  inf_builder.add_flush_period(inference.flush_period_);
+  inf_builder.add_need_manual_release(inference.need_manual_release_);
+  inf_builder.add_precision(ToFB(inference.precision_));
+  inf_builder.add_storage_type(ToFB(inference.storage_type_));
+  inf_builder.add_nodes(nodes_fb_vec);
+  inf_builder.add_tensors(tensors_fb_vec);
+  inf_builder.add_input_ids(in_ids_fb);
+  inf_builder.add_output_ids(out_ids_fb);
+  inf_builder.add_variable_ids_and_refs(variable_ids_and_refs_fb_vec);
+  return inf_builder.Finish();
+}
+
+absl::Status Decode(const data::InferenceContext* fb_inference,
+                    InferenceContext* inference) {
+  inference->need_flush_ = fb_inference->need_flush();
+  inference->flush_periodically_ = fb_inference->flush_periodically();
+  inference->flush_period_ = fb_inference->flush_period();
+  inference->need_manual_release_ = fb_inference->need_manual_release();
+  inference->precision_ = ToEnum(fb_inference->precision());
+  inference->storage_type_ = ToEnum(fb_inference->storage_type());
+
+  inference->nodes_.resize(fb_inference->nodes()->size());
+  int counter = 0;
+  for (auto node_fb : *fb_inference->nodes()) {
+    RETURN_IF_ERROR(Decode(node_fb, &inference->nodes_[counter]));
+    counter++;
+  }
+
+  std::vector<std::pair<ValueId, TensorDescriptor>> tensors;
+  for (auto tensor_fb : *fb_inference->tensors()) {
+    TensorDescriptor desc;
+    Decode(tensor_fb->desc(), &desc);
+    tensors.push_back({tensor_fb->id(), std::move(desc)});
+  }
+  inference->tensor_reserver_.Add(tensors);
+  for (auto in_fb : *fb_inference->input_ids()) {
+    inference->input_ids_.push_back(in_fb);
+  }
+  for (auto out_fb : *fb_inference->output_ids()) {
+    inference->output_ids_.push_back(out_fb);
+  }
+
+  for (auto variable_id : *fb_inference->variable_ids_and_refs()) {
+    inference->variable_ids_and_refs_[variable_id->first()] =
+        variable_id->second();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.fbs b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
new file mode 100644
index 00000000000..64b830bc310
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
@@ -0,0 +1,255 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+include "tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs";
+
+namespace tflite.gpu.cl.data;
+
+table Int4 {
+  x:int32;
+  y:int32;
+  z:int32;
+  w:int32;
+}
+
+table Int3 {
+  x:int32;
+  y:int32;
+  z:int32;
+}
+
+table Int2 {
+  x:int32;
+  y:int32;
+}
+
+table IntValue {
+  name:string;
+  value:int32;
+  active:bool;
+}
+
+table FloatValue {
+  name:string;
+  value:float;
+  active:bool;
+}
+
+table HalfValue {
+  name:string;
+  value:float;
+  active:bool;
+}
+
+enum DataType : byte {
+  UNKNOWN = 0,
+  FLOAT32 = 1,
+  FLOAT16 = 2,
+}
+
+enum MemoryType : byte {
+  GLOBAL = 0,
+  CONSTANT = 1,
+  LOCAL = 2,
+}
+
+table BufferDescriptor {
+  base_obj:tflite.gpu.data.GPUObjectDescriptor;
+  element_type:DataType;
+  element_size:int32;
+  memory_type:MemoryType;
+  attributes:[string];
+  size:int32;
+  data:[uint8];
+}
+
+table Texture2DDescriptor {
+  base_obj:tflite.gpu.data.GPUObjectDescriptor;
+  element_type:DataType;
+  normalized:bool;
+  normalized_type:DataType;
+  size:Int2;
+  data:[uint8];
+}
+
+enum LinearStorageType : byte {
+  BUFFER = 0,
+  TEXTURE_2D = 1,
+}
+
+table TensorLinearDescriptor {
+  base_obj:tflite.gpu.data.GPUObjectDescriptor;
+  storage_type:LinearStorageType;
+  element_type:DataType;
+  memory_type:MemoryType;
+  size:int32;
+  data:[uint8];
+}
+
+enum TensorStorageType : byte {
+  UNKNOWN = 0,
+  BUFFER = 1,
+  IMAGE_BUFFER = 2,
+  TEXTURE_2D = 3,
+  TEXTURE_3D = 4,
+  TEXTURE_ARRAY = 5,
+  SINGLE_TEXTURE_2D = 6,
+}
+
+enum Layout : byte {
+  UNKNOWN = 0,
+  HWC = 1,
+  BHWC = 2,
+  HWDC = 3,
+  BHWDC = 4,
+}
+
+table BHWDC {
+  b:int32;
+  h:int32;
+  w:int32;
+  d:int32;
+  c:int32;
+}
+
+table TensorDescriptor {
+  base_obj:tflite.gpu.data.GPUObjectDescriptor;
+  data_type:DataType;
+  storage_type:TensorStorageType;
+  layout:Layout;
+  shape:BHWDC;
+  data:[uint8];
+}
+
+table BufferDescriptorMapValue {
+  key:string;
+  value:BufferDescriptor;
+}
+
+table Texture2DDescriptorMapValue {
+  key:string;
+  value:Texture2DDescriptor;
+}
+
+table TensorLinearDescriptorMapValue {
+  key:string;
+  value:TensorLinearDescriptor;
+}
+
+table TensorDescriptorMapValue {
+  key:string;
+  value:TensorDescriptor;
+}
+
+table Arguments {
+  int_values:[IntValue];
+  float_values:[FloatValue];
+  half_values:[HalfValue];
+
+  buffer_refs:[BufferDescriptorMapValue];
+  texture2d_refs:[Texture2DDescriptorMapValue];
+  tensor_linear_refs:[TensorLinearDescriptorMapValue];
+  tensor_refs:[TensorDescriptorMapValue];
+
+  buffer_objects:[BufferDescriptorMapValue];
+  texture2d_objects:[Texture2DDescriptorMapValue];
+  tensor_linear_objects:[TensorLinearDescriptorMapValue];
+  tensor_objects:[TensorDescriptorMapValue];
+}
+
+enum CalculationsPrecision : byte {
+  F32 = 0,
+  F32_F16 = 1,
+  F16 = 2,
+}
+
+enum TensorToGrid : byte {
+  CUSTOM = 0,
+  WB_TO_X_HD_TO_Y_S_TO_Z = 1,
+  WB_TO_X_HD_TO_Y_Z_IS_1 = 2,
+  WB_TO_X_H_TO_Y_D_TO_Z = 3,
+  B_TO_X_Y_IS_1_Z_IS_1 = 4,
+}
+
+enum CompilerOptions : byte {
+  ADRENO_FULL_SIMD_LINE = 0,
+  ADRENO_MORE_WAVES = 1,
+  POWERVR_FP16 = 2,
+  CL_OPT_DISABLE = 3,
+  CL_2_0 = 4,
+  CL_3_0 = 5,
+}
+
+table OperationDef {
+  precision:CalculationsPrecision;
+  src_tensors:[TensorDescriptor];
+  dst_tensors:[TensorDescriptor];
+}
+
+table CompilerOption {
+  option:CompilerOptions;
+}
+
+table GPUOperation {
+  arguments:Arguments;
+  code:string;
+  work_group_size:Int3;
+  compiler_options:[CompilerOption];
+  tensor_to_grid:TensorToGrid;
+  elementwise:bool;
+  linkable:bool;
+  check_src_channels_size:bool;
+  definition:OperationDef;
+  grid_dimension:int32;
+  work_group_launch_order:Int3;
+  grid_size:Int3;
+  src_tensors_names:[string];
+  dst_tensors_names:[string];
+  work_groups_count:Int3;
+  linkable_count:int32;
+  elementwise_code:string;
+}
+
+table TensorDescWithId {
+  desc:TensorDescriptor;
+  id:int32;
+}
+
+table CLNode {
+  gpu_op:GPUOperation;
+  input_ids:[int32];
+  output_ids:[int32];
+  name:string;
+}
+
+table PairOfValueIds {
+  first:int32;
+  second:int32;
+}
+
+table InferenceContext {
+  need_flush:bool;
+  flush_periodically:bool;
+  flush_period:int32;
+  need_manual_release:bool;
+  precision:CalculationsPrecision;
+  storage_type:TensorStorageType;
+  nodes:[CLNode];
+  tensors:[TensorDescWithId];
+  input_ids:[int32];
+  variable_ids_and_refs:[PairOfValueIds];
+  output_ids:[int32];
+}
+
+root_type InferenceContext;
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.h b/tensorflow/lite/delegates/gpu/cl/serialization.h
new file mode 100644
index 00000000000..1273e62a100
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SERIALIZATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SERIALIZATION_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class InferenceContext;
+
+flatbuffers::Offset<data::InferenceContext> Encode(
+    const InferenceContext& inference, flatbuffers::FlatBufferBuilder* builder);
+
+absl::Status Decode(CLContext* context,
+                    const data::InferenceContext* fb_inference,
+                    InferenceContext* inference);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SERIALIZATION_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 72c53c5b1ac..61400dc8048 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -253,14 +253,6 @@ absl::Status CreateTensorShared(const CLContext& context, const BHWDC& shape,
 
 }  // namespace
 
-absl::Status TensorDescriptor::CreateGPUObject(CLContext* context,
-                                               GPUObjectPtr* result) const {
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*this, context));
-  *result = absl::make_unique<Tensor>(std::move(gpu_tensor));
-  return absl::OkStatus();
-}
-
 Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC& shape,
                const TensorDescriptor& descriptor)
     : memory_(memory),
@@ -605,8 +597,11 @@ absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor& desc,
   descriptor_.layout = desc.layout;
   memory_owner_ = true;
   CLMemory memory;
-  RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_,
-                                       desc.data.data(), &memory));
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  RETURN_IF_ERROR(
+      AllocateTensorMemory(*context, shape_, descriptor_, data_ptr, &memory));
   memory_ = memory.Release();
   if (desc.storage_type == TensorStorageType::IMAGE_BUFFER) {
     RETURN_IF_ERROR(CreateImageBufferFromBuffer(
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index f31df43539e..d297e7cc53d 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 2157bf05543..3a33d2643e0 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -19,9 +19,10 @@ limitations under the License.
 #include <cstddef>
 #include <string>
 
-#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -68,8 +69,6 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   GPUResources GetGPUResources() const override;
 
-  absl::Status CreateGPUObject(CLContext* context,
-                               GPUObjectPtr* result) const override;
   void Release() override { data.clear(); }
 
   bool HasAxis(Axis axis) const;
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index a14dfd72cfd..d6078e27aa4 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -39,6 +39,7 @@ cc_binary(
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc b/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc
index 3e9b614c8c4..be297546709 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/api.h"
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
@@ -85,8 +86,18 @@ void CompareCPUGPUResults(tflite::Interpreter* cpu,
 }
 }  // namespace
 
+absl::Status RunModelSampleWithInternalAPISerializedKernels(
+    const std::string& model_name, const std::vector<uint8_t>& kernel_cache);
+
+absl::Status RunModelSampleWithInternalAPISerialized(
+    tflite::Interpreter* cpu, const std::vector<int64_t>& in_refs,
+    const std::vector<int64_t>& out_refs,
+    const std::vector<uint8_t>& kernel_cache,
+    const std::vector<uint8_t>& serialized_model);
+
 // Run Jet with OpenCL internal API and compares correctness with TFLite CPU
-absl::Status RunModelSampleWithInternalAPI(const std::string& model_name) {
+absl::Status RunModelSampleWithInternalAPI(const std::string& model_name,
+                                           std::vector<uint8_t>* kernel_cache) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
 
   ops::builtin::BuiltinOpResolver op_resolver;
@@ -124,6 +135,7 @@ absl::Status RunModelSampleWithInternalAPI(const std::string& model_name) {
     return absl::InternalError("Failed to Invoke CPU inference.");
   }
 
+  const auto start = std::chrono::high_resolution_clock::now();
   GraphFloat32 graph_cl;
   RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl));
 
@@ -156,6 +168,7 @@ absl::Status RunModelSampleWithInternalAPI(const std::string& model_name) {
   options.priority2 = InferencePriority::MIN_MEMORY_USAGE;
   options.priority3 = InferencePriority::MAX_PRECISION;
   options.usage = InferenceUsage::SUSTAINED_SPEED;
+
   RETURN_IF_ERROR(
       inf_env->NewInferenceBuilder(options, std::move(graph_cl), &builder));
 
@@ -176,6 +189,15 @@ absl::Status RunModelSampleWithInternalAPI(const std::string& model_name) {
   // Builds runner.
   RETURN_IF_ERROR(builder->Build(&runner));
 
+  const auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Initialization total time - " << (end - start).count() * 1e-6f
+            << "ms" << std::endl;
+
+  if (kernel_cache) {
+    *kernel_cache = inf_env->GetSerializedBinaryCache();
+    std::cout << "Kernel cache size - " << kernel_cache->size() << std::endl;
+  }
+
   // Sets the input/output object.
   for (int i = 0; i < in_refs.size(); ++i) {
     TfLiteTensor* tensor_ptr = cpu_inference->tensor(in_refs[i]);
@@ -198,6 +220,205 @@ absl::Status RunModelSampleWithInternalAPI(const std::string& model_name) {
   return absl::OkStatus();
 }
 
+absl::Status RunModelSampleWithInternalAPISerializedKernels(
+    const std::string& model_name, const std::vector<uint8_t>& kernel_cache) {
+  auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
+
+  ops::builtin::BuiltinOpResolver op_resolver;
+  InterpreterBuilder tfl_builder(*flatbuffer, op_resolver);
+
+  // CPU.
+  std::unique_ptr<tflite::Interpreter> cpu_inference;
+  tfl_builder(&cpu_inference);
+  if (!cpu_inference) {
+    return absl::InternalError("Failed to build CPU inference.");
+  }
+  auto status = cpu_inference->AllocateTensors();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to AllocateTensors for CPU inference.");
+  }
+  for (int k = 0; k < cpu_inference->inputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr =
+        cpu_inference->tensor(cpu_inference->inputs()[k]);
+    if (tensor_ptr->type != kTfLiteFloat32) {
+      return absl::InvalidArgumentError(
+          "Internal api supports only F32 input tensors");
+    }
+  }
+  for (int k = 0; k < cpu_inference->outputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr =
+        cpu_inference->tensor(cpu_inference->outputs()[k]);
+    if (tensor_ptr->type != kTfLiteFloat32) {
+      return absl::InvalidArgumentError(
+          "Internal api supports only F32 output tensors");
+    }
+  }
+  FillInputTensors(cpu_inference.get());
+  status = cpu_inference->Invoke();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to Invoke CPU inference.");
+  }
+
+  const auto start = std::chrono::high_resolution_clock::now();
+  GraphFloat32 graph_cl;
+  RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl));
+
+  auto inputs = graph_cl.inputs();
+  auto outputs = graph_cl.outputs();
+  std::vector<int64_t> in_refs(inputs.size());
+  std::vector<int64_t> out_refs(outputs.size());
+  for (int i = 0; i < inputs.size(); ++i) {
+    in_refs[i] = inputs[i]->tensor.ref;
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    out_refs[i] = outputs[i]->tensor.ref;
+  }
+
+  Environment env;
+  RETURN_IF_ERROR(CreateEnvironment(&env));
+
+  std::unique_ptr<InferenceEnvironment> inf_env;
+  // Initializes environment.
+  InferenceEnvironmentOptions env_options;
+  env_options.device = env.device().id();
+  env_options.context = env.context().context();
+  env_options.command_queue = env.queue()->queue();
+  env_options.serialized_binary_cache =
+      absl::MakeSpan(kernel_cache.data(), kernel_cache.size());
+  RETURN_IF_ERROR(NewInferenceEnvironment(env_options, &inf_env, nullptr));
+
+  InferenceOptions options;
+  options.priority1 = InferencePriority::MIN_LATENCY;
+  options.priority2 = InferencePriority::MIN_MEMORY_USAGE;
+  options.priority3 = InferencePriority::MAX_PRECISION;
+  options.usage = InferenceUsage::SUSTAINED_SPEED;
+
+  std::vector<uint8_t> serialized_model;
+  RETURN_IF_ERROR(inf_env->BuildSerializedModel(options, std::move(graph_cl),
+                                                &serialized_model));
+  std::unique_ptr<InferenceBuilder> builder;
+  RETURN_IF_ERROR(inf_env->NewInferenceBuilder(serialized_model, &builder));
+
+  // Sets input/output object def for builder_.
+  ObjectDef obj_def;
+  obj_def.data_type = DataType::FLOAT32;
+  obj_def.data_layout = DataLayout::BHWC;
+  obj_def.object_type = ObjectType::CPU_MEMORY;
+  obj_def.user_provided = true;
+  for (int i = 0; i < in_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetInputObjectDef(i, obj_def));
+  }
+  for (int i = 0; i < out_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetOutputObjectDef(i, obj_def));
+  }
+
+  std::unique_ptr<::tflite::gpu::InferenceRunner> runner;
+  // Builds runner.
+  RETURN_IF_ERROR(builder->Build(&runner));
+
+  const auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Initialization total time(with kernel cache) - "
+            << (end - start).count() * 1e-6f << "ms" << std::endl;
+
+  // Sets the input/output object.
+  for (int i = 0; i < in_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu_inference->tensor(in_refs[i]);
+    RETURN_IF_ERROR(runner->SetInputObject(
+        i, CpuMemory{tensor_ptr->data.data, tensor_ptr->bytes}));
+  }
+
+  std::vector<std::vector<float>> output_tensors(out_refs.size());
+  for (int i = 0; i < out_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu_inference->tensor(out_refs[i]);
+    output_tensors[i].resize(tensor_ptr->bytes / 4);
+    RETURN_IF_ERROR(runner->SetOutputObject(
+        i, CpuMemory{output_tensors[i].data(), tensor_ptr->bytes}));
+  }
+
+  RETURN_IF_ERROR(runner->Run());
+
+  CompareCPUGPUResults(cpu_inference.get(), out_refs, output_tensors, 1e-4f);
+
+  RETURN_IF_ERROR(RunModelSampleWithInternalAPISerialized(
+      cpu_inference.get(), in_refs, out_refs, kernel_cache, serialized_model));
+
+  return absl::OkStatus();
+}
+
+absl::Status RunModelSampleWithInternalAPISerialized(
+    tflite::Interpreter* cpu, const std::vector<int64_t>& in_refs,
+    const std::vector<int64_t>& out_refs,
+    const std::vector<uint8_t>& kernel_cache,
+    const std::vector<uint8_t>& serialized_model) {
+  FillInputTensors(cpu);
+  auto status = cpu->Invoke();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to Invoke CPU inference.");
+  }
+
+  const auto start = std::chrono::high_resolution_clock::now();
+
+  Environment env;
+  RETURN_IF_ERROR(CreateEnvironment(&env));
+
+  std::unique_ptr<InferenceEnvironment> inf_env;
+  // Initializes environment.
+  InferenceEnvironmentOptions env_options;
+  env_options.device = env.device().id();
+  env_options.context = env.context().context();
+  env_options.command_queue = env.queue()->queue();
+  env_options.serialized_binary_cache =
+      absl::MakeSpan(kernel_cache.data(), kernel_cache.size());
+  RETURN_IF_ERROR(NewInferenceEnvironment(env_options, &inf_env, nullptr));
+
+  std::unique_ptr<InferenceBuilder> builder;
+  RETURN_IF_ERROR(inf_env->NewInferenceBuilder(serialized_model, &builder));
+
+  // Sets input/output object def for builder_.
+  ObjectDef obj_def;
+  obj_def.data_type = DataType::FLOAT32;
+  obj_def.data_layout = DataLayout::BHWC;
+  obj_def.object_type = ObjectType::CPU_MEMORY;
+  obj_def.user_provided = true;
+  for (int i = 0; i < in_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetInputObjectDef(i, obj_def));
+  }
+  for (int i = 0; i < out_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetOutputObjectDef(i, obj_def));
+  }
+
+  std::unique_ptr<::tflite::gpu::InferenceRunner> runner;
+  // Builds runner.
+  RETURN_IF_ERROR(builder->Build(&runner));
+
+  const auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Serialized initialization total time - "
+            << (end - start).count() * 1e-6f << "ms" << std::endl;
+
+  // Sets the input/output object.
+  for (int i = 0; i < in_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu->tensor(in_refs[i]);
+    RETURN_IF_ERROR(runner->SetInputObject(
+        i, CpuMemory{tensor_ptr->data.data, tensor_ptr->bytes}));
+  }
+
+  std::vector<std::vector<float>> output_tensors(out_refs.size());
+  for (int i = 0; i < out_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu->tensor(out_refs[i]);
+    output_tensors[i].resize(tensor_ptr->bytes / 4);
+    RETURN_IF_ERROR(runner->SetOutputObject(
+        i, CpuMemory{output_tensors[i].data(), tensor_ptr->bytes}));
+  }
+
+  RETURN_IF_ERROR(runner->Run());
+
+  std::cout << "Comparing results second time:" << std::endl;
+
+  CompareCPUGPUResults(cpu, out_refs, output_tensors, 1e-4f);
+
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
@@ -214,7 +435,15 @@ int main(int argc, char** argv) {
     return -1;
   }
 
-  auto run_status = tflite::gpu::cl::RunModelSampleWithInternalAPI(argv[1]);
+  std::vector<uint8_t> kernel_cache;
+  auto run_status =
+      tflite::gpu::cl::RunModelSampleWithInternalAPI(argv[1], &kernel_cache);
+  if (!run_status.ok()) {
+    std::cerr << run_status.message();
+    return -1;
+  }
+  run_status = tflite::gpu::cl::RunModelSampleWithInternalAPISerializedKernels(
+      argv[1], kernel_cache);
   if (!run_status.ok()) {
     std::cerr << run_status.message();
     return -1;
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index 77cc7c9353c..57e6f2f5eb9 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -120,14 +120,6 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
   return absl::OkStatus();
 }
 
-absl::Status Texture2DDescriptor::CreateGPUObject(CLContext* context,
-                                                  GPUObjectPtr* result) const {
-  Texture2D gpu_texture;
-  RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context));
-  *result = absl::make_unique<Texture2D>(std::move(gpu_texture));
-  return absl::OkStatus();
-}
-
 Texture2D::Texture2D(cl_mem texture, int width, int height,
                      cl_channel_type type)
     : texture_(texture), width_(width), height_(height), channel_type_(type) {}
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index 15864305f21..cb6139a110f 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -57,8 +57,6 @@ struct Texture2DDescriptor : public GPUObjectDescriptor {
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
 
-  absl::Status CreateGPUObject(CLContext* context,
-                               GPUObjectPtr* result) const override;
   void Release() override;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/util.cc b/tensorflow/lite/delegates/gpu/cl/util.cc
index d0e65537519..901e2c30a65 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/util.cc
@@ -241,6 +241,19 @@ absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
   return absl::OkStatus();
 }
 
+std::string MemoryTypeToCLType(MemoryType type) {
+  switch (type) {
+    case MemoryType::GLOBAL:
+      return "__global";
+    case MemoryType::CONSTANT:
+      return "__constant";
+      break;
+    case MemoryType::LOCAL:
+      return "__local";
+  }
+  return "";
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/util.h b/tensorflow/lite/delegates/gpu/cl/util.h
index 54a6c74a3ff..05a8f08de58 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 
@@ -57,6 +58,8 @@ absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
                                cl_channel_type channel_type, void* data,
                                cl_mem* result);
 
+std::string MemoryTypeToCLType(MemoryType type);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 7125064d7a8..99d915f0ed2 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -150,6 +150,7 @@ cc_library(
         ":tensor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/delegates/gpu/common/data_type.cc b/tensorflow/lite/delegates/gpu/common/data_type.cc
index 05a61f86f29..f393c877cd4 100644
--- a/tensorflow/lite/delegates/gpu/common/data_type.cc
+++ b/tensorflow/lite/delegates/gpu/common/data_type.cc
@@ -105,5 +105,36 @@ std::string ToCLDataType(DataType data_type, int vec_size) {
   return "undefined";
 }
 
+std::string ToMetalDataType(DataType data_type, int vec_size) {
+  const std::string postfix = vec_size == 1 ? "" : std::to_string(vec_size);
+  switch (data_type) {
+    case DataType::FLOAT16:
+      return "half" + postfix;
+    case DataType::FLOAT32:
+      return "float" + postfix;
+    case DataType::FLOAT64:
+      return "double" + postfix;
+    case DataType::INT16:
+      return "short" + postfix;
+    case DataType::INT32:
+      return "int" + postfix;
+    case DataType::INT64:
+      return "long" + postfix;
+    case DataType::INT8:
+      return "char" + postfix;
+    case DataType::UINT16:
+      return "ushort" + postfix;
+    case DataType::UINT32:
+      return "uint" + postfix;
+    case DataType::UINT64:
+      return "ulong" + postfix;
+    case DataType::UINT8:
+      return "uchar" + postfix;
+    case DataType::UNKNOWN:
+      return "unknown";
+  }
+  return "undefined";
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/data_type.h b/tensorflow/lite/delegates/gpu/common/data_type.h
index 82d55ec9d4e..8ad3d635dd7 100644
--- a/tensorflow/lite/delegates/gpu/common/data_type.h
+++ b/tensorflow/lite/delegates/gpu/common/data_type.h
@@ -43,6 +43,8 @@ std::string ToString(DataType t);
 
 std::string ToCLDataType(DataType data_type, int vec_size = 1);
 
+std::string ToMetalDataType(DataType data_type, int vec_size = 1);
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index ebb5d628cdc..c200f0926aa 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/lite/delegates/gpu/common/task/BUILD b/tensorflow/lite/delegates/gpu/common/task/BUILD
new file mode 100644
index 00000000000..e270fd94b6b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/task/BUILD
@@ -0,0 +1,25 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "gpu_object_desc",
+    hdrs = ["gpu_object_desc.h"],
+    deps = [
+        ":serialization_base_cc_fbs",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+flatbuffer_cc_library(
+    name = "serialization_base_cc_fbs",
+    srcs = ["serialization_base.fbs"],
+    flatc_args = [
+        "--scoped-enums",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h b/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
new file mode 100644
index 00000000000..103ef48385c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
@@ -0,0 +1,149 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct GPUImage2DDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUImage3DDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUImage2DArrayDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUImageBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUCustomMemoryDescriptor {
+  std::string type_name;
+};
+
+enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
+
+struct GPUBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
+  std::vector<std::string> attributes;
+};
+
+struct GPUResources {
+  std::vector<std::string> ints;
+  std::vector<std::string> floats;
+  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+  std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+  std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
+  std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
+  std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
+  std::vector<std::pair<std::string, GPUCustomMemoryDescriptor>>
+      custom_memories;
+
+  std::vector<std::string> GetNames() const {
+    std::vector<std::string> names = ints;
+    names.insert(names.end(), floats.begin(), floats.end());
+    for (const auto& obj : buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images2d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image2d_arrays) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images3d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image_buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : custom_memories) {
+      names.push_back(obj.first);
+    }
+    return names;
+  }
+};
+
+class GPUObjectDescriptor {
+ public:
+  GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc) = default;
+  GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) = default;
+  virtual ~GPUObjectDescriptor() = default;
+
+  void SetStateVar(const std::string& key, const std::string& value) const {
+    state_vars_[key] = value;
+  }
+
+  virtual std::string PerformConstExpr(const std::string& const_expr) const {
+    return "";
+  }
+
+  virtual absl::Status PerformSelector(
+      const std::string& selector, const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args,
+      std::string* result) const {
+    *result = "";
+    return absl::OkStatus();
+  }
+  virtual GPUResources GetGPUResources() const { return GPUResources(); }
+
+  virtual void Release() {}
+
+  void SetAccess(AccessType access_type) { access_type_ = access_type; }
+  AccessType GetAccess() const { return access_type_; }
+
+ protected:
+  friend flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> Encode(
+      const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
+  friend void Decode(const tflite::gpu::data::GPUObjectDescriptor* fb_obj,
+                     GPUObjectDescriptor* obj);
+  mutable std::map<std::string, std::string> state_vars_;
+  AccessType access_type_;
+};
+
+using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs b/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs
new file mode 100644
index 00000000000..037a65a8a2d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs
@@ -0,0 +1,31 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace tflite.gpu.data;
+
+enum AccessType : byte {
+  READ = 0,
+  WRITE = 1,
+  READ_WRITE = 2,
+}
+
+table StateVariable {
+  key:string;
+  value:string;
+}
+
+table GPUObjectDescriptor {
+  state_vars:[StateVariable];
+  access_type:AccessType;
+}
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index 801e87fd775..5932da4666d 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -179,6 +179,7 @@ cc_test(
     srcs = ["fuse_auto_input_test.cc"],
     tags = [
         "local",
+        "no_mac",  # TODO(b/171881489)
     ],
     deps = [
         ":compiled_node",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index a5d49b2c394..197887d5aa9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -333,6 +333,7 @@ cc_test(
     srcs = ["mean_test.cc"],
     linkstatic = True,
     tags = [
+        "no_mac",  # TODO(b/171882090)
         "notap",
         "tflite_not_portable_ios",
     ],
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index c4e7ca7c10d..31ee0261d31 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -26,7 +26,7 @@ cc_library(
     deps = [
         ":compiled_model",
         ":compute_task_descriptor",
-        ":environment",
+        ":device_info",
         ":runtime_options",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -38,6 +38,43 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arguments",
+    srcs = ["arguments.cc"],
+    hdrs = ["arguments.h"],
+    deps = [
+        ":gpu_object_desc",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+objc_library(
+    name = "buffer",
+    srcs = ["buffer.mm"],
+    hdrs = ["buffer.h"],
+    copts = DEFAULT_COPTS,
+    sdk_frameworks = ["Metal"],
+    deps = [
+        ":gpu_object",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+objc_library(
+    name = "buffer_test_lib",
+    testonly = 1,
+    srcs = ["buffer_test.mm"],
+    sdk_frameworks = [
+        "XCTest",
+        "Metal",
+    ],
+    deps = [
+        ":buffer",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
 objc_library(
     name = "buffer_convert",
     srcs = ["buffer_convert.mm"],
@@ -134,6 +171,7 @@ objc_library(
     deps = [
         ":common",
         ":compute_task_descriptor",
+        ":metal_arguments",
         ":runtime_options",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:shape",
@@ -149,6 +187,7 @@ objc_library(
     hdrs = ["compute_task_descriptor.h"],
     copts = DEFAULT_COPTS,
     deps = [
+        ":arguments",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
@@ -158,17 +197,33 @@ objc_library(
     ],
 )
 
+cc_library(
+    name = "device_info",
+    srcs = ["device_info.cc"],
+    hdrs = ["device_info.h"],
+)
+
 objc_library(
-    name = "environment",
-    srcs = ["environment.mm"],
-    hdrs = ["environment.h"],
+    name = "gpu_object",
+    hdrs = ["gpu_object.h"],
     copts = DEFAULT_COPTS,
     sdk_frameworks = ["Metal"],
     deps = [
-        ":common",
-        # TODO(b/152322289): The following dependency is not needed, but a Bazel
-        # bug causes a build failure without an additional dummy dependency.
-        "//tensorflow/lite/delegates/gpu/common:convert",
+        ":gpu_object_desc",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+cc_library(
+    name = "gpu_object_desc",
+    srcs = ["gpu_object_desc.cc"],
+    hdrs = ["gpu_object_desc.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
     ],
 )
 
@@ -213,6 +268,21 @@ ios_unit_test(
     deps = [":inference_context_test_lib"],
 )
 
+objc_library(
+    name = "metal_arguments",
+    srcs = ["metal_arguments.mm"],
+    hdrs = ["metal_arguments.h"],
+    copts = DEFAULT_COPTS,
+    sdk_frameworks = ["Metal"],
+    deps = [
+        ":arguments",
+        ":gpu_object_desc",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "runtime_options",
     hdrs = ["runtime_options.h"],
@@ -247,6 +317,7 @@ objc_library(
     name = "common_tests_lib",
     testonly = 1,
     srcs = [
+        "//tensorflow/lite/delegates/gpu/metal:buffer_test.mm",
         "//tensorflow/lite/delegates/gpu/metal:common_test.mm",
         "//tensorflow/lite/delegates/gpu/metal:compiled_model_test.mm",
         "//tensorflow/lite/delegates/gpu/metal:inference_context_test.mm",
@@ -255,8 +326,10 @@ objc_library(
     ],
     sdk_frameworks = ["XCTest"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/metal:buffer",
         "//tensorflow/lite/delegates/gpu/metal:common",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "//tensorflow/lite/delegates/gpu/metal/kernels:test_util",
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 1f3476170b0..f8fed600ed9 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compiled_model.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/concat.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h"
diff --git a/tensorflow/lite/delegates/gpu/metal/api.h b/tensorflow/lite/delegates/gpu/metal/api.h
index e4435287518..407434c8257 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.h
+++ b/tensorflow/lite/delegates/gpu/metal/api.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/metal/compiled_model.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/arguments.cc b/tensorflow/lite/delegates/gpu/metal/arguments.cc
new file mode 100644
index 00000000000..d9c9c3cc22b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/arguments.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/arguments.h"
+
+#include "absl/strings/ascii.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
+bool HasWord(const std::string& word, const std::string& text) {
+  size_t pos = text.find(word);
+  while (pos != std::string::npos) {
+    char prev = pos == 0 ? '.' : text[pos - 1];
+    char next = pos + word.size() < text.size() ? text[pos + word.size()] : '.';
+    if (!IsWordSymbol(prev) & !IsWordSymbol(next)) {
+      return true;
+    }
+    pos = text.find(word, pos + 1);
+  }
+  return false;
+}
+}  // namespace
+
+// Static
+constexpr char Arguments::kArgsPrefix[];
+
+void Arguments::AddFloat(const std::string& name, float value) {
+  float_values_[name].value = value;
+}
+
+void Arguments::AddInt(const std::string& name, int value) {
+  int_values_[name].value = value;
+}
+
+void Arguments::AddObject(const std::string& name,
+                          GPUObjectDescriptorPtr&& descriptor_ptr) {
+  descriptor_ptr->SetAccess(AccessType::READ);
+  objects_[name] = {std::move(descriptor_ptr)};
+}
+
+void Arguments::GetActiveArguments(const std::string& code) {
+  for (auto& float_val : float_values_) {
+    float_val.second.active = HasWord(kArgsPrefix + float_val.first, code);
+  }
+  for (auto& int_val : int_values_) {
+    int_val.second.active = HasWord(kArgsPrefix + int_val.first, code);
+  }
+}
+
+void Arguments::ReleaseCPURepresentation() {
+  for (auto& t : objects_) {
+    t.second->Release();
+  }
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/arguments.h b/tensorflow/lite/delegates/gpu/metal/arguments.h
new file mode 100644
index 00000000000..47eca6dc783
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/arguments.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class Arguments {
+ public:
+  Arguments() = default;
+
+  // Move only
+  Arguments(Arguments&& args) = default;
+  Arguments& operator=(Arguments&& args) = default;
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
+  void AddFloat(const std::string& name, float value = 0.0f);
+  void AddInt(const std::string& name, int value = 0);
+  void AddObject(const std::string& name,
+                 GPUObjectDescriptorPtr&& descriptor_ptr);
+
+  void ReleaseCPURepresentation();
+
+ private:
+  friend class MetalArguments;
+  void GetActiveArguments(const std::string& code);
+
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // this flag active if argument was used in kernel_code
+    // Will be filled after GetActiveArguments call
+    bool active = false;
+  };
+  std::map<std::string, IntValue> int_values_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // this flag active if argument was used in kernel_code
+    // Will be filled after GetActiveArguments call
+    bool active = false;
+  };
+  std::map<std::string, FloatValue> float_values_;
+
+  std::map<std::string, GPUObjectDescriptorPtr> objects_;
+};
+
+class ArgumentsSetter {
+ public:
+  virtual absl::Status SetInt(const std::string& name, int value) = 0;
+  virtual absl::Status SetFloat(const std::string& name, float value) = 0;
+  virtual ~ArgumentsSetter() = default;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/buffer.h b/tensorflow/lite/delegates/gpu/metal/buffer.h
new file mode 100644
index 00000000000..58a1aa2db0f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/buffer.h
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the Licensgoe is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_H_
+
+#include <string>
+#include <vector>
+
+#import <Metal/Metal.h>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class Buffer : public GPUObject {
+ public:
+  Buffer() {}  // just for using Buffer as a class members
+  Buffer(id<MTLBuffer> buffer, size_t size_in_bytes);
+
+  // Move only
+  Buffer(Buffer&& buffer);
+  Buffer& operator=(Buffer&& buffer);
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+  ~Buffer();
+
+  // for profiling and memory statistics
+  uint64_t GetMemorySizeInBytes() const { return size_; }
+
+  id<MTLBuffer> GetMemoryPtr() const { return buffer_; }
+
+  // Writes data to a buffer. Data should point to a region that
+  // has exact size in bytes as size_in_bytes(constructor parameter).
+  template <typename T>
+  absl::Status WriteData(const absl::Span<T> data);
+
+  // Reads data from Buffer into CPU memory.
+  template <typename T>
+  absl::Status ReadData(std::vector<T>* result) const;
+
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
+
+ private:
+  void Release();
+
+  id<MTLBuffer> buffer_ = nullptr;
+  size_t size_;
+};
+
+absl::Status CreateBuffer(size_t size_in_bytes, const void* data, id<MTLDevice> device,
+                          Buffer* result);
+
+template <typename T>
+absl::Status Buffer::WriteData(const absl::Span<T> data) {
+  if (size_ != sizeof(T) * data.size()) {
+    return absl::InvalidArgumentError(
+        "absl::Span<T> data size is different from buffer allocated size.");
+  }
+  std::memcpy([buffer_ contents], data.data(), size_);
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status Buffer::ReadData(std::vector<T>* result) const {
+  if (size_ % sizeof(T) != 0) {
+    return absl::UnknownError("Wrong element size(typename T is not correct?");
+  }
+
+  const int elements_count = size_ / sizeof(T);
+  result->resize(elements_count);
+  std::memcpy(result->data(), [buffer_ contents], size_);
+
+  return absl::OkStatus();
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/buffer.mm b/tensorflow/lite/delegates/gpu/metal/buffer.mm
new file mode 100644
index 00000000000..92ff67c0829
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/buffer.mm
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/buffer.h"
+
+#include <utility>
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+Buffer::Buffer(id<MTLBuffer> buffer, size_t size_in_bytes)
+    : buffer_(buffer), size_(size_in_bytes) {}
+
+Buffer::Buffer(Buffer&& buffer) : buffer_(buffer.buffer_), size_(buffer.size_) {
+  buffer.buffer_ = nullptr;
+  buffer.size_ = 0;
+}
+
+Buffer& Buffer::operator=(Buffer&& buffer) {
+  if (this != &buffer) {
+    Release();
+    std::swap(size_, buffer.size_);
+    std::swap(buffer_, buffer.buffer_);
+  }
+  return *this;
+}
+
+Buffer::~Buffer() { Release(); }
+
+void Buffer::Release() {
+  if (buffer_) {
+    buffer_ = nullptr;
+    size_ = 0;
+  }
+}
+
+absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                                     GPUResourcesWithValue* resources) const {
+  resources->buffers.push_back({"buffer", buffer_});
+  return absl::OkStatus();
+}
+
+absl::Status CreateBuffer(size_t size_in_bytes, const void* data,
+                                  id<MTLDevice> device, Buffer* result) {
+  id<MTLBuffer> buffer;
+  if (data) {
+    buffer = [device newBufferWithBytes:data
+                                 length:size_in_bytes
+                                options:MTLResourceStorageModeShared];
+  } else {
+    buffer = [device newBufferWithLength:size_in_bytes
+                                         options:MTLResourceStorageModeShared];
+  }
+
+  *result = Buffer(buffer, size_in_bytes);
+
+  return absl::OkStatus();
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/buffer_test.mm b/tensorflow/lite/delegates/gpu/metal/buffer_test.mm
new file mode 100644
index 00000000000..70a63eb27ac
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/buffer_test.mm
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/buffer.h"
+
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+#import <XCTest/XCTest.h>
+
+#import <Metal/Metal.h>
+
+#include <vector>
+#include <iostream>
+
+@interface BufferTest : XCTestCase
+@end
+
+@implementation BufferTest
+- (void)setUp {
+  [super setUp];
+}
+
+using tflite::gpu::half;
+
+- (void)testBufferF32 {
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+
+  const std::vector<float> data = {1.0f, 2.0f, 3.0f, -4.0f, 5.1f};
+  tflite::gpu::metal::Buffer buffer;
+  XCTAssertTrue(tflite::gpu::metal::CreateBuffer(sizeof(float) * 5, nullptr, device, &buffer).ok());
+  XCTAssertTrue(buffer.WriteData(absl::MakeConstSpan(data.data(), data.size())).ok());
+  std::vector<float> gpu_data;
+  XCTAssertTrue(buffer.ReadData<float>(&gpu_data).ok());
+
+  XCTAssertEqual(gpu_data.size(), data.size());
+  for (int i = 0; i < gpu_data.size(); ++i) {
+    XCTAssertEqual(gpu_data[i], data[i]);
+  }
+}
+
+- (void)testBufferF16 {
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+
+  const std::vector<half> data = {half(1.0f), half(2.0f), half(3.0f), half(-4.0f), half(5.1f)};
+  tflite::gpu::metal::Buffer buffer;
+  XCTAssertTrue(tflite::gpu::metal::CreateBuffer(
+      sizeof(tflite::gpu::half) * 5, nullptr, device, &buffer).ok());
+  XCTAssertTrue(buffer.WriteData(absl::MakeConstSpan(data.data(), data.size())).ok());
+  std::vector<half> gpu_data;
+  XCTAssertTrue(buffer.ReadData<half>(&gpu_data).ok());
+
+  XCTAssertEqual(gpu_data.size(), data.size());
+  for (int i = 0; i < gpu_data.size(); ++i) {
+    XCTAssertEqual(gpu_data[i], data[i]);
+  }
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/metal/common_test.mm b/tensorflow/lite/delegates/gpu/metal/common_test.mm
index 48cdb679461..9fecc59e235 100644
--- a/tensorflow/lite/delegates/gpu/metal/common_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/common_test.mm
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 
 using ::tflite::gpu::metal::GetBestSupportedMetalDevice;
 using ::tflite::gpu::metal::CreateComputeProgram;
diff --git a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
index 74202edd585..b431843d0bd 100644
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
@@ -543,11 +543,12 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
       function_index++;
     }
   }
+  fused_descriptor->args = std::move(sequence.front()->args);
 
   ComputeTaskDescriptorPtr non_linkable = sequence.front();
   fused_descriptor->shader_source =
-      absl::Substitute(non_linkable->shader_source, function_code,
-                       buffer_declarations, call_code);
+      absl::Substitute(non_linkable->shader_source, function_code + "$0",
+                       buffer_declarations + "$1", call_code);
   std::vector<ValueId> alias;
   alias.reserve(chain.size() - 1);
   for (int i = 0; i < chain.size() - 1; i++) {
diff --git a/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm b/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
index 3a76178d71f..0bafcb2616e 100644
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
@@ -158,6 +158,7 @@ static std::vector<ComputeTaskDescriptorPtr> Add2Linkable(int id, ValueId input_
                                                           ValueId input_id2, ValueId output_id) {
   std::vector<ComputeTaskDescriptorPtr> descriptors;
   descriptors.push_back(ComputeTaskDescriptorPtr(new ComputeTaskDescriptor({
+      {},  // args
       id,
       true,  // linkable
       true,  // associative_op
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.h b/tensorflow/lite/delegates/gpu/metal/compute_task.h
index b03a8436077..b5c220e90f7 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.h
@@ -19,6 +19,7 @@ limitations under the License.
 #import <Metal/Metal.h>
 
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -56,9 +57,11 @@ limitations under the License.
               sharedBufferIds:(const std::vector<size_t>&)sharedBufferIds
                 sharedBuffers:(const std::vector<id<MTLBuffer>>&)sharedBuffers;
 
-- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)encoder
-       inputOutputBuffers:
-           (const std::map<::tflite::gpu::ValueId, id<MTLBuffer>>&)inputOutputBuffers;
+- (bool)hasInOutIds:(const std::set<::tflite::gpu::ValueId>&)ids;
+
+- (void)updateBuffers:(const std::map<::tflite::gpu::ValueId, id<MTLBuffer>>&)inputOutputBuffers;
+
+- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)encoder;
 
 @end
 
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.mm b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
index 7bfbb55feff..060a08aae09 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 
+#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -70,11 +71,15 @@ struct UniformBuffer {
   uint3 _groupsCount;
   DispatchParamsFunction _resizeFunction;
   std::string _description;
+  tflite::gpu::metal::MetalArguments _metal_args;
 }
 
 - (absl::Status)compileWithDevice:(id<MTLDevice>)device
                    taskDescriptor:(ComputeTaskDescriptorPtr)desc
                    runtimeOptions:(const RuntimeOptions&)options {
+  size_t offset = desc->input_buffers.size() + desc->uniform_buffers.size()
+                  + desc->immutable_buffers.size() + 1;
+  RETURN_IF_ERROR(_metal_args.Init(offset, &desc->args, &desc->shader_source));
   NSString* barrier;
   // simdgroup_barrier is supported on macOS 10.13+ and Metal shading language version 2.0
   if (@available(macOS 10.13, iOS 10.0, tvOS 10.0, *)) {
@@ -213,8 +218,36 @@ struct UniformBuffer {
   return absl::OkStatus();
 }
 
-- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)encoder
-       inputOutputBuffers:(const std::map<ValueId, id<MTLBuffer>>&)inputOutputBuffers {
+- (bool)hasInOutIds:(const std::set<::tflite::gpu::ValueId>&)ids {
+  for (auto& buffer : _inputBuffers) {
+    if (ids.count(buffer.uid)) {
+      return true;
+    }
+  }
+  for (auto& buffer : _outputBuffers) {
+    if (ids.count(buffer.uid)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+- (void)updateBuffers:(const std::map<::tflite::gpu::ValueId, id<MTLBuffer>>&)inputOutputBuffers {
+  for (auto& buffer : _inputBuffers) {
+    const auto externalBuffer = inputOutputBuffers.find(buffer.uid);
+    if (externalBuffer != inputOutputBuffers.end()) {
+      buffer.metalHandle = externalBuffer->second;
+    }
+  }
+  for (auto& buffer : _outputBuffers) {
+    const auto externalBuffer = inputOutputBuffers.find(buffer.uid);
+    if (externalBuffer != inputOutputBuffers.end()) {
+      buffer.metalHandle = externalBuffer->second;
+    }
+  }
+}
+
+- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)encoder {
   // The dispatch call is intended to be skipped.
   if (_groupsCount.x * _groupsCount.y * _groupsCount.z == 0) {
     return;
@@ -223,24 +256,12 @@ struct UniformBuffer {
   [encoder setComputePipelineState:_program];
 
   int bindIndex = 0;
-  for (auto& buffer : _outputBuffers) {
-    const auto externalBuffer = inputOutputBuffers.find(buffer.uid);
-    if (externalBuffer == inputOutputBuffers.end()) {
-      [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex];
-    } else {
-      // the buffer is input or output
-      [encoder setBuffer:externalBuffer->second offset:0 atIndex:bindIndex];
-    }
+  for (const auto& buffer : _outputBuffers) {
+    [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex];
     bindIndex++;
   }
-  for (auto& buffer : _inputBuffers) {
-    const auto externalBuffer = inputOutputBuffers.find(buffer.uid);
-    if (externalBuffer == inputOutputBuffers.end()) {
-      [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex];
-    } else {
-      // the buffer is input or output
-      [encoder setBuffer:externalBuffer->second offset:0 atIndex:bindIndex];
-    }
+  for (const auto& buffer : _inputBuffers) {
+    [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex];
     bindIndex++;
   }
   for (auto& immutable : _immutableBuffers) {
@@ -251,6 +272,7 @@ struct UniformBuffer {
     [encoder setBytes:uniform.data.data() length:uniform.data.size() atIndex:bindIndex];
     bindIndex++;
   }
+  _metal_args.Encode(encoder, bindIndex);
 
   MTLSize groupsCount = MTLSizeMake(_groupsCount.x, _groupsCount.y, _groupsCount.z);
   MTLSize groupsSize = MTLSizeMake(_groupsSize.x, _groupsSize.y, _groupsSize.z);
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
index 923f4dcc245..7b65f2bdb02 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/metal/arguments.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
@@ -79,6 +80,7 @@ struct ComputeTaskDescriptor {
     UniformsFunction data_function;
   };
 
+  Arguments args;
   // Unique ID to match the graph compilation errors.
   int id;
   bool is_linkable;
diff --git a/tensorflow/lite/delegates/gpu/metal/environment.mm b/tensorflow/lite/delegates/gpu/metal/device_info.cc
similarity index 76%
rename from tensorflow/lite/delegates/gpu/metal/environment.mm
rename to tensorflow/lite/delegates/gpu/metal/device_info.cc
index f08a9beef47..ef3dfff5fc9 100644
--- a/tensorflow/lite/delegates/gpu/metal/environment.mm
+++ b/tensorflow/lite/delegates/gpu/metal/device_info.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 
 #include <map>
 #include <string>
@@ -39,18 +39,13 @@ Vendor GetVendorFromString(const std::string& device_name) {
 
 AppleGPUInfo::AppleGPUInfo(const std::string& device_name) {
   const std::map<std::string, AppleGPU> kMapping = {
-    {"Apple A7 GPU", AppleGPU::kA7},
-    {"Apple A8 GPU", AppleGPU::kA8},
-    {"Apple A8X GPU", AppleGPU::kA8X},
-    {"Apple A9 GPU", AppleGPU::kA9},
-    {"Apple A9X GPU", AppleGPU::kA9X},
-    {"Apple A10 GPU", AppleGPU::kA10},
-    {"Apple A10X GPU", AppleGPU::kA10X},
-    {"Apple A11 GPU", AppleGPU::kA11},
-    {"Apple A12 GPU", AppleGPU::kA12},
-    {"Apple A12X GPU", AppleGPU::kA12X},
-    {"Apple A12Z GPU", AppleGPU::kA12Z},
-    {"Apple A13 GPU", AppleGPU::kA13},
+      {"Apple A7 GPU", AppleGPU::kA7},     {"Apple A8 GPU", AppleGPU::kA8},
+      {"Apple A8X GPU", AppleGPU::kA8X},   {"Apple A9 GPU", AppleGPU::kA9},
+      {"Apple A9X GPU", AppleGPU::kA9X},   {"Apple A10 GPU", AppleGPU::kA10},
+      {"Apple A10X GPU", AppleGPU::kA10X}, {"Apple A11 GPU", AppleGPU::kA11},
+      {"Apple A12 GPU", AppleGPU::kA12},   {"Apple A12X GPU", AppleGPU::kA12X},
+      {"Apple A12Z GPU", AppleGPU::kA12Z}, {"Apple A13 GPU", AppleGPU::kA13},
+      {"Apple A14 GPU", AppleGPU::kA14},
   };
   auto it = kMapping.find(device_name);
   if (it != kMapping.end()) {
@@ -67,11 +62,9 @@ bool AppleGPUInfo::IsLocalMemoryPreferredOverGlobal() const {
 }
 
 bool AppleGPUInfo::IsBionic() const {
-  return gpu_type == AppleGPU::kA11 ||
-         gpu_type == AppleGPU::kA12 ||
-         gpu_type == AppleGPU::kA12X ||
-         gpu_type == AppleGPU::kA12Z ||
-         gpu_type == AppleGPU::kA13;
+  return gpu_type == AppleGPU::kA11 || gpu_type == AppleGPU::kA12 ||
+         gpu_type == AppleGPU::kA12X || gpu_type == AppleGPU::kA12Z ||
+         gpu_type == AppleGPU::kA13 || gpu_type == AppleGPU::kA14;
 }
 
 bool AppleGPUInfo::IsRoundToNearestSupported() const {
@@ -108,12 +101,15 @@ int AppleGPUInfo::GetComputeUnitsCount() const {
       return 8;
     case AppleGPU::kA13:
       return 4;
+    case AppleGPU::kA14:
+      return 4;
     case AppleGPU::kUnknown:
       return 1;
   }
 }
 
-DeviceInfo::DeviceInfo(const std::string& device_name) : vendor(GetVendorFromString(device_name)) {
+DeviceInfo::DeviceInfo(const std::string& device_name)
+    : vendor(GetVendorFromString(device_name)) {
   if (vendor == Vendor::kApple) {
     apple_info = AppleGPUInfo(device_name);
   }
diff --git a/tensorflow/lite/delegates/gpu/metal/environment.h b/tensorflow/lite/delegates/gpu/metal/device_info.h
similarity index 87%
rename from tensorflow/lite/delegates/gpu/metal/environment.h
rename to tensorflow/lite/delegates/gpu/metal/device_info.h
index 14c8860dee2..6211d248e35 100644
--- a/tensorflow/lite/delegates/gpu/metal/environment.h
+++ b/tensorflow/lite/delegates/gpu/metal/device_info.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_ENVIRONMENT_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_ENVIRONMENT_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_DEVICE_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_DEVICE_INFO_H_
 
 #include <string>
 
@@ -43,6 +43,7 @@ enum class AppleGPU {
   kA12X,
   kA12Z,
   kA13,
+  kA14,
 };
 
 struct AppleGPUInfo {
@@ -88,4 +89,4 @@ struct DeviceInfo {
 }  // namespace gpu
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_ENVIRONMENT_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_DEVICE_INFO_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/gpu_object.h b/tensorflow/lite/delegates/gpu/metal/gpu_object.h
new file mode 100644
index 00000000000..dde3a7728a5
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/gpu_object.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+struct GPUResourcesWithValue {
+  std::vector<std::pair<std::string, int>> ints;
+  std::vector<std::pair<std::string, float>> floats;
+  std::vector<std::pair<std::string, id<MTLBuffer>>> buffers;
+  std::vector<std::pair<std::string, id<MTLTexture>>> images2d;
+  std::vector<std::pair<std::string, id<MTLTexture>>> image2d_arrays;
+  std::vector<std::pair<std::string, id<MTLTexture>>> images3d;
+  std::vector<std::pair<std::string, id<MTLTexture>>> image_buffers;
+};
+
+class GPUObject {
+ public:
+  GPUObject() = default;
+  // Move only
+  GPUObject(GPUObject&& obj_desc) = default;
+  GPUObject& operator=(GPUObject&& obj_desc) = default;
+  GPUObject(const GPUObject&) = delete;
+  GPUObject& operator=(const GPUObject&) = delete;
+  virtual ~GPUObject() = default;
+  virtual absl::Status GetGPUResources(
+      const GPUObjectDescriptor* obj_ptr,
+      GPUResourcesWithValue* resources) const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.cc b/tensorflow/lite/delegates/gpu/metal/gpu_object_desc.cc
similarity index 79%
rename from tensorflow/lite/delegates/gpu/cl/gpu_object.cc
rename to tensorflow/lite/delegates/gpu/metal/gpu_object_desc.cc
index 277d711ff63..09fd19aa355 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.cc
+++ b/tensorflow/lite/delegates/gpu/metal/gpu_object_desc.cc
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h"
 
 namespace tflite {
 namespace gpu {
-namespace cl {
+namespace metal {
 
-std::string MemoryTypeToCLType(MemoryType type) {
+std::string MemoryTypeToMetalType(MemoryType type) {
   switch (type) {
     case MemoryType::GLOBAL:
-      return "__global";
+      return "device";
     case MemoryType::CONSTANT:
-      return "__constant";
+      return "constant";
       break;
     case MemoryType::LOCAL:
-      return "__local";
+      return "threadgroup";
   }
   return "";
 }
 
-}  // namespace cl
+}  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h b/tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h
new file mode 100644
index 00000000000..e33bbcde703
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_DESC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_DESC_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
+
+std::string MemoryTypeToMetalType(MemoryType type);
+
+struct GPUBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
+  std::vector<std::string> attributes;
+};
+
+struct GPUResources {
+  std::vector<std::string> ints;
+  std::vector<std::string> floats;
+  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+
+  std::vector<std::string> GetNames() const {
+    std::vector<std::string> names = ints;
+    names.insert(names.end(), floats.begin(), floats.end());
+    for (const auto& obj : buffers) {
+      names.push_back(obj.first);
+    }
+    return names;
+  }
+};
+
+class GPUObjectDescriptor {
+ public:
+  GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc) = default;
+  GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) = default;
+
+  virtual ~GPUObjectDescriptor() = default;
+
+  void SetStateVar(const std::string& key, const std::string& value) const {
+    state_vars_[key] = value;
+  }
+
+  virtual std::string PerformConstExpr(const std::string& const_expr) const {
+    return "";
+  }
+
+  virtual absl::Status PerformSelector(
+      const std::string& selector, const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args,
+      std::string* result) const {
+    *result = "";
+    return absl::OkStatus();
+  }
+  virtual GPUResources GetGPUResources() const { return GPUResources(); }
+
+  virtual void Release() {}
+
+  void SetAccess(AccessType access_type) { access_type_ = access_type; }
+  AccessType GetAccess() const { return access_type_; }
+
+ protected:
+  mutable std::map<std::string, std::string> state_vars_;
+  AccessType access_type_;
+};
+
+using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_DESC_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.mm b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
index d5589ae8ab4..3d1ac49a2a6 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.mm
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
@@ -40,6 +40,8 @@ using ::tflite::gpu::TensorUsageRecord;
 
 @implementation TFLInferenceContext {
   std::vector<TFLComputeTask*> _computeTasks;
+  // contains indexes of _computeTasks
+  std::vector<int> _taskIdsWithInOutBuffers;
   std::vector<ValueId> _outputIds;
   id<MTLDevice> _device;
   RuntimeOptions _options;
@@ -144,7 +146,11 @@ using ::tflite::gpu::TensorUsageRecord;
     sharedBuffers[i] = [_device newBufferWithLength:bufferSize
                                             options:MTLResourceStorageModeShared];
   }
-  for (auto& task : _computeTasks) {
+  for (int i = 0; i < _computeTasks.size(); ++i) {
+    auto& task = _computeTasks[i];
+    if ([task hasInOutIds:preallocatedIds]) {
+      _taskIdsWithInOutBuffers.push_back(i);
+    }
     RETURN_IF_ERROR([task assignBuffers:&buffers
                               outputIds:_outputIds
                          usageRecordIds:usageRecordIds
@@ -157,9 +163,13 @@ using ::tflite::gpu::TensorUsageRecord;
 - (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
        inputOutputBuffers:(const std::map<ValueId, id<MTLBuffer>>&)inputOutputBuffers
              encoderBlock:(id<MTLComputeCommandEncoder> (^)(bool isLast))encoderBlock {
+  for (auto& task_index : _taskIdsWithInOutBuffers) {
+    auto& task = _computeTasks[task_index];
+    [task updateBuffers:inputOutputBuffers];
+  }
   for (int i = 0; i < _computeTasks.size(); ++i) {
     auto& task = _computeTasks[i];
-    [task encodeWithEncoder:commandEncoder inputOutputBuffers:inputOutputBuffers];
+    [task encodeWithEncoder:commandEncoder];
     if (encoderBlock != nil) {
       commandEncoder = encoderBlock(i == _computeTasks.size() - 1);
     }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index e90f8a41c8b..867919d6f0a 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -130,7 +130,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
@@ -165,6 +165,7 @@ macos_unit_test(
     minimum_os_version = "10.13",
     tags = [
         "local",
+        "no_mac",  # TODO(b/171882133)
     ],
     deps = [":conv_test_lib"],
 )
@@ -231,7 +232,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -269,7 +270,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
@@ -369,6 +370,7 @@ ios_unit_test(
     minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = [
+        "no_mac",  # TODO(b/171882133)
         "notap",
         "tflite_not_portable_android",
     ],
@@ -694,7 +696,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
     ],
 )
@@ -768,7 +770,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
@@ -828,7 +830,7 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/metal:api",
         "//tensorflow/lite/delegates/gpu/metal:common",
         "//tensorflow/lite/delegates/gpu/metal:compiled_model",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@FP16",
@@ -908,7 +910,7 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:common",
-        "//tensorflow/lite/delegates/gpu/metal:environment",
+        "//tensorflow/lite/delegates/gpu/metal:device_info",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
     ],
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index 04cd95de4b1..401089f9fa1 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.h b/tensorflow/lite/delegates/gpu/metal/kernels/conv.h
index def4ba5e08a..1e45323e609 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
index 9872328f6d7..2e6fa290670 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
@@ -51,13 +51,6 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
     #include <metal_stdlib>
     using namespace metal;
 
-    struct uniforms {
-      uint src_depth;
-      uint dst_channels;
-      uint out_channels;
-      uint dummy;
-    };
-
     $$0
     kernel void ComputeFunction(
                                 $$1
@@ -71,11 +64,11 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
   float summa = 0.0f;
   threadgroup FLT4 local_vector[32];
   for (int j = 0; j < $0; ++j) {
-    local_vector[tid_index] = j * 32 + tid_index >= params.src_depth ?
+    local_vector[tid_index] = j * 32 + tid_index >= args.src_slices ?
       FLT4(0.0f) : vector[j * 32 + tid_index];
     $1(mem_flags::mem_threadgroup);
     for (uint i = 0, counter = j * 32 + tid.y * 8; i < 8; ++i, ++counter) {
-      summa += dot(local_vector[tid.y * 8 + i], matrix[counter * params.dst_channels + ugid.x]);
+      summa += dot(local_vector[tid.y * 8 + i], matrix[counter * args.dst_channels_alignedx8 + ugid.x]);
     }
     $1(mem_flags::mem_none);
   }
@@ -87,10 +80,10 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
   for (uint i = 0; i < $0; ++i, ++counter) {
     )";
     if (src_depth % 4 != 0) {
-      code << "    if (counter >= params.src_depth) continue;" << std::endl;
+      code << "    if (counter >= args.src_slices) continue;" << std::endl;
     }
     code << "    summa += dot(vector[counter], matrix[counter * "
-            "params.dst_channels + ugid.x]);"
+            "args.dst_channels_alignedx8 + ugid.x]);"
          << std::endl;
     code << "  }" << std::endl;
   }
@@ -106,7 +99,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
     temp[tid.x][0] = summa;
   }
   $1(mem_flags::mem_threadgroup);
-  if (tid.y == 0 && tid.x % 4 == 0 && ugid.x < params.out_channels) {
+  if (tid.y == 0 && tid.x % 4 == 0 && ugid.x < args.dst_channels) {
     const int linear_index = ugid.x / 4;
     FLT4 value = FLT4(temp[tid.x][0], temp[tid.x + 1][0], temp[tid.x + 2][0], temp[tid.x + 3][0]) +
       biases[linear_index];
@@ -132,6 +125,11 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
   desc->shader_source = GetFullyConnectedCode(device_info, attr.weights.shape.i,
                                               attr.weights.shape.o);
 
+  desc->args.AddInt("dst_channels", attr.weights.shape.o);
+  desc->args.AddInt("src_slices", DivideRoundUp(attr.weights.shape.i, 4));
+  desc->args.AddInt("dst_channels_alignedx8",
+                    AlignByN(attr.weights.shape.o, 8));
+
   desc->input_buffers = {
       {input_id, "device FLT4* const vector"},
   };
@@ -174,19 +172,6 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
                                      attr.weights.shape.o)},
   };
 
-  desc->uniform_buffers = {
-      {"constant uniforms& params",
-       [attr](const std::map<ValueId, BHWC>& buffers) {
-         std::vector<uint32_t> uniform_params{
-             static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
-             static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
-             static_cast<uint32_t>(attr.weights.shape.o),
-             static_cast<uint32_t>(0),
-         };
-         return GetByteBuffer(uniform_params);
-       }},
-  };
-
   desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
     const uint3 groups_size{8, 4, 1};
     const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h
index 3e1f26fc7a8..87e31a7dc33 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
index 0dfbbc830dd..1fda7cfafa1 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.h b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.h
index 2745d1f0c3e..4a5ccdeba49 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.mm b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.mm
index a1b414f0060..4458e81e71c 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.mm
@@ -34,7 +34,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
index 4a7f356e822..3bea0c5d8e2 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h
index 54dd2f93dcc..8e798c74eb4 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_arguments.h b/tensorflow/lite/delegates/gpu/metal/metal_arguments.h
new file mode 100644
index 00000000000..df66bd915bb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/metal_arguments.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/arguments.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+struct GPUResourcesWithValue {
+  std::vector<std::pair<std::string, int>> ints;
+  std::vector<std::pair<std::string, float>> floats;
+  std::vector<std::pair<std::string, id<MTLBuffer>>> buffers;
+};
+
+class MetalArguments : public ArgumentsSetter {
+ public:
+  MetalArguments() = default;
+
+  absl::Status Init(int buffer_offset, Arguments* args, std::string* code);
+
+  // Move only
+  MetalArguments(MetalArguments&& args) = default;
+  MetalArguments& operator=(MetalArguments&& args) = default;
+  MetalArguments(const MetalArguments&) = delete;
+  MetalArguments& operator=(const MetalArguments&) = delete;
+
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+
+  void Encode(id<MTLComputeCommandEncoder> encoder, int buffer_offset) const;
+
+ private:
+  std::string GetListOfArgs(int buffer_offset);
+
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+
+  absl::Status SetBuffer(const std::string& name, id<MTLBuffer> handle);
+
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t bytes_offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t bytes_offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<uint8_t> const_data_;
+
+  struct MetalBufferDescriptor {
+    GPUBufferDescriptor desc;
+    id<MTLBuffer> handle;
+  };
+  std::map<std::string, MetalBufferDescriptor> buffers_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_arguments.mm b/tensorflow/lite/delegates/gpu/metal/metal_arguments.mm
new file mode 100644
index 00000000000..b82a5613c88
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/metal_arguments.mm
@@ -0,0 +1,200 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
+
+#include <string>
+
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
+void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
+                     std::string* str) {
+  size_t position = str->find(old_word);
+  while (position != std::string::npos) {
+    char prev = position == 0 ? '.' : (*str)[position - 1];
+    char next = position + old_word.size() < str->size()
+                    ? (*str)[position + old_word.size()]
+                    : '.';
+    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
+      position = str->find(old_word, position + 1);
+      continue;
+    }
+    str->replace(position, old_word.size(), new_word);
+    position = str->find(old_word, position + new_word.size());
+  }
+}
+
+void AppendArgument(const std::string& arg, std::string* args) {
+  if (!args->empty()) {
+    absl::StrAppend(args, ",\n");
+  }
+  absl::StrAppend(args, arg);
+}
+}  // namespace
+
+// Static
+constexpr char MetalArguments::kArgsPrefix[];
+
+absl::Status MetalArguments::Init(int buffer_offset, Arguments* args, std::string* code) {
+  args->GetActiveArguments(*code);
+  std::string struct_desc = "struct uniforms_buffer {\n";
+  int pos = 0;
+  for (auto& fvalue : args->float_values_) {
+    auto& new_val = float_values_[fvalue.first];
+    new_val.value = fvalue.second.value;
+    new_val.active = fvalue.second.active;
+    if (fvalue.second.active) {
+      new_val.bytes_offset = pos * 4;
+      pos++;
+      struct_desc += "  float " + fvalue.first + ";\n";
+      ReplaceAllWords(kArgsPrefix + fvalue.first, "U." + fvalue.first, code);
+    }
+  }
+  for (auto& ivalue : args->int_values_) {
+    auto& new_val = int_values_[ivalue.first];
+    new_val.value = ivalue.second.value;
+    new_val.active = ivalue.second.active;
+    if (ivalue.second.active) {
+      new_val.bytes_offset = pos * 4;
+      pos++;
+      struct_desc += "  int " + ivalue.first + ";\n";
+      ReplaceAllWords(kArgsPrefix + ivalue.first, "U." + ivalue.first, code);
+    }
+  }
+  if (pos != 0) {
+    int aligned_pos = AlignByN(pos, 4);
+    for (int i = pos; i < aligned_pos; i++) {
+      struct_desc += "  int dummy" + std::to_string(i - pos) + ";\n";
+    }
+    struct_desc += "};";
+    const_data_.resize(aligned_pos * 4);
+    for (auto& it : float_values_) {
+      if (it.second.active) {
+        float* ptr = reinterpret_cast<float*>(&const_data_[it.second.bytes_offset]);
+        *ptr = it.second.value;
+      }
+    }
+    for (auto& it : int_values_) {
+      if (it.second.active) {
+        int32_t* ptr = reinterpret_cast<int32_t*>(&const_data_[it.second.bytes_offset]);
+        *ptr = it.second.value;
+      }
+    }
+  } else {
+    struct_desc = "";
+  }
+  *code = absl::Substitute(*code, struct_desc, GetListOfArgs(buffer_offset));
+  return absl::OkStatus();
+}
+
+absl::Status MetalArguments::SetInt(const std::string& name, int value) {
+  auto it = int_values_.find(name);
+  if (it == int_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No int argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    int32_t* ptr = reinterpret_cast<int32_t*>(&const_data_[it->second.bytes_offset]);
+    *ptr = value;
+  }
+  return absl::OkStatus();
+}
+absl::Status MetalArguments::SetFloat(const std::string& name, float value) {
+  auto it = float_values_.find(name);
+  if (it == float_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No float argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    float* ptr = reinterpret_cast<float*>(&const_data_[it->second.bytes_offset]);
+    *ptr = value;
+  }
+  return absl::OkStatus();
+}
+
+void MetalArguments::Encode(id<MTLComputeCommandEncoder> encoder, int buffer_offset) const {
+  if (!const_data_.empty()) {
+    [encoder setBytes:const_data_.data() length:const_data_.size() atIndex:buffer_offset];
+  }
+}
+
+std::string MetalArguments::GetListOfArgs(int buffer_offset) {
+  std::string result;
+  for (auto& t : buffers_) {
+    std::string attributes;
+    for (const auto& attr : t.second.desc.attributes) {
+      attributes += absl::StrCat("  __attribute__((", attr, "))");
+    }
+    AppendArgument(
+        absl::StrCat(
+            MemoryTypeToMetalType(t.second.desc.memory_type), " ",
+            ToMetalDataType(t.second.desc.data_type, t.second.desc.element_size),
+            "* ", t.first, "[[buffer(", buffer_offset, ")]]", attributes),
+        &result);
+    buffer_offset++;
+  }
+  if (!const_data_.empty()) {
+    AppendArgument(
+        absl::StrCat("constant uniforms_buffer& U[[buffer(", buffer_offset, ")]]"),
+        &result);
+    buffer_offset++;
+  }
+  if (!result.empty()) {
+    result += ",\n";
+  }
+  return result;
+}
+
+absl::Status MetalArguments::SetGPUResources(
+    const std::string& name, const GPUResourcesWithValue& resources) {
+  for (const auto& r : resources.ints) {
+    RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.floats) {
+    RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.buffers) {
+    RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  return absl::OkStatus();
+}
+
+void MetalArguments::AddBuffer(const std::string& name, const GPUBufferDescriptor& desc) {
+  buffers_[name].desc = desc;
+}
+
+absl::Status MetalArguments::SetBuffer(const std::string& name, id<MTLBuffer> handle) {
+  auto it = buffers_.find(name);
+  if (it == buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No buffer argument with name - ", name));
+  }
+  it->second.handle = handle;
+  return absl::OkStatus();
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h
index ea9da126954..e1e2ed7ff58 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -47,10 +47,15 @@ typedef struct {
   bool allow_precision_loss;
   TFLGpuDelegateWaitType wait_type;
   // Allows execution of integer quantized models
-  // TODO(b/169350710): Enable by default.
   bool enable_quantization;
 } TFLGpuDelegateOptions;
 
+// Populates TFLGpuDelegateOptions as follows:
+//   allow_precision_loss = false;
+//   wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
+//   enable_quantization = true;
+TFL_CAPI_EXPORT extern TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault(void);
+
 // Creates a new delegate instance that need to be destroyed with
 // `TFLDeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
 // When `options` is set to `nullptr`, the following default values are used:
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index e97e89d54c0..933a95d3a10 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
 #include "tensorflow/lite/delegates/gpu/metal/compiled_model.h"
-#include "tensorflow/lite/delegates/gpu/metal/environment.h"
+#include "tensorflow/lite/delegates/gpu/metal/device_info.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -177,10 +177,7 @@ class Delegate {
     if (options) {
       options_ = *options;
     } else {
-      // Default options.
-      options_.allow_precision_loss = false;
-      options_.enable_quantization = false;
-      options_.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
+      options_ = TFLGpuDelegateOptionsDefault();
     }
     metal_device_ = MTLCreateSystemDefaultDevice();
     command_queue_ = [metal_device_ newCommandQueue];
@@ -732,3 +729,12 @@ bool TFLGpuDelegateSetCommandEncoder(
   metal_delegate->SetCommandEncoder(encoder, control_encoder);
   return true;
 }
+
+TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault() {
+  TFLGpuDelegateOptions options = {
+      .allow_precision_loss = false,
+      .wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive,
+      .enable_quantization = true,
+  };
+  return options;
+}
diff --git a/tensorflow/lite/delegates/hexagon/java/src/main/native/hexagon_delegate_jni.cc b/tensorflow/lite/delegates/hexagon/java/src/main/native/hexagon_delegate_jni.cc
index 387ccb21ed3..9254b824dc6 100644
--- a/tensorflow/lite/delegates/hexagon/java/src/main/native/hexagon_delegate_jni.cc
+++ b/tensorflow/lite/delegates/hexagon/java/src/main/native/hexagon_delegate_jni.cc
@@ -45,6 +45,7 @@ Java_org_tensorflow_lite_HexagonDelegate_setAdspLibraryPath(
   std::stringstream path;
   path << lib_dir_path
        << ";/system/lib/rfsa/adsp;/system/vendor/lib/rfsa/adsp;/dsp";
+  env->ReleaseStringUTFChars(native_lib_path, lib_dir_path);
   return setenv("ADSP_LIBRARY_PATH", path.str().c_str(), 1 /*override*/) == 0
              ? JNI_TRUE
              : JNI_FALSE;
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 7a34b0846f2..b3c6eff4741 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -189,6 +189,30 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "nnapi_delegate_nnapi_failure_handling_test",
+    size = "small",
+    srcs = [
+        "nnapi_delegate_nnapi_failure_handling_test.cc",
+    ],
+    tags = [
+        "no_mac",
+        "no_windows",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":nnapi_delegate",
+        ":nnapi_delegate_mock_test",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/nnapi:nnapi_lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "nnapi_delegate_signed_quantization_test",
     size = "small",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index c30e22892cc..681c3fbfe18 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -307,6 +307,9 @@ PowOpModel/.+,29
 # quant_basic_lstm_test
 QuantizedLstmTest/BasicQuantizedLstmTest/29
 
+# quantized_lstm op test
+IntegerLstmOpTest/NoCifg_NoPeephole_Projection_LayerNorm,30
+
 # quantize_test
 QuantizeOpTest/UINT8,29
 QuantizeOpTest/UInt8UInt8.+,29
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
index 533407e5d3a..7387959136f 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -76,8 +76,8 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
     }
 
     /**
-     * Configure the location to be used to store model compilation cache entries. If either
-     * {@code cacheDir} or {@code modelToken} parameters are unset NNAPI caching will be disabled.
+     * Configure the location to be used to store model compilation cache entries. If either {@code
+     * cacheDir} or {@code modelToken} parameters are unset NNAPI caching will be disabled.
      *
      * <p>Only effective on Android 10 (API level 29) and above.
      */
@@ -151,7 +151,7 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
             /*overrideDisallowCpu=*/ options.useNnapiCpu != null,
             /*disallowCpuValue=*/ options.useNnapiCpu != null
                 ? !options.useNnapiCpu.booleanValue()
-                : false,
+                : true,
             options.allowFp16 != null ? options.allowFp16 : false);
   }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index d81d950af76..f5a9b85ad1b 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -496,9 +496,9 @@ TfLiteStatus GetDeviceHandle(const NnApi* nnapi, TfLiteContext* context,
 }
 
 // Compute the hash of a TfLiteIntArray.
-uint64_t GetHash(const TfLiteIntArray* int_array) {
+uint64_t GetHash(const TfLiteIntArray* int_array, uint64_t combine_with = 0) {
   constexpr auto kHashConst = 0x9e3779b97f4a7800ULL;
-  uint64_t result = 0;
+  uint64_t result = combine_with;
   for (auto i : TfLiteIntArrayView(int_array)) {
     result = result ^ (i + kHashConst + (result << 10) + (result >> 4));
   }
@@ -549,7 +549,7 @@ TfLiteStatus GetTargetSdkVersion(
       (devices_sdk_version < nnapi->android_sdk_version)) {
     TFLITE_LOG(TFLITE_LOG_INFO,
                "Changing Android NN SDK version %d to version "
-               "supported by target devices: %d",
+               "supported by target devices: %lld",
                nnapi->android_sdk_version, devices_sdk_version);
 
     *target_sdk_version = devices_sdk_version;
@@ -565,14 +565,18 @@ TfLiteStatus GetTargetSdkVersion(
 // If exclude_nnapi_reference is true this method will return false if the
 // accelerator_name in the delegate options is equal to "nnapi-reference"
 bool ShouldUseTargetDevices(StatefulNnApiDelegate::Options delegate_options,
+                            const NnApi* nnapi,
                             bool exclude_nnapi_reference = false) {
   const char* device_name_ptr = delegate_options.accelerator_name;
   std::string nnapi_cpu("nnapi-reference");
   bool has_selected_accelerator = device_name_ptr != nullptr;
   if (exclude_nnapi_reference && has_selected_accelerator) {
-    has_selected_accelerator = nnapi_cpu != device_name_ptr;
+    if (nnapi_cpu == device_name_ptr) return false;
   }
-  return (delegate_options.disallow_nnapi_cpu) || has_selected_accelerator;
+  return (delegate_options.disallow_nnapi_cpu &&
+          nnapi->android_sdk_version >=
+              delegate::nnapi::kMinSdkVersionForNNAPI12) ||
+         has_selected_accelerator;
 }
 
 // Fills the given result vector with the list of devices the given delegate
@@ -731,6 +735,19 @@ class NNAPIOpBuilder {
         values, num_values, ANEURALNETWORKS_TENSOR_INT32, scale, zero_point);
   }
 
+  TfLiteStatus AddVectorInt16Operand(const int16_t* values,
+                                     uint32_t num_values) {
+    return AddVectorOperand<int16_t>(values, num_values,
+                                     ANEURALNETWORKS_TENSOR_QUANT16_SYMM,
+                                     /*scale=*/1.f, /*zero_point=*/0);
+  }
+
+  TfLiteStatus AddVectorInt8Operand(const int8_t* values, uint32_t num_values) {
+    return AddVectorOperand<int8_t>(values, num_values,
+                                    ANEURALNETWORKS_TENSOR_QUANT8_SYMM,
+                                    /*scale=*/1.f, /*zero_point=*/0);
+  }
+
   TfLiteStatus AddVectorFloat32Operand(const float* values,
                                        uint32_t num_values) {
     return AddVectorOperand<float>(values, num_values,
@@ -771,6 +788,24 @@ class NNAPIOpBuilder {
         ann_tensor_index_out);
   }
 
+  TfLiteStatus AddStateInt16Tensor(int tensor_index,
+                                   int* ann_tensor_index_out) {
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    return AddAdditionalOutputTensor(
+        tensor->dims->size, reinterpret_cast<uint32_t*>(tensor->dims->data),
+        ANEURALNETWORKS_TENSOR_QUANT16_SYMM, tensor->params.scale,
+        tensor->params.zero_point, ann_tensor_index_out);
+  }
+
+  TfLiteStatus AddStateInt8AsymTensor(int tensor_index,
+                                      int* ann_tensor_index_out) {
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    return AddAdditionalOutputTensor(
+        tensor->dims->size, reinterpret_cast<uint32_t*>(tensor->dims->data),
+        ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED, tensor->params.scale,
+        tensor->params.zero_point, ann_tensor_index_out);
+  }
+
   // Add a constant tensor with a single element, intended for broadcast capable
   // ops.
   TfLiteStatus AddSingleValueConstantTensor(float value, bool is_quantized) {
@@ -2309,9 +2344,16 @@ bool NNAPIDelegateKernel::Validate(
                                      kMinSdkVersionForNNAPI12, &val_ctx);
         }
 
-        Expect(weight_type == kTfLiteFloat32 || weight_type == kTfLiteUInt8,
-               NNAPIValidationFailureType::kUnsupportedInputType,
-               "Weight has to be Float32 or UINT8", &val_ctx);
+        if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+          Expect(weight_type == kTfLiteFloat32 || weight_type == kTfLiteUInt8 ||
+                     weight_type == kTfLiteInt8,
+                 NNAPIValidationFailureType::kUnsupportedInputType,
+                 "Weight has to be Float32 or UINT8 or INT8", &val_ctx);
+        } else {
+          Expect(weight_type == kTfLiteFloat32 || weight_type == kTfLiteUInt8,
+                 NNAPIValidationFailureType::kUnsupportedInputType,
+                 "Weight has to be Float32 or UINT8", &val_ctx);
+        }
       }
     } break;
     case kTfLiteBuiltinMean: {
@@ -3479,7 +3521,7 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
   const auto delegate_options =
       StatefulNnApiDelegate::GetOptions(params->delegate);
   if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-      ShouldUseTargetDevices(delegate_options)) {
+      ShouldUseTargetDevices(delegate_options, nnapi_)) {
     TF_LITE_ENSURE_STATUS(GetTargetDevices(context, params->delegate, nnapi_,
                                            nnapi_errno, &nnapi_devices_));
 
@@ -3517,15 +3559,27 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
     // token.
     // TODO(b/133342794): use a generic token generator class.
     uint64_t token_parts[4];
-    // bits from model_token.
+    // Create bits from model_token.
+    // TODO(b/172237993): should not use std::hash, as that is not
+    // guaranteed to be stable across program invocations.
     token_parts[0] = std::hash<std::string>{}(model_token);
-    // bits from params->nodes_to_replace.
+    // Create bits from params->nodes_to_replace.
     token_parts[1] = GetHash(params->nodes_to_replace);
-    // bits from params->input_tensors.
+    // Create bits from params->input_tensors. These include the input tensor
+    // sizes, as the cached compilations are size-dependent.
     token_parts[2] = GetHash(params->input_tensors);
+    for (int i : TfLiteIntArrayView(params->input_tensors)) {
+      if (i != kTfLiteOptionalTensor) {
+        TfLiteTensor* t = &context->tensors[i];
+        TF_LITE_ENSURE(context, t->dims);
+        token_parts[2] = GetHash(t->dims, token_parts[2]);
+      }
+    }
     // bits from params->output_tensors.
     token_parts[3] = GetHash(params->output_tensors);
     // NNAPI requires the token to be 256bit long.
+    // TODO(b/172238515): get token size from header instead of
+    // hardcoding.
     std::vector<uint8_t> nnapi_cache_token(32, 0);
     // Copy the token bits.
     uint8_t* p = reinterpret_cast<uint8_t*>(token_parts);
@@ -4050,6 +4104,101 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
     TF_LITE_ENSURE_STATUS(
         context->GetNodeAndRegistration(context, node_index, &node, &reg));
 
+    // Fully quantized full LSTM.
+    if (target_sdk_version_ >= kMinSdkVersionForNNAPI13 &&
+        reg->builtin_code == kTfLiteBuiltinLstm && isLstmFullKernel(node) &&
+        context->tensors[node->inputs->data[0]].type == kTfLiteInt8) {
+      const auto quant8_full_lstm_op_code = ANEURALNETWORKS_QUANTIZED_LSTM;
+
+      constexpr int kInputTensor = 0;
+      constexpr int kInputToInputWeightsTensor = 1;
+      constexpr int kRecurrentToInputWeightsTensor = 5;
+      constexpr int kInputGateBiasTensor = 12;
+      constexpr int kForgetGateBiasTensor = 13;
+      constexpr int kCellGateBiasTensor = 14;
+      constexpr int kOutputGateBiasTensor = 15;
+      constexpr int kProjectionWeightsTensor = 16;
+      constexpr int kProjectionBiasTensor = 17;
+      constexpr int kPrevOutputTensor = 18;
+
+      // Add input tensors.
+      for (int input_pos = 0; input_pos < node->inputs->size; ++input_pos) {
+        const auto input_index = node->inputs->data[input_pos];
+        if (input_index == kTfLiteOptionalTensor) {
+          if (input_pos == kInputToInputWeightsTensor ||
+              input_pos == kRecurrentToInputWeightsTensor ||
+              input_pos == kProjectionWeightsTensor) {
+            TF_LITE_ENSURE_STATUS(builder.AddVectorInt8Operand(nullptr, 0));
+          } else if (input_pos == kInputGateBiasTensor ||
+                     input_pos == kForgetGateBiasTensor ||
+                     input_pos == kCellGateBiasTensor ||
+                     input_pos == kOutputGateBiasTensor ||
+                     input_pos == kProjectionBiasTensor) {
+            TF_LITE_ENSURE_STATUS(builder.AddVectorInt32Operand(nullptr, 0));
+          } else {  // cell-to-* and layer norm weights.
+            TF_LITE_ENSURE_STATUS(builder.AddVectorInt16Operand(nullptr, 0));
+          }
+        } else {
+          // Only input and previous output use INT8_ASYM_SIGNED.
+          int flags =
+              (input_pos == kInputTensor || input_pos == kPrevOutputTensor)
+                  ? NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED
+                  : 0;
+          TF_LITE_ENSURE_STATUS(
+              builder.AddTensorInput(input_index, /*hybrid_op=*/false, flags));
+        }
+      }
+
+      // Add clip parameters.
+      auto builtin = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+      TF_LITE_ENSURE_STATUS(
+          builder.AddScalarFloat32Operand(builtin->cell_clip));
+      TF_LITE_ENSURE_STATUS(
+          builder.AddScalarFloat32Operand(builtin->proj_clip));
+
+      // Add quantization parameters for intermediate tensors.
+      TF_LITE_ENSURE_EQ(context, node->intermediates->size, 5);
+      for (int intermediate_pos = 0;
+           intermediate_pos < node->intermediates->size; ++intermediate_pos) {
+        const auto intermediate_index =
+            node->intermediates->data[intermediate_pos];
+        const TfLiteTensor& tensor = context->tensors[intermediate_index];
+        TfLiteAffineQuantization* quantization_params =
+            static_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+        if (intermediate_pos == 4) {
+          TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
+              quantization_params->zero_point->data[0]));
+        }
+        TF_LITE_ENSURE_STATUS(builder.AddScalarFloat32Operand(
+            quantization_params->scale->data[0]));
+      }
+
+      // Activation state output.
+      int ann_index;
+      builder.AddStateInt8AsymTensor(
+          node->inputs->data[/*kInputActivationStateTensor*/ 18], &ann_index);
+      model_state_outputs_.push_back(ann_index);
+      model_state_tfl_inputs_.push_back(
+          node->inputs->data[/*kInputActivationStateTensor*/ 18]);
+
+      // Cell state output.
+      builder.AddStateInt16Tensor(
+          node->inputs->data[/*kInputCellStateTensor*/ 19], &ann_index);
+      model_state_outputs_.push_back(ann_index);
+      model_state_tfl_inputs_.push_back(
+          node->inputs->data[/*kInputCellStateTensor*/ 19]);
+
+      // Add output tensors.
+      for (int output_pos = 0; output_pos < node->outputs->size; ++output_pos) {
+        const auto output_index = node->outputs->data[output_pos];
+        TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(
+            output_index, NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED));
+      }
+
+      builder.FinalizeAddOperation(quant8_full_lstm_op_code, node_index);
+      continue;
+    }
+
     const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
     const bool scalar_as_tensor = IsScalarInputSupported(reg->builtin_code);
     const bool need_int8_conversion =
@@ -4341,8 +4490,9 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
                   builder.AddTensorInput(input_index, hybrid_op));
               break;
             case kTfLiteInt64: {
-              // We made sure that dimensions are constant and fit into int32 in
-              // Map(), so we can safely create a new tensor with casted values.
+              // We made sure that dimensions are constant and fit into int32
+              // in Map(), so we can safely create a new tensor with casted
+              // values.
               const int dims_size = dims_tensor.dims->data[0];
               std::vector<int32_t> dims_int32(dims_size);
               std::copy(dims_tensor.data.i64, dims_tensor.data.i64 + dims_size,
@@ -4417,7 +4567,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
     AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
                                       node_index, &builder, nnapi_errno);
 
-    builder.FinalizeAddOperation(nn_op_type, node_index);
+    TF_LITE_ENSURE_OK(context_,
+                      builder.FinalizeAddOperation(nn_op_type, node_index));
   }
   return kTfLiteOk;
 }
@@ -4809,7 +4960,7 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
   // If not, don't delegate to NNAPI's CPU reference implementation unless
   // it has been specified as target accelerator.
   if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI12) {
-    if (ShouldUseTargetDevices(delegate_options)) {
+    if (ShouldUseTargetDevices(delegate_options, nnapi)) {
       std::vector<ANeuralNetworksDevice*> devices;
       TF_LITE_ENSURE_STATUS(
           GetTargetDevices(context, delegate, nnapi, nnapi_errno, &devices));
@@ -4849,7 +5000,7 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
 
   // Check for every node if it is supported
   const bool is_accelerator_specified = ShouldUseTargetDevices(
-      delegate_options, /*exclude_nnapi_reference=*/true);
+      delegate_options, nnapi, /*exclude_nnapi_reference=*/true);
   for (int node_index : TfLiteIntArrayView(plan)) {
     TfLiteNode* node;
     TfLiteRegistration* registration;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index bd4165d8a17..4b12b0d0d18 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -79,7 +79,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // above. The NNAPI CPU typically performs less well than built-in TfLite
     // kernels, but allowing CPU allows partial acceleration of models. If this
     // is set to true, NNAPI is only used if the whole model is accelerated.
-    bool disallow_nnapi_cpu = false;
+    bool disallow_nnapi_cpu = true;
 
     // Specifies the max number of partitions to delegate. A value <= 0 means
     // no limit.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index d5c86acf16f..e0d134fa53a 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -114,7 +114,7 @@ struct NnApiDeviceSelectionTest
   FloatAddOpModel m;
 };
 
-TEST_F(NnApiDeviceSelectionTest, DoesntSetDevicesWithoutFlags) {
+TEST_F(NnApiDeviceSelectionTest, DoesntSetDevicesWhenCpuAllowed) {
   nnapi_mock_->StubCompilationCreateForDevicesWith(
       [](ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
@@ -124,6 +124,7 @@ TEST_F(NnApiDeviceSelectionTest, DoesntSetDevicesWithoutFlags) {
       });
 
   tflite::StatefulNnApiDelegate::Options options;
+  options.disallow_nnapi_cpu = false;
   InitWithOptions(options);
   m.Invoke();
   EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
@@ -270,7 +271,10 @@ class ArgMaxOpModel : public SingleOpModel, public AcceleratedModel {
 
     SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
                  CreateArgMaxOptions(builder_, output_type).Union());
-    BuildInterpreter({input_shape, {1}});
+    BuildInterpreter({input_shape, {1}}, /*num_threads*/ -1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 };
 
@@ -410,7 +414,8 @@ class AddSubOpsAcceleratedModel : public MultiOpModel, public AcceleratedModel {
                  {add_output, input3_}, {output_});
     BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)},
                      /*num_threads=*/-1, allow_fp32_relax_to_fp16,
-                     /*apply_delegate=*/true);
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 };
 
@@ -591,7 +596,8 @@ class HardSwishAddOpsAcceleratedModel : public MultiOpModel,
                  CreateAddOptions(builder_, activation_type).Union(),
                  {input1_, hard_swish_output}, {output_});
     BuildInterpreter({GetShape(input1_), GetShape(input2_)}, /*num_threads=*/-1,
-                     allow_fp32_relax_to_fp16, /*apply_delegate=*/true);
+                     allow_fp32_relax_to_fp16, /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 };
 
@@ -721,7 +727,8 @@ class QuantizedWeightsConvolutionOpModel : public SingleOpModel,
 
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
                      num_threads, /*allow_fp32_relax_to_fp16=*/false,
-                     /*apply_delegate=*/true);
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -867,7 +874,11 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
         {intermediate_outputs[intermediate_outputs.size() - 1], zero_input_},
         {output_});
 
-    BuildInterpreter({GetShape(input_), GetShape(zero_input_)});
+    BuildInterpreter({GetShape(input_), GetShape(zero_input_)},
+                     /*num_threads*/ -1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
 
     std::vector<float> zero(GetTensorSize(input_), 0.0);
     PopulateTensor(zero_input_, zero);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
index f347799b4b8..481f5aaee38 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/interpreter.h"
@@ -30,7 +31,8 @@ namespace {
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   explicit SingleOpModelWithNNAPI(const NnApi* nnapi) {
-    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi));
+    options_.disallow_nnapi_cpu = false;
+    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options_));
     this->SetDelegate(stateful_delegate_.get());
   }
 
@@ -42,6 +44,7 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
 
  private:
   std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
+  StatefulNnApiDelegate::Options options_;
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -75,7 +78,11 @@ class FloatAddOpModel : public SingleOpModelWithNNAPI {
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
     BuildInterpreter({GetShape(input1_), GetShape(input2_)}, /*num_threads=*/-1,
-                     allow_fp32_relax_to_fp16, /*apply_delegate=*/true);
+                     allow_fp32_relax_to_fp16, /*apply_delegate=*/false);
+    // We defer applying the 'stateful_delegate_' till now (i.e. via setting
+    // 'apply_delegate=false' above) so that default TfLite delegates won't be
+    // applied.
+    ApplyDelegate();
   }
 };
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
new file mode 100644
index 00000000000..3f3d6229290
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <ostream>
+#include <unordered_set>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace {
+
+struct NnApiFailureHandlingTest
+    : ::tflite::delegate::nnapi::NnApiDelegateMockTest {};
+
+// This is a model with two ops:
+//
+//  input1 ---->
+//                ADD --
+//  input2   -->        |
+//                       -->
+//                          SUB --> output
+//  input3 ---------------->
+//
+class AddSubOpsAcceleratedModel : public MultiOpModel {
+ public:
+  AddSubOpsAcceleratedModel(const TensorData& input1, const TensorData& input2,
+                            const TensorData& input3, const TensorData& output,
+                            ActivationFunctionType activation_type,
+                            const NnApi* nnapi,
+                            const std::string& accelerator_name,
+                            bool allow_fp32_relax_to_fp16 = false)
+      : MultiOpModel() {
+    StatefulNnApiDelegate::Options options;
+    options.accelerator_name = accelerator_name.c_str();
+    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options));
+    SetDelegate(stateful_delegate_.get());
+    Init(input1, input2, input3, output, activation_type,
+         allow_fp32_relax_to_fp16);
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int input3() { return input3_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int input3_;
+  int output_;
+
+ private:
+  std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
+
+  // Performs initialization logic shared across all constructors.
+  void Init(const TensorData& input1, const TensorData& input2,
+            const TensorData& input3, const TensorData& output,
+            ActivationFunctionType activation_type,
+            bool allow_fp32_relax_to_fp16 = false) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    input3_ = AddInput(input3);
+    const int add_output = AddInnerTensor<float>(output);
+    output_ = AddOutput(output);
+    AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                 CreateAddOptions(builder_, activation_type).Union(),
+                 {input1_, input2_}, {add_output});
+    AddBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
+                 CreateSubOptions(builder_, activation_type).Union(),
+                 {add_output, input3_}, {output_});
+    BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)},
+                     /*num_threads=*/-1, allow_fp32_relax_to_fp16,
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
+  }
+};
+
+TEST_F(NnApiFailureHandlingTest, DelegateShouldFailImmediatelyIfUnableToAddOp) {
+  static int add_op_invocation_count = 0;
+  nnapi_mock_->SetNnapiSupportedDevice("test-device");
+
+  nnapi_mock_->StubAddOperationWith(
+      [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+         uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+         const uint32_t* outputs) -> int {
+        ++add_op_invocation_count;
+        return ANEURALNETWORKS_BAD_DATA;
+      });
+
+  AddSubOpsAcceleratedModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
+      ActivationFunctionType_NONE, nnapi_mock_->GetNnApi(),
+      /*accelerator_name=*/"test-device");
+  std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
+  std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
+  m.PopulateTensor<float>(m.input1(), input1);
+  m.PopulateTensor<float>(m.input2(), input2);
+  m.PopulateTensor<float>(m.input3(), input2);
+  m.Invoke();
+
+  EXPECT_EQ(add_op_invocation_count, 1);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
index 10898a97a41..2bb512df8ef 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
@@ -41,7 +41,8 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() = default;
   void Init(const NnApi* nnapi) {
-    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi));
+    options_.disallow_nnapi_cpu = false;
+    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options_));
     SetDelegate(stateful_delegate_.get());
   }
 
@@ -54,6 +55,7 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
 
  protected:
   std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
+  StatefulNnApiDelegate::Options options_;
   TfLiteStatus compilation_status_;
 };
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 205a44991dc..611dec01a29 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -47,11 +47,17 @@ MATCHER(QuantizedNear, "") {
 
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
-  SingleOpModelWithNNAPI() { SetDelegate(NnApiDelegate()); }
+  SingleOpModelWithNNAPI() {
+    options_.disallow_nnapi_cpu = false;
+    stateful_delegate_.reset(new StatefulNnApiDelegate(options_));
+    SetDelegate(stateful_delegate_.get());
+  }
 
   explicit SingleOpModelWithNNAPI(
       const StatefulNnApiDelegate::Options& options) {
-    stateful_delegate_.reset(new StatefulNnApiDelegate(options));
+    options_ = options;
+    options_.disallow_nnapi_cpu = false;
+    stateful_delegate_.reset(new StatefulNnApiDelegate(options_));
     SetDelegate(stateful_delegate_.get());
   }
 
@@ -107,6 +113,7 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
  private:
   // Stateful NNAPI delegate. This is valid only if the state-ful constructor is
   // used.
+  StatefulNnApiDelegate::Options options_;
   std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
 };
 
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index e96dfdd187b..f97efec99b5 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -76,8 +76,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -94,8 +94,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -112,8 +112,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -130,8 +130,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -148,8 +148,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -165,8 +165,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -182,8 +182,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -199,8 +199,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -217,8 +217,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -234,8 +234,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -251,8 +251,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -268,8 +268,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -285,8 +285,8 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index bc18c76f7eb..6007ddcec64 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
index b81928717f3..c1d378ba344 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index ca40e89375a..a14fc15d7b5 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 0ea0580bd79..c55555d60ec 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
index c44143eb18a..cee4c0b55f2 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/mean_test.cc b/tensorflow/lite/delegates/xnnpack/mean_test.cc
index b79c553ab03..c6f6e6e7407 100644
--- a/tensorflow/lite/delegates/xnnpack/mean_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/mean_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
-TEST(Mean, DISABLED_4DReduceBatch) {
+TEST(Mean, DISABLED_4DReduceBatchSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -42,10 +42,32 @@ TEST(Mean, DISABLED_4DReduceBatch) {
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({0})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_4DReduceHeight) {
+TEST(Mean, DISABLED_4DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_4DReduceHeightSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -62,10 +84,32 @@ TEST(Mean, DISABLED_4DReduceHeight) {
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({1})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_4DReduceWidth) {
+TEST(Mean, DISABLED_4DReduceHeightKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_4DReduceWidthSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -82,10 +126,32 @@ TEST(Mean, DISABLED_4DReduceWidth) {
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({2})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, 4DReduceHeightWidth) {
+TEST(Mean, DISABLED_4DReduceWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, 4DReduceHeightWidthSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -102,15 +168,44 @@ TEST(Mean, 4DReduceHeightWidth) {
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({1, 2})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({2, 1})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_4DReduceChannels) {
+TEST(Mean, 4DReduceHeightWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2, 1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_4DReduceChannelsSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -127,10 +222,32 @@ TEST(Mean, DISABLED_4DReduceChannels) {
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({3})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_3DReduceBatch) {
+TEST(Mean, DISABLED_4DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({3})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_3DReduceBatchSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -146,10 +263,31 @@ TEST(Mean, DISABLED_3DReduceBatch) {
   ReduceTester()
       .InputShape({batch, width, channels})
       .Axes({0})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_3DReduceWidth) {
+TEST(Mean, DISABLED_3DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_3DReduceWidthSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -165,10 +303,31 @@ TEST(Mean, DISABLED_3DReduceWidth) {
   ReduceTester()
       .InputShape({batch, width, channels})
       .Axes({1})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_3DReduceChannels) {
+TEST(Mean, DISABLED_3DReduceWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_3DReduceChannelsSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -184,10 +343,31 @@ TEST(Mean, DISABLED_3DReduceChannels) {
   ReduceTester()
       .InputShape({batch, width, channels})
       .Axes({2})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_2DReduceBatch) {
+TEST(Mean, DISABLED_3DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_2DReduceBatchSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -202,10 +382,30 @@ TEST(Mean, DISABLED_2DReduceBatch) {
   ReduceTester()
       .InputShape({batch, channels})
       .Axes({0})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_2DReduceChannels) {
+TEST(Mean, DISABLED_2DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_2DReduceChannelsSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -220,10 +420,30 @@ TEST(Mean, DISABLED_2DReduceChannels) {
   ReduceTester()
       .InputShape({batch, channels})
       .Axes({1})
+      .KeepDims(false)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
-TEST(Mean, DISABLED_1D) {
+TEST(Mean, DISABLED_2DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_1DSqueezeDims) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -234,8 +454,23 @@ TEST(Mean, DISABLED_1D) {
       std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
   const auto batch = shape_rng();
 
-  ReduceTester().InputShape({batch}).Axes({0}).Test(BuiltinOperator_MEAN,
-                                                    xnnpack_delegate.get());
+  ReduceTester().InputShape({batch}).Axes({0}).KeepDims(false).Test(
+      BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  ReduceTester().InputShape({batch}).Axes({0}).KeepDims(true).Test(
+      BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
 TEST(Mean, MultiThreading) {
@@ -258,6 +493,7 @@ TEST(Mean, MultiThreading) {
   ReduceTester()
       .InputShape({batch, height, width, channels})
       .Axes({1, 2})
+      .KeepDims(true)
       .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
index 0365fd4cbd5..e9790be64d4 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
index bb6e8be7b7d..b31c2d2a91c 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
index cee690e4dbd..496342439bc 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
index 9628dbcc1d4..7715e9cd938 100644
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
index f44e3bd9ea3..8a6d2548b0d 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
index df11d740696..7453d92fb6f 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
index 4ec916db17f..2ab005c639e 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
index df22ae2db7a..60a2599f7a4 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 914c4ec7f8f..c4476bab702 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -1829,22 +1829,6 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
         logging_context, axes_tensor, node->inputs->data[1], node_index));
 
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (!reducer_params->keep_dims) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unsupported MEAN reduction without keep_dims attributes in node %d",
-          node_index);
-      return kTfLiteError;
-    }
-
     if (axes_tensor.dims->data[0] != 2) {
       TF_LITE_MAYBE_KERNEL_LOG(
           logging_context,
@@ -1866,6 +1850,16 @@ class Subgraph {
       return kTfLiteError;
     }
 
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    const int expected_output_dims = reducer_params->keep_dims ? 4 : 2;
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
+                                           expected_output_dims,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
     if (subgraph != nullptr) {
       const xnn_status status = xnn_define_global_average_pooling_2d(
           subgraph,
diff --git a/tensorflow/lite/examples/minimal/CMakeLists.txt b/tensorflow/lite/examples/minimal/CMakeLists.txt
new file mode 100644
index 00000000000..d48a8104f73
--- /dev/null
+++ b/tensorflow/lite/examples/minimal/CMakeLists.txt
@@ -0,0 +1,44 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Builds the minimal Tensorflow Lite example.
+
+cmake_minimum_required(VERSION 3.16)
+project(minimal C CXX)
+
+set(TENSORFLOW_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the TensorFlow project"
+)
+if(NOT TENSORFLOW_SOURCE_DIR)
+  get_filename_component(TENSORFLOW_SOURCE_DIR
+    "${CMAKE_CURRENT_LIST_DIR}/../../../../"
+    ABSOLUTE
+  )
+endif()
+
+add_subdirectory(
+  "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
+  "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite"
+  EXCLUDE_FROM_ALL
+)
+
+add_executable(minimal
+  minimal.cc
+)
+target_link_libraries(minimal
+  tensorflow-lite
+  ${CMAKE_DL_LIBS}
+)
diff --git a/tensorflow/lite/examples/minimal/README.md b/tensorflow/lite/examples/minimal/README.md
new file mode 100644
index 00000000000..76a44d463f4
--- /dev/null
+++ b/tensorflow/lite/examples/minimal/README.md
@@ -0,0 +1,37 @@
+# TensorFlow Lite C++ minimal example
+
+This example shows how you can build a simple TensorFlow Lite application.
+
+#### Step 1. Install CMake tool
+
+It requires CMake 3.16 or higher. On Ubuntu, you can simply run the following
+command.
+
+```sh
+sudo apt-get install cmake
+```
+
+Or you can follow
+[the official cmake installation guide](https://cmake.org/install/)
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+#### Step 3. Create CMake build directory and run CMake tool
+
+```sh
+mkdir minimal_build
+cd minimal_build
+cmake ../tensorflow_src/tensorflow/lite/examples/minimal
+```
+
+#### Step 4. Build TensorFlow Lite
+
+In the minimal_build directory,
+
+```sh
+cmake --build . -j
+```
diff --git a/tensorflow/lite/examples/python/README.md b/tensorflow/lite/examples/python/README.md
index 82b7ad690fc..8f870468d08 100644
--- a/tensorflow/lite/examples/python/README.md
+++ b/tensorflow/lite/examples/python/README.md
@@ -5,18 +5,16 @@ TensorFlow Lite model and use it to recognize objects in images. The Python
 script accepts arguments specifying the model to use, the corresponding labels
 file, and the image to process.
 
-**Tip:**
-If you're using a Raspberry Pi, instead try the [classify_picamera.py example](
-https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/raspberry_pi).
-
-Before you begin,
-make sure you [have TensorFlow installed](https://www.tensorflow.org/install).
+**Tip:** If you're using a Raspberry Pi, instead try the
+[classify_picamera.py example](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/raspberry_pi).
 
+Before you begin, make sure you
+[have TensorFlow installed](https://www.tensorflow.org/install).
 
 ## Download sample model and image
 
-You can use any compatible model, but the following MobileNet v1 model offers
-a good demonstration of a model trained to recognize 1,000 different objects.
+You can use any compatible model, but the following MobileNet v1 model offers a
+good demonstration of a model trained to recognize 1,000 different objects.
 
 ```sh
 # Get photo
@@ -31,8 +29,6 @@ mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
 
 ## Run the sample
 
-Note: Instead use `python` if you're using Python 2.x.
-
 ```sh
 python3 label_image.py \
   --model_file /tmp/mobilenet_v1_1.0_224.tflite \
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index 497bf3cb58c..15ff046cb05 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -127,6 +127,11 @@ message NNAPISettings {
   // dynamic dimensions of the model.
   // By default this is set to false.
   optional bool allow_dynamic_dimensions = 9;
+
+  // Whether to allow the NNAPI accelerator to optionally use lower-precision
+  // float16 (16-bit floating point) arithmetic when doing calculations on
+  // float32 (32-bit floating point).
+  optional bool allow_fp16_precision_for_fp32 = 10;
 }
 
 // Which GPU backend to select. Default behaviour on Android is to try OpenCL
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
index cf99f530d6d..30dda0daa5b 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
@@ -96,6 +96,7 @@ class NnapiPlugin : public DelegatePluginInterface {
         !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
     options_.execution_priority =
         ConvertExecutionPriority(nnapi_settings->execution_priority());
+    options_.allow_fp16 = nnapi_settings->allow_fp16_precision_for_fp32();
   }
 
  private:
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
index c35e92e6fc1..11c85013313 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
@@ -45,7 +45,12 @@ class SingleAddOpModel : tflite::SingleOpModel {
 
     SetBuiltinOp(tflite::BuiltinOperator_ADD, tflite::BuiltinOptions_AddOptions,
                  tflite::CreateAddOptions(builder_).Union());
-    BuildInterpreter({GetShape(input), GetShape(constant)});
+    // Set apply_delegate to false to skip applying TfLite default delegates.
+    BuildInterpreter({GetShape(input), GetShape(constant)},
+                     /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false,
+                     /*allocate_and_delegate=*/true);
   }
 
   tflite::Interpreter* Interpreter() const { return interpreter_.get(); }
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 11868fe044d..cce0c4df883 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -71,6 +71,7 @@ tflite_ios_static_framework(
 # bazel build -c opt --config=ios --ios_multi_cpus=armv7,arm64,x86_64 //tensorflow/lite/experimental/ios:TensorFlowLiteSelectTfOps_framework
 ios_static_framework(
     name = "TensorFlowLiteSelectTfOps_framework",
+    avoid_deps = ["//tensorflow/lite/c:common"],
     bundle_name = "TensorFlowLiteSelectTfOps",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     deps = [
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
deleted file mode 100644
index eed0f087f44..00000000000
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ /dev/null
@@ -1,47 +0,0 @@
-Pod::Spec.new do |s|
-  s.name             = 'TensorFlowLiteObjC'
-  s.version          = '0.0.1-nightly'
-  s.authors          = 'Google Inc.'
-  s.license          = { :type => 'Apache' }
-  s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :branch => 'master' }
-  s.summary          = 'TensorFlow Lite for Objective-C'
-  s.description      = <<-DESC
-
-  TensorFlow Lite is TensorFlow's lightweight solution for Objective-C
-  developers. It enables low-latency inference of on-device machine learning
-  models with a small binary size and fast performance supporting hardware
-  acceleration.
-                       DESC
-
-  s.ios.deployment_target = '9.0'
-
-  s.module_name = 'TFLTensorFlowLite'
-  s.static_framework = true
-
-  tfl_dir = 'tensorflow/lite/'
-  objc_dir = tfl_dir + 'experimental/objc/'
-  s.public_header_files = objc_dir + 'apis/*.h'
-  s.source_files = [
-    objc_dir + '{apis,sources}/*.{h,m,mm}',
-    tfl_dir + 'c/c_api.h',
-    tfl_dir + 'c/common.h',
-    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
-  ]
-  s.module_map = objc_dir + 'apis/framework.modulemap'
-  s.dependency 'TensorFlowLiteC', "~> #{s.version}"
-  s.pod_target_xcconfig = {
-    'HEADER_SEARCH_PATHS' =>
-      '"${PODS_TARGET_SRCROOT}" ' +
-      '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
-  }
-
-  s.test_spec 'Tests' do |ts|
-    ts.source_files = objc_dir + 'tests/*.m'
-    ts.resources = [
-      tfl_dir + 'testdata/add.bin',
-      tfl_dir + 'testdata/add_quantized.bin',
-    ]
-  end
-end
diff --git a/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h b/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
index f73c5dedfe5..026e945d888 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
@@ -57,7 +57,7 @@ typedef NS_ENUM(NSUInteger, TFLMetalDelegateThreadWaitType) {
 
 /**
  * Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default is
- * `false`.
+ * `true`.
  */
 @property(nonatomic, getter=isQuantizationEnabled) BOOL quantizationEnabled;
 
diff --git a/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m b/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
index 85894e61cd1..e5bdb18967b 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
+++ b/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
@@ -29,6 +29,7 @@ NS_ASSUME_NONNULL_BEGIN
 - (instancetype)init {
   self = [super init];
   if (self != nil) {
+    _quantizationEnabled = true;
     _waitType = TFLMetalDelegateThreadWaitTypePassive;
   }
   return self;
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 4d0060231f6..0eefe353e58 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -62,8 +62,8 @@ extension MetalDelegate {
     public var waitType: ThreadWaitType = .passive
 
     /// Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default
-    /// is `false`.
-    public var isQuantizationEnabled = false
+    /// is `true`.
+    public var isQuantizationEnabled = true
 
     /// Creates a new instance with the default values.
     public init() {}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
deleted file mode 100644
index 8b0e797eeaa..00000000000
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
+++ /dev/null
@@ -1,45 +0,0 @@
-Pod::Spec.new do |s|
-  s.name             = 'TensorFlowLiteSwift'
-  s.version          = '0.0.1-nightly'
-  s.authors          = 'Google Inc.'
-  s.license          = { :type => 'Apache' }
-  s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :branch => 'master' }
-  s.summary          = 'TensorFlow Lite for Swift'
-  s.description      = <<-DESC
-
-  TensorFlow Lite is TensorFlow's lightweight solution for Swift developers. It
-  enables low-latency inference of on-device machine learning models with a
-  small binary size and fast performance supporting hardware acceleration.
-                       DESC
-
-  s.ios.deployment_target = '9.0'
-
-  s.module_name = 'TensorFlowLite'
-  s.static_framework = true
-
-  tfl_dir = 'tensorflow/lite/'
-  swift_dir = tfl_dir + 'experimental/swift/'
-
-  s.default_subspec = 'Core'
-
-  s.subspec 'Core' do |core|
-    core.dependency 'TensorFlowLiteC', "#{s.version}"
-    core.source_files = swift_dir + 'Sources/*.swift'
-    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
-  end
-
-  s.subspec 'CoreML' do |coreml|
-    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
-    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
-    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
-  end
-
-  s.test_spec 'Tests' do |ts|
-    ts.source_files = swift_dir + 'Tests/*.swift'
-    ts.resources = [
-      tfl_dir + 'testdata/add.bin',
-      tfl_dir + 'testdata/add_quantized.bin',
-    ]
-  end
-end
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
index 1e414f1959f..8b44de4df14 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -54,6 +54,7 @@ Pod::Spec.new do |s|
       ts.resources = [
         tfl_dir + 'testdata/add.bin',
         tfl_dir + 'testdata/add_quantized.bin',
+        tfl_dir + 'testdata/multi_add.bin',
       ]
     end
   end
diff --git a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
index 8e8de7c320d..4983d7fc324 100644
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@@ -34,18 +34,26 @@ class MetalDelegateTests: XCTestCase {
   }
 
   func testInitInterpreterWithDelegate() throws {
+    // If metal device is not available, skip.
+    if MTLCreateSystemDefaultDevice() == nil {
+      return
+    }
     let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
+    let interpreter = try Interpreter(modelPath: MultiAddModel.path, delegates: [metalDelegate])
     XCTAssertEqual(interpreter.delegates?.count, 1)
     XCTAssertNil(interpreter.options)
   }
 
   func testInitInterpreterWithOptionsAndDelegate() throws {
+    // If metal device is not available, skip.
+    if MTLCreateSystemDefaultDevice() == nil {
+      return
+    }
     var options = Interpreter.Options()
     options.threadCount = 1
     let metalDelegate = MetalDelegate()
     let interpreter = try Interpreter(
-      modelPath: AddQuantizedModel.path,
+      modelPath: MultiAddModel.path,
       options: options,
       delegates: [metalDelegate]
     )
@@ -91,3 +99,16 @@ class MetalDelegateOptionsTests: XCTestCase {
     XCTAssertNotEqual(options1, options2)
   }
 }
+
+
+/// Values for the `multi_add.bin` model.
+enum MultiAddModel {
+  static let info = (name: "multi_add", extension: "bin")
+
+  static var path: String = {
+    let bundle = Bundle(for: MetalDelegateTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 47550be2a21..8efbd51b969 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -80,6 +80,8 @@ static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteUnpackParams",
                                       "TfLiteReverseSequenceParams",
                                       "TfLiteWhileParams",
+                                      "TfLiteCumsumParams",
+                                      "TfLiteCallOnceParams",
                                       nullptr};
 }  // namespace
 
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 22fd564635c..37a3682f473 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -139,7 +139,6 @@ upper_tabs:
         path: /lite/performance/measurement
       - title: "Delegates"
         path: /lite/performance/delegates
-        status: experimental
       - title: "GPU delegate"
         path: /lite/performance/gpu
       - title: "Advanced GPU"
@@ -152,6 +151,9 @@ upper_tabs:
       - title: "Core ML delegate"
         path: /lite/performance/coreml_delegate
         status: experimental
+      - title: "Implementing a delegate"
+        path: /lite/performance/implementing_delegate
+        status: experimental
 
       - heading: "Optimize a model"
       - title: "Overview"
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 36774205770..070e801030b 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -74,7 +74,7 @@ import tensorflow as tf
 
 # Convert the model
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir) # path to the SavedModel directory
-tflite_model = converter.convert().
+tflite_model = converter.convert()
 
 # Save the model.
 with open('model.tflite', 'wb') as f:
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 667e12fae6f..a6670f10aba 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -37,7 +37,7 @@ There are three parts to the model metadata in the
 [schema](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs):
 
 1.  **Model information** - Overall description of the model as well as items
-    such as licence terms. See
+    such as license terms. See
     [ModelMetadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L640).
 2.  **Input information** - Description of the inputs and pre-processing
     required such as normalization. See
@@ -82,8 +82,8 @@ is compatible with existing TFLite framework and Interpreter. See
 [Pack mtadata and associated files into the model](#pack-metadata-and-associated-files-into-the-model)
 for more details.
 
-The associated file information can be recored in the metadata. Depending on the
-file type and where the file is attached to (i.e. `ModelMetadata`,
+The associated file information can be recorded in the metadata. Depending on
+the file type and where the file is attached to (i.e. `ModelMetadata`,
 `SubGraphMetadata`, and `TensorMetadata`),
 [the TensorFlow Lite Android code generator](../inference_with_metadata/codegen.md)
 may apply corresponding pre/post processing automatically to the object. See
@@ -328,7 +328,7 @@ populator.populate()
 
 You can pack as many associated files as you want into the model through
 `load_associated_files`. However, it is required to pack at least those files
-documented in the metadata. In this example, packing the lable file is
+documented in the metadata. In this example, packing the label file is
 mandatory.
 
 ## Visualize the metadata
@@ -375,12 +375,12 @@ does not imply the true incompatibility. When bumping up the MAJOR number, it
 does not necessarily mean the backwards compatibility is broken. Therefore, we
 use the
 [Flatbuffers file identification](https://google.github.io/flatbuffers/md__schemas.html),
-[file_identifiler](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L61),
+[file_identifier](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L61),
 to denote the true compatibility of the metadata schema. The file identifier is
 exactly 4 characters long. It is fixed to a certain metadata schema and not
 subject to change by users. If the backward compatibility of the metadata schema
 has to be broken for some reason, the file_identifier will bump up, for example,
-from “M001” to “M002”. File_identifiler is expected to be changed much less
+from “M001” to “M002”. File_identifier is expected to be changed much less
 frequently than the metadata_version.
 
 ### The minimum necessary metadata parser version
diff --git a/tensorflow/lite/g3doc/guide/build_android.md b/tensorflow/lite/g3doc/guide/build_android.md
index 32fc6f1facc..006d4d7a95b 100644
--- a/tensorflow/lite/g3doc/guide/build_android.md
+++ b/tensorflow/lite/g3doc/guide/build_android.md
@@ -41,17 +41,17 @@ docker build . -t tflite-builder -f tflite-android.Dockerfile
 ```
 
 *   Start the docker container interactively by mounting your current folder to
-    /tmp inside the container (note that /tensorflow_src is the TensorFlow
+    /host_dir inside the container (note that /tensorflow_src is the TensorFlow
     repository inside the container):
 
 ```shell
-docker run -it -v $PWD:/tmp tflite-builder bash
+docker run -it -v $PWD:/host_dir tflite-builder bash
 ```
 
 If you use PowerShell on Windows, replace "$PWD" with "pwd".
 
 If you would like to use a TensorFlow repository on the host, mount that host
-directory instead (-v hostDir:/tmp).
+directory instead (-v hostDir:/host_dir).
 
 *   Once you are inside the container, you can run the following to download
     additional Android tools and libraries (note that you may need to accept the
@@ -62,8 +62,8 @@ android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_AP
 ```
 
 You can now proceed to the "Build and Install" section. After you are finished
-building the libraries, you can copy them to /tmp inside the container so that
-you can access them on the host.
+building the libraries, you can copy them to /host_dir inside the container so
+that you can access them on the host.
 
 ### Set up build environment without Docker
 
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 3b420926991..f43a81dd268 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -81,8 +81,8 @@ PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH \
 
 ## Compile natively on Raspberry Pi
 
-The following instructions have been tested on Raspberry Pi Zero, Raspbian
-GNU/Linux 10 (buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
+The following instructions have been tested on Raspberry Pi Zero, Raspberry Pi
+OS GNU/Linux 10 (Buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
 
 To natively compile TensorFlow Lite follow the steps:
 
diff --git a/tensorflow/lite/g3doc/guide/faq.md b/tensorflow/lite/g3doc/guide/faq.md
index 491e3252635..4ab52fea085 100644
--- a/tensorflow/lite/g3doc/guide/faq.md
+++ b/tensorflow/lite/g3doc/guide/faq.md
@@ -135,11 +135,9 @@ like this:
     to do this. However, increasing threads results in performance variability
     depending on the environment.
 *   *Use Hardware Accelerators.* TensorFlow Lite supports model acceleration for
-    specific hardware using delegates. For example, to use Android’s Neural
-    Networks API, call
-    [`UseNNAPI`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L343)
-    on the interpreter. Or take a look at our
-    [GPU delegate tutorial](../performance/gpu.md).
+    specific hardware using delegates. See our
+    [Delegates](../performance/delegates.md) guide for information on what
+    accelerators are supported and how to use them with your model on-device.
 *   *(Advanced) Profile Model.* The Tensorflow Lite
     [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
     has a built-in profiler that can show per-operator statistics. If you know
diff --git a/tensorflow/lite/g3doc/guide/python.md b/tensorflow/lite/g3doc/guide/python.md
index 1f68a0aa5a2..fa2245f680d 100644
--- a/tensorflow/lite/g3doc/guide/python.md
+++ b/tensorflow/lite/g3doc/guide/python.md
@@ -29,8 +29,8 @@ package](https://www.tensorflow.org/install/).
 To install, run `pip3 install` and pass it the appropriate Python wheel URL from
 the following table.
 
-For example, if you have Raspberry Pi that's running Raspbian Buster (which has
-Python 3.7), install the Python wheel as follows:
+For example, if you have a Raspberry Pi that's running Raspberry Pi OS 10 (which
+has Python 3.7), install the Python wheel as follows:
 
 <pre class="devsite-terminal devsite-click-to-copy">
 pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_armv7l.whl
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/overview.md
index 8caa92a6b68..2bb628bddb9 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/overview.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/overview.md
@@ -49,3 +49,10 @@ and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
 pretrained models with metadata for both vision and text tasks. Also see
 different options of
 [visualizing the metadata](../convert/metadata.md#visualize-the-metadata).
+
+## TensorFlow Lite Support GitHub repo
+
+Visit the
+[TensorFlow Lite Support GitHub repo](https://github.com/tensorflow/tflite-support)
+for more examples and source code. Let us know your feedback by creating a
+[new GitHub issue](https://github.com/tensorflow/tflite-support/issues/new).
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
index ac8eb1975cc..a9450184123 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
@@ -137,7 +137,7 @@ with your own model and test data.
 The `BetNLClassifier` API expects a TFLite model with mandatory
 [TFLite Model Metadata](../../convert/metadata.md).
 
-The Metadata should meet the following requiresments:
+The Metadata should meet the following requirements:
 
 *   input_process_units for Wordpiece/Sentencepiece Tokenizer
 
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
index f5d9aff7b6d..995c1cf7478 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
@@ -153,7 +153,7 @@ with your own model and test data.
 The `BertQuestionAnswerer` API expects a TFLite model with mandatory
 [TFLite Model Metadata](../../convert/metadata.md).
 
-The Metadata should meet the following requiresments:
+The Metadata should meet the following requirements:
 
 *   `input_process_units` for Wordpiece/Sentencepiece Tokenizer
 
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
index 68e701d0796..04f0477552e 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
@@ -370,7 +370,7 @@ native API to be built first.
 Here is an example using ObjC
 [`TFLBertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/ios/task/text/qa/Sources/TFLBertQuestionAnswerer.h)
 for [MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1)
-in Swfit.
+in Swift.
 
 ```swift
   static let mobileBertModelPath = "path/to/model.tflite";
@@ -427,7 +427,7 @@ following the steps below:
         std::unique_ptr<QuestionAnswererCPP> _bertQuestionAnswerwer;
       }
 
-      // Initilalize the native API object
+      // Initialize the native API object
       + (instancetype)mobilebertQuestionAnswererWithModelPath:(NSString *)modelPath
                                               vocabPath:(NSString *)vocabPath {
         absl::StatusOr<std::unique_ptr<QuestionAnswererCPP>> cQuestionAnswerer =
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
index c17370be026..f1204dfa300 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
@@ -36,6 +36,10 @@ API.
 
 ## Run inference in Java
 
+See the
+[Image Segmentation reference app](https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/android/)
+for an example of how to use `ImageSegmenter` in an Android app.
+
 ### Step 1: Import Gradle dependency and other settings
 
 Copy the `.tflite` model file to the assets directory of the Android module
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
index 09ce3a12a49..f152ce69f7f 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
@@ -42,6 +42,10 @@ API.
 
 ## Run inference in Java
 
+See the
+[Object Detection reference app](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android/)
+for an example of how to use `ObjectDetector` in an Android app.
+
 ### Step 1: Import Gradle dependency and other settings
 
 Copy the `.tflite` model file to the assets directory of the Android module
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
index 8f401e916b5..dd22e2ed61d 100644
--- a/tensorflow/lite/g3doc/models/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -2,43 +2,30 @@
 
 <img src="../images/image.png" class="attempt-right">
 
-Use a pre-trained and optimized model to identify hundreds of classes of
-objects, including people, activities, animals, plants, and places.
+The task of identifying what an image represents is called _image
+classification_. An image classification model is trained to recognize various
+classes of images. For example, you may train a model to recognize photos
+representing three different types of animals: rabbits, hamsters, and dogs.
+TensorFlow Lite provides optimized pre-trained models that you can deploy in
+your mobile applications. Learn more about image classification using TensorFlow
+[here](https://www.tensorflow.org/tutorials/images/classification).
+
+The following image shows the output of the image classification model on
+Android.
+
+<img src="images/android_banana.png" alt="Screenshot of Android example" width="30%">
 
 ## Get started
 
-If you are unfamiliar with the concept of image classification, you should start
-by reading <a href="#what_is_image_classification">What is image
-classification?</a>
+If you are new to TensorFlow Lite and are working with Android or iOS, it is
+recommended you explore the following example applications that can help you get
+started.
 
-To learn how to use image classification in a mobile app, we recommend exploring
-our <a href="#example_applications_and_guides">Example applications and
-guides</a>.
-
-If you are using a platform other than Android or iOS, or you are already
-familiar with the TensorFlow Lite APIs, you can download our starter image
-classification model and the accompanying labels.
-
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">Download
-starter model and labels</a>
-
-Once you have the starter model running on your target device, you can
-experiment with different models to find the optimal balance between
-performance, accuracy, and model size. For guidance, see
-<a href="#choose_a_different_model">Choose a different model</a>.
-
-### Example applications and guides
-
-We have example applications for image classification for both Android and iOS.
-For each example, we provide a guide that explains how it works.
-
-#### Android
-
-You can leverage the out-of-box API from TensorFlow Lite Task Library to
-[integrate image classification models](../../inference_with_metadata/task_library/image_classifier)
-in just a few lines of code. You can also
-[build your own custom inference pipleline](../../inference_with_metadata/lite_support)
-using the TensorFlow Lite Support Library.
+You can leverage the out-of-box API from
+[TensorFlow Lite Task Library](../../inference_with_metadata/task_library/image_classifier)
+to integrate image classification models in just a few lines of code. You can
+also build your own custom inference pipeline using the
+[TensorFlow Lite Support Library](../../inference_with_metadata/lite_support).
 
 The Android example below demonstrates the implementation for both methods as
 [lib_task_api](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/lib_task_api)
@@ -49,39 +36,34 @@ respectively.
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">View
 Android example</a>
 
-Read the
-[Android example guide](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/EXPLORE_THE_CODE.md)
-to learn how the app works.
-
-#### iOS
-
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios">View
 iOS example</a>
 
-Read the
-[iOS example guide](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md)
-to learn how the app works.
+If you are using a platform other than Android/iOS, or if you are already
+familiar with the
+[TensorFlow Lite APIs](https://www.tensorflow.org/api_docs/python/tf/lite),
+download the starter model and supporting files (if applicable).
 
-#### Screenshot
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">Download
+starter model</a>
 
-The following screenshot shows the Android image classification example.
+## Model description
 
-<img src="images/android_banana.png" alt="Screenshot of Android example" width="30%">
+### How it works
 
-## What is image classification?
+During training, an image classification model is fed images and their
+associated _labels_. Each label is the name of a distinct concept, or class,
+that the model will learn to recognize.
 
-A common use of machine learning is to identify what an image represents. For
-example, we might want to know what type of animal appears in the following
-photograph.
+Given sufficient training data (often hundreds or thousands of images per
+label), an image classification model can learn to predict whether new images
+belong to any of the classes it has been trained on. This process of prediction
+is called _inference_. Note that you can also use
+[transfer learning](https://www.tensorflow.org/tutorials/images/transfer_learning)
+to identify new classes of images by using a pre-existing model. Transfer
+learning does not require a very large training dataset.
 
-<img src="images/dog.png" alt="dog" width="50%">
-
-The task of predicting what an image represents is called _image
-classification_. An image classification model is trained to recognize various
-classes of images. For example, a model might be trained to recognize photos
-representing three different types of animals: rabbits, hamsters, and dogs.
-
-When we subsequently provide a new image as input to the model, it will output
+When you subsequently provide a new image as input to the model, it will output
 the probabilities of the image representing each of the types of animal it was
 trained on. An example output might be as follows:
 
@@ -108,63 +90,10 @@ trained on. An example output might be as follows:
   </tbody>
 </table>
 
-Based on the output, we can see that the classification model has predicted that
-the image has a high probability of representing a dog.
-
-Note: Image classification can only tell you the probability that an image
-represents one or more of the classes that the model was trained on. It cannot
-tell you the position or identity of objects within the image. If you need to
-identify objects and their positions within images, you should use an
-<a href="../object_detection/overview.md">object detection</a> model.
-
-### Training, labels, and inference
-
-During training, an image classification model is fed images and their
-associated _labels_. Each label is the name of a distinct concept, or class,
-that the model will learn to recognize.
-
-Given sufficient training data (often hundreds or thousands of images per
-label), an image classification model can learn to predict whether new images
-belong to any of the classes it has been trained on. This process of prediction
-is called _inference_.
-
-To perform inference, an image is passed as input to a model. The model will
-then output an array of probabilities between 0 and 1. With our example model,
-this process might look like the following:
-
-<table style="width: 60%">
-  <tr style="border-top: 0px;">
-    <td style="width: 40%"><img src="images/dog.png" alt="dog"></td>
-    <td style="width: 20%; font-size: 2em; vertical-align: middle; text-align: center;">→</td>
-    <td style="width: 40%; vertical-align: middle; text-align: center;">[0.07, 0.02, 0.91]</td>
-</table>
-
-Each number in the output corresponds to a label in our training data.
-Associating our output with the three labels the model was trained on, we can
-see the model has predicted a high probability that the image represents a dog.
-
-<table style="width: 40%;">
-  <thead>
-    <tr>
-      <th>Label</th>
-      <th>Probability</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>rabbit</td>
-      <td>0.07</td>
-    </tr>
-    <tr>
-      <td>hamster</td>
-      <td>0.02</td>
-    </tr>
-    <tr>
-      <td style="background-color: #fcb66d;">dog</td>
-      <td style="background-color: #fcb66d;">0.91</td>
-    </tr>
-  </tbody>
-</table>
+Each number in the output corresponds to a label in the training data.
+Associating the output with the three labels the model was trained on, you can
+see that the model has predicted a high probability that the image represents a
+dog.
 
 You might notice that the sum of all the probabilities (for rabbit, hamster, and
 dog) is equal to 1. This is a common type of output for models with multiple
@@ -172,12 +101,18 @@ classes (see
 <a href="https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax">Softmax</a>
 for more information).
 
-### Ambiguous results
+Note: Image classification can only tell you the probability that an image
+represents one or more of the classes that the model was trained on. It cannot
+tell you the position or identity of objects within the image. If you need to
+identify objects and their positions within images, you should use an
+<a href="../object_detection/overview.md">object detection</a> model.
 
-Since the probabilities will always sum to 1, if the image is not confidently
-recognized as belonging to any of the classes the model was trained on you may
-see the probability distributed throughout the labels without any one value
-being significantly larger.
+<h4>Ambiguous results</h4>
+
+Since the output probabilities will always sum to 1, if an image is not
+confidently recognized as belonging to any of the classes the model was trained
+on you may see the probability distributed throughout the labels without any one
+value being significantly larger.
 
 For example, the following might indicate an ambiguous result:
 
@@ -203,13 +138,29 @@ For example, the following might indicate an ambiguous result:
     </tr>
   </tbody>
 </table>
+If your model frequently returns ambiguous results, you may need a different,
+more accurate model.
 
-### Uses and limitations
+<h3>Choosing a model architecture</h3>
 
-The image classification models that we provide are useful for single-label
-classification, which means predicting which single label the image is most
-likely to represent. They are trained to recognize 1000 classes of image. For a
-full list of classes, see the labels file in the
+TensorFlow Lite provides you with a variety of image classification models which
+are all trained on the original dataset. Model architectures like MobileNet,
+Inception, and NASNet are available on the
+<a href="../../guide/hosted_models.md">hosted models page</a>. To choose the best model for
+your use case, you need to consider the individual architectures as well as some
+of the tradeoffs between various models. Some of these model tradeoffs are based
+on metrics such as performance, accuracy, and model size. For example, you might
+need a faster model for building a bar code scanner while you might prefer a
+slower, more accurate model for a medical imaging app.
+
+Note that the <a href=https://www.tensorflow.org/lite/guide/hosted_models#image_classification>image classification models</a> provided accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel. The <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/EXPLORE_THE_CODE.md">Android</a> and <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md">iOS</a> code samples demonstrate how to process full-sized camera images into the required format for each model.
+
+<h3>Uses and limitations</h3>
+
+The TensorFlow Lite image classification models are useful for single-label
+classification; that is, predicting which single label the image is most likely to
+represent. They are trained to recognize 1000 image classes. For a full list of
+classes, see the labels file in the
 <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">model
 zip</a>.
 
@@ -225,13 +176,43 @@ For the following use cases, you should use a different type of model:
 
 Once you have the starter model running on your target device, you can
 experiment with different models to find the optimal balance between
-performance, accuracy, and model size. For guidance, see
-<a href="#choose_a_different_model">Choose a different model</a>.
+performance, accuracy, and model size.
 
-## Performance benchmarks
+<h3>Customize model</h3>
 
-Performance benchmark numbers are generated with the tool
-[described here](https://www.tensorflow.org/lite/performance/benchmarks).
+The pre-trained models provided are trained to recognize 1000 classes of images.
+For a full list of classes, see the labels file in the
+<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">model
+zip</a>.
+
+You can also use transfer learning to re-train a model to
+recognize classes not in the original set. For example, you could re-train the
+model to distinguish between different species of tree, despite there being no
+trees in the original training data. To do this, you will need a set of training
+images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in the
+<a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/index.html#0">Recognize
+flowers with TensorFlow</a> codelab, or with the
+<a href="https://www.tensorflow.org/lite/tutorials/model_maker_image_classification">Model Maker library</a>.
+
+<h2>Performance benchmarks</h2>
+
+Model performance is measured in terms of the amount of time it takes for a
+model to run inference on a given piece of hardware. The lower the time, the faster
+the model.
+
+The performance you require depends on your application. Performance can be
+important for applications like real-time video, where it may be important to
+analyze each frame in the time before the next frame is drawn (e.g. inference
+must be faster than 33ms to perform real-time inference on a 30fps video
+stream).
+
+The TensorFlow Lite quantized MobileNet models' performance range from 3.7ms to
+80.3 ms.
+
+Performance benchmark numbers are generated with the
+<a href="https://www.tensorflow.org/lite/performance/benchmarks">benchmarking tool</a>.
 
 <table>
   <thead>
@@ -270,75 +251,35 @@ Performance benchmark numbers are generated with the tool
 
 \*\* 2 threads used on iPhone for the best performance result.
 
-## Choose a different model
+### Model accuracy
 
-A large number of image classification models are available on our
-<a href="../../guide/hosted_models.md">List of hosted models</a>. You should aim
-to choose the optimal model for your application based on performance, accuracy
-and model size. There are trade-offs between each of them.
-
-### Performance
-
-We measure performance in terms of the amount of time it takes for a model to
-run inference on a given piece of hardware. The less time, the faster the model.
-
-The performance you require depends on your application. Performance can be
-important for applications like real-time video, where it may be important to
-analyze each frame in the time before the next frame is drawn (e.g. inference
-must be faster than 33ms to perform real-time inference on a 30fps video
-stream).
-
-Our quantized MobileNet models’ performance ranges from 3.7ms to 80.3 ms.
-
-### Accuracy
-
-We measure accuracy in terms of how often the model correctly classifies an
+Accuracy is measured in terms of how often the model correctly classifies an
 image. For example, a model with a stated accuracy of 60% can be expected to
 classify an image correctly an average of 60% of the time.
 
-Our <a href="../../guide/hosted_models.md">list of hosted models</a> provides
-Top-1 and Top-5 accuracy statistics. Top-1 refers to how often the correct label
-appears as the label with the highest probability in the model’s output. Top-5
-refers to how often the correct label appears in the top 5 highest probabilities
-in the model’s output.
+The [list of hosted models](../../guide/hosted_models.md) provides Top-1 and
+Top-5 accuracy statistics. Top-1 refers to how often the correct label appears
+as the label with the highest probability in the model’s output. Top-5 refers to
+how often the correct label appears in the 5 highest probabilities in the
+model’s output.
 
-Our quantized MobileNet models’ Top-5 accuracy ranges from 64.4 to 89.9%.
+The TensorFlow Lite quantized MobileNet models’ Top-5 accuracy range from 64.4
+to 89.9%.
 
-### Size
+### Model size
 
 The size of a model on-disk varies with its performance and accuracy. Size may
 be important for mobile development (where it might impact app download sizes)
 or when working with hardware (where available storage might be limited).
 
-Our quantized MobileNet models’ size ranges from 0.5 to 3.4 Mb.
+The TensorFlow Lite quantized MobileNet models' sizes range from 0.5 to 3.4 MB.
 
-### Architecture
+## Further reading and resources
 
-Several different model architectures are available on
-<a href="../../guide/hosted_models.md">List of hosted models</a>, indicated by
-the model’s name. For example, you can choose between MobileNet, Inception, and
-others.
+Use the following resources to learn more about concepts related to image
+classification:
 
-The architecture of a model impacts its performance, accuracy, and size. All of
-our hosted models are trained on the same data, meaning you can use the provided
-statistics to compare them and choose which is optimal for your application.
-
-Note: The image classification models we provide accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. <br /><br />All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel.<br /><br />Our <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/EXPLORE_THE_CODE.md">Android</a> and <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md">iOS</a> code samples demonstrate how to process full-sized camera images into the required format for each model.
-
-## Customize model
-
-The pre-trained models we provide are trained to recognize 1000 classes of
-image. For a full list of classes, see the labels file in the
-<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">model
-zip</a>.
-
-You can use a technique known as _transfer learning_ to re-train a model to
-recognize classes not in the original set. For example, you could re-train the
-model to distinguish between different species of tree, despite there being no
-trees in the original training data. To do this, you will need a set of training
-images for each of the new labels you wish to train.
-
-Learn how to perform transfer learning in the
-<a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">Recognize
-flowers with TensorFlow</a> codelab, or with the
-[model maker toolkit](/lite/tutorials/model_maker_image_classification).
+*   [Image classification using TensorFlow](https://www.tensorflow.org/tutorials/images/classification)
+*   [Image classification with CNNs](https://www.tensorflow.org/tutorials/images/cnn)
+*   [Transfer learning](https://www.tensorflow.org/tutorials/images/transfer_learning)
+*   [Data augmentation](https://www.tensorflow.org/tutorials/images/data_augmentation)
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
index 5ceb6a16ee1..4d34ce67fd0 100644
--- a/tensorflow/lite/g3doc/models/object_detection/overview.md
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -1,62 +1,151 @@
 # Object detection
 
-<img src="../images/detection.png" class="attempt-right">
-
-Detect multiple objects within an image, with bounding boxes. Recognize 90
-different classes of objects.
-
-## Get started
-
-If you are new to TensorFlow Lite and are working with Android or iOS, we
-recommend exploring the following example applications that can help you get
-started.
-
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android">Android
-example</a>
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/ios">iOS
-example</a>
-
-If you are using a platform other than Android or iOS, or you are already
-familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
-download our starter object detection model and the accompanying labels.
-
-<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">Download
-starter model with Metadata</a>
-
-For more information about the starter model, see
-<a href="#starter_model">Starter model</a>.
-
-For more information about Metadata and associated fields (eg: `labels.txt`) see
-<a href="https://www.tensorflow.org/lite/convert/metadata#read_the_metadata_from_models">Read
-the metadata from models</a>
-
-## What is object detection?
-
 Given an image or a video stream, an object detection model can identify which
 of a known set of objects might be present and provide information about their
 positions within the image.
 
-For example, this screenshot of our <a href="#get_started">example
+For example, this screenshot of the <a href="#get_started">example
 application</a> shows how two objects have been recognized and their positions
 annotated:
 
 <img src="images/android_apple_banana.png" alt="Screenshot of Android example" width="30%">
 
+## Get started
+
+To learn how to use object detection in a mobile app, explore the
+<a href="#example_applications_and_guides">Example applications and guides</a>.
+
+If you are using a platform other than Android or iOS, or if you are already
+familiar with the
+<a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite
+APIs</a>, you can download our starter object detection model and the
+accompanying labels.
+
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">Download
+starter model with Metadata</a>
+
+For more information about Metadata and associated fields (eg: `labels.txt`) see
+<a href="https://www.tensorflow.org/lite/convert/metadata#read_the_metadata_from_models">Read
+the metadata from models</a>
+
+If you want to train a custom detection model for your own task, see
+<a href="#model-customization">Model customization</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting which single label the image most likely represents (see <a href="../image_classification/overview.md">image classification</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
+</ul>
+
+### Example applications and guides
+
+If you are new to TensorFlow Lite and are working with Android or iOS, we
+recommend exploring the following example applications that can help you get
+started.
+
+#### Android
+
+You can leverage the out-of-box API from
+[TensorFlow Lite Task Library](../../inference_with_metadata/task_library/object_detector)
+to integrate object detection models in just a few lines of code. You can also
+build your own custom inference pipeline using the
+[TensorFlow Lite Interpreter Java API](../../guide/inference#load_and_run_a_model_in_java).
+
+The Android example below demonstrates the implementation for both methods as
+[lib_task_api](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android/lib_task_api)
+and
+[lib_interpreter](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android/lib_interpreter),
+respectively.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android">View
+Android example</a>
+
+#### iOS
+
+You can integrate the model using the
+[TensorFlow Lite Interpreter Swift API](../../guide/inference#load_and_run_a_model_in_swift).
+See the iOS example below.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/ios">View
+iOS example</a>
+
+## Model description
+
+This section describes the signature for
+[Single-Shot Detector](https://arxiv.org/abs/1512.02325) models converted to
+TensorFlow Lite from the
+[TensorFlow Object Detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/).
+
 An object detection model is trained to detect the presence and location of
 multiple classes of objects. For example, a model might be trained with images
 that contain various pieces of fruit, along with a _label_ that specifies the
 class of fruit they represent (e.g. an apple, a banana, or a strawberry), and
 data specifying where each object appears in the image.
 
-When we subsequently provide an image to the model, it will output a list of the
-objects it detects, the location of a bounding box that contains each object,
-and a score that indicates the confidence that detection was correct.
+When an image is subsequently provided to the model, it will output a list of
+the objects it detects, the location of a bounding box that contains each
+object, and a score that indicates the confidence that detection was correct.
 
-### Model output
+### Input Signature
 
-Imagine a model has been trained to detect apples, bananas, and strawberries.
-When we pass it an image, it will output a set number of detection results - in
-this example, 5.
+The model takes an image as input.
+
+Lets assume the expected image is 300x300 pixels, with three channels (red,
+blue, and green) per pixel. This should be fed to the model as a flattened
+buffer of 270,000 byte values (300x300x3). If the model is
+<a href="../../performance/post_training_quantization.md">quantized</a>, each
+value should be a single byte representing a value between 0 and 255.
+
+You can take a look at our
+[example app code](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android)
+to understand how to do this pre-processing on Android.
+
+### Output Signature
+
+The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2
+describe `N` detected objects, with one element in each array corresponding to
+each object.
+
+<table>
+  <thead>
+    <tr>
+      <th>Index</th>
+      <th>Name</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>Locations</td>
+      <td>Multidimensional array of [N][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>Classes</td>
+      <td>Array of N integers (output as floating point values) each indicating the index of a class label from the labels file</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>Scores</td>
+      <td>Array of N floating point values between 0 and 1 representing probability that a class was detected</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>Number of detections</td>
+      <td>Integer value of N</td>
+    </tr>
+  </tbody>
+</table>
+
+NOTE: The number of results (10 in the above case) is a parameter set while
+exporting the detection model to TensorFlow Lite. See
+<a href="#model-customization">Model customization</a> for more details.
+
+For example, imagine a model has been trained to detect apples, bananas, and
+strawberries. When provided an image, it will output a set number of detection
+results - in this example, 5.
 
 <table style="width: 60%;">
   <thead>
@@ -95,7 +184,7 @@ this example, 5.
   </tbody>
 </table>
 
-### Confidence score
+#### Confidence score
 
 To interpret these results, we can look at the score and the location for each
 detected object. The score is a number between 0 and 1 that indicates confidence
@@ -103,10 +192,10 @@ that the object was genuinely detected. The closer the number is to 1, the more
 confident the model is.
 
 Depending on your application, you can decide a cut-off threshold below which
-you will discard detection results. For our example, we might decide a sensible
-cut-off is a score of 0.5 (meaning a 50% probability that the detection is
-valid). In that case, we would ignore the last two objects in the array, because
-those confidence scores are below 0.5:
+you will discard detection results. For the current example, a sensible cut-off
+is a score of 0.5 (meaning a 50% probability that the detection is valid). In
+that case, the last two objects in the array would be ignored because those
+confidence scores are below 0.5:
 
 <table style="width: 60%;">
   <thead>
@@ -158,11 +247,11 @@ positive.
 
 <img src="images/false_positive.png" alt="Screenshot of Android example showing a false positive" width="30%">
 
-### Location
+#### Location
 
 For each detected object, the model will return an array of four numbers
 representing a bounding rectangle that surrounds its position. For the starter
-model we provide, the numbers are ordered as follows:
+model provided, the numbers are ordered as follows:
 
 <table style="width: 50%; margin: 0 auto;">
   <tbody>
@@ -186,7 +275,9 @@ Note: Object detection models accept input images of a specific size. This is li
 
 ## Performance benchmarks
 
-Performance benchmark numbers are generated with the tool
+Performance benchmark numbers for our
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">starter
+model</a> are generated with the tool
 [described here](https://www.tensorflow.org/lite/performance/benchmarks).
 
 <table>
@@ -226,79 +317,53 @@ Performance benchmark numbers are generated with the tool
 
 \*\* 2 threads used on iPhone for the best performance result.
 
-## Starter model
+## Model Customization
 
-We recommend starting with this pre-trained quantized COCO SSD MobileNet v1
-model.
+### Pre-trained models
 
-<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">Download
-starter model and labels</a>
+Mobile-optimized detection models with a variety of latency and precision
+characteristics can be found in the
+[Detection Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#mobile-models).
+Each one of them follows the input and output signatures described in the
+following sections.
 
-### Uses and limitations
+Most of the download zips contain a `model.tflite` file. If there isn't one, a
+TensorFlow Lite flatbuffer can be generated using
+[these instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md).
+SSD models from the
+[TF2 Object Detection Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md)
+can also be converted to TensorFlow Lite using the instructions
+[here](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tf2.md).
+It is important to note that detection models cannot be converted directly using
+the [TensorFlow Lite Converter](https://www.tensorflow.org/lite/convert), since
+they require an intermediate step of generating a mobile-friendly source model.
+The scripts linked above perform this step.
 
-The object detection model we provide can identify and locate up to 10 objects
-in an image. It is trained to recognize 90 classes of objects. For a full list
-of classes, see the labels file embedded in the model with
-<a href="https://www.tensorflow.org/lite/convert/metadata#visualize_the_metadata">metadata
-visualiztion</a>.
+Both the
+[TF1](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md)
+&
+[TF2](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md)
+exporting scripts have parameters that can enable a larger number of output
+objects or slower, more-accurate post processing. Please use `--help` with the
+scripts to see an exhaustive list of supported arguments.
 
-If you want to train a model to recognize new classes, see
-<a href="#customize_model">Customize model</a>.
+> Currently, on-device inference is only optimized with SSD models. Better
+> support for other architectures like CenterNet and EfficientDet is being
+> investigated.
 
-For the following use cases, you should use a different type of model:
+### How to choose a model to customize?
 
-<ul>
-  <li>Predicting which single label the image most likely represents (see <a href="../image_classification/overview.md">image classification</a>)</li>
-  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
-</ul>
+Each model comes with its own precision (quantified by mAP value) and latency
+characteristics. You should choose a model that works the best for your use-case
+and intended hardware. For example, the
+[Edge TPU](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#pixel4-edge-tpu-models)
+models are ideal for inference on Google's Edge TPU on Pixel 4.
 
-### Input
+You can use our
+[benchmark tool](https://www.tensorflow.org/lite/performance/measurement) to
+evaluate models and choose the most efficient option available.
 
-The model takes an image as input. The expected image is 300x300 pixels, with
-three channels (red, blue, and green) per pixel. This should be fed to the model
-as a flattened buffer of 270,000 byte values (300x300x3). Since the model is
-<a href="../../performance/post_training_quantization.md">quantized</a>, each
-value should be a single byte representing a value between 0 and 255.
-
-### Output
-
-The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2
-describe 10 detected objects, with one element in each array corresponding to
-each object. There will always be 10 objects detected.
-
-<table>
-  <thead>
-    <tr>
-      <th>Index</th>
-      <th>Name</th>
-      <th>Description</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>0</td>
-      <td>Locations</td>
-      <td>Multidimensional array of [10][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
-    </tr>
-    <tr>
-      <td>1</td>
-      <td>Classes</td>
-      <td>Array of 10 integers (output as floating point values) each indicating the index of a class label from the labels file</td>
-    </tr>
-    <tr>
-      <td>2</td>
-      <td>Scores</td>
-      <td>Array of 10 floating point values between 0 and 1 representing probability that a class was detected</td>
-    </tr>
-    <tr>
-      <td>3</td>
-      <td>Number and detections</td>
-      <td>Array of length 1 containing a floating point value expressing the total number of detection results</td>
-    </tr>
-  </tbody>
-</table>
-
-## Customize model
+## Fine-tuning models on custom data
 
 The pre-trained models we provide are trained to detect 90 classes of objects.
 For a full list of classes, see the labels file in the
@@ -309,8 +374,15 @@ You can use a technique known as transfer learning to re-train a model to
 recognize classes not in the original set. For example, you could re-train the
 model to detect multiple types of vegetable, despite there only being one
 vegetable in the original training data. To do this, you will need a set of
-training images for each of the new labels you wish to train.
+training images for each of the new labels you wish to train. Please see our
+[Few-shot detection Colab](https://github.com/tensorflow/models/blob/master/research/object_detection/colab_tutorials/eager_few_shot_od_training_tflite.ipynb)
+as an example of fine-tuning a pre-trained model with few examples.
 
-Learn how to perform transfer learning in
-<a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193">Training
-and serving a real-time mobile object detector in 30 minutes</a>.
+For fine-tuning with larger datasets, take a look at the these guides for
+training your own models with the TensorFlow Object Detection API:
+[TF1](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_training_and_evaluation.md),
+[TF2](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_training_and_evaluation.md).
+Once trained, they can be converted to a TFLite-friendly format with the
+instructions here:
+[TF1](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md),
+[TF2](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md)
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
index 408b0cadd91..2ff7262febc 100644
--- a/tensorflow/lite/g3doc/models/segmentation/overview.md
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -1,22 +1,38 @@
 # Segmentation
 
-<img src="../images/segmentation.png" class="attempt-right" />
+Image segmentation is the process of partitioning a digital image into multiple
+segments (sets of pixels, also known as image objects). The goal of segmentation
+is to simplify and/or change the representation of an image into something that
+is more meaningful and easier to analyze.
+
+The following image shows the output of the image segmentation model on Android.
+The model will create a mask over the target objects with high accuracy.
+
+<img src="images/segmentation.gif" class="attempt-right" />
 
 ## Get started
 
-_DeepLab_ is a state-of-art deep learning model for semantic image segmentation,
-where the goal is to assign semantic labels (e.g. person, dog, cat) to every
-pixel in the input image.
-
-If you are new to TensorFlow Lite and are working with Android or iOS, we
-recommend exploring the following example applications that can help you get
+If you are new to TensorFlow Lite and are working with Android or iOS, it is
+recommended you explore the following example applications that can help you get
 started.
 
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/android">Android
-example</a>
+You can leverage the out-of-box API from
+[TensorFlow Lite Task Library](../../inference_with_metadata/task_library/image_segmenter)
+to integrate image segmentation models within just a few lines of code. You can
+also integrate the model using the
+[TensorFlow Lite Interpreter Java API](../../guide/inference#load_and_run_a_model_in_java).
 
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/ios">iOS
-example</a>
+The Android example below demonstrates the implementation for both methods as
+[lib_task_api](https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/android/lib_task_api)
+and
+[lib_interpreter](https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/android/lib_interpreter),
+respectively.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/android">View
+Android example</a>
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/ios">View
+iOS example</a>
 
 If you are using a platform other than Android or iOS, or you are already
 familiar with the
@@ -26,7 +42,13 @@ APIs</a>, you can download our starter image segmentation model.
 <a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2?lite-format=tflite">Download
 starter model</a>
 
-## How it works
+## Model description
+
+_DeepLab_ is a state-of-art deep learning model for semantic image segmentation,
+where the goal is to assign semantic labels (e.g. person, dog, cat) to every
+pixel in the input image.
+
+### How it works
 
 Semantic image segmentation predicts whether each pixel of an image is
 associated with a certain class. This is in contrast to
@@ -85,13 +107,7 @@ Performance benchmark numbers are generated with the tool
 
 \*\* 2 threads used on iPhone for the best performance result.
 
-## Example output
-
-The model will create a mask over the target objects with high accuracy.
-
-<img alt="Animation showing image segmentation" src="images/segmentation.gif" />
-
-## Read more about segmentation
+## Further reading and resources
 
 <ul>
   <li><a href="https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html">Semantic Image Segmentation with DeepLab in TensorFlow</a></li>
diff --git a/tensorflow/lite/g3doc/models/text_classification/overview.md b/tensorflow/lite/g3doc/models/text_classification/overview.md
index 1761974ee3f..684ae9736af 100644
--- a/tensorflow/lite/g3doc/models/text_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/text_classification/overview.md
@@ -7,9 +7,10 @@ Use a pre-trained model to category a paragraph into predefined groups.
 <img src="images/screenshot.gif" class="attempt-right" style="max-width: 300px">
 
 If you are new to TensorFlow Lite and are working with Android, we recommend
-exploring the guide of TensorFLow Lite Task Library to
-[integrate text classification models](../../inference_with_metadata/task_library/nl_classifier).
-within just a few lines of code. You can also integrate the model using the
+exploring the guide of
+[TensorFLow Lite Task Library](../../inference_with_metadata/task_library/nl_classifier)
+to integrate text classification models within just a few lines of code. You can
+also integrate the model using the
 [TensorFlow Lite Interpreter Java API](../../guide/inference#load_and_run_a_model_in_java).
 
 The Android example below demonstrates the implementation for both methods as
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index 9df0ace4db0..ae5ffa1b13f 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -39,7 +39,7 @@ help in understanding performance bottlenecks and which operators dominate the
 computation time.
 
 You can also use
-[TensrFlow Lite tracing](measurement.md#trace_tensorflow_lite_internals_in_android)
+[TensorFlow Lite tracing](measurement.md#trace_tensorflow_lite_internals_in_android)
 to profile the model in your Android application, using standard Android system
 tracing, and to visualize the operator invocations by time with GUI based
 profiling tools.
@@ -107,9 +107,8 @@ interpreter execution. TensorFlow Lite can use delegates by:
 *   Using Android's
     [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/).
     You can utilize these hardware accelerator backends to improve the speed and
-    efficiency of your model. To enable the Neural Networks API, call
-    [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L343)
-    on the interpreter instance.
+    efficiency of your model. To enable the Neural Networks API, check out
+    the [NNAPI delegate](nnapi.md) guide.
 *   GPU delegate is available on Android and iOS, using OpenGL/OpenCL and Metal,
     respectively. To try them out, see the [GPU delegate tutorial](gpu.md) and
     [documentation](gpu_advanced.md).
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index 6b233075398..b17c9c35fec 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -1,31 +1,61 @@
-# TensorFlow Lite delegates
+# TensorFlow Lite Delegates
 
-Note: Delegate API is still experimental and is subject to change.
+## Introduction
 
-## What is a TensorFlow Lite delegate?
+**Delegates** enable hardware acceleration of TensorFlow Lite models by
+leveraging on-device accelerators such as the GPU and
+[Digital Signal Processor (DSP)](https://en.wikipedia.org/wiki/Digital_signal_processor).
 
-A TensorFlow Lite delegate is a way to delegate part or all of graph execution
-to another executor.
+By default, TensorFlow Lite utilizes CPU kernels that are optimized for the
+[ARM Neon](https://developer.arm.com/documentation/dht0002/a/Introducing-NEON/NEON-architecture-overview/NEON-instructions)
+instruction set. However, the CPU is a multi-purpose processor that isn't
+necessarily optimized for the heavy arithmetic typically found in Machine
+Learning models (for example, the matrix math involved in convolution and dense
+layers).
 
-## Why should I use delegates?
+On the other hand, most modern mobile phones contain chips that are better at
+handling these heavy operations. Utilizing them for neural network operations
+provides huge benefits in terms of latency and power efficiency. For example,
+GPUs can provide upto a
+[5x speedup](https://blog.tensorflow.org/2020/08/faster-mobile-gpu-inference-with-opencl.html)
+in latency, while the
+[Qualcomm® Hexagon DSP](https://developer.qualcomm.com/software/hexagon-dsp-sdk/dsp-processor)
+has shown to reduce power consumption upto 75% in our experiments.
 
-Running inference on compute-heavy machine learning models on mobile devices is
-resource demanding due to the devices' limited processing and power.
+Each of these accelerators have associated APIs that enable custom computations,
+such as [OpenCL](https://www.khronos.org/opencl/) or
+[OpenGL ES](https://www.khronos.org/opengles/) for mobile GPU and the
+[Qualcomm® Hexagon SDK](https://developer.qualcomm.com/software/hexagon-dsp-sdk)
+for DSP. Typically, you would have to write a lot of custom code to run a neural
+network though these interfaces. Things get even complicated when you consider
+that each accelerator has its pros & cons and cannot execute every operation in
+a neural network. TensorFlow Lite's Delegate API solves this problem by acting
+as a bridge between the TFLite runtime and these lower-level APIs.
 
-Instead of relying on the CPU, some devices have hardware accelerators, such as
-GPU or DSP, that allows for better performance and higher energy efficiency.
+![runtime with delegates](images/delegate_runtime.png)
 
-## Using the built-in delegates
+## Choosing a Delegate
 
-TensorFlow Lite provides the following delegates for hardware acceleration:
+TensorFlow Lite supports multiple delegates, each of which is optimized for
+certain platform(s) and particular types of models. Usually, there will be
+multiple delegates applicable to your use-case, depending on two major criteria:
+the *Platform* (Android or iOS?) you target, and the *Model-type*
+(floating-point or quantized?) that you are trying to accelerate.
+
+### Delegates by Platform
+
+#### Cross-platform (Android & iOS)
+
+*   **GPU delegate** - The GPU delegate can be used on both Android and iOS. It
+    is optimized to run 32-bit and 16-bit float based models where a GPU is
+    available. It also supports 8-bit quantized models and provides GPU
+    performance on par with their float versions. For details on the GPU
+    delegate, see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step
+    tutorials on using the GPU delegate with Android and iOS, see
+    [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
+
+#### Android
 
-*   **GPU delegate for cross platform acceleration** - The GPU delegate can be
-    used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
-    based models where a GPU is available. It also supports 8-bit quantized
-    models and provides GPU performance on par with their float versions. For
-    details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md).
-    For step-by-step tutorials on using the GPU delegate with Android and iOS,
-    see [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
 *   **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
     used to accelerate models on Android devices with GPU, DSP and / or NPU
     available. It is available in Android 8.1 (API 27+) or higher. For an
@@ -33,210 +63,188 @@ TensorFlow Lite provides the following delegates for hardware acceleration:
     practices, see [TensorFlow Lite NNAPI delegate](nnapi.md).
 *   **Hexagon delegate for older Android devices** - The Hexagon delegate can be
     used to accelerate models on Android devices with Qualcomm Hexagon DSP. It
-    can be used on devices older version of Android OS that does not fully
-    support NNAPI. See [TensorFlow Lite Hexagon delegate](hexagon_delegate.md)
-    for more detail.
+    can be used on devices running older versions of Android that do not support
+    NNAPI. See [TensorFlow Lite Hexagon delegate](hexagon_delegate.md) for more
+    detail.
+
+#### iOS
+
 *   **Core ML delegate for newer iPhones and iPads** - For newer iPhones and
     iPads where Neural Engine is available, you can use Core ML delegate to
-    accelerate inference for 32-bit float based models. Neural Engine is
-    available Apple mobile devices with A12 SoC or higher. For an overview of
-    the Core ML delegate and step-by-step instructions, see
+    accelerate inference for 32-bit or 16-bit floating-point models. Neural
+    Engine is available Apple mobile devices with A12 SoC or higher. For an
+    overview of the Core ML delegate and step-by-step instructions, see
     [TensorFlow Lite Core ML delegate](coreml_delegate.md).
 
-## How do delegates work?
+### Delegates by model type
 
-Let's say we have a simple model graph such as the following:
+Each accelerator is designed with a certain bit-width of data in mind. If you
+provide a floating-point model to a delegate that only supports 8-bit quantized
+operations (such as the [Hexagon delegate](hexagon_delegate.md)), it will reject
+all its operations and the model will run entirely on the CPU. To avoid such
+surprises, the table below provides an overview of delegate support based on
+model type:
 
-![Original graph](../images/performance/tflite_delegate_graph_1.png "Original Graph")
+**Model Type**                                                                                          | **GPU** | **NNAPI** | **Hexagon** | **CoreML**
+------------------------------------------------------------------------------------------------------- | ------- | --------- | ----------- | ----------
+Floating-point (32 bit)                                                                                 | Yes     | Yes       | No          | Yes
+[Post-training float16 quantization](post_training_float16_quant.ipynb)                                 | Yes     | No        | No          | Yes
+[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | Yes     | Yes       | No          | No
+[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Yes     | Yes       | Yes         | No
+[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Yes     | Yes       | Yes         | No
 
-If a delegate was provided for specific operations, then TensorFlow Lite will
-split the graph into multiple subgraphs where each subgraph will be handled by a
-delegate.
+### Validating performance
 
-Let's assume that a delegate, `MyDelegate`, has a faster implementation for
-Conv2D and Mean operations. The resulting main graph will be updated to look
-like below.
+The information in this section acts as a rough guideline for shortlisting the
+delegates that could improve your application. However, it is important to note
+that each delegate has a pre-defined set of operations it supports, and may
+perform differently depending on the model and device; for example, the
+[NNAPI delegate](nnapi.md) may choose to use Google's Edge-TPU on a Pixel phone
+while utilizing a DSP on another device. Therefore, it is usually recommended
+that you perform some benchmarking to gauge how useful a delegate is for your
+needs. This also helps justify the binary size increase associated with
+attaching a delegate to the TensorFlow Lite runtime.
 
-![Graph with delegate](../images/performance/tflite_delegate_graph_2.png "Graph with delegate")
+TensorFlow Lite has extensive performance and accuracy-evaluation tooling that
+can empower developers to be confident in using delegates in their application.
+These tools are discussed in the next section.
 
-Each subgraph that is handled by a delegate will be replaced with a node that
-evaluates the subgraph on its invoked call.
+## Tools for Evaluation
 
-Depending on the model, the final graph can end up with one node, which means
-that all of the graphs were delegated or multiple nodes handled the subgraphs.
-In general, you don't want to have multiple subgraphs handled by the delegate,
-since each time you switch from delegate to the main graph, there is an overhead
-for passing the results from the subgraph to the main graph. It's not always
-safe to share memory.
+### Latency & memory footprint
 
-## How to add a delegate
+TensorFlow Lite’s
+[benchmark tool](https://www.tensorflow.org/lite/performance/measurement) can be
+used with suitable parameters to estimate model performance, including average
+inference latency, initialization overhead, memory footprint, etc. This tool
+supports multiple flags to figure out the best delegate configuration for your
+model. For instance, `--gpu_backend=gl` can be specified with `--use_gpu` to
+measure GPU execution with OpenGL. The complete list of supported delegate
+parameters is defined in the
+[detailed documentation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/README.md#tflite-delegate-registrar).
 
-_Note that the API used below is experimental and is subject to change._
+Here’s an example run for a quantized model with GPU via `adb`:
 
-Based on the previous section, to add a delegate, we need to do the following:
-
-1.  Define a kernel node that is responsible for evaluating the delegate
-    subgraph.
-1.  Create an instance of
-    [TfLiteDelegate](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h#L611),
-    which is responsible for registering the kernel node and claiming the nodes
-    that the delegate can execute.
-
-To see it in code, let's define a delegate and call it `MyDelegate`, which can
-execute Conv2D and Mean operations faster.
-
-```c++
-#include "tensorflow/lite/util.h"
-#include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/context_util.h"
-
-// This is where the execution of the operations or whole graph happens.
-// The class below has an empty implementation just as a guideline
-// on the structure.
-class MyDelegate {
- public:
-  // Returns true if my delegate can handle this type of op.
-  static bool SupportedOp(const TfLiteRegistration* registration) {
-    switch (registration->builtin_code) {
-      case kTfLiteBuiltinConv2d:
-      case kTfLiteBuiltinMean:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  // Any initialization code needed
-  bool Init() {}
-  // Any preparation work needed (e.g. allocate buffers)
-  bool Prepare(TfLiteContext* context, TfLiteNode* node) {}
-  // Actual running of the delegate subgraph.
-  bool Invoke(TfLiteContext* context, TfLiteNode* node) {}
-  // ... Add any other methods needed.
-};
-
-// Create the TfLiteRegistration for the Kernel node which will replace
-// the subgraph in the main TfLite graph.
-TfLiteRegistration GetMyDelegateNodeRegistration() {
-  // This is the registration for the Delegate Node that gets added to
-  // the TFLite graph instead of the subgraph it replaces.
-  // It is treated as an OP node. But in our case
-  // Init will initialize the delegate.
-  // Invoke will run the delegate graph.
-  // Prepare for preparing the delegate.
-  // Free for any cleaning needed by the delegate.
-  TfLiteRegistration kernel_registration;
-  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
-  kernel_registration.custom_name = "MyDelegate";
-  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
-    delete reinterpret_cast<MyDelegate*>(buffer);
-  };
-  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
-                                   size_t) -> void* {
-    // In the node init phase, initialize MyDelegate instance
-    const TfLiteDelegateParams* delegate_params =
-        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    MyDelegate* my_delegate = new MyDelegate;
-    if (!my_delegate->Init(context, params)) {
-      return nullptr;
-    }
-    return my_delegate;
-  };
-  kernel_registration.invoke = [](TfLiteContext* context,
-                                   TfLiteNode* node) -> TfLiteStatus {
-    MyDelegate* kernel = reinterpret_cast<MyDelegate*>(node->user_data);
-    return kernel->Invoke(context, node);
-  };
-  kernel_registration.prepare = [](TfLiteContext* context,
-                                    TfLiteNode* node) -> TfLiteStatus {
-    MyDelegate* kernel = reinterpret_cast<MyDelegate*>(node->user_data);
-    return kernel->Prepare(context, node);
-  };
-
-  return kernel_registration;
-}
-
-// TfLiteDelegate methods
-
-TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  // Claim all nodes that can be evaluated by the delegate and ask the
-  // framework to update the graph with delegate kernel instead.
-  std::vector<int> supported_nodes;
-  TfLiteIntArray* plan;
-  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-  TfLiteNode* node;
-  TfLiteRegistration* registration;
-  for (int node_index : TfLiteIntArrayView(plan)) {
-    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-        context, node_index, &node, &registration));
-    if (MyDelegate::SupportedOp(registration)) {
-      supported_nodes.push_back(node_index);
-    }
-  }
-  TfLiteRegistration my_delegate_kernel_registration =
-      GetMyDelegateNodeRegistration();
-
-  // This call split the graphs into subgraphs, for subgraphs that can be
-  // handled by the delegate, it will replace it with a
-  // 'my_delegate_kernel_registration'
-  TfLiteIntArray* supported_nodes_int_array =
-      ::tflite::ConvertVectorToTfLiteIntArray(supported_nodes);
-  auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, my_delegate_kernel_registration,
-      supported_nodes_int_array, delegate);
-  TfLiteIntArrayFree(supported_nodes_int_array);
-  return status
-}
-
-void FreeBufferHandle(TfLiteContext* context, TfLiteDelegate* delegate,
-                      TfLiteBufferHandle* handle) {
-  // Do any cleanups.
-}
-
-TfLiteStatus CopyToBufferHandle(TfLiteContext* context,
-                                TfLiteDelegate* delegate,
-                                TfLiteBufferHandle buffer_handle,
-                                TfLiteTensor* tensor) {
-  // Copies data from tensor to delegate buffer if needed.
-  return kTfLiteOk;
-}
-
-TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
-                                  TfLiteDelegate* delegate,
-                                  TfLiteBufferHandle buffer_handle,
-                                  TfLiteTensor* tensor) {
-  // Copies the data from delegate buffer into the tensor raw memory.
-  return kTfLiteOk;
-}
-
-// Caller takes ownership of the returned pointer.
-TfLiteDelegate* CreateMyDelegate() {
-  TfLiteDelegate* delegate = new TfLiteDelegate;
-
-  delegate->data_ = nullptr;
-  delegate->flags = kTfLiteDelegateFlagsNone;
-  delegate->Prepare = &DelegatePrepare;
-  // This cannot be null.
-  delegate->CopyFromBufferHandle = &CopyFromBufferHandle;
-  // This can be null.
-  delegate->CopyToBufferHandle = &CopyToBufferHandle;
-  // This can be null.
-  delegate->FreeBufferHandle = &FreeBufferHandle;
-
-  return delegate;
-}
-
-
-// To add the delegate you need to call
-
-auto* my_delegate = CreateMyDelegate();
-if (interpreter->ModifyGraphWithDelegate(my_delegate) !=
-        kTfLiteOk) {
-  // Handle error
-} else {
-  interpreter->Invoke();
-}
-...
-// Don't forget to delete your delegate
-delete my_delegate;
 ```
+adb shell /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_v1_224_quant.tflite \
+  --use_gpu=true
+```
+
+You can download pre-built version of this tool for Android, 64-bit ARM
+architecture
+[here](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model.apk)
+([more details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/android)).
+
+### Accuracy & correctness
+
+Delegates usually perform computations at a different precision than their CPU
+counterparts. As a result, there is an (usually minor) accuracy tradeoff
+associated with utilizing a delegate for hardware acceleration. Note that this
+isn't *always* true; for example, since the GPU uses floating-point precision to
+run quantized models, there might be a slight precision improvement (for e.g.,
+<1% Top-5 improvement in ILSVRC image classification).
+
+TensorFlow Lite has two types of tooling to measure how accurately a delegate
+behaves for a given model: *Task-Based* and *Task-Agnostic*. All the tools
+described in this section support the
+[advanced delegation parameters](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/README.md#tflite-delegate-registrar)
+used by the benchmarking tool from the previous section. Note that the
+sub-sections below focus on *delegate evaluation* (Does the delegate perform the
+same as the CPU?) rather than model evaluation (Is the model itself good for the
+task?).
+
+#### Task-Based Evaluation
+
+TensorFlow Lite has tools to evaluate correctness on two image-based tasks:
+
+*   [ILSVRC 2012](http://image-net.org/challenges/LSVRC/2012/) (Image
+    Classification) with
+    [top-K accuracy](https://en.wikipedia.org/wiki/Evaluation_measures_\(information_retrieval\)#Precision_at_K)
+
+*   [COCO Object Detection (w/ bounding boxes)](https://cocodataset.org/#detection-2020)
+    with
+    [mean Average Precision (mAP)](https://en.wikipedia.org/wiki/Evaluation_measures_\(information_retrieval\)#Mean_average_precision)
+
+Prebuilt binaries of these tools (Android, 64-bit ARM architecture), along with
+documentation can be found here:
+
+*   [ImageNet Image Classification](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_eval_imagenet_image_classification)
+    ([More details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification))
+*   [COCO Object Detection](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_eval_coco_object_detection)
+    ([More details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/coco_object_detection))
+
+The example below demonstrates
+[image classification evaluation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification)
+with NNAPI utilizing Google's Edge-TPU on a Pixel 4:
+
+```
+adb shell /data/local/tmp/run_eval \
+  --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --ground_truth_images_path=/data/local/tmp/ilsvrc_images \
+  --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \
+  --model_output_labels=/data/local/tmp/model_output_labels.txt \
+  --output_file_path=/data/local/tmp/accuracy_output.txt \
+  --num_images=0 # Run on all images. \
+  --use_nnapi=true \
+  --nnapi_accelerator_name=google-edgetpu
+```
+
+The expected output is a list of Top-K metrics from 1 to 10:
+
+```
+Top-1 Accuracy: 0.733333
+Top-2 Accuracy: 0.826667
+Top-3 Accuracy: 0.856667
+Top-4 Accuracy: 0.87
+Top-5 Accuracy: 0.89
+Top-6 Accuracy: 0.903333
+Top-7 Accuracy: 0.906667
+Top-8 Accuracy: 0.913333
+Top-9 Accuracy: 0.92
+Top-10 Accuracy: 0.923333
+```
+
+#### Task-Agnostic Evaluation
+
+For tasks where there isn't an established on-device evaluation tool, or if you
+are experimenting with custom models, TensorFlow Lite has the
+[Inference Diff](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/inference_diff)
+tool. (Android, 64-bit ARM binary architecture binary
+[here](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_eval_inference_diff))
+
+Inference Diff compares TensorFlow Lite execution (in terms of latency &
+output-value deviation) in two settings:
+
+*   Single-threaded CPU Inference
+*   User-defined Inference - defined by
+    [these parameters](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/README.md#tflite-delegate-registrar)
+
+To do so, the tool generates random Gaussian data and passes it through two
+TFLite Interpreters - one running single-threaded CPU kernels, and the other
+parametrized by the user's arguments.
+
+It measures the latency of both, as well as the absolute difference between the
+output tensors from each Interpreter, on a per-element basis.
+
+For a model with a single output tensor, the output might look like this:
+
+```
+Num evaluation runs: 50
+Reference run latency: avg=84364.2(us), std_dev=12525(us)
+Test run latency: avg=7281.64(us), std_dev=2089(us)
+OutputDiff[0]: avg_error=1.96277e-05, std_dev=6.95767e-06
+```
+
+What this means is that for the output tensor at index `0`, the elements from
+the CPU output different from the delegate output by an average of `1.96e-05`.
+
+Note that interpreting these numbers requires deeper knowledge of the model, and
+what each output tensor signifies. If its a simple regression that determines
+some sort of score or embedding, the difference should be low (otherwise it's an
+error with the delegate). However, outputs like the 'detection class' one from
+SSD models is a little harder to interpret. For example, it might show a
+difference using this tool, but that may not mean something really wrong with
+the delegate: consider two (fake) classes: "TV (ID: 10)", "Monitor (ID:20)" - If
+a delegate is slightly off the golden truth and shows monitor instead of TV, the
+output diff for this tensor might be something as high as 20-10 = 10.
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 96e8aa6f9dc..077f88e1b12 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -12,11 +12,9 @@ resulting in lower latency. In the best scenario, inference on the GPU may now
 run fast enough for previously not available real-time applications.
 
 Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
-not require quantization for optimal performance.
-
-**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
-iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
-details.
+not require quantization for optimal performance. The delegate does accept 8-bit
+quantized models, but the calculation will be performed in floating point
+numbers. Refer to the [advanced documentation](gpu_advanced.md) for details.
 
 Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 71415693f86..d23c87c8288 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -325,7 +325,6 @@ Android APIs support quantized models by default. To disable, do the following:
 **C++ API**
 
 ```c++
-// NEW: Prepare custom options with feature enabled.
 TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
 options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
 
@@ -336,7 +335,6 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 **Java API**
 
 ```java
-// NEW: Prepare GPU delegate with feature turned on.
 GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
 
 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
@@ -344,27 +342,21 @@ Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
 
 #### iOS
 
-Support for quantized models on iOS APIs is experimental. To enable, do the
-following:
+iOD APIs support quantized models by default. To disable, do the following:
 
 **Swift API**
 
 ```swift
-// NEW: Prepare custom options with feature enabled.
 var options = MetalDelegate.Options()
-options.isQuantizationEnabled = true
+options.isQuantizationEnabled = false
 let delegate = MetalDelegate(options: options)
 ```
 
 **C API (also used for Objective-C)**
 
 ```c
-
-// THIS:
-// NEW: Prepare custom options with feature enabled.
-const TFLGpuDelegateOptions options = {
-  .enable_quantization = true,
-};
+TFLGpuDelegateOptions options = TFLGpuDelegateOptionsDefault();
+options.enable_quantization = false;
 
 auto* delegate = TFLGpuDelegateCreate(options);
 ```
diff --git a/tensorflow/lite/g3doc/performance/images/delegate_runtime.png b/tensorflow/lite/g3doc/performance/images/delegate_runtime.png
new file mode 100644
index 00000000000..e229f0fda09
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/delegate_runtime.png differ
diff --git a/tensorflow/lite/g3doc/performance/implementing_delegate.md b/tensorflow/lite/g3doc/performance/implementing_delegate.md
new file mode 100644
index 00000000000..85904cad091
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/implementing_delegate.md
@@ -0,0 +1,171 @@
+# Implementing a Delegate
+
+Note: The API used below is experimental and is subject to change.
+
+Follow the steps below to add a delegate:
+
+1.  Define a kernel node that is responsible for evaluating the delegate
+    subgraph.
+1.  Create an instance of
+    [TfLiteDelegate](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h#L611),
+    which is responsible for registering the kernel node and claiming the nodes
+    that the delegate can execute.
+
+To see it in code, define a delegate `MyDelegate` to execute Conv2D and Mean ops
+faster.
+
+```c++
+#include "tensorflow/lite/util.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/context_util.h"
+
+// This is where the execution of the operations or whole graph happens.
+// The class below has an empty implementation just as a guideline
+// on the structure.
+class MyDelegate {
+ public:
+  // Returns true if MyDelegate can handle this type of op.
+  static bool SupportedOp(const TfLiteRegistration* registration) {
+    switch (registration->builtin_code) {
+      case kTfLiteBuiltinConv2d:
+      case kTfLiteBuiltinMean:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  // Any initialization code needed
+  bool Init() {}
+  // Any preparation work needed (e.g. allocate buffers)
+  bool Prepare(TfLiteContext* context, TfLiteNode* node) {}
+  // Actual running of the delegate subgraph.
+  bool Invoke(TfLiteContext* context, TfLiteNode* node) {}
+  // ... Add any other methods needed.
+};
+
+// Create the TfLiteRegistration for the Kernel node which will replace
+// the subgraph in the main TfLite graph.
+TfLiteRegistration GetMyDelegateNodeRegistration() {
+  // This is the registration for the Delegate Node that gets added to
+  // the TFLite graph instead of the subgraph it replaces.
+  // It is treated as an OP node. But in this case
+  // Init initializes the delegate.
+  // Invoke runs the delegate graph.
+  // Prepare prepares the delegate.
+  // Free performs any memory cleanup needed by the delegate.
+  TfLiteRegistration kernel_registration;
+  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
+  kernel_registration.custom_name = "MyDelegate";
+  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
+    delete reinterpret_cast<MyDelegate*>(buffer);
+  };
+  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
+                                   size_t) -> void* {
+    // In the node init phase, initialize MyDelegate instance
+    const TfLiteDelegateParams* delegate_params =
+        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    MyDelegate* my_delegate = new MyDelegate;
+    if (!my_delegate->Init(context, params)) {
+      return nullptr;
+    }
+    return my_delegate;
+  };
+  kernel_registration.invoke = [](TfLiteContext* context,
+                                   TfLiteNode* node) -> TfLiteStatus {
+    MyDelegate* kernel = reinterpret_cast<MyDelegate*>(node->user_data);
+    return kernel->Invoke(context, node);
+  };
+  kernel_registration.prepare = [](TfLiteContext* context,
+                                    TfLiteNode* node) -> TfLiteStatus {
+    MyDelegate* kernel = reinterpret_cast<MyDelegate*>(node->user_data);
+    return kernel->Prepare(context, node);
+  };
+
+  return kernel_registration;
+}
+
+// TfLiteDelegate methods
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  // Claim all nodes that can be evaluated by the delegate and ask the
+  // framework to update the graph with delegate kernel instead.
+  std::vector<int> supported_nodes;
+  TfLiteIntArray* plan;
+  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+  TfLiteNode* node;
+  TfLiteRegistration* registration;
+  for (int node_index : TfLiteIntArrayView(plan)) {
+    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+        context, node_index, &node, &registration));
+    if (MyDelegate::SupportedOp(registration)) {
+      supported_nodes.push_back(node_index);
+    }
+  }
+  TfLiteRegistration my_delegate_kernel_registration =
+      GetMyDelegateNodeRegistration();
+
+  // This call split the graphs into subgraphs, for subgraphs that can be
+  // handled by the delegate, it will replace it with a
+  // 'my_delegate_kernel_registration'
+  TfLiteIntArray* supported_nodes_int_array =
+      ::tflite::ConvertVectorToTfLiteIntArray(supported_nodes);
+  auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, my_delegate_kernel_registration,
+      supported_nodes_int_array, delegate);
+  TfLiteIntArrayFree(supported_nodes_int_array);
+  return status
+}
+
+void FreeBufferHandle(TfLiteContext* context, TfLiteDelegate* delegate,
+                      TfLiteBufferHandle* handle) {
+  // Do any cleanups.
+}
+
+TfLiteStatus CopyToBufferHandle(TfLiteContext* context,
+                                TfLiteDelegate* delegate,
+                                TfLiteBufferHandle buffer_handle,
+                                TfLiteTensor* tensor) {
+  // Copies data from tensor to delegate buffer if needed.
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                  TfLiteDelegate* delegate,
+                                  TfLiteBufferHandle buffer_handle,
+                                  TfLiteTensor* tensor) {
+  // Copies the data from delegate buffer into the tensor raw memory.
+  return kTfLiteOk;
+}
+
+// Caller takes ownership of the returned pointer.
+TfLiteDelegate* CreateMyDelegate() {
+  TfLiteDelegate* delegate = new TfLiteDelegate;
+
+  delegate->data_ = nullptr;
+  delegate->flags = kTfLiteDelegateFlagsNone;
+  delegate->Prepare = &DelegatePrepare;
+  // This cannot be null.
+  delegate->CopyFromBufferHandle = &CopyFromBufferHandle;
+  // This can be null.
+  delegate->CopyToBufferHandle = &CopyToBufferHandle;
+  // This can be null.
+  delegate->FreeBufferHandle = &FreeBufferHandle;
+
+  return delegate;
+}
+
+
+// To add the delegate you need to call
+
+auto* my_delegate = CreateMyDelegate();
+if (interpreter->ModifyGraphWithDelegate(my_delegate) !=
+        kTfLiteOk) {
+  // Handle error
+} else {
+  interpreter->Invoke();
+}
+...
+// Don't forget to delete your delegate
+delete my_delegate;
+```
diff --git a/tensorflow/lite/g3doc/performance/measurement.md b/tensorflow/lite/g3doc/performance/measurement.md
index 9d2f7247ac7..2d03fa81ef5 100644
--- a/tensorflow/lite/g3doc/performance/measurement.md
+++ b/tensorflow/lite/g3doc/performance/measurement.md
@@ -186,7 +186,7 @@ You can get nightly pre-built binaries for this tool as listed below:
 *   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model_performance_options)
 *   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model_performance_options)
 
-### iOS benchamark app
+### iOS benchmark app
 
 To run benchmarks on iOS device, you need to build the app from
 [source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
@@ -421,7 +421,7 @@ internal events.
 Some examples of events are:
 
 *   Operator invocation
-*   Graph modification by deleagate
+*   Graph modification by delegate
 *   Tensor allocation
 
 Among different options for capturing traces, this guide covers the Android
diff --git a/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md b/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
index e38c2b3e215..a2377412c8d 100644
--- a/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
@@ -104,8 +104,8 @@ tflite_convert \
   --std_dev_values=127.7
 ```
 
-*If you're setting `--inference_type=QUANTIZED_UINT8` then update
-`--mean_values=128` and `--std_dev_values=127`*
+*If you're setting `--inference_type=UINT8` then update `--mean_values=128` and
+`--std_dev_values=127`*
 
 #### Convert a model with \"dummy-quantization\" into a quantized TensorFlow Lite model
 
@@ -134,8 +134,8 @@ tflite_convert \
   --default_ranges_max=6
 ```
 
-*If you're setting `--inference_type=QUANTIZED_UINT8` then update
-`--mean_values=128` and `--std_dev_values=127`*
+*If you're setting `--inference_type=UINT8` then update `--mean_values=128` and
+`--std_dev_values=127`*
 
 #### Convert a model with select TensorFlow operators.
 
diff --git a/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md b/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
index 826bb7afdbb..386d9063f9f 100644
--- a/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
+++ b/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
@@ -63,8 +63,7 @@ based on index.
         has a shape of [2, 3] and "bar" has a shape of [4, 5, 6].
 *   `--std_dev_values`, `--mean_values`. Type: comma-separated list of floats.
     These specify the (de-)quantization parameters of the input array, when it
-    is quantized. This is only needed if `inference_input_type` is `INT8` or
-    `QUANTIZED_UINT8`.
+    is quantized. Only needed if `inference_input_type` is `INT8` or `UINT8`.
     *   The meaning of `mean_values` and `std_dev_values` is as follows: each
         quantized value in the quantized input array will be interpreted as a
         mathematical real number (i.e. as an input activation value) according
@@ -75,12 +74,12 @@ based on index.
         the inference code according to the above formula, before proceeding
         with float inference.
     *   When performing quantized inference (`inference_type`
-        is`INT8`or`QUANTIZED_UINT8`), no dequantization is performed by the
-        inference code. However, the quantization parameters of all arrays,
-        including those of the input arrays as specified
-        by`mean_value`and`std_dev_value`, determine the fixed-point multipliers
-        used in the quantized inference code.`mean_value` must be an integer
-        when performing quantized inference.
+        is`INT8`or`UINT8`), no dequantization is performed by the inference
+        code. However, the quantization parameters of all arrays, including
+        those of the input arrays as specified by`mean_value`and`std_dev_value`,
+        determine the fixed-point multipliers used in the quantized inference
+        code.`mean_value` must be an integer when performing quantized
+        inference.
 
 ## Transformation flags
 
@@ -90,7 +89,7 @@ have.
 
 *   `--inference_type`. Type: string. Default: `FLOAT`. Data type of all
     real-number arrays in the output file except for input arrays (defined by
-    `--inference_input_type`). Must be `{FLOAT, INT8, QUANTIZED_UINT8}`.
+    `--inference_input_type`). Must be `{FLOAT, INT8, UINT8}`.
 
     This flag only impacts real-number arrays including float and quantized
     arrays. This excludes all other data types including plain integer arrays
@@ -102,16 +101,15 @@ have.
     *   If `INT8`, then real-numbers arrays will be quantized as int8 in the
         output file. If they were float in the input file, then they get
         quantized.
-    *   If `QUANTIZED_UINT8`, then real-numbers arrays will be quantized as
-        uint8 in the output file. If they were float in the input file, then
-        they get quantized.
+    *   If `UINT8`, then real-numbers arrays will be quantized as uint8 in the
+        output file. If they were float in the input file, then they get
+        quantized.
 
 *   `--inference_input_type`. Type: string. Data type of a real-number input
     array in the output file. By default the `--inference_type` is used as type
     of all of the input arrays. Flag is primarily intended for generating a
     float-point graph with a quantized input array. A Dequantized operator is
-    added immediately after the input array. Must be `{FLOAT, INT8,
-    QUANTIZED_UINT8}`.
+    added immediately after the input array. Must be `{FLOAT, INT8, UINT8}`.
 
     The flag is typically used for vision models taking a bitmap as input but
     requiring floating-point inference. For such image models, the uint8 input
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index 43168e394f5..ba6d266361b 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -518,7 +518,7 @@
       "source": [
         "## Choose a `model_spec` that Represents a Model for Text Classifier\n",
         "\n",
-        "Each `model_spec` object represents a specific model for the text classifier. TensorFlow Lite Model Maker currently supports [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf), averaging word embeddings and [BERT-Base]((https://arxiv.org/pdf/1810.04805.pdf) models.\n",
+        "Each `model_spec` object represents a specific model for the text classifier. TensorFlow Lite Model Maker currently supports [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf), averaging word embeddings and [BERT-Base](https://arxiv.org/pdf/1810.04805.pdf) models.\n",
         "\n",
         "Supported Model | Name of model_spec | Model Description\n",
         "--- | --- | ---\n",
@@ -548,7 +548,7 @@
       "source": [
         "## Load Input Data Specific to an On-device ML App\n",
         "\n",
-        "The [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) is one of the tasks in the [GLUE](https://gluebenchmark.com/) benchmark . It contains 67,349 movie reviews for training and 872 movie reviews for validation. The dataset has two classes: positive and negative movie reviews.\n",
+        "The [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) is one of the tasks in the [GLUE](https://gluebenchmark.com/) benchmark. It contains 67,349 movie reviews for training and 872 movie reviews for validation. The dataset has two classes: positive and negative movie reviews.\n",
         "\n",
         "Download the archived version of the dataset and extract it.\n"
       ]
@@ -669,9 +669,7 @@
       "source": [
         "## Evaluate the Customized Model\n",
         "\n",
-        "Evaluate the result of the model and get the loss and accuracy of the model.\n",
-        "\n",
-        "Evaluate the loss and accuracy in the test data."
+        "Evaluate the model with the test data and get its loss and accuracy."
       ]
     },
     {
@@ -749,7 +747,7 @@
         "id": "HZKYthlVrTos"
       },
       "source": [
-        "You can evalute the tflite model with `evaluate_tflite` method."
+        "You can evalute the tflite model with `evaluate_tflite` method to get its accuracy."
       ]
     },
     {
@@ -760,7 +758,7 @@
       },
       "outputs": [],
       "source": [
-        "model.evaluate_tflite('average_word_vec/model.tflite', test_data)"
+        "accuracy = model.evaluate_tflite('average_word_vec/model.tflite', test_data)"
       ]
     },
     {
@@ -771,7 +769,7 @@
       "source": [
         "## Advanced Usage\n",
         "\n",
-        "The `create` function is the driver function that the Model Maker library uses to create models. The `model spec` parameter defines the model specification. The `AverageWordVecModelSpec` and `BertClassifierModelSpec` classes are currently supported. The `create` function comprises of the following steps:\n",
+        "The `create` function is the driver function that the Model Maker library uses to create models. The `model_spec` parameter defines the model specification. The `AverageWordVecModelSpec` and `BertClassifierModelSpec` classes are currently supported. The `create` function comprises of the following steps:\n",
         "\n",
         "1. Creates the model for the text classifier according to `model_spec`.\n",
         "2. Trains the classifier model.  The default epochs and the default batch size are set by the `default_training_epochs` and `default_batch_size` variables in the `model_spec` object.\n",
@@ -867,7 +865,7 @@
         "The model parameters you can adjust are:\n",
         "\n",
         "* `seq_len`: Length of the sequence to feed into the model.\n",
-        "* `initializer_range`: The standard deviation of the truncated_normal_initializer for initializing all weight matrices.\n",
+        "* `initializer_range`: The standard deviation of the `truncated_normal_initializer` for initializing all weight matrices.\n",
         "* `trainable`: Boolean that specifies whether the pre-trained layer is trainable.\n",
         "\n",
         "The training pipeline parameters you can adjust are:\n",
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 25acac96cf4..f97d6d805d0 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -109,8 +109,6 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   own_external_cpu_backend_context_.reset(new ExternalCpuBackendContext());
   external_contexts_[kTfLiteCpuBackendContext] =
       own_external_cpu_backend_context_.get();
-
-  primary_subgraph().UseNNAPI(false);
 }
 
 Interpreter::~Interpreter() {
@@ -182,17 +180,20 @@ TfLiteStatus Interpreter::AllocateTensors() {
   // Apply the default delegate that TFLite will enable at this point to allow
   // other user-level delegates to be applied first.
   if (!lazy_delegate_providers_.empty()) {
+    // We only apply lazy delegate providers once.
+    std::vector<TfLiteDelegatePtr> delegate_providers;
+    delegate_providers.swap(lazy_delegate_providers_);
+
     TFLITE_LOG(TFLITE_LOG_INFO,
                "Applying %zu TensorFlow Lite delegate(s) lazily.",
-               lazy_delegate_providers_.size());
+               delegate_providers.size());
     // At the momement, XNNPACK delegate is the only one that might be applied
     // by default, in which case, the execution will fall back to default
     // implementation if the XNNPACK delegate fails to be applied. Therefore, we
     // ignore the return status here and let it fall through the rest of the
     // code.
-    for (size_t i = 0; i < lazy_delegate_providers_.size(); ++i) {
-      auto status =
-          ModifyGraphWithDelegate(std::move(lazy_delegate_providers_[i]));
+    for (size_t i = 0; i < delegate_providers.size(); ++i) {
+      auto status = ModifyGraphWithDelegate(std::move(delegate_providers[i]));
       switch (status) {
         case kTfLiteOk:
           TFLITE_LOG(TFLITE_LOG_INFO,
@@ -227,7 +228,6 @@ TfLiteStatus Interpreter::AllocateTensors() {
           return kTfLiteError;
       }
     }
-    lazy_delegate_providers_.clear();
   }
 
   return primary_subgraph().AllocateTensors();
@@ -344,13 +344,6 @@ TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
   return primary_subgraph().SetExecutionPlan(new_plan);
 }
 
-void Interpreter::UseNNAPI(bool enable) {
-  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
-                       "Interpreter::UseNNAPI() is deprecated. Use "
-                       "tflite::NnApiDelegate() directly instead.");
-  primary_subgraph().UseNNAPI(enable);
-}
-
 TfLiteStatus Interpreter::SetNumThreads(int num_threads) {
   if (num_threads < -1) {
     context_->ReportError(context_,
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index d8f337d07dd..72690fec09a 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -22,7 +22,9 @@ limitations under the License.
 #include <cstdio>
 #include <cstdlib>
 #include <functional>
+#include <map>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
@@ -275,6 +277,70 @@ class Interpreter {
     return nullptr;
   }
 
+  /// WARNING: Experimental interface, subject to change
+  /// Returns list of all names of different method signatures defined
+  /// in the model.
+  /// Note, pointers returned have lifetime same as the Interpreter object.
+  std::vector<const std::string*> signature_def_names() const {
+    std::vector<const std::string*> method_names;
+    method_names.reserve(signature_defs_.size());
+    for (const auto& sig_def : signature_defs_) {
+      method_names.emplace_back(&sig_def.method_name);
+    }
+    return method_names;
+  }
+
+  /// WARNING: Experimental interface, subject to change
+  /// Returns the mapping of inputs to tensor index in the signature
+  /// specified through 'method_name'.
+  /// If invalid name passed, an empty list will be returned.
+  const std::map<std::string, uint32_t>& signature_inputs(
+      const char* method_name) const {
+    for (const auto& sig_def : signature_defs_) {
+      if (sig_def.method_name == method_name) return sig_def.inputs;
+    }
+    static const std::map<std::string, uint32_t>* default_empty_list =
+        new std::map<std::string, uint32_t>();
+    return *default_empty_list;
+  }
+
+  /// WARNING: Experimental interface, subject to change
+  /// Returns the mapping of outputs to tensor index in the signature
+  /// specified through 'method_name'.
+  /// If invalid name passed, an empty list will be returned.
+  const std::map<std::string, uint32_t>& signature_outputs(
+      const char* method_name) const {
+    for (const auto& sig_def : signature_defs_) {
+      if (sig_def.method_name == method_name) return sig_def.outputs;
+    }
+    static const std::map<std::string, uint32_t>* default_empty_list =
+        new std::map<std::string, uint32_t>();
+    return *default_empty_list;
+  }
+
+  /// WARNING: Experimental interface, subject to change
+  /// Returns the input tensor identified by 'signature_input_name' in the
+  /// signature identified by 'signature_method_name'.
+  /// Returns nullptr if not found.
+  TfLiteTensor* input_tensor_by_signature_name(
+      const char* signature_input_name, const char* signature_method_name) {
+    const int tensor_index = GetTensorIndexFromSignatureDefName(
+        signature_input_name, signature_method_name, /*is_input=*/true);
+    return tensor_index == -1 ? nullptr : tensor(tensor_index);
+  }
+
+  /// WARNING: Experimental interface, subject to change
+  /// Returns the output tensor identified by 'signature_output_name' in the
+  /// signature identified by 'signature_method_name'.
+  /// Returns nullptr if not found.
+  const TfLiteTensor* output_tensor_by_signature_name(
+      const char* signature_output_name,
+      const char* signature_method_name) const {
+    const int tensor_index = GetTensorIndexFromSignatureDefName(
+        signature_output_name, signature_method_name, /*is_input=*/false);
+    return tensor_index == -1 ? nullptr : tensor(tensor_index);
+  }
+
   /// Return a mutable pointer to the given input tensor. The given index must
   /// be between 0 and inputs().size().
   TfLiteTensor* input_tensor(size_t index) { return tensor(inputs()[index]); }
@@ -363,15 +429,6 @@ class Interpreter {
   /// Returns status of success or failure.
   TfLiteStatus Invoke();
 
-  /// Enable or disable NNAPI (true to enable). Disabled by default.
-  ///
-  /// WARNING: NNAPI cannot be disabled after the graph has been prepared
-  /// (via `AllocateTensors`) with NNAPI enabled.
-  ///
-  /// NOTE: This API is deprecated, prefer using the NNAPI delegate directly.
-  /// This method will be removed in a future release.
-  void UseNNAPI(bool enable);
-
   /// Set the number of threads available to the interpreter.
   ///
   /// NOTE: num_threads should be >= -1.
@@ -380,8 +437,12 @@ class Interpreter {
   TfLiteStatus SetNumThreads(int num_threads);
 
   /// Allow float16 precision for FP32 calculation when possible.
-  /// default: not allow.
-  /// WARNING: This is an experimental API and subject to change.
+  /// Default: not allow.
+  ///
+  /// WARNING: This API is deprecated: prefer controlling this via delegate
+  /// options, e.g. `tflite::StatefulNnApiDelegate::Options::allow_fp16' or
+  /// `TfLiteGpuDelegateOptionsV2::is_precision_loss_allowed`.
+  /// This method will be removed in a future release.
   void SetAllowFp16PrecisionForFp32(bool allow);
 
   /// Get the half precision flag.
@@ -408,7 +469,11 @@ class Interpreter {
   /// 2. kTfLiteDelegateError: Delegation failed due to an error in the
   /// delegate. The Interpreter has been restored to its pre-delegation state.
   /// NOTE: This undoes all delegates previously applied to the Interpreter.
-  /// 3. kTfLiteError: Unexpected/runtime failure.
+  /// 3. kTfLiteApplicationError : Delegation failed to be applied due to the
+  /// incompatibility with the TfLite runtime, e.g., the model graph is already
+  /// immutable when applying the delegate. However, the interpreter could still
+  /// be invoked.
+  /// 4. kTfLiteError: Unexpected/runtime failure.
   /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
@@ -584,6 +649,17 @@ class Interpreter {
 #endif  // DOXYGEN_SKIP
 
  private:
+  // Structure representing SignatureDef inputs/outputs.
+  struct SignatureDef {
+    // Maps name in signature def as key to index of the tensor in the model.
+    std::map<std::string, uint32_t> inputs;
+    // Maps name in signature def as key to index of the tensor in the model.
+    std::map<std::string, uint32_t> outputs;
+    // The method name for this signature.
+    std::string method_name;
+    // The key of this SignatureDef in the SavedModel signature def map.
+    std::string signature_def_key;
+  };
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
@@ -594,6 +670,26 @@ class Interpreter {
                                  TfLiteExternalContextType type,
                                  TfLiteExternalContext* ctx);
 
+  // Helper method that return the tensot index that corresponds to
+  // a name in a SignatureDef. Defined by 'signature_method_name', and
+  // 'signature_tensor_name'.
+  // If 'is_input' is true then the tensor is checked in input tensors,
+  // otherwise it will be checked in output tensors.
+  // Returns -1 if the tensor is not found.
+  int GetTensorIndexFromSignatureDefName(const char* signature_tensor_name,
+                                         const char* signature_method_name,
+                                         bool is_input) const {
+    // Iterate directly and don't use other methods to avoid extra allocation.
+    for (const auto& signature : signature_defs_) {
+      if (signature.method_name != signature_method_name) continue;
+      auto& signature_list = (is_input ? signature.inputs : signature.outputs);
+      auto tensor_iter = signature_list.find(signature_tensor_name);
+      if (tensor_iter == signature_list.end()) return -1;
+      return tensor_iter->second;
+    }
+    return -1;
+  }
+
   // Sets the profiler to all subgraphs.
   void SetSubgraphProfiler();
 
@@ -607,6 +703,11 @@ class Interpreter {
   // Returns true if cancellation function returns true.
   bool IsCancelled();
 
+  // Sets the list of signature defs in the model.
+  void SetSignatureDef(std::vector<SignatureDef> signature_defs) {
+    signature_defs_ = std::move(signature_defs);
+  }
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -653,6 +754,10 @@ class Interpreter {
   // An empty one means there's no delegate to be applied by default or
   // delegates have been applied and doesn't need to be applied again.
   std::vector<TfLiteDelegatePtr> lazy_delegate_providers_;
+
+  // List of signature def mapping inputs/output to tensor ids.
+  // We just keep track of tensor index.
+  std::vector<SignatureDef> signature_defs_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index f5c8d97b962..4249c85238e 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#include <map>
+#include <string>
+
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -44,6 +47,20 @@ limitations under the License.
 #endif
 #endif
 
+// TODO(b/139446230): Move to portable platform header.
+#if defined(__ANDROID__)
+#define TFLITE_IS_MOBILE_PLATFORM
+#endif  // defined(__ANDROID__)
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#if TARGET_IPHONE_SIMULATOR
+#define TFLITE_IS_MOBILE_PLATFORM
+#elif TARGET_OS_IPHONE
+#define TFLITE_IS_MOBILE_PLATFORM
+#endif
+#endif  // defined(__APPLE__)
+
 namespace tflite {
 
 namespace {
@@ -103,6 +120,20 @@ TfLiteStatus ParseSparseIndexVector(const DimensionMetadata* src,
   return kTfLiteError;
 }
 
+// Helper that returns std::map that corresponds to vector of TensorMap.
+std::map<std::string, uint32_t> GetMapFromTensorMap(
+    const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>*
+        tensor_map) {
+  if (!tensor_map) return {};
+  std::map<std::string, uint32_t> result;
+  for (const auto tensor : *tensor_map) {
+    if (tensor != nullptr && tensor->name() != nullptr) {
+      result[tensor->name()->c_str()] = tensor->tensor_index();
+    }
+  }
+  return result;
+}
+
 }  // namespace
 
 const char* kEmptyTensorName = "";
@@ -112,9 +143,16 @@ const char* kEmptyTensorName = "";
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
-#if !defined(__ANDROID__)
-  // If _pywrap_tensorflow_internal.so is available, use
-  // TF_AcquireFlexDelegate() to initialize flex delegate.
+  auto acquire_flex_delegate_func =
+      reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
+          SharedLibrary::GetSymbol("TF_AcquireFlexDelegate"));
+  if (acquire_flex_delegate_func) {
+    return acquire_flex_delegate_func();
+  }
+
+#if !defined(TFLITE_IS_MOBILE_PLATFORM)
+  // Load TF_AcquireFlexDelegate() from _pywrap_tensorflow_internal.so if it is
+  // available.
   const char* filename_pywrap_tensorflow_internal =
 #if defined(_WIN32)
       "_pywrap_tensorflow_internal.pyd";
@@ -126,15 +164,16 @@ TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   void* lib_tf_internal =
       SharedLibrary::LoadLibrary(filename_pywrap_tensorflow_internal);
   if (lib_tf_internal) {
-    auto TF_AcquireFlexDelegate =
+    acquire_flex_delegate_func =
         reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
             SharedLibrary::GetLibrarySymbol(lib_tf_internal,
                                             "TF_AcquireFlexDelegate"));
-    if (TF_AcquireFlexDelegate) {
-      return TF_AcquireFlexDelegate();
+    if (acquire_flex_delegate_func) {
+      return acquire_flex_delegate_func();
     }
   }
-#endif
+#endif  // !defined(TFLITE_IS_MOBILE_PLATFORM)
+
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
@@ -435,6 +474,50 @@ TfLiteStatus InterpreterBuilder::ParseSparsity(
   return kTfLiteOk;
 }
 
+TfLiteStatus InterpreterBuilder::ParseSignatureDefs(
+    const flatbuffers::Vector<flatbuffers::Offset<SignatureDef>>*
+        signature_def_list,
+    Interpreter* interpreter) {
+  if (signature_def_list == nullptr || signature_def_list->size() == 0) {
+    return kTfLiteOk;
+  }
+  std::vector<Interpreter::SignatureDef> signature_defs;
+  signature_defs.reserve(signature_def_list->size());
+  for (const auto fb_signature_def : *signature_def_list) {
+    if (fb_signature_def == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter_, "NULL SignatureDef in the model.");
+      return kTfLiteError;
+    }
+    if (fb_signature_def->method_name() == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Missing exported method name for SignatureDef");
+      return kTfLiteError;
+    }
+    if (fb_signature_def->inputs() == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "NULL SignatureDef inputs for exported method %s",
+                           fb_signature_def->method_name()->c_str());
+      return kTfLiteError;
+    }
+    if (fb_signature_def->outputs() == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "NULL SignatureDef outputs for exported method %s",
+                           fb_signature_def->method_name()->c_str());
+      return kTfLiteError;
+    }
+    signature_defs.resize(signature_defs.size() + 1);
+    auto& signature_def = signature_defs.back();
+    signature_def.inputs = GetMapFromTensorMap(fb_signature_def->inputs());
+    signature_def.outputs = GetMapFromTensorMap(fb_signature_def->outputs());
+    signature_def.method_name = fb_signature_def->method_name()->c_str();
+    if (fb_signature_def->key() != nullptr) {
+      signature_def.signature_def_key = fb_signature_def->key()->c_str();
+    }
+  }
+  interpreter->SetSignatureDef(std::move(signature_defs));
+  return kTfLiteOk;
+}
+
 TfLiteStatus InterpreterBuilder::ParseTensors(
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
@@ -667,6 +750,11 @@ TfLiteStatus InterpreterBuilder::operator()(
     modified_subgraph->SetVariables(std::move(variables));
   }
 
+  if (ParseSignatureDefs(model_->signature_defs(), interpreter->get()) !=
+      kTfLiteOk) {
+    return cleanup_and_error();
+  }
+
   if (num_fp32_tensors_ > 0) {
     (*interpreter)->lazy_delegate_providers_ =
         op_resolver_.GetDelegates(num_threads);
diff --git a/tensorflow/lite/interpreter_builder.h b/tensorflow/lite/interpreter_builder.h
index 4b0052f66ce..34ba9742d9d 100644
--- a/tensorflow/lite/interpreter_builder.h
+++ b/tensorflow/lite/interpreter_builder.h
@@ -80,6 +80,10 @@ class InterpreterBuilder {
                                  const std::vector<int>& dims);
   TfLiteStatus ParseSparsity(const SparsityParameters* src_sparsity,
                              TfLiteSparsity** sparsity);
+  TfLiteStatus ParseSignatureDefs(
+      const flatbuffers::Vector<flatbuffers::Offset<SignatureDef>>*
+          signature_def_list,
+      Interpreter* interpreter);
 
   const ::tflite::Model* model_;
   const OpResolver& op_resolver_;
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 66728ea89e9..dd26c298243 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -49,6 +49,24 @@ class InterpreterTest : public ::testing::Test {
  protected:
   TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
 
+  std::vector<Interpreter::TfLiteDelegatePtr>*
+  mutable_lazy_delegate_providers() {
+    return &interpreter_.lazy_delegate_providers_;
+  }
+
+  bool HasDelegates() { return interpreter_.HasDelegates(); }
+
+  void BuildSignature(const std::string& method_name, const std::string& key,
+                      const std::map<std::string, uint32_t>& inputs,
+                      const std::map<std::string, uint32_t>& outputs) {
+    Interpreter::SignatureDef signature;
+    signature.inputs = inputs;
+    signature.outputs = outputs;
+    signature.method_name = method_name;
+    signature.signature_def_key = key;
+    interpreter_.SetSignatureDef({signature});
+  }
+
   Interpreter interpreter_;
 };
 
@@ -967,17 +985,6 @@ TEST(BasicInterpreter, TestOverflow) {
   }
 }
 
-TEST(BasicInterpreter, TestUseNNAPI) {
-  TestErrorReporter reporter;
-  Interpreter interpreter(&reporter);
-  interpreter.UseNNAPI(true);
-  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
-  interpreter.UseNNAPI(false);
-  ASSERT_EQ(reporter.error_messages(),
-            "Attempting to disable NNAPI delegate after it's applied.");
-}
-
 TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
   Interpreter interpreter;
   ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
@@ -1782,6 +1789,143 @@ TEST_F(TestCustomAllocation, ResizeTensorsWithEnoughMemory) {
   VerifyInvoke();
 }
 
+// Tests related to lazy delegate providers that are primarily used for applying
+// TfLite delegates by default.
+class TestLazyDelegateProvider : public InterpreterTest {
+ protected:
+  struct DummyLazyDelegateProvider : public TfLiteDelegate {
+    explicit DummyLazyDelegateProvider(int64_t support_flags) {
+      data_ = static_cast<void*>(this);
+      flags = support_flags;
+      Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus {
+        return kTfLiteOk;
+      };
+    }
+  };
+
+  void InitWithLazyDelegate(int64_t delegate_flags,
+                            bool create_dyanmic_tensor = false,
+                            bool return_error = false) {
+    TfLiteRegistration reg = {nullptr};
+    if (return_error) {
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        return kTfLiteError;
+      };
+    }
+    ASSERT_EQ(interpreter_.AddTensors(2), kTfLiteOk);
+    interpreter_.SetInputs({0});
+    interpreter_.SetOutputs({1});
+    interpreter_.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+
+    Interpreter::TfLiteDelegatePtr delegate(
+        new DummyLazyDelegateProvider(delegate_flags),
+        [](TfLiteDelegate* delegate) {
+          auto* dummy =
+              static_cast<DummyLazyDelegateProvider*>(delegate->data_);
+          delete dummy;
+        });
+    mutable_lazy_delegate_providers()->push_back(std::move(delegate));
+
+    if (create_dyanmic_tensor) {
+      // Mark the output as dynamic tensor.
+      interpreter_.tensor(1)->data.raw = nullptr;
+      interpreter_.tensor(1)->allocation_type = kTfLiteDynamic;
+    }
+  }
+};
+
+TEST_F(TestLazyDelegateProvider, ApplicationSuccess) {
+  InitWithLazyDelegate(kTfLiteDelegateFlagsNone);
+  EXPECT_EQ(kTfLiteOk, interpreter_.AllocateTensors());
+  // We clear Interpreter::lazy_delegate_providers_ after they are tried out.
+  EXPECT_TRUE(mutable_lazy_delegate_providers()->empty());
+  EXPECT_TRUE(HasDelegates());
+}
+
+TEST_F(TestLazyDelegateProvider, ApplicationFailure) {
+  InitWithLazyDelegate(kTfLiteDelegateFlagsNone,
+                       false /* create_dyanmic_tensor */,
+                       true /* return_error */);
+  EXPECT_EQ(kTfLiteError, interpreter_.AllocateTensors());
+  // We clear Interpreter::lazy_delegate_providers_ after they are tried out.
+  EXPECT_TRUE(mutable_lazy_delegate_providers()->empty());
+  EXPECT_FALSE(HasDelegates());
+}
+
+TEST_F(TestLazyDelegateProvider, ApplicationSkipped) {
+  InitWithLazyDelegate(kTfLiteDelegateFlagsNone,
+                       true /* create_dyanmic_tensor */);
+  EXPECT_EQ(kTfLiteOk, interpreter_.AllocateTensors());
+  EXPECT_TRUE(mutable_lazy_delegate_providers()->empty());
+  // As the delegate doesn't allow dynamic tensor, the delegate won't be applied
+  // and the interpreter doesn't have any delegate applied.
+  EXPECT_FALSE(HasDelegates());
+}
+
+TEST_F(InterpreterTest, SingleSignature_get_signatures) {
+  const char kMethodName[] = "test_method";
+  const char kSignatureDefKey[] = "test_key";
+  BuildSignature(kMethodName, kSignatureDefKey, {{"Input1", 0}, {"Input2", 1}},
+                 {{"Output1", 5}});
+  auto results = interpreter_.signature_def_names();
+  ASSERT_EQ(1, results.size());
+  EXPECT_EQ(kMethodName, *results[0]);
+}
+
+TEST_F(InterpreterTest, SingleSignature_get_inputs) {
+  const char kMethodName[] = "test_method";
+  const char kSignatureDefKey[] = "test_key";
+  const std::map<std::string, uint32_t> inputs = {{"Input1", 0}, {"Input2", 1}};
+  const std::map<std::string, uint32_t> outputs = {{"Output1", 5}};
+  BuildSignature(kMethodName, kSignatureDefKey, inputs, outputs);
+  EXPECT_THAT(interpreter_.signature_inputs(kMethodName), testing::Eq(inputs));
+  EXPECT_THAT(interpreter_.signature_outputs(kMethodName),
+              testing::Eq(outputs));
+}
+
+TEST_F(InterpreterTest, SingleSignature_validate_get_tensor) {
+  const char kMethodName[] = "test_method";
+  const char kSignatureDefKey[] = "test_key";
+  const std::map<std::string, uint32_t> inputs = {{"Input1", 0}, {"Input2", 1}};
+  const std::map<std::string, uint32_t> outputs = {{"Output1", 5}};
+
+  BuildSignature(kMethodName, kSignatureDefKey, inputs, outputs);
+  ASSERT_EQ(interpreter_.AddTensors(6), kTfLiteOk);
+  ASSERT_EQ(interpreter_.SetInputs({0, 1}), kTfLiteOk);
+  ASSERT_EQ(interpreter_.SetOutputs({5}), kTfLiteOk);
+  ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                1, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_.ResizeInputTensor(interpreter_.inputs()[0], {1, 2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_.ResizeInputTensor(interpreter_.inputs()[1], {1, 2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+  EXPECT_TRUE(interpreter_.input_tensor_by_signature_name(
+                  "Input1", kMethodName) != nullptr);
+  EXPECT_TRUE(interpreter_.input_tensor_by_signature_name(
+                  "Input2", kMethodName) != nullptr);
+  EXPECT_TRUE(interpreter_.output_tensor_by_signature_name(
+                  "Output1", kMethodName) != nullptr);
+
+  // Invalid tensor
+  EXPECT_EQ(interpreter_.input_tensor_by_signature_name("Input3", kMethodName),
+            nullptr);
+  EXPECT_EQ(interpreter_.output_tensor_by_signature_name("Input3", kMethodName),
+            nullptr);
+  // Invalid method
+  EXPECT_EQ(
+      interpreter_.input_tensor_by_signature_name("Input1", "InvalidMethod"),
+      nullptr);
+  EXPECT_EQ(
+      interpreter_.output_tensor_by_signature_name("Output1", "InvalidMethod"),
+      nullptr);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 86caa5cbac4..f7e10ae796c 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -394,9 +394,13 @@ public final class InterpreterTest {
   }
 
   @Test
+  // setAllowFp16PrecisionForFp32 is deprecated, suppress the warning to allow testing.
+  @SuppressWarnings("deprecation")
   public void testTurnOnNNAPI() throws Exception {
     Interpreter interpreter =
-        new Interpreter(MODEL_BUFFER, new Interpreter.Options().setUseNNAPI(true));
+        new Interpreter(
+            MODEL_BUFFER,
+            new Interpreter.Options().setUseNNAPI(true).setAllowFp16PrecisionForFp32(true));
     float[] oneD = {1.23f, 6.54f, 7.81f};
     float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
     float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index f7aa91dc24d..9cc5d0452ec 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -190,8 +190,8 @@ cc_library(
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
@@ -544,11 +544,13 @@ BUILTIN_KERNEL_SRCS = [
     "batch_to_space_nd.cc",
     "bidirectional_sequence_lstm.cc",
     "bidirectional_sequence_rnn.cc",
+    "call_once.cc",
     "cast.cc",
     "ceil.cc",
     "comparisons.cc",
     "concatenation.cc",
     "conv.cc",
+    "cumsum.cc",
     "densify.cc",
     "depth_to_space.cc",
     "depthwise_conv.cc",
@@ -717,7 +719,6 @@ cc_library(
     name = "custom_ops",
     srcs = [
         "complex_support.cc",
-        "cumsum.cc",
         "multinomial.cc",
         "random_standard_normal.cc",
         "rfft2d.cc",
@@ -2100,6 +2101,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "call_once_test",
+    size = "small",
+    srcs = ["call_once_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":kernel_util",
+        ":subgraph_test_util",
+        ":test_main",
+        ":variable_op_kernels",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "if_test",
     size = "small",
@@ -2225,6 +2241,7 @@ cc_library(
         ":builtin_ops",
         ":kernel_util",
         ":test_util",
+        ":variable_op_kernels",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
@@ -2341,16 +2358,15 @@ cc_test(
 
 cc_test(
     name = "cumsum_test",
+    size = "small",
     srcs = ["cumsum_test.cc"],
     deps = [
-        ":custom_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 0efac36be74..46fec3981b6 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -364,6 +364,12 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplier(identity_multiplier, &data->output_multiplier_identity,
                        &data->output_shift_identity);
   }
+
+  if (input->type == kTfLiteInt16 && output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  }
+
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
 }
@@ -598,6 +604,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     data->params.exp_lut = data->exp_lut;
@@ -669,8 +676,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   output->type = input->type;
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      output->type == kTfLiteInt16) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     // prelu(x) = x if x >= 0 else x * alpha.
     // So if we translate that for quantized computation:
     //
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 6e0316538b9..447245d11fe 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -556,7 +556,9 @@ TEST(QuantizedActivationsOpTest, LeakyReluUint8) {
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedActivationsOpTestLeakyRelu() {
   const float kMin = -1;
-  const float kMax = 127.f / 128.f;
+  const float kMax =
+      std::numeric_limits<integer_dtype>::max() /
+      static_cast<float>(std::numeric_limits<integer_dtype>::max() + 1);
 
   QuantizedActivationsOpModel m(
       /*input=*/{tensor_type, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
@@ -836,58 +838,102 @@ TEST_P(TanhOpTest, TanhInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+      /*input=*/{TensorType_INT16, {177}, 16 * kMin, 16 * kMax},
+      /*output=*/{TensorType_INT16, {177}, kMin, kMax});
   m.SetInput<int16_t>(
-      {-8.0000000000, -7.8181818182, -7.6363636364, -7.4545454545,
-       -7.2727272727, -7.0909090909, -6.9090909091, -6.7272727273,
-       -6.5454545455, -6.3636363636, -6.1818181818, -6.0000000000,
-       -5.8181818182, -5.6363636364, -5.4545454545, -5.2727272727,
-       -5.0909090909, -4.9090909091, -4.7272727273, -4.5454545455,
-       -4.3636363636, -4.1818181818, -4.0000000000, -3.8181818182,
-       -3.6363636364, -3.4545454545, -3.2727272727, -3.0909090909,
-       -2.9090909091, -2.7272727273, -2.5454545455, -2.3636363636,
-       -2.1818181818, -2.0000000000, -1.8181818182, -1.6363636364,
-       -1.4545454545, -1.2727272727, -1.0909090909, -0.9090909091,
-       -0.7272727273, -0.5454545455, -0.3636363636, -0.1818181818,
-       0.0000000000,  0.1818181818,  0.3636363636,  0.5454545455,
-       0.7272727273,  0.9090909091,  1.0909090909,  1.2727272727,
-       1.4545454545,  1.6363636364,  1.8181818182,  2.0000000000,
-       2.1818181818,  2.3636363636,  2.5454545455,  2.7272727273,
-       2.9090909091,  3.0909090909,  3.2727272727,  3.4545454545,
-       3.6363636364,  3.8181818182,  4.0000000000,  4.1818181818,
-       4.3636363636,  4.5454545455,  4.7272727273,  4.9090909091,
-       5.0909090909,  5.2727272727,  5.4545454545,  5.6363636364,
-       5.8181818182,  6.0000000000,  6.1818181818,  6.3636363636,
-       6.5454545455,  6.7272727273,  6.9090909091,  7.0909090909,
-       7.2727272727,  7.4545454545,  7.6363636364,  7.8181818182,
-       8.0000000000});
+      {-20.0000000000, -19.7727272727, -19.5454545455, -19.3181818182,
+       -19.0909090909, -18.8636363636, -18.6363636364, -18.4090909091,
+       -18.1818181818, -17.9545454545, -17.7272727273, -17.5000000000,
+       -17.2727272727, -17.0454545455, -16.8181818182, -16.5909090909,
+       -16.3636363636, -16.1363636364, -15.9090909091, -15.6818181818,
+       -15.4545454545, -15.2272727273, -15.0000000000, -14.7727272727,
+       -14.5454545455, -14.3181818182, -14.0909090909, -13.8636363636,
+       -13.6363636364, -13.4090909091, -13.1818181818, -12.9545454545,
+       -12.7272727273, -12.5000000000, -12.2727272727, -12.0454545455,
+       -11.8181818182, -11.5909090909, -11.3636363636, -11.1363636364,
+       -10.9090909091, -10.6818181818, -10.4545454545, -10.2272727273,
+       -10.0000000000, -9.7727272727,  -9.5454545455,  -9.3181818182,
+       -9.0909090909,  -8.8636363636,  -8.6363636364,  -8.4090909091,
+       -8.1818181818,  -7.9545454545,  -7.7272727273,  -7.5000000000,
+       -7.2727272727,  -7.0454545455,  -6.8181818182,  -6.5909090909,
+       -6.3636363636,  -6.1363636364,  -5.9090909091,  -5.6818181818,
+       -5.4545454545,  -5.2272727273,  -5.0000000000,  -4.7727272727,
+       -4.5454545455,  -4.3181818182,  -4.0909090909,  -3.8636363636,
+       -3.6363636364,  -3.4090909091,  -3.1818181818,  -2.9545454545,
+       -2.7272727273,  -2.5000000000,  -2.2727272727,  -2.0454545455,
+       -1.8181818182,  -1.5909090909,  -1.3636363636,  -1.1363636364,
+       -0.9090909091,  -0.6818181818,  -0.4545454545,  -0.2272727273,
+       0.0000000000,   0.2272727273,   0.4545454545,   0.6818181818,
+       0.9090909091,   1.1363636364,   1.3636363636,   1.5909090909,
+       1.8181818182,   2.0454545455,   2.2727272727,   2.5000000000,
+       2.7272727273,   2.9545454545,   3.1818181818,   3.4090909091,
+       3.6363636364,   3.8636363636,   4.0909090909,   4.3181818182,
+       4.5454545455,   4.7727272727,   5.0000000000,   5.2272727273,
+       5.4545454545,   5.6818181818,   5.9090909091,   6.1363636364,
+       6.3636363636,   6.5909090909,   6.8181818182,   7.0454545455,
+       7.2727272727,   7.5000000000,   7.7272727273,   7.9545454545,
+       8.1818181818,   8.4090909091,   8.6363636364,   8.8636363636,
+       9.0909090909,   9.3181818182,   9.5454545455,   9.7727272727,
+       10.0000000000,  10.2272727273,  10.4545454545,  10.6818181818,
+       10.9090909091,  11.1363636364,  11.3636363636,  11.5909090909,
+       11.8181818182,  12.0454545455,  12.2727272727,  12.5000000000,
+       12.7272727273,  12.9545454545,  13.1818181818,  13.4090909091,
+       13.6363636364,  13.8636363636,  14.0909090909,  14.3181818182,
+       14.5454545455,  14.7727272727,  15.0000000000,  15.2272727273,
+       15.4545454545,  15.6818181818,  15.9090909091,  16.1363636364,
+       16.3636363636,  16.5909090909,  16.8181818182,  17.0454545455,
+       17.2727272727,  17.5000000000,  17.7272727273,  17.9545454545,
+       18.1818181818,  18.4090909091,  18.6363636364,  18.8636363636,
+       19.0909090909,  19.3181818182,  19.5454545455,  19.7727272727,
+       20.0000000000});
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear(
-                  {-0.9999997749, -0.9999996762, -0.9999995342, -0.9999993300,
-                   -0.9999990361, -0.9999986134, -0.9999980053, -0.9999971306,
-                   -0.9999958722, -0.9999940619, -0.9999914578, -0.9999877117,
-                   -0.9999823226, -0.9999745703, -0.9999634183, -0.9999473758,
-                   -0.9999242982, -0.9998911009, -0.9998433469, -0.9997746542,
-                   -0.9996758446, -0.9995337191, -0.9993292997, -0.9990353053,
-                   -0.9986125310, -0.9980046622, -0.9971308601, -0.9958751909,
-                   -0.9940716137, -0.9914827859, -0.9877703933, -0.9824541388,
-                   -0.9748561217, -0.9640275801, -0.9486568273, -0.9269625051,
-                   -0.8965880154, -0.8545351057, -0.7972097087, -0.7206956332,
-                   -0.6213939966, -0.4971057414, -0.3484130125, -0.1798408185,
-                   0.0000000000,  0.1798408185,  0.3484130125,  0.4971057414,
-                   0.6213939966,  0.7206956332,  0.7972097087,  0.8545351057,
-                   0.8965880154,  0.9269625051,  0.9486568273,  0.9640275801,
-                   0.9748561217,  0.9824541388,  0.9877703933,  0.9914827859,
-                   0.9940716137,  0.9958751909,  0.9971308601,  0.9980046622,
-                   0.9986125310,  0.9990353053,  0.9993292997,  0.9995337191,
-                   0.9996758446,  0.9997746542,  0.9998433469,  0.9998911009,
-                   0.9999242982,  0.9999473758,  0.9999634183,  0.9999745703,
-                   0.9999823226,  0.9999877117,  0.9999914578,  0.9999940619,
-                   0.9999958722,  0.9999971306,  0.9999980053,  0.9999986134,
-                   0.9999990361,  0.9999993300,  0.9999995342,  0.9999996762,
-                   0.9999997749},
+                  {-1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -1.0000000000,
+                   -1.0000000000, -1.0000000000, -1.0000000000, -0.9999999999,
+                   -0.9999999999, -0.9999999998, -0.9999999997, -0.9999999996,
+                   -0.9999999993, -0.9999999989, -0.9999999983, -0.9999999974,
+                   -0.9999999959, -0.9999999935, -0.9999999898, -0.9999999839,
+                   -0.9999999746, -0.9999999600, -0.9999999370, -0.9999999007,
+                   -0.9999998435, -0.9999997535, -0.9999996117, -0.9999993882,
+                   -0.9999990361, -0.9999984815, -0.9999976076, -0.9999962309,
+                   -0.9999940619, -0.9999906449, -0.9999852614, -0.9999767801,
+                   -0.9999634183, -0.9999423677, -0.9999092043, -0.9998569589,
+                   -0.9997746542, -0.9996450004, -0.9994407705, -0.9991190997,
+                   -0.9986125310, -0.9978149744, -0.9965597488, -0.9945853915,
+                   -0.9914827859, -0.9866142982, -0.9789923110, -0.9671021386,
+                   -0.9486568273, -0.9202886021, -0.8772337852, -0.8131859906,
+                   -0.7206956332, -0.5927001330, -0.4256281972, -0.2234388228,
+                   0.0000000000,  0.2234388228,  0.4256281972,  0.5927001330,
+                   0.7206956332,  0.8131859906,  0.8772337852,  0.9202886021,
+                   0.9486568273,  0.9671021386,  0.9789923110,  0.9866142982,
+                   0.9914827859,  0.9945853915,  0.9965597488,  0.9978149744,
+                   0.9986125310,  0.9991190997,  0.9994407705,  0.9996450004,
+                   0.9997746542,  0.9998569589,  0.9999092043,  0.9999423677,
+                   0.9999634183,  0.9999767801,  0.9999852614,  0.9999906449,
+                   0.9999940619,  0.9999962309,  0.9999976076,  0.9999984815,
+                   0.9999990361,  0.9999993882,  0.9999996117,  0.9999997535,
+                   0.9999998435,  0.9999999007,  0.9999999370,  0.9999999600,
+                   0.9999999746,  0.9999999839,  0.9999999898,  0.9999999935,
+                   0.9999999959,  0.9999999974,  0.9999999983,  0.9999999989,
+                   0.9999999993,  0.9999999996,  0.9999999997,  0.9999999998,
+                   0.9999999999,  0.9999999999,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000,  1.0000000000,  1.0000000000,  1.0000000000,
+                   1.0000000000},
                   kQuantizedToleranceInt16)));
 }
 
@@ -1031,54 +1077,94 @@ TEST_P(LogisticOpTest, SigmoidInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+      /*input=*/{TensorType_INT16, {177}, 16 * kMin, 16 * kMax},
+      /*output=*/{TensorType_INT16, {177}, kMin, kMax});
   m.SetInput<int16_t>(
-      {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182,
-       -9.0909090909,  -8.8636363636, -8.6363636364, -8.4090909091,
-       -8.1818181818,  -7.9545454545, -7.7272727273, -7.5000000000,
-       -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909,
-       -6.3636363636,  -6.1363636364, -5.9090909091, -5.6818181818,
-       -5.4545454545,  -5.2272727273, -5.0000000000, -4.7727272727,
-       -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636,
-       -3.6363636364,  -3.4090909091, -3.1818181818, -2.9545454545,
-       -2.7272727273,  -2.5000000000, -2.2727272727, -2.0454545455,
-       -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364,
-       -0.9090909091,  -0.6818181818, -0.4545454545, -0.2272727273,
-       0.0000000000,   0.2272727273,  0.4545454545,  0.6818181818,
-       0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,
-       1.8181818182,   2.0454545455,  2.2727272727,  2.5000000000,
-       2.7272727273,   2.9545454545,  3.1818181818,  3.4090909091,
-       3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,
-       4.5454545455,   4.7727272727,  5.0000000000,  5.2272727273,
-       5.4545454545,   5.6818181818,  5.9090909091,  6.1363636364,
-       6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,
-       7.2727272727,   7.5000000000,  7.7272727273,  7.9545454545,
-       8.1818181818,   8.4090909091,  8.6363636364,  8.8636363636,
-       9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,
-       10.0000000000});
+      {-20.0000000000, -19.7727272727, -19.5454545455, -19.3181818182,
+       -19.0909090909, -18.8636363636, -18.6363636364, -18.4090909091,
+       -18.1818181818, -17.9545454545, -17.7272727273, -17.5000000000,
+       -17.2727272727, -17.0454545455, -16.8181818182, -16.5909090909,
+       -16.3636363636, -16.1363636364, -15.9090909091, -15.6818181818,
+       -15.4545454545, -15.2272727273, -15.0000000000, -14.7727272727,
+       -14.5454545455, -14.3181818182, -14.0909090909, -13.8636363636,
+       -13.6363636364, -13.4090909091, -13.1818181818, -12.9545454545,
+       -12.7272727273, -12.5000000000, -12.2727272727, -12.0454545455,
+       -11.8181818182, -11.5909090909, -11.3636363636, -11.1363636364,
+       -10.9090909091, -10.6818181818, -10.4545454545, -10.2272727273,
+       -10.0000000000, -9.7727272727,  -9.5454545455,  -9.3181818182,
+       -9.0909090909,  -8.8636363636,  -8.6363636364,  -8.4090909091,
+       -8.1818181818,  -7.9545454545,  -7.7272727273,  -7.5000000000,
+       -7.2727272727,  -7.0454545455,  -6.8181818182,  -6.5909090909,
+       -6.3636363636,  -6.1363636364,  -5.9090909091,  -5.6818181818,
+       -5.4545454545,  -5.2272727273,  -5.0000000000,  -4.7727272727,
+       -4.5454545455,  -4.3181818182,  -4.0909090909,  -3.8636363636,
+       -3.6363636364,  -3.4090909091,  -3.1818181818,  -2.9545454545,
+       -2.7272727273,  -2.5000000000,  -2.2727272727,  -2.0454545455,
+       -1.8181818182,  -1.5909090909,  -1.3636363636,  -1.1363636364,
+       -0.9090909091,  -0.6818181818,  -0.4545454545,  -0.2272727273,
+       0.0000000000,   0.2272727273,   0.4545454545,   0.6818181818,
+       0.9090909091,   1.1363636364,   1.3636363636,   1.5909090909,
+       1.8181818182,   2.0454545455,   2.2727272727,   2.5000000000,
+       2.7272727273,   2.9545454545,   3.1818181818,   3.4090909091,
+       3.6363636364,   3.8636363636,   4.0909090909,   4.3181818182,
+       4.5454545455,   4.7727272727,   5.0000000000,   5.2272727273,
+       5.4545454545,   5.6818181818,   5.9090909091,   6.1363636364,
+       6.3636363636,   6.5909090909,   6.8181818182,   7.0454545455,
+       7.2727272727,   7.5000000000,   7.7272727273,   7.9545454545,
+       8.1818181818,   8.4090909091,   8.6363636364,   8.8636363636,
+       9.0909090909,   9.3181818182,   9.5454545455,   9.7727272727,
+       10.0000000000,  10.2272727273,  10.4545454545,  10.6818181818,
+       10.9090909091,  11.1363636364,  11.3636363636,  11.5909090909,
+       11.8181818182,  12.0454545455,  12.2727272727,  12.5000000000,
+       12.7272727273,  12.9545454545,  13.1818181818,  13.4090909091,
+       13.6363636364,  13.8636363636,  14.0909090909,  14.3181818182,
+       14.5454545455,  14.7727272727,  15.0000000000,  15.2272727273,
+       15.4545454545,  15.6818181818,  15.9090909091,  16.1363636364,
+       16.3636363636,  16.5909090909,  16.8181818182,  17.0454545455,
+       17.2727272727,  17.5000000000,  17.7272727273,  17.9545454545,
+       18.1818181818,  18.4090909091,  18.6363636364,  18.8636363636,
+       19.0909090909,  19.3181818182,  19.5454545455,  19.7727272727,
+       20.0000000000});
   m.Invoke();
   EXPECT_THAT(
       m.GetDequantizedOutput<int16_t>(),
       ElementsAreArray(ArrayFloatNear(
-          {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729,
-           0.0001414198, 0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396,
-           0.0004404502, 0.0005527786, 0.0006937345, 0.0008706021, 0.0010925128,
-           0.0013709094, 0.0017201256, 0.0021581065, 0.0027073042, 0.0033957870,
-           0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576, 0.0105038445,
-           0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
-           0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047,
-           0.1145124805, 0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272,
-           0.2871859014, 0.3358556241, 0.3882805886, 0.4434251301, 0.5000000000,
-           0.5565748699, 0.6117194114, 0.6641443759, 0.7128140986, 0.7570113728,
-           0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195, 0.9065929953,
-           0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
-           0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555,
-           0.9916136424, 0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130,
-           0.9972926958, 0.9978418935, 0.9982798744, 0.9986290906, 0.9989074872,
-           0.9991293979, 0.9993062655, 0.9994472214, 0.9995595498, 0.9996490604,
-           0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802, 0.9998873271,
-           0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021},
+          {0.0000000021, 0.0000000026, 0.0000000032, 0.0000000041, 0.0000000051,
+           0.0000000064, 0.0000000081, 0.0000000101, 0.0000000127, 0.0000000159,
+           0.0000000200, 0.0000000251, 0.0000000315, 0.0000000396, 0.0000000497,
+           0.0000000623, 0.0000000782, 0.0000000982, 0.0000001232, 0.0000001547,
+           0.0000001942, 0.0000002437, 0.0000003059, 0.0000003840, 0.0000004819,
+           0.0000006049, 0.0000007593, 0.0000009530, 0.0000011962, 0.0000015014,
+           0.0000018846, 0.0000023654, 0.0000029690, 0.0000037266, 0.0000046776,
+           0.0000058711, 0.0000073693, 0.0000092497, 0.0000116100, 0.0000145724,
+           0.0000182909, 0.0000229581, 0.0000288162, 0.0000361690, 0.0000453979,
+           0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729, 0.0001414198,
+           0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396, 0.0004404502,
+           0.0005527786, 0.0006937345, 0.0008706021, 0.0010925128, 0.0013709094,
+           0.0017201256, 0.0021581065, 0.0027073042, 0.0033957870, 0.0042586071,
+           0.0053394826, 0.0066928509, 0.0083863576, 0.0105038445, 0.0131488902,
+           0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562, 0.0398556989,
+           0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047, 0.1145124805,
+           0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272, 0.2871859014,
+           0.3358556241, 0.3882805886, 0.4434251301, 0.5000000000, 0.5565748699,
+           0.6117194114, 0.6641443759, 0.7128140986, 0.7570113728, 0.7963500665,
+           0.8307439673, 0.8603478166, 0.8854875195, 0.9065929953, 0.9241418200,
+           0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438, 0.9743284137,
+           0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555, 0.9916136424,
+           0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130, 0.9972926958,
+           0.9978418935, 0.9982798744, 0.9986290906, 0.9989074872, 0.9991293979,
+           0.9993062655, 0.9994472214, 0.9995595498, 0.9996490604, 0.9997203853,
+           0.9997772173, 0.9998225002, 0.9998585802, 0.9998873271, 0.9999102311,
+           0.9999284795, 0.9999430185, 0.9999546021, 0.9999638310, 0.9999711838,
+           0.9999770419, 0.9999817091, 0.9999854276, 0.9999883900, 0.9999907503,
+           0.9999926307, 0.9999941289, 0.9999953224, 0.9999962734, 0.9999970310,
+           0.9999976346, 0.9999981154, 0.9999984986, 0.9999988038, 0.9999990470,
+           0.9999992407, 0.9999993951, 0.9999995181, 0.9999996160, 0.9999996941,
+           0.9999997563, 0.9999998058, 0.9999998453, 0.9999998768, 0.9999999018,
+           0.9999999218, 0.9999999377, 0.9999999503, 0.9999999604, 0.9999999685,
+           0.9999999749, 0.9999999800, 0.9999999841, 0.9999999873, 0.9999999899,
+           0.9999999919, 0.9999999936, 0.9999999949, 0.9999999959, 0.9999999968,
+           0.9999999974, 0.9999999979},
           kQuantizedToleranceInt16)));
 }
 
@@ -1223,9 +1309,12 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt8) {
 // Test quantized softmax with int16 input and output. With the same input as in
 // QuantizedActivationsOpTest.Softmax2D, the dequantized output is identical.
 TEST(QuantizedActivationsOpTest, Softmax1DInt16) {
-  QuantizedActivationsOpModel m(1,
-                                /*input=*/{TensorType_INT16, {3}, -3, 3},
-                                /*output_type-*/ TensorType_INT16);
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      1,
+      /*input=*/{TensorType_INT16, {3}, 3 * kMin, 3 * kMax},
+      /*output_type-*/ TensorType_INT16);
   m.SetInput<int16_t>({1, 2, 3});
   m.Invoke();
   EXPECT_THAT(
@@ -1235,9 +1324,11 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt16) {
 }
 
 TEST(QuantizedActivationsOpTest, Softmax1DInt16ZeroElement) {
-  QuantizedActivationsOpModel m(0.1,
-                                /*input=*/{TensorType_INT16, {1}, -1, 1},
-                                TensorType_INT16);
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT16, {1}, 1 * kMin, 1 * kMax}, TensorType_INT16);
   m.SetInput<int16_t>({0});
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
@@ -1245,9 +1336,12 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt16ZeroElement) {
 }
 
 TEST(QuantizedActivationsOpTest, Softmax2DInt16) {
-  QuantizedActivationsOpModel m(0.1,
-                                /*input=*/{TensorType_INT16, {2, 4}, -10, 10},
-                                TensorType_INT16);
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT16, {2, 4}, 10 * kMin, 10 * kMax},
+      TensorType_INT16);
   m.SetInput<int16_t>({
       0, -6, 2, 4,   //
       3, -2, 10, 1,  //
@@ -1262,9 +1356,10 @@ TEST(QuantizedActivationsOpTest, Softmax2DInt16) {
                   kQuantizedToleranceInt16)));
 
   // Same input, but a different shape.
-  QuantizedActivationsOpModel m2(0.1,
-                                 /*input=*/{TensorType_INT16, {4, 2}, -10, 10},
-                                 TensorType_INT16);
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_INT16, {4, 2}, 10 * kMin, 10 * kMax},
+      TensorType_INT16);
   m2.SetInput<int16_t>({
       0, -6,  //
       2, 4,   //
@@ -1284,9 +1379,12 @@ TEST(QuantizedActivationsOpTest, Softmax2DInt16) {
 }
 
 TEST(QuantizedActivationsOpTest, Softmax3DInt16) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       1,
-      /*input=*/{TensorType_INT16, {1, 2, 4}, -10, 10}, TensorType_INT16);
+      /*input=*/{TensorType_INT16, {1, 2, 4}, 10 * kMin, 10 * kMax},
+      TensorType_INT16);
   m.SetInput<int16_t>({
       0, -6, 2, 4,   // depth = 0
       3, -2, 10, 1,  // depth = 1
@@ -1303,7 +1401,8 @@ TEST(QuantizedActivationsOpTest, Softmax3DInt16) {
   // Same input, but a different shape.
   QuantizedActivationsOpModel m2(
       1,
-      /*input=*/{TensorType_INT16, {4, 1, 2}, -10, 10}, TensorType_INT16);
+      /*input=*/{TensorType_INT16, {4, 1, 2}, 10 * kMin, 10 * kMax},
+      TensorType_INT16);
   m2.SetInput<int16_t>({
       0, -6,  //
       2, 4,   //
@@ -1325,9 +1424,12 @@ TEST(QuantizedActivationsOpTest, Softmax3DInt16) {
 // Test quantized softmax with int16 input and output. With the same input as in
 // QuantizedActivationsOpTest.Softmax4D, the dequantized output is identical.
 TEST(QuantizedActivationsOpTest, Softmax4DInt16) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       0.1,
-      /*input=*/{TensorType_INT16, {1, 2, 1, 4}, -10, 10}, TensorType_INT16);
+      /*input=*/{TensorType_INT16, {1, 2, 1, 4}, 10 * kMin, 10 * kMax},
+      TensorType_INT16);
   m.SetInput<int16_t>({
       0, -6, 2, 4,   // depth = 0
       3, -2, 10, 1,  // depth = 1
@@ -1344,7 +1446,8 @@ TEST(QuantizedActivationsOpTest, Softmax4DInt16) {
   // Same input, but a different shape.
   QuantizedActivationsOpModel m2(
       0.1,
-      /*input=*/{TensorType_INT16, {4, 1, 1, 2}, -10, 10}, TensorType_INT16);
+      /*input=*/{TensorType_INT16, {4, 1, 1, 2}, 10 * kMin, 10 * kMax},
+      TensorType_INT16);
   m2.SetInput<int16_t>({
       0, -6,  //
       2, 4,   //
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 35cf57128e7..5f6afa3d14f 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -314,7 +314,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
-  if (lhs_data->type == kTfLiteInt8) {
+  if (lhs_data->type == kTfLiteInt8 || lhs_data->type == kTfLiteInt16) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, lhs_data, rhs_data, output, &real_multiplier));
@@ -322,16 +322,34 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplier(real_multiplier, &op_data->output_multiplier, &exponent);
     op_data->output_shift = exponent;
     // BatchMatMul has no fused activation functions. Therefore, set
-    // output activation min and max to min and max of int8_t type,
-    // respecitvely.
-    op_data->output_activation_min = std::numeric_limits<int8_t>::min();
-    op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+    // output activation min and max to min and max of int8_t or int16_t
+    // type.
+    if (lhs_data->type == kTfLiteInt8) {
+      op_data->output_activation_min = std::numeric_limits<int8_t>::min();
+      op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+    } else {
+      op_data->output_activation_min = std::numeric_limits<int16_t>::min();
+      op_data->output_activation_max = std::numeric_limits<int16_t>::max();
+    }
+  }
+
+  if (lhs_data->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, lhs_data->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, rhs_data->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
   }
 
   TF_LITE_ENSURE(context, lhs_data->type == kTfLiteFloat32 ||
-                              lhs_data->type == kTfLiteInt8);
+                              lhs_data->type == kTfLiteInt8 ||
+                              lhs_data->type == kTfLiteInt16);
   TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 ||
-                              rhs_data->type == kTfLiteInt8);
+                              rhs_data->type == kTfLiteInt8 ||
+                              rhs_data->type == kTfLiteInt16);
+  // Either we have a hybrid quantization with a float32 and an int8 input,
+  // otherwise both inputs should be of the same type.
+  TF_LITE_ENSURE(context, (lhs_data->type == kTfLiteFloat32 &&
+                           rhs_data->type == kTfLiteInt8) ||
+                              lhs_data->type == rhs_data->type);
   // Support dimensions between 2 and 4, inclusive.
   TF_LITE_ENSURE(context, NumDimensions(lhs_data) >= 2);
   TF_LITE_ENSURE(context, NumDimensions(lhs_data) <= 4);
@@ -402,9 +420,14 @@ TfLiteStatus TransposeRowsColumns(TfLiteContext* context,
         tensor_in, GetTensorData<int8_t>(tensor_in), tensor_out,
         GetTensorData<int8_t>(tensor_out));
     return kTfLiteOk;
+  } else if (tensor_in->type == kTfLiteInt16) {
+    TransposeRowsColumnsImpl<int16_t>(
+        tensor_in, GetTensorData<int16_t>(tensor_in), tensor_out,
+        GetTensorData<int16_t>(tensor_out));
+    return kTfLiteOk;
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Can only transpose tensors with float and int8 type.");
+    TF_LITE_KERNEL_LOG(
+        context, "Can only transpose tensors with float, int8 or int16 type.");
     return kTfLiteError;
   }
 }
@@ -501,10 +524,10 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
   op_params.rhs_cacheable = IsConstantTensor(rhs);
 
   if (kernel_type == kReference) {
-    reference_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
-                               lhs_shape, GetTensorData<int8_t>(lhs),
-                               GetTensorShape(output),
-                               GetTensorData<int8_t>(output));
+    reference_ops::BatchMatMul<int8_t, int32_t>(
+        op_params, rhs_shape, GetTensorData<int8_t>(rhs), lhs_shape,
+        GetTensorData<int8_t>(lhs), GetTensorShape(output),
+        GetTensorData<int8_t>(output));
   } else {
     optimized_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
                                lhs_shape, GetTensorData<int8_t>(lhs),
@@ -515,13 +538,40 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
   return kTfLiteOk;
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalInt16(TfLiteContext* context, const OpData* data,
+                       const RuntimeShape& lhs_shape, const TfLiteTensor* lhs,
+                       const RuntimeShape& rhs_shape, const TfLiteTensor* rhs,
+                       const RuntimeShape& output_shape, TfLiteTensor* output) {
+  // Reuse params struct from FullyConnected Op.
+  FullyConnectedParams op_params;
+  int32_t input_offset = -lhs->params.zero_point;
+  int32_t filter_offset = -rhs->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  // optimized_ops not yet implemnted for int16_t, use reference_ops in all
+  // cases.
+  reference_ops::BatchMatMul<int16_t, int64_t>(
+      op_params, rhs_shape, GetTensorData<int16_t>(rhs), lhs_shape,
+      GetTensorData<int16_t>(lhs), GetTensorShape(output),
+      GetTensorData<int16_t>(output));
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            OpData* data, const RuntimeShape& lhs_shape,
                            const TfLiteTensor* lhs,
                            const RuntimeShape& rhs_shape,
                            const TfLiteTensor* rhs, TfLiteTensor* output) {
-  if (lhs->type == kTfLiteFloat32) {
+  if (lhs->type == kTfLiteFloat32 && rhs->type == kTfLiteInt8) {
     TfLiteTensor* input_quantized;
     TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
                                                 &input_quantized));
@@ -540,12 +590,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     return EvalHybrid<kernel_type>(
         context, node, data, lhs_shape, lhs, rhs_shape, rhs, input_quantized,
         scaling_factors, accum_scratch, row_sums, input_offsets, output);
-  } else if (lhs->type == kTfLiteInt8) {
+  } else if (lhs->type == kTfLiteInt8 && rhs->type == kTfLiteInt8) {
     return EvalInt8<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
                                  GetTensorShape(output), output);
+  } else if (lhs->type == kTfLiteInt16 && rhs->type == kTfLiteInt16) {
+    return EvalInt16<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
+                                  GetTensorShape(output), output);
   } else {
     TF_LITE_KERNEL_LOG(
-        context, "Currently only hybrid and int8 quantization is supported.\n");
+        context,
+        "Currently only hybrid, int8 and int16 quantization are supported.\n");
     return kTfLiteError;
   }
   return kTfLiteOk;
@@ -558,7 +612,7 @@ TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
     return nullptr;
   }
 
-  if (rhs->type == kTfLiteInt8) {
+  if (rhs->type == kTfLiteInt8 || rhs->type == kTfLiteInt16) {
     // Get the quantization params from the RHS tensor.
     transposed_rhs->params.scale = rhs->params.scale;
     transposed_rhs->params.zero_point = rhs->params.zero_point;
@@ -573,7 +627,7 @@ TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node,
     return nullptr;
   }
 
-  if (lhs->type == kTfLiteInt8) {
+  if (lhs->type == kTfLiteInt8 || lhs->type == kTfLiteInt16) {
     // Get the quantization params from the LHS tensor.
     transposed_lhs->params.scale = lhs->params.scale;
     transposed_lhs->params.zero_point = lhs->params.zero_point;
@@ -646,6 +700,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     case kTfLiteInt8:
+    case kTfLiteInt16:
       EvalQuantized<kernel_type>(context, node, op_data, lhs_shape, lhs_tensor,
                                  rhs_shape, rhs_tensor, output);
       break;
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 98df8ebe3db..7abef73d5a2 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -483,7 +483,12 @@ class QuantizedBatchMatMulOpModel : public SingleOpModel {
     input_size_ = total_input_size / batches_;
 
     lhs_id_ = AddInput(lhs);
-    rhs_id_ = AddInput({lhs.type, {input_size_, units_}, lhs.min, lhs.max});
+    rhs_id_ = AddInput({lhs.type,
+                        {input_size_, units_},
+                        0,
+                        0,
+                        GetScale(lhs_id_),
+                        GetZeroPoint(lhs_id_)});
 
     output_id_ = AddOutput(output);
 
@@ -553,6 +558,35 @@ TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(22, 22, 22, 56, 56, 56));
 }
 
+TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt16) {
+  const float inputs_scale = 10.0 / std::numeric_limits<int16_t>::max();
+  const float output_scale = 1.0;
+  const int32_t zero_point = 0;
+
+  QuantizedBatchMatMulOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*lhs=*/
+      {TensorType_INT16, {2, 10}, 0, 0, inputs_scale, zero_point},
+      /*output=*/
+      {TensorType_INT16, {}, 0, 0, output_scale, zero_point});
+
+  m.SetWeights<int16_t>({
+      1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,  5,  5,
+      6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10,
+  });
+
+  m.SetInput<int16_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57})));
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAre(23, 23, 23, 57, 57, 57));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     QuantizedBatchMatMulOpTest, QuantizedBatchMatMulOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/lite/kernels/builtin_op_kernels.h b/tensorflow/lite/kernels/builtin_op_kernels.h
index 1c73f06487b..f1ba36e59b8 100644
--- a/tensorflow/lite/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/kernels/builtin_op_kernels.h
@@ -39,11 +39,13 @@ TfLiteRegistration* Register_BATCH_TO_SPACE_ND();
 TfLiteRegistration* Register_BATCH_MATMUL();
 TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_CALL_ONCE();
 TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_CEIL();
 TfLiteRegistration* Register_CONCATENATION();
 TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_COS();
+TfLiteRegistration* Register_CUMSUM();
 TfLiteRegistration* Register_DENSIFY();
 TfLiteRegistration* Register_DEPTH_TO_SPACE();
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
diff --git a/tensorflow/lite/kernels/call_once.cc b/tensorflow/lite/kernels/call_once.cc
new file mode 100644
index 00000000000..2e56f5d8511
--- /dev/null
+++ b/tensorflow/lite/kernels/call_once.cc
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace call_once_kernel {
+
+// CallOnce operator is a control flow op to invoke other subgraph in the graph
+// in order to conduct the given graph's initialization tasks, for example, hash
+// table initialization and variable initialization.
+//
+// This operator will invoke the subgraph for initialization in the first run
+// and become no-op after the first run in an interpreter's life cycle.
+
+struct OpData {
+  // Subgraph index to be invoked once in a life cycle by this CallOnce op.
+  int init_subgraph_index;
+  // Boolean storage to store whether the subgraph for initialization is invoked
+  // successfully once in an interpreter's life cycle.
+  bool init_subgraph_invoked;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  const auto* params = reinterpret_cast<const TfLiteCallOnceParams*>(buffer);
+  op_data->init_subgraph_index = params->init_subgraph_index;
+  op_data->init_subgraph_invoked = false;
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  // Return early if the initialization graph is already invoked.
+  if (op_data->init_subgraph_invoked) return kTfLiteOk;
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 0);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 0);
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_data->init_subgraph_index < subgraphs->size());
+
+  // Ensures that there are no input and output tensors in the subgraph.
+  Subgraph* init_subgraph = (*subgraphs)[op_data->init_subgraph_index].get();
+  TF_LITE_ENSURE_EQ(context, init_subgraph->inputs().size(), 0);
+  TF_LITE_ENSURE_EQ(context, init_subgraph->outputs().size(), 0);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  // The initialization graph should be invoked once in a life cycle.
+  if (op_data->init_subgraph_invoked) return kTfLiteOk;
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  Subgraph& init_subgraph = *(*subgraphs)[op_data->init_subgraph_index];
+
+  TF_LITE_ENSURE_OK(context, init_subgraph.AllocateTensors());
+  TF_LITE_ENSURE_OK(context, init_subgraph.Invoke());
+  TF_LITE_ENSURE_OK(context, init_subgraph.ReleaseNonPersistentMemory());
+
+  // Mark the invocation completed.
+  op_data->init_subgraph_invoked = true;
+  return kTfLiteOk;
+}
+
+}  // namespace call_once_kernel
+
+TfLiteRegistration* Register_CALL_ONCE() {
+  static TfLiteRegistration r = {call_once_kernel::Init, call_once_kernel::Free,
+                                 call_once_kernel::Prepare,
+                                 call_once_kernel::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/call_once_test.cc b/tensorflow/lite/kernels/call_once_test.cc
new file mode 100644
index 00000000000..29917d60c61
--- /dev/null
+++ b/tensorflow/lite/kernels/call_once_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+
+namespace tflite {
+
+using subgraph_test_util::ControlFlowOpTest;
+
+namespace {
+
+class CallOnceTest : public ControlFlowOpTest {
+ protected:
+  void SetUp() override {
+    interpreter_->AddSubgraphs(1);
+    builder_->BuildCallOnceAndReadVariableSubgraph(
+        &interpreter_->primary_subgraph());
+    builder_->BuildAssignRandomValueToVariableSubgraph(
+        interpreter_->subgraph(1));
+
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  }
+};
+
+TEST_F(CallOnceTest, TestSimple) {
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  ASSERT_EQ(output->dims->size, 1);
+  ASSERT_EQ(output->dims->data[0], 1);
+  ASSERT_EQ(output->type, kTfLiteInt32);
+  ASSERT_EQ(NumElements(output), 1);
+
+  // The value of the variable must be non-zero, which will be assigned by the
+  // initialization subgraph.
+  EXPECT_GT(output->data.i32[0], 0);
+}
+
+TEST_F(CallOnceTest, TestInvokeMultipleTimes) {
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  ASSERT_EQ(output->dims->size, 1);
+  ASSERT_EQ(output->dims->data[0], 1);
+  ASSERT_EQ(output->type, kTfLiteInt32);
+  ASSERT_EQ(NumElements(output), 1);
+
+  // The value of the variable must be non-zero, which will be assigned by the
+  // initialization subgraph.
+  int value = output->data.i32[0];
+  EXPECT_GT(value, 0);
+
+  for (int i = 0; i < 3; ++i) {
+    // Make sure that no more random value assignment in the initialization
+    // subgraph.
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+    ASSERT_EQ(output->dims->size, 1);
+    ASSERT_EQ(output->dims->data[0], 1);
+    ASSERT_EQ(output->type, kTfLiteInt32);
+    ASSERT_EQ(NumElements(output), 1);
+    ASSERT_EQ(output->data.i32[0], value);
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 01f7f9fcc48..d447ba7a289 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -100,6 +100,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
+  if (input_type == kTfLiteInt16) {
+    // Make sure that all Int16 inputs have a null zero-point.
+    for (int i = 0; i < node->inputs->size; ++i) {
+      const TfLiteTensor* t = GetInput(context, node, i);
+      TF_LITE_ENSURE_EQ(context, t->params.zero_point, 0);
+    }
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  }
+
   return context->ResizeTensor(context, output, output_size);
 }
 
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 5a36895d847..dacf14ae9cf 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -296,8 +296,13 @@ TYPED_TEST_CASE(ConcatenationOpTestTyped, TestTypes);
 TYPED_TEST(ConcatenationOpTestTyped, FourInputsQuantizedInt8) {
   using TestType = typename TestFixture::TestType;
 
+  const float kMin = -1;
+  const float kMax =
+      std::numeric_limits<TestType>::max() /
+      static_cast<float>(std::numeric_limits<TestType>::max() + 1);
+
   QuantizedConcatenationOpModel m0(
-      {TestFixture::tensor_type, {2, 1, 2}, -12.7, 12.8},
+      {TestFixture::tensor_type, {2, 1, 2}, 12.8f * kMin, 12.8f * kMax},
       /*axis=*/2,
       /*num_inputs=*/4);
 
@@ -311,20 +316,6 @@ TYPED_TEST(ConcatenationOpTestTyped, FourInputsQuantizedInt8) {
                   1, 3, 1.1, 3.1, 1.2, 3.2, 1.3, 3.3,  //
                   4, 7, 4.1, 7.1, 4.2, 7.2, 4.3, 7.3   //
               })));
-
-  if (TestFixture::tensor_type == TensorType_INT8) {
-    EXPECT_THAT(m0.GetOutput<int8_t>(), ElementsAreArray({
-                                            9, 29, 10, 30, 11, 31, 12, 32,   //
-                                            39, 69, 40, 70, 41, 71, 42, 72,  //
-                                        }));
-  }
-
-  if (TestFixture::tensor_type == TensorType_INT16) {
-    EXPECT_THAT(m0.GetOutput<int16_t>(),
-                ElementsAreArray({2441, 7581, 2698, 7838, 2955,    //
-                                  8095, 3212, 8352, 10151, 17861,  //
-                                  10408, 18118, 10665, 18375, 10922, 18632}));
-  }
 }
 
 TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 5c978f8dbfb..c3523c0ebfa 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -338,6 +338,11 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
 
+  if (input_type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  }
+
   const TfLiteTensor* bias = nullptr;
 
   // TODO(ahentz): At this point the optimized versions require 'bias'. We can
@@ -352,8 +357,6 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     } else if (input_type == kTfLiteInt16) {
       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
-      TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
     }
diff --git a/tensorflow/lite/kernels/cumsum.cc b/tensorflow/lite/kernels/cumsum.cc
index 173de0959fa..b37bab15803 100644
--- a/tensorflow/lite/kernels/cumsum.cc
+++ b/tensorflow/lite/kernels/cumsum.cc
@@ -13,44 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
-// TODO(b/161933288): Promote this op to builtin-op when we can add new builtin
-// ops.
 
 namespace tflite {
 namespace ops {
-namespace custom {
+namespace builtin {
 namespace cumsum {
 
-typedef struct {
-  bool exclusive;
-  bool reverse;
-} TfLiteCumsumParams;
-
 static const int kInputTensor = 0;
 static const int kAxisTensor = 1;
 static const int kOutputTensor = 0;
 
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new TfLiteCumsumParams;
-  const uint8_t* buffer_data = reinterpret_cast<const uint8_t*>(buffer);
-
-  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_data, length).AsMap();
-  data->exclusive = m["exclusive"].AsBool();
-  data->reverse = m["reverse"].AsBool();
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<TfLiteCumsumParams*>(buffer);
-}
-
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -58,8 +37,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
 
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteInt32 || input->type == kTfLiteFloat32);
+  TF_LITE_ENSURE(context, input->type == kTfLiteInt32 ||
+                              input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt64);
   TF_LITE_ENSURE_EQ(context, axis->type, kTfLiteInt32);
 
   TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
@@ -78,7 +58,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  auto* params = reinterpret_cast<TfLiteCumsumParams*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteCumsumParams*>(node->builtin_data);
 
   int axis = *GetTensorData<int>(axis_tensor);
   if (axis < 0) axis += NumDimensions(input);
@@ -95,6 +75,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                             GetTensorData<int>(output));
       break;
     }
+    case kTfLiteInt64: {
+      optimized_ops::CumSum(GetTensorData<int64_t>(input),
+                            GetTensorShape(input), axis, params->exclusive,
+                            params->reverse, GetTensorData<int64_t>(output));
+      break;
+    }
     case kTfLiteFloat32: {
       optimized_ops::CumSum(GetTensorData<float>(input), GetTensorShape(input),
                             axis, params->exclusive, params->reverse,
@@ -115,11 +101,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace cumsum
 
 TfLiteRegistration* Register_CUMSUM() {
-  static TfLiteRegistration r = {cumsum::Init, cumsum::Free, cumsum::Prepare,
+  static TfLiteRegistration r = {nullptr, nullptr, cumsum::Prepare,
                                  cumsum::Eval};
   return &r;
 }
 
-}  // namespace custom
+}  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/cumsum_test.cc b/tensorflow/lite/kernels/cumsum_test.cc
index 092defdcba3..f11781dc744 100644
--- a/tensorflow/lite/kernels/cumsum_test.cc
+++ b/tensorflow/lite/kernels/cumsum_test.cc
@@ -17,18 +17,14 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/custom_ops_register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_CUMSUM();
+namespace builtin {
 
 namespace {
 
@@ -42,13 +38,8 @@ class CumsumOpModel : public SingleOpModel {
 
     output_ = AddOutput(output);
 
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {
-      fbb.Bool("exclusive", exclusive);
-      fbb.Bool("reverse", reverse);
-    });
-    fbb.Finish();
-    SetCustomOp("Cumsum", fbb.GetBuffer(), Register_CUMSUM);
+    SetBuiltinOp(BuiltinOperator_CUMSUM, BuiltinOptions_CumsumOptions,
+                 CreateCumsumOptions(builder_, exclusive, reverse).Union());
 
     BuildInterpreter({GetShape(input_), GetShape(axis_)});
   }
@@ -77,6 +68,23 @@ TEST(CumsumOpTest, SimpleIntTest) {
               testing::ElementsAreArray({1, 3, 6, 10, 5, 11, 18, 26}));
 }
 
+TEST(CumsumOpTest, SimpleInt64Test) {
+  CumsumOpModel<int64_t> m({TensorType_INT64, {2, 4}}, {TensorType_INT64, {}},
+                           false, false);
+
+  m.PopulateTensor<int64_t>(
+      m.input(), {100000000001l, 100000000002l, 100000000003l, 100000000004l,
+                  100000000005l, 100000000006l, 100000000007l, 100000000008l});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(
+                                 {100000000001l, 200000000003l, 300000000006l,
+                                  400000000010l, 100000000005l, 200000000011l,
+                                  300000000018l, 400000000026l}));
+}
+
 TEST(CumsumOpTest, SimpleIntAxis0Test) {
   CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
                            false, false);
@@ -143,6 +151,6 @@ TEST(CumsumOpTest, SimpleFloatTest) {
 }
 
 }  // namespace
-}  // namespace custom
+}  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/custom_ops_register.h b/tensorflow/lite/kernels/custom_ops_register.h
index 8aadd379a43..a24c062dbe6 100644
--- a/tensorflow/lite/kernels/custom_ops_register.h
+++ b/tensorflow/lite/kernels/custom_ops_register.h
@@ -21,7 +21,6 @@ namespace tflite {
 namespace ops {
 namespace custom {
 
-TfLiteRegistration* Register_CUMSUM();
 TfLiteRegistration* Register_HASHTABLE();
 TfLiteRegistration* Register_HASHTABLE_FIND();
 TfLiteRegistration* Register_HASHTABLE_IMPORT();
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index a76853da190..ae88967e968 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -133,6 +133,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                    filter->type == data_type || data_type == kTfLiteInt16);
   }
 
+  if (data_type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  }
+
   // Filter in DepthwiseConv is expected to be [1, H, W, O].
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);
 
@@ -144,8 +149,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     } else if (data_type == kTfLiteInt16) {
       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
-      TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, data_type);
     }
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index a2a1bd495cf..9aa752406c4 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -60,6 +60,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                               op_context.input->type == kTfLiteInt16 ||
                               op_context.input->type == kTfLiteFloat16);
 
+  if (op_context.input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0);
+  }
+
   op_context.output->type = kTfLiteFloat32;
   // If the input tensor is constant, we can persist the dequantized value in
   // the output tensor. Otherwise we run dequantize upon each eval.
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index da795474400..e609d32aa50 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -108,8 +108,8 @@ TEST(DequantizeOpTest, Float16) {
 }
 
 TEST(DequantizeOpTest, Int16) {
-  DequantizeOpModel m(TensorType_INT16, {2, 5}, 0.5, -1, 4);
-  m.SetInput<int16_t>({-130, -127, -126, -125, -124, 123, 124, 125, 126, 130});
+  DequantizeOpModel m(TensorType_INT16, {2, 5}, 0.5, 0, 4);
+  m.SetInput<int16_t>({-129, -126, -125, -124, -123, 124, 125, 126, 127, 131});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/hashtable/README.md b/tensorflow/lite/kernels/hashtable/README.md
index 77076a94f7a..26b5e75f638 100644
--- a/tensorflow/lite/kernels/hashtable/README.md
+++ b/tensorflow/lite/kernels/hashtable/README.md
@@ -45,7 +45,7 @@ Supported mapping type: string → int64, int64 → string
   <tr>
    <td rowspan="2" >tf.lookup.index_table_from_tensor
    </td>
-   <td rowspan="2" colspan="5" >Supported natively when num_oov_bukcets=0 and dtype=dtypes.string.
+   <td rowspan="2" colspan="5" >Supported natively when num_oov_buckets=0 and dtype=dtypes.string.
 <p>
 For the oov concept, you will need a <a href="https://www.tensorflow.org/lite/guide/ops_select" title="Select TensorFlow operators to use in TensorFlow Lite">Flex delegate</a>.
    </td>
@@ -78,8 +78,6 @@ tf.contrib.lookup.MutableDenseHashTable
   </tr>
 </table>
 
-
-
 ## Python Sample code
 
 Here, you can find the Python sample code:
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
index 51f3d2559db..f9472515417 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
@@ -288,13 +288,13 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "ld1 {v20.8b}, [x15], %[input_depth]\n"
           "saddw v14.8h, v26.8h, v14.8b\n"
 
-          "dup v21.4s, wzr\n"
+          "movi v21.4s, #0\n"
           "saddw v15.8h, v26.8h, v15.8b\n"
-          "dup v22.4s, wzr\n"
+          "movi v22.4s, #0\n"
           "saddw v16.8h, v26.8h, v16.8b\n"
-          "dup v23.4s, wzr\n"
+          "movi v23.4s, #0\n"
           "saddw v17.8h, v26.8h, v17.8b\n"
-          "dup v24.4s, wzr\n"
+          "movi v24.4s, #0\n"
 
           "saddw v18.8h, v26.8h, v18.8b\n"
           "saddw v19.8h, v26.8h, v19.8b\n"
@@ -383,14 +383,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
             "fcvtms v23.4s, v23.4s\n"
             "fcvtms v24.4s, v24.4s\n"
 
-            "dup v22.4s, wzr\n"
-            "dup v24.4s, wzr\n"
+            "movi v22.4s, #0\n"
+            "movi v24.4s, #0\n"
             "saddw v9.8h, v26.8h, v9.8b\n"
             "saddw v12.8h, v26.8h, v12.8b\n"
             "saddw v15.8h, v26.8h, v15.8b\n"
-            "dup v21.4s, wzr\n"
+            "movi v21.4s, #0\n"
             "saddw v18.8h, v26.8h, v18.8b\n"
-            "dup v23.4s, wzr\n"
+            "movi v23.4s, #0\n"
 
             // Mul-add right outputs.
             "smlal v21.4s, v0.4h, v10.4h\n"
@@ -480,8 +480,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
             "fcvtms v23.4s, v23.4s\n"
             "fcvtms v24.4s, v24.4s\n"
 
-            "dup v22.4s, wzr\n"
-            "dup v24.4s, wzr\n"
+            "movi v22.4s, #0\n"
+            "movi v24.4s, #0\n"
             "saddw v9.8h, v26.8h, v9.8b\n"
             "saddw v10.8h, v26.8h, v10.8b\n"
             "saddw v11.8h, v26.8h, v11.8b\n"
@@ -489,9 +489,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
             "saddw v13.8h, v26.8h, v13.8b\n"
             "saddw v14.8h, v26.8h, v14.8b\n"
             "saddw v15.8h, v26.8h, v15.8b\n"
-            "dup v21.4s, wzr\n"
+            "movi v21.4s, #0\n"
             "saddw v16.8h, v26.8h, v16.8b\n"
-            "dup v23.4s, wzr\n"
+            "movi v23.4s, #0\n"
             "saddw v17.8h, v26.8h, v17.8b\n"
             "saddw v18.8h, v26.8h, v18.8b\n"
             "saddw v19.8h, v26.8h, v19.8b\n"
@@ -581,14 +581,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "fcvtms v23.4s, v23.4s\n"
           "fcvtms v24.4s, v24.4s\n"
 
-          "dup v22.4s, wzr\n"
-          "dup v24.4s, wzr\n"
+          "movi v22.4s, #0\n"
+          "movi v24.4s, #0\n"
           "saddw v9.8h, v26.8h, v9.8b\n"
           "saddw v12.8h, v26.8h, v12.8b\n"
           "saddw v15.8h, v26.8h, v15.8b\n"
-          "dup v21.4s, wzr\n"
+          "movi v21.4s, #0\n"
           "saddw v18.8h, v26.8h, v18.8b\n"
-          "dup v23.4s, wzr\n"
+          "movi v23.4s, #0\n"
 
           // Mul-add right outputs.
           "smlal v21.4s, v0.4h, v10.4h\n"
@@ -765,10 +765,10 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
         "ld1 {v17.8b}, [x14], %[input_depth]\n"
         "ld1 {v18.8b}, [x14], %[input_depth]\n"
         "ld1 {v19.8b}, [x14], %[input_depth]\n"
-        "dup v21.4s, wzr\n"
-        "dup v22.4s, wzr\n"
-        "dup v23.4s, wzr\n"
-        "dup v24.4s, wzr\n"
+        "movi v21.4s, #0\n"
+        "movi v22.4s, #0\n"
+        "movi v23.4s, #0\n"
+        "movi v24.4s, #0\n"
 
         "saddw v9.8h, v26.8h, v9.8b\n"
         "saddw v10.8h, v26.8h, v10.8b\n"
@@ -880,8 +880,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "fcvtms v23.4s, v23.4s\n"
           "fcvtms v24.4s, v24.4s\n"
 
-          "dup v22.4s, wzr\n"
-          "dup v24.4s, wzr\n"
+          "movi v22.4s, #0\n"
+          "movi v24.4s, #0\n"
           "saddw v9.8h, v26.8h, v9.8b\n"
           "saddw v10.8h, v26.8h, v10.8b\n"
           "saddw v11.8h, v26.8h, v11.8b\n"
@@ -889,9 +889,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "saddw v13.8h, v26.8h, v13.8b\n"
           "saddw v14.8h, v26.8h, v14.8b\n"
           "saddw v15.8h, v26.8h, v15.8b\n"
-          "dup v21.4s, wzr\n"
+          "movi v21.4s, #0\n"
           "saddw v16.8h, v26.8h, v16.8b\n"
-          "dup v23.4s, wzr\n"
+          "movi v23.4s, #0\n"
           "saddw v17.8h, v26.8h, v17.8b\n"
           "saddw v18.8h, v26.8h, v18.8b\n"
           "saddw v19.8h, v26.8h, v19.8b\n"
@@ -1202,19 +1202,19 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "ld1 {v15.8b}, [x13], %[input_depth]\n"
           "add x7, %[output_ptr], x19\n"
           "ld1 {v16.8b}, [x13], %[input_depth]\n"
-          "dup v21.4s, wzr\n"
-          "dup v22.4s, wzr\n"
-          "dup v23.4s, wzr\n"
+          "movi v21.4s, #0\n"
+          "movi v22.4s, #0\n"
+          "movi v23.4s, #0\n"
           "saddw v9.8h, v28.8h, v9.8b\n"
-          "dup v24.4s, wzr\n"
+          "movi v24.4s, #0\n"
           "saddw v10.8h, v28.8h, v10.8b\n"
-          "dup v19.4s, wzr\n"
+          "movi v19.4s, #0\n"
           "saddw v11.8h, v28.8h, v11.8b\n"
-          "dup v20.4s, wzr\n"
+          "movi v20.4s, #0\n"
           "saddw v14.8h, v28.8h, v14.8b\n"
-          "dup v25.4s, wzr\n"
+          "movi v25.4s, #0\n"
           "saddw v15.8h, v28.8h, v15.8b\n"
-          "dup v26.4s, wzr\n"
+          "movi v26.4s, #0\n"
           "saddw v16.8h, v28.8h, v16.8b\n"
 
           "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
@@ -1343,8 +1343,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
             "fcvtms v23.4s, v23.4s\n"
             "fcvtms v24.4s, v24.4s\n"
 
-            "dup v22.4s, wzr\n"
-            "dup v24.4s, wzr\n"
+            "movi v22.4s, #0\n"
+            "movi v24.4s, #0\n"
             "saddw v9.8h, v28.8h, v9.8b\n"
             "saddw v10.8h, v28.8h, v10.8b\n"
             "saddw v11.8h, v28.8h, v11.8b\n"
@@ -1373,9 +1373,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
             "smlal2 v20.4s, v3.8h, v14.8h\n"
             "ld1 {v14.8b}, [x13], %[input_depth]\n"
             "smlal v25.4s, v3.4h, v16.4h\n"
-            "dup v21.4s, wzr\n"
+            "movi v21.4s, #0\n"
             "smlal2 v26.4s, v3.8h, v16.8h\n"
-            "dup v23.4s, wzr\n"
+            "movi v23.4s, #0\n"
             "smlal v19.4s, v4.4h, v15.4h\n"
             "saddw v17.8h, v28.8h, v17.8b\n"
             "smlal2 v20.4s, v4.8h, v15.8h\n"
@@ -1423,14 +1423,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
             "fcvtms v25.4s, v25.4s\n"
             "fcvtms v26.4s, v26.4s\n"
 
-            "dup v20.4s, wzr\n"
-            "dup v26.4s, wzr\n"
+            "movi v20.4s, #0\n"
+            "movi v26.4s, #0\n"
             "saddw v9.8h, v28.8h, v9.8b\n"
             "saddw v10.8h, v28.8h, v10.8b\n"
             "saddw v11.8h, v28.8h, v11.8b\n"
-            "dup v19.4s, wzr\n"
+            "movi v19.4s, #0\n"
             "saddw v14.8h, v28.8h, v14.8b\n"
-            "dup v25.4s, wzr\n"
+            "movi v25.4s, #0\n"
             "saddw v15.8h, v28.8h, v15.8b\n"
             "saddw v16.8h, v28.8h, v16.8b\n"
 
@@ -1557,8 +1557,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "fcvtms v23.4s, v23.4s\n"
           "fcvtms v24.4s, v24.4s\n"
 
-          "dup v22.4s, wzr\n"
-          "dup v24.4s, wzr\n"
+          "movi v22.4s, #0\n"
+          "movi v24.4s, #0\n"
           "saddw v9.8h, v28.8h, v9.8b\n"
           "saddw v10.8h, v28.8h, v10.8b\n"
           "saddw v11.8h, v28.8h, v11.8b\n"
@@ -1764,12 +1764,12 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
         "ld1 {v17.8b}, [x15], %[input_depth]\n"
 
         "saddw v9.8h, v28.8h, v9.8b\n"
-        "dup v24.4s, wzr\n"
+        "movi v24.4s, #0\n"
         "saddw v10.8h, v28.8h, v10.8b\n"
-        "dup v25.4s, wzr\n"
+        "movi v25.4s, #0\n"
         "saddw v11.8h, v28.8h, v11.8b\n"
-        "dup v26.4s, wzr\n"
-        "dup v27.4s, wzr\n"
+        "movi v26.4s, #0\n"
+        "movi v27.4s, #0\n"
         "saddw v12.8h, v28.8h, v12.8b\n"
         "saddw v13.8h, v28.8h, v13.8b\n"
         "saddw v14.8h, v28.8h, v14.8b\n"
@@ -1879,17 +1879,17 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
           "fcvtms v26.4s, v26.4s\n"
           "fcvtms v27.4s, v27.4s\n"
 
-          "dup v25.4s, wzr\n"
+          "movi v25.4s, #0\n"
           "saddw v9.8h, v28.8h, v9.8b\n"
-          "dup v27.4s, wzr\n"
+          "movi v27.4s, #0\n"
           "saddw v10.8h, v28.8h, v10.8b\n"
           "saddw v11.8h, v28.8h, v11.8b\n"
           "saddw v12.8h, v28.8h, v12.8b\n"
           "saddw v13.8h, v28.8h, v13.8b\n"
           "saddw v14.8h, v28.8h, v14.8b\n"
-          "dup v24.4s, wzr\n"
+          "movi v24.4s, #0\n"
           "saddw v15.8h, v28.8h, v15.8b\n"
-          "dup v26.4s, wzr\n"
+          "movi v26.4s, #0\n"
           "saddw v16.8h, v28.8h, v16.8b\n"
           "saddw v17.8h, v28.8h, v17.8b\n"
 
@@ -2094,9 +2094,9 @@ struct DepthwiseConvHybridPartialPerChannel<
         "ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.4s, w9\n"
         "dup v31.4s, w10\n"
-        "dup v16.4s, wzr\n"
+        "movi v16.4s, #0\n"
         "saddw v8.8h, v26.8h, v8.8b\n"
-        "dup v17.4s, wzr\n"
+        "movi v17.4s, #0\n"
         "sshll v0.8h, v0.8b, #0\n"
 
         "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
@@ -2133,9 +2133,9 @@ struct DepthwiseConvHybridPartialPerChannel<
           "fcvtms v17.4s, v17.4s\n"
 
           "saddw v8.8h, v26.8h, v8.8b\n"
-          "dup v16.4s, wzr\n"
+          "movi v16.4s, #0\n"
           "sshll v0.8h, v0.8b, #0\n"
-          "dup v17.4s, wzr\n"
+          "movi v17.4s, #0\n"
           "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
           "ld1 {v10.4s}, [%[bias_ptr]], #16\n"
           "ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
@@ -2241,9 +2241,9 @@ struct DepthwiseConvHybridPartialPerChannel<
 
         // Add input and filter offsets.
         "saddw v8.8h, v26.8h, v8.8b\n"
-        "dup v16.4s, wzr\n"
+        "movi v16.4s, #0\n"
         "saddw v9.8h, v26.8h, v9.8b\n"
-        "dup v17.4s, wzr\n"
+        "movi v17.4s, #0\n"
         "saddw v10.8h, v26.8h, v10.8b\n"
         "saddw v11.8h, v26.8h, v11.8b\n"
 
@@ -2290,9 +2290,9 @@ struct DepthwiseConvHybridPartialPerChannel<
           "fcvtms v17.4s, v17.4s\n"
 
           "saddw v8.8h, v26.8h, v8.8b\n"
-          "dup v16.4s, wzr\n"
+          "movi v16.4s, #0\n"
           "saddw v9.8h, v26.8h, v9.8b\n"
-          "dup v17.4s, wzr\n"
+          "movi v17.4s, #0\n"
           "saddw v10.8h, v26.8h, v10.8b\n"
           "saddw v11.8h, v26.8h, v11.8b\n"
           "sshll v0.8h, v0.8b, #0\n"
@@ -2417,9 +2417,9 @@ struct DepthwiseConvHybridPartialPerChannel<
 
         // Add input and filter offsets.
         "saddw v8.8h, v26.8h, v8.8b\n"
-        "dup v16.4s, wzr\n"
+        "movi v16.4s, #0\n"
         "saddw v9.8h, v26.8h, v9.8b\n"
-        "dup v17.4s, wzr\n"
+        "movi v17.4s, #0\n"
         "saddw v10.8h, v26.8h, v10.8b\n"
         "saddw v11.8h, v26.8h, v11.8b\n"
         "saddw v12.8h, v26.8h, v12.8b\n"
@@ -2494,9 +2494,9 @@ struct DepthwiseConvHybridPartialPerChannel<
           "sshll v0.8h, v0.8b, #0\n"
           "sshll v1.8h, v1.8b, #0\n"
           "sshll v2.8h, v2.8b, #0\n"
-          "dup v16.4s, wzr\n"
+          "movi v16.4s, #0\n"
           "sshll v3.8h, v3.8b, #0\n"
-          "dup v17.4s, wzr\n"
+          "movi v17.4s, #0\n"
           "sshll v4.8h, v4.8b, #0\n"
           "sshll v5.8h, v5.8b, #0\n"
           "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
@@ -2622,9 +2622,9 @@ struct DepthwiseConvHybridPartialPerChannel<
 
         // Add input and filter offsets.
         "saddw v8.8h, v26.8h, v8.8b\n"
-        "dup v16.4s, wzr\n"
+        "movi v16.4s, #0\n"
         "saddw v9.8h, v26.8h, v9.8b\n"
-        "dup v17.4s, wzr\n"
+        "movi v17.4s, #0\n"
         "saddw v10.8h, v26.8h, v10.8b\n"
         "saddw v11.8h, v26.8h, v11.8b\n"
         "saddw v12.8h, v26.8h, v12.8b\n"
@@ -2701,9 +2701,9 @@ struct DepthwiseConvHybridPartialPerChannel<
           "sshll v0.8h, v0.8b, #0\n"
           "sshll v1.8h, v1.8b, #0\n"
           "sshll v2.8h, v2.8b, #0\n"
-          "dup v16.4s, wzr\n"
+          "movi v16.4s, #0\n"
           "sshll v3.8h, v3.8b, #0\n"
-          "dup v17.4s, wzr\n"
+          "movi v17.4s, #0\n"
           "sshll v4.8h, v4.8b, #0\n"
           "sshll v5.8h, v5.8b, #0\n"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 0172ba690e4..5e61cea036b 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -372,10 +372,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
 
       asm volatile(
           // Zero out the accumulator registers.
-          "dup v0.4s, wzr\n"
-          "dup v1.4s, wzr\n"
-          "dup v2.4s, wzr\n"
-          "dup v3.4s, wzr\n"
+          "movi v0.4s, #0\n"
+          "movi v1.4s, #0\n"
+          "movi v2.4s, #0\n"
+          "movi v3.4s, #0\n"
 
           "1:\n"  // batch_cols_loop
 
@@ -463,12 +463,12 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
           "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-          : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
-            [ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr),
-            [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3)
-          : [ mat_ptr0_end ] "r"(mat_ptr0_end),
-            [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
-            [ wide_rows ] "r"(wide_rows)
+          : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
+            [vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
+            [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
+          : [mat_ptr0_end] "r"(mat_ptr0_end),
+            [scaling_factors_ptr] "r"(scaling_factors_ptr),
+            [wide_rows] "r"(wide_rows)
           : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
             "v10", "v11", "v12", "v13", "cc", "memory");
     }
@@ -501,16 +501,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
       const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
       const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
       asm volatile(
-          "dup v0.4s, wzr\n"
-          "dup v1.4s, wzr\n"
-          "dup v2.4s, wzr\n"
-          "dup v3.4s, wzr\n"
+          "movi v0.4s, #0\n"
+          "movi v1.4s, #0\n"
+          "movi v2.4s, #0\n"
+          "movi v3.4s, #0\n"
           // Load zero points.
           "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
           "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
           // Zero out zero point accumulators.
-          "dup v14.4s, wzr\n"
-          "dup v15.4s, wzr\n"
+          "movi v14.4s, #0\n"
+          "movi v15.4s, #0\n"
 
           // Load per channel scales if not null.
           "cmp %w[is_channel_scale_nullptr], #0\n"
@@ -587,16 +587,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
           "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-          : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
-            [ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr),
-            [ row_sums_ptr ] "+r"(row_sums_ptr)
-          : [ mat_ptr0_end ] "r"(mat_ptr0_end),
-            [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
-            [ wide_rows ] "r"(wide_rows),
-            [ channel_scales_ptr ] "r"(channel_scales_ptr),
-            [ batch_offsets_ptr ] "r"(batch_offsets_ptr),
-            [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
-            [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
+          : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
+            [vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
+            [row_sums_ptr] "+r"(row_sums_ptr)
+          : [mat_ptr0_end] "r"(mat_ptr0_end),
+            [scaling_factors_ptr] "r"(scaling_factors_ptr),
+            [wide_rows] "r"(wide_rows),
+            [channel_scales_ptr] "r"(channel_scales_ptr),
+            [batch_offsets_ptr] "r"(batch_offsets_ptr),
+            [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
+            [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
           : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
             "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1",
             "cc", "memory");
@@ -746,9 +746,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
 
       if (ledger_ptr != ledger_end) {
         asm volatile(
-            "dup v0.4s, wzr\n"
-            "dup v1.4s, wzr\n"
-            "dup v8.4s, wzr\n"
+            "movi v0.4s, #0\n"
+            "movi v1.4s, #0\n"
+            "movi v8.4s, #0\n"
             "mov x7, 0\n"
 
             "1:\n"  // chunks_loop
@@ -775,9 +775,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
             // We have to be careful to cast this value to 32 bits in order
             // to interpret the sign bit properly.
             "mov %[row_sum], v1.d[0]\n"
-            : [ row_sum ] "=r"(row_sum), [ ledger_ptr ] "+r"(ledger_ptr),
-              [ mat_ptr ] "+r"(mat_ptr), [ vec_ptr ] "+r"(vec_ptr)
-            : [ ledger_end ] "r"(ledger_end)
+            : [row_sum] "=r"(row_sum), [ledger_ptr] "+r"(ledger_ptr),
+              [mat_ptr] "+r"(mat_ptr), [vec_ptr] "+r"(vec_ptr)
+            : [ledger_end] "r"(ledger_end)
             : "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory");
       }
       result[batch * m_rows + row] +=
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 24c3ffe3d7e..f06199c7700 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -217,10 +217,11 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+template <typename T, typename AccumT>
 inline void BatchMatMul(const FullyConnectedParams& params,
-                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
-                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
-                        const RuntimeShape& output_shape, int8_t* output_data) {
+                        const RuntimeShape& lhs_shape, const T* lhs_data,
+                        const RuntimeShape& rhs_shape, const T* rhs_data,
+                        const RuntimeShape& output_shape, T* output_data) {
   const RuntimeShape extended_lhs_shape =
       RuntimeShape::ExtendedShape(5, lhs_shape);
   const RuntimeShape extended_rhs_shape =
@@ -276,33 +277,33 @@ inline void BatchMatMul(const FullyConnectedParams& params,
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   for (int b0 = 0; b0 < batch_dim0; ++b0) {
-    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
-    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
     for (int b1 = 0; b1 < batch_dim1; ++b1) {
-      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
-      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
       for (int b2 = 0; b2 < batch_dim2; ++b2) {
-        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
-        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
-        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
-                                         b1 * batch_dim2 + b2) *
-                                            lhs_rows * rhs_cols;
+        const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        T* out_ptr = output_data +
+                     ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+                         lhs_rows * rhs_cols;
 
         for (int j = 0; j < rhs_cols; ++j) {
           for (int i = 0; i < lhs_rows; ++i) {
-            int32_t total = 0;
+            AccumT total = 0;
             for (int k = 0; k < accum_depth; ++k) {
-              int32_t lhs_val = lhs_ptr2[accum_depth * i + k];
-              int32_t rhs_val = rhs_ptr2[accum_depth * j + k];
+              AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
+              AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
               total += (lhs_val + filter_offset) * (rhs_val + input_offset);
             }
-            total = MultiplyByQuantizedMultiplier(total, output_multiplier,
-                                                  output_shift);
-            total += output_offset;
-            total = std::max(total, output_activation_min);
-            total = std::min(total, output_activation_max);
+            int32_t total_scaled = MultiplyByQuantizedMultiplier(
+                total, output_multiplier, output_shift);
+            total_scaled += output_offset;
+            total_scaled = std::max(total_scaled, output_activation_min);
+            total_scaled = std::min(total_scaled, output_activation_max);
             const int idx = lhs_rows * j + i;
-            out_ptr[idx] = static_cast<int8_t>(total);
+            out_ptr[idx] = static_cast<T>(total_scaled);
           }
         }
       }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index e315683c0cd..d1a15bd9a5b 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -75,13 +75,20 @@ inline void Logistic(int32_t input_multiplier, int32_t input_size,
     // We divide by 2 power of 9, because
     // we need to divide by 2 in power of 7 for
     // the input conversion + 1/4 from the scale above.
-    uint8_t uh = abs_input_data >> 9;
-    uint32_t ua = sigmoid_table_uint16[uh];
-    uint32_t ub = sigmoid_table_uint16[uh + 1];
-    uint32_t ut = abs_input_data & 0x1ff;
+    // Define uh as uint32_t type not to make this function overflow.
+    uint32_t uh = abs_input_data >> 9;
+    uint32_t result;
 
-    // Interpolation is done using the fractional bit.
-    uint32_t result = (ua << 9) + ut * (ub - ua);
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0x7FFF << 10;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+      uint32_t ut = abs_input_data & 0x1ff;
+      // Interpolation is done using the fractional bit.
+      result = (ua << 9) + ut * (ub - ua);
+    }
 
     result = (input_data >= 0) ? (result + (1 << 9))
                                : ((1 << (16 + 9)) - result + (1 << 9) - 1);
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index d57b6f2c20e..a7c86ddbc71 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -381,8 +381,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
     const float scale = input_scale / output_scale;
     if (compute_sum) {
       // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-      const float bias =
-          -input_zero_point * scale * num_elements_in_axis + 0.5f;
+      const float bias = -input_zero_point * scale * num_elements_in_axis;
       for (size_t idx = 0; idx < num_outputs; ++idx) {
         const U value =
             static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
@@ -390,7 +389,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
         output_data[idx] = static_cast<T>(value);
       }
     } else {
-      const float bias = -input_zero_point * scale + 0.5f;
+      const float bias = -input_zero_point * scale;
       for (size_t idx = 0; idx < num_outputs; ++idx) {
         float float_mean = static_cast<float>(temp_sum[idx]) /
                            static_cast<float>(num_elements_in_axis);
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index d7d5a251d35..e321cfdb751 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -1659,7 +1659,8 @@ inline void ComputeInterpolationValues(const int32 value, const int32 scale_10,
     *scaled_value = value * scale_10;
   }
   *lower_bound = std::max(*scaled_value / (1 << 10), 0);
-  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+  *upper_bound =
+      std::min((*scaled_value + (1 << 10) - 1) / (1 << 10), input_size - 1);
 }
 
 // Same as above but doesn't use any floating-point for the resize
@@ -1743,8 +1744,9 @@ inline void ResizeBilinearInteger(
               (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
           const int64_t output_20 =
               output_20_ll + output_20_lu + output_20_rl + output_20_ru;
-          const T interpolation =
-              static_cast<T>((output_20 + (1 << 19)) / (1 << 20));
+          const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
+          const int8_t interpolation =
+              static_cast<int8_t>((output_20 + round) / (1 << 20));
           output_data[Offset(output_shape, b, y, x, c)] = interpolation;
         }
       }
diff --git a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
index 3715b1286f5..a29082dbf9a 100644
--- a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
@@ -241,7 +241,7 @@ TEST(ResizeBilinear, TestResizeBilinearHalfPixelCenters_2x2to4x4) {
                                    3, 4};
   // clang-format on
 
-  // Output: 2x2
+  // Output: 4x4
   RuntimeShape output_dims_inference({1, 4, 4, 1});
   // Initialize the output data with something other than zero, so we can catch
   // issue with kernels failing to initialize the output.
@@ -289,5 +289,73 @@ TEST(ResizeBilinear, TestResizeBilinearHalfPixelCenters_2x2to4x4) {
   }
 }
 
+TEST(ResizeBilinear, TestResizeBilinearHalfPixelCenters_2x2to4x6_Int8) {
+  // Input: 2x2
+  RuntimeShape input_dims_inference({1, 2, 2, 1});
+  // clang-format off
+  std::vector<int8> input_data = {127, -128,
+                                  64, 0};
+  // clang-format on
+
+  // Output: 4x6
+  RuntimeShape output_dims_inference({1, 4, 6, 1});
+  // Initialize the output data with something other than zero, so we can catch
+  // issue with kernels failing to initialize the output.
+  const int output_buffer_size = output_dims_inference.FlatSize();
+  std::vector<int8> output_data(output_buffer_size, 3);
+
+  RuntimeShape output_size_dims({1, 1, 1, 2});
+  std::vector<int32> output_size_data = {4, 6};
+
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = false;
+  op_params.half_pixel_centers = false;
+
+  // Test with half_pixel_centers = false.
+  reference_ops::ResizeBilinear(
+      op_params, input_dims_inference, input_data.data(), output_size_dims,
+      output_size_data.data(), output_dims_inference, output_data.data());
+  // clang-format off
+  std::vector<int8> reference_half_pixel_centers_false =
+      {  127,   42,  -43, -128,  -128, -128,
+          96,   42,  -11,  -64,   -64,  -64,
+          64,   43,   21,    0,     0,    0,
+          64,   43,   21,    0,     0,    0};
+  // Float results =
+  // {127.000000, 41.999996, -43.000004, -128.000000, -128.000000, -128.000000,
+  //   95.500000, 42.333328, -10.833336,  -64.000000,  -64.000000,  -64.000000,
+  //   64.000000, 42.666664,  21.333332,    0.000000,    0.000000,    0.000000,
+  //   64.000000, 42.666664,  21.333332,    0.000000,    0.000000,    0.000000};
+
+  // clang-format on
+  for (int i = 0; i < output_buffer_size; i++) {
+    EXPECT_EQ(static_cast<int8>(output_data[i]),
+              static_cast<int8>(reference_half_pixel_centers_false[i]));
+  }
+
+  // Test with half_pixel_centers = true.
+  op_params.half_pixel_centers = true;
+  reference_ops::ResizeBilinear(
+      op_params, input_dims_inference, input_data.data(), output_size_dims,
+      output_size_data.data(), output_dims_inference, output_data.data());
+  // clang-format off
+  std::vector<int8> reference_half_pixel_centers_true =
+      {  127,  127,   42,  -43, -128, -128,
+         111,  111,   42,  -27,  -96,  -96,
+          80,   80,   43,    5,  -32,  -32,
+          64,   64,   43,   21,    0,    0};
+  // Float result =
+  // {127.000000, 127.000000, 41.999992, -43.000023, -128.000000, -128.000000,
+  //  111.249992, 111.250000, 42.166660, -26.916683,  -96.000000,  -96.000000,
+  //   79.749992,  79.750000, 42.499996,   5.249992,  -32.000000,  -32.000000,
+  //   63.999996,  64.000000, 42.666664,  21.333328,    0.000000,    0.000000};
+
+  // clang-format on
+  for (int i = 0; i < output_buffer_size; i++) {
+    EXPECT_EQ(static_cast<int8>(output_data[i]),
+              static_cast<int8>(reference_half_pixel_centers_true[i]));
+  }
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 3eb26565bc2..26bccd3a4b6 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -2102,10 +2102,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
               cell_gate_bias, output_gate_bias, projection_weights,
-              projection_bias, params, &op_data->integer_lstm_param,
-              output_state, cell_state, output, scratch0, scratch1, scratch2,
-              scratch3, scratch4, scratch5,
-              CpuBackendContext::GetFromContext(context));
+              projection_bias, params, /*forward_sequence=*/true,
+              /*time_major=*/true, &op_data->integer_lstm_param, output_state,
+              cell_state, output, scratch0, scratch1, scratch2, scratch3,
+              scratch4, scratch5, CpuBackendContext::GetFromContext(context));
         } else {
           TfLiteTensor* scratch0;
           TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 695100fa92f..aa9db64f057 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -1412,8 +1412,10 @@ inline void LstmStepInteger8x8_16(
     TFLITE_DCHECK(input_to_input_effective_bias);
     TFLITE_DCHECK(recurrent_to_input_effective_bias);
   }
-  TFLITE_DCHECK(projection_effective_bias);
-
+  const bool use_projection = (projection_weight_ptr != nullptr);
+  if (use_projection) {
+    TFLITE_DCHECK(projection_effective_bias);
+  }
   if (!use_cifg) {
     // Calculate the input gate. (If not CIFG.)
     CalculateLstmGateInteger8x8_16(
@@ -1479,7 +1481,7 @@ inline void LstmStepInteger8x8_16(
       quantized_proj_clip, output_state_ptr, context, scratch0, scratch4,
       scratch5);
   // Copy output state to the output. Note that unlike float or hybrid, output
-  // is always contigous.
+  // is always contiguous.
   std::copy_n(output_state_ptr, n_batch * n_output, output_ptr);
 }
 
@@ -2177,7 +2179,7 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
     TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
     TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
@@ -2190,8 +2192,8 @@ TfLiteStatus EvalInteger8x8_16(
     max_time = 1;
     n_batch = input->dims->data[0];
   } else {
-    max_time = input->dims->data[0];
-    n_batch = input->dims->data[1];
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
   }
 
   // n_cell and n_output will be the same size when there is no projection.
@@ -2204,90 +2206,193 @@ TfLiteStatus EvalInteger8x8_16(
   // Get params for time/batch/sequence.
   const int output_batch_leading_dim =
       output->dims->data[output->dims->size - 1];
-  const int input_step = n_batch * n_input;
-  const int output_step = n_batch * output_batch_leading_dim;
 
-  for (int t = 0; t < max_time; t++) {
-    const int t_rel = t;
-    int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
-    const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
-    LstmStepInteger8x8_16(
-        input_ptr, GetTensorData<int8_t>(input_to_input_weights),
-        integer_lstm_param->effective_input_to_input_scale_a,
-        integer_lstm_param->effective_input_to_input_scale_b,
-        GetTensorData<int8_t>(input_to_forget_weights),
-        integer_lstm_param->effective_input_to_forget_scale_a,
-        integer_lstm_param->effective_input_to_forget_scale_b,
-        GetTensorData<int8_t>(input_to_cell_weights),
-        integer_lstm_param->effective_input_to_cell_scale_a,
-        integer_lstm_param->effective_input_to_cell_scale_b,
-        GetTensorData<int8_t>(input_to_output_weights),
-        integer_lstm_param->effective_input_to_output_scale_a,
-        integer_lstm_param->effective_input_to_output_scale_b,
-        GetTensorData<int8_t>(recurrent_to_input_weights),
-        integer_lstm_param->effective_recurrent_to_input_scale_a,
-        integer_lstm_param->effective_recurrent_to_input_scale_b,
-        GetTensorData<int8_t>(recurrent_to_forget_weights),
-        integer_lstm_param->effective_recurrent_to_forget_scale_a,
-        integer_lstm_param->effective_recurrent_to_forget_scale_b,
-        GetTensorData<int8_t>(recurrent_to_cell_weights),
-        integer_lstm_param->effective_recurrent_to_cell_scale_a,
-        integer_lstm_param->effective_recurrent_to_cell_scale_b,
-        GetTensorData<int8_t>(recurrent_to_output_weights),
-        integer_lstm_param->effective_recurrent_to_output_scale_a,
-        integer_lstm_param->effective_recurrent_to_output_scale_b,
-        GetTensorData<int16_t>(cell_to_input_weights),
-        integer_lstm_param->effective_cell_to_input_scale_a,
-        integer_lstm_param->effective_cell_to_input_scale_b,
-        GetTensorData<int16_t>(cell_to_forget_weights),
-        integer_lstm_param->effective_cell_to_forget_scale_a,
-        integer_lstm_param->effective_cell_to_forget_scale_b,
-        GetTensorData<int16_t>(cell_to_output_weights),
-        integer_lstm_param->effective_cell_to_output_scale_a,
-        integer_lstm_param->effective_cell_to_output_scale_b,
-        GetTensorData<int8_t>(projection_weights),
-        integer_lstm_param->effective_proj_scale_a,
-        integer_lstm_param->effective_proj_scale_b,
-        integer_lstm_param->hidden_zp,
-        integer_lstm_param->effective_hidden_scale_a,
-        integer_lstm_param->effective_hidden_scale_b,
-        GetTensorData<int16_t>(input_layer_norm_coefficients),
-        integer_lstm_param->layer_norm_input_scale_a,
-        integer_lstm_param->layer_norm_input_scale_b,
-        GetTensorData<int16_t>(forget_layer_norm_coefficients),
-        integer_lstm_param->layer_norm_forget_scale_a,
-        integer_lstm_param->layer_norm_forget_scale_b,
-        GetTensorData<int16_t>(cell_layer_norm_coefficients),
-        integer_lstm_param->layer_norm_cell_scale_a,
-        integer_lstm_param->layer_norm_cell_scale_b,
-        GetTensorData<int16_t>(output_layer_norm_coefficients),
-        integer_lstm_param->layer_norm_output_scale_a,
-        integer_lstm_param->layer_norm_output_scale_b,
-        GetTensorData<int32_t>(input_gate_bias),
-        GetTensorData<int32_t>(forget_gate_bias),
-        GetTensorData<int32_t>(cell_gate_bias),
-        GetTensorData<int32_t>(output_gate_bias),
-        integer_lstm_param->quantized_cell_clip,
-        integer_lstm_param->quantized_proj_clip, integer_lstm_param->cell_scale,
-        integer_lstm_param->input_variance_guard,
-        integer_lstm_param->forget_variance_guard,
-        integer_lstm_param->cell_variance_guard,
-        integer_lstm_param->output_variance_guard,
-        integer_lstm_param->input_to_forget_effective_bias.get(),
-        integer_lstm_param->recurrent_to_forget_effective_bias.get(),
-        integer_lstm_param->input_to_cell_effective_bias.get(),
-        integer_lstm_param->recurrent_to_cell_effective_bias.get(),
-        integer_lstm_param->input_to_output_effective_bias.get(),
-        integer_lstm_param->recurrent_to_output_effective_bias.get(),
-        integer_lstm_param->input_to_input_effective_bias.get(),
-        integer_lstm_param->recurrent_to_input_effective_bias.get(),
-        integer_lstm_param->projection_effective_bias.get(), n_batch, n_cell,
-        n_input, n_output, GetTensorData<int8_t>(output_state), output_state_zp,
-        GetTensorData<int16_t>(cell_state), output_ptr,
-        GetTensorData<int16_t>(scratch0), GetTensorData<int16_t>(scratch1),
-        GetTensorData<int16_t>(scratch2), GetTensorData<int16_t>(scratch3),
-        GetTensorData<int8_t>(scratch4), GetTensorData<int32_t>(scratch5),
-        context);
+  if (time_major) {
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      const int t_rel = t;
+      int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
+      const int8_t* input_ptr =
+          GetTensorData<int8_t>(input) + t_rel * input_step;
+      LstmStepInteger8x8_16(
+          input_ptr, GetTensorData<int8_t>(input_to_input_weights),
+          integer_lstm_param->effective_input_to_input_scale_a,
+          integer_lstm_param->effective_input_to_input_scale_b,
+          GetTensorData<int8_t>(input_to_forget_weights),
+          integer_lstm_param->effective_input_to_forget_scale_a,
+          integer_lstm_param->effective_input_to_forget_scale_b,
+          GetTensorData<int8_t>(input_to_cell_weights),
+          integer_lstm_param->effective_input_to_cell_scale_a,
+          integer_lstm_param->effective_input_to_cell_scale_b,
+          GetTensorData<int8_t>(input_to_output_weights),
+          integer_lstm_param->effective_input_to_output_scale_a,
+          integer_lstm_param->effective_input_to_output_scale_b,
+          GetTensorData<int8_t>(recurrent_to_input_weights),
+          integer_lstm_param->effective_recurrent_to_input_scale_a,
+          integer_lstm_param->effective_recurrent_to_input_scale_b,
+          GetTensorData<int8_t>(recurrent_to_forget_weights),
+          integer_lstm_param->effective_recurrent_to_forget_scale_a,
+          integer_lstm_param->effective_recurrent_to_forget_scale_b,
+          GetTensorData<int8_t>(recurrent_to_cell_weights),
+          integer_lstm_param->effective_recurrent_to_cell_scale_a,
+          integer_lstm_param->effective_recurrent_to_cell_scale_b,
+          GetTensorData<int8_t>(recurrent_to_output_weights),
+          integer_lstm_param->effective_recurrent_to_output_scale_a,
+          integer_lstm_param->effective_recurrent_to_output_scale_b,
+          GetTensorData<int16_t>(cell_to_input_weights),
+          integer_lstm_param->effective_cell_to_input_scale_a,
+          integer_lstm_param->effective_cell_to_input_scale_b,
+          GetTensorData<int16_t>(cell_to_forget_weights),
+          integer_lstm_param->effective_cell_to_forget_scale_a,
+          integer_lstm_param->effective_cell_to_forget_scale_b,
+          GetTensorData<int16_t>(cell_to_output_weights),
+          integer_lstm_param->effective_cell_to_output_scale_a,
+          integer_lstm_param->effective_cell_to_output_scale_b,
+          GetTensorData<int8_t>(projection_weights),
+          integer_lstm_param->effective_proj_scale_a,
+          integer_lstm_param->effective_proj_scale_b,
+          integer_lstm_param->hidden_zp,
+          integer_lstm_param->effective_hidden_scale_a,
+          integer_lstm_param->effective_hidden_scale_b,
+          GetTensorData<int16_t>(input_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_input_scale_a,
+          integer_lstm_param->layer_norm_input_scale_b,
+          GetTensorData<int16_t>(forget_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_forget_scale_a,
+          integer_lstm_param->layer_norm_forget_scale_b,
+          GetTensorData<int16_t>(cell_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_cell_scale_a,
+          integer_lstm_param->layer_norm_cell_scale_b,
+          GetTensorData<int16_t>(output_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_output_scale_a,
+          integer_lstm_param->layer_norm_output_scale_b,
+          GetTensorData<int32_t>(input_gate_bias),
+          GetTensorData<int32_t>(forget_gate_bias),
+          GetTensorData<int32_t>(cell_gate_bias),
+          GetTensorData<int32_t>(output_gate_bias),
+          integer_lstm_param->quantized_cell_clip,
+          integer_lstm_param->quantized_proj_clip,
+          integer_lstm_param->cell_scale,
+          integer_lstm_param->input_variance_guard,
+          integer_lstm_param->forget_variance_guard,
+          integer_lstm_param->cell_variance_guard,
+          integer_lstm_param->output_variance_guard,
+          integer_lstm_param->input_to_forget_effective_bias.get(),
+          integer_lstm_param->recurrent_to_forget_effective_bias.get(),
+          integer_lstm_param->input_to_cell_effective_bias.get(),
+          integer_lstm_param->recurrent_to_cell_effective_bias.get(),
+          integer_lstm_param->input_to_output_effective_bias.get(),
+          integer_lstm_param->recurrent_to_output_effective_bias.get(),
+          integer_lstm_param->input_to_input_effective_bias.get(),
+          integer_lstm_param->recurrent_to_input_effective_bias.get(),
+          integer_lstm_param->projection_effective_bias.get(), n_batch, n_cell,
+          n_input, n_output, GetTensorData<int8_t>(output_state),
+          output_state_zp, GetTensorData<int16_t>(cell_state), output_ptr,
+          GetTensorData<int16_t>(scratch0), GetTensorData<int16_t>(scratch1),
+          GetTensorData<int16_t>(scratch2), GetTensorData<int16_t>(scratch3),
+          GetTensorData<int8_t>(scratch4), GetTensorData<int32_t>(scratch5),
+          context);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const int time_offset = b * max_time + t_rel;
+        const int8_t* input_ptr =
+            GetTensorData<int8_t>(input) + time_offset * input_step;
+        int8_t* output_ptr =
+            GetTensorData<int8_t>(output) + time_offset * output_step;
+
+        // Offset the {output,cell}_state pointers to the right batch.
+        int8_t* output_state_ptr =
+            GetTensorData<int8_t>(output_state) + b * output_batch_leading_dim;
+        int16_t* cell_state_ptr =
+            GetTensorData<int16_t>(cell_state) + b * n_cell;
+
+        LstmStepInteger8x8_16(
+            input_ptr, GetTensorData<int8_t>(input_to_input_weights),
+            integer_lstm_param->effective_input_to_input_scale_a,
+            integer_lstm_param->effective_input_to_input_scale_b,
+            GetTensorData<int8_t>(input_to_forget_weights),
+            integer_lstm_param->effective_input_to_forget_scale_a,
+            integer_lstm_param->effective_input_to_forget_scale_b,
+            GetTensorData<int8_t>(input_to_cell_weights),
+            integer_lstm_param->effective_input_to_cell_scale_a,
+            integer_lstm_param->effective_input_to_cell_scale_b,
+            GetTensorData<int8_t>(input_to_output_weights),
+            integer_lstm_param->effective_input_to_output_scale_a,
+            integer_lstm_param->effective_input_to_output_scale_b,
+            GetTensorData<int8_t>(recurrent_to_input_weights),
+            integer_lstm_param->effective_recurrent_to_input_scale_a,
+            integer_lstm_param->effective_recurrent_to_input_scale_b,
+            GetTensorData<int8_t>(recurrent_to_forget_weights),
+            integer_lstm_param->effective_recurrent_to_forget_scale_a,
+            integer_lstm_param->effective_recurrent_to_forget_scale_b,
+            GetTensorData<int8_t>(recurrent_to_cell_weights),
+            integer_lstm_param->effective_recurrent_to_cell_scale_a,
+            integer_lstm_param->effective_recurrent_to_cell_scale_b,
+            GetTensorData<int8_t>(recurrent_to_output_weights),
+            integer_lstm_param->effective_recurrent_to_output_scale_a,
+            integer_lstm_param->effective_recurrent_to_output_scale_b,
+            GetTensorData<int16_t>(cell_to_input_weights),
+            integer_lstm_param->effective_cell_to_input_scale_a,
+            integer_lstm_param->effective_cell_to_input_scale_b,
+            GetTensorData<int16_t>(cell_to_forget_weights),
+            integer_lstm_param->effective_cell_to_forget_scale_a,
+            integer_lstm_param->effective_cell_to_forget_scale_b,
+            GetTensorData<int16_t>(cell_to_output_weights),
+            integer_lstm_param->effective_cell_to_output_scale_a,
+            integer_lstm_param->effective_cell_to_output_scale_b,
+            GetTensorData<int8_t>(projection_weights),
+            integer_lstm_param->effective_proj_scale_a,
+            integer_lstm_param->effective_proj_scale_b,
+            integer_lstm_param->hidden_zp,
+            integer_lstm_param->effective_hidden_scale_a,
+            integer_lstm_param->effective_hidden_scale_b,
+            GetTensorData<int16_t>(input_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_input_scale_a,
+            integer_lstm_param->layer_norm_input_scale_b,
+            GetTensorData<int16_t>(forget_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_forget_scale_a,
+            integer_lstm_param->layer_norm_forget_scale_b,
+            GetTensorData<int16_t>(cell_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_cell_scale_a,
+            integer_lstm_param->layer_norm_cell_scale_b,
+            GetTensorData<int16_t>(output_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_output_scale_a,
+            integer_lstm_param->layer_norm_output_scale_b,
+            GetTensorData<int32_t>(input_gate_bias),
+            GetTensorData<int32_t>(forget_gate_bias),
+            GetTensorData<int32_t>(cell_gate_bias),
+            GetTensorData<int32_t>(output_gate_bias),
+            integer_lstm_param->quantized_cell_clip,
+            integer_lstm_param->quantized_proj_clip,
+            integer_lstm_param->cell_scale,
+            integer_lstm_param->input_variance_guard,
+            integer_lstm_param->forget_variance_guard,
+            integer_lstm_param->cell_variance_guard,
+            integer_lstm_param->output_variance_guard,
+            integer_lstm_param->input_to_forget_effective_bias.get(),
+            integer_lstm_param->recurrent_to_forget_effective_bias.get(),
+            integer_lstm_param->input_to_cell_effective_bias.get(),
+            integer_lstm_param->recurrent_to_cell_effective_bias.get(),
+            integer_lstm_param->input_to_output_effective_bias.get(),
+            integer_lstm_param->recurrent_to_output_effective_bias.get(),
+            integer_lstm_param->input_to_input_effective_bias.get(),
+            integer_lstm_param->recurrent_to_input_effective_bias.get(),
+            integer_lstm_param->projection_effective_bias.get(), /*n_batch=*/1,
+            n_cell, n_input, n_output, output_state_ptr, output_state_zp,
+            cell_state_ptr, output_ptr, GetTensorData<int16_t>(scratch0),
+            GetTensorData<int16_t>(scratch1), GetTensorData<int16_t>(scratch2),
+            GetTensorData<int16_t>(scratch3), GetTensorData<int8_t>(scratch4),
+            GetTensorData<int32_t>(scratch5), context);
+      }
+    }
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 5807c9ee56d..6e286626fb9 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -188,7 +188,7 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
     TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
     TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index c7d935a4b4f..bdad6c790eb 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -617,8 +617,9 @@ void TestOneFullyQuantizedLSTM() {
       one_parameter.GetOutputLayerNorm(), one_parameter.GetInputBias(),
       one_parameter.GetForgetBias(), one_parameter.GetCellBias(),
       one_parameter.GetOutputBias(), one_parameter.GetProjection(),
-      one_parameter.GetProjectionBias(), nullptr, param, activation, cell,
-      output, one_parameter.GetScratch0(), one_parameter.GetScratch1(),
+      one_parameter.GetProjectionBias(), nullptr, /*forward_sequence=*/true,
+      /*time_major=*/true, param, activation, cell, output,
+      one_parameter.GetScratch0(), one_parameter.GetScratch1(),
       one_parameter.GetScratch2(), one_parameter.GetScratch3(),
       one_parameter.GetScratch4(), one_parameter.GetScratch5(), &context);
 
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index ccdc8193f09..9e0084e813d 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -1458,9 +1458,13 @@ class LSTMIntegerOpModel : public SingleOpModel {
         BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
         CreateLSTMOptions(builder_, ActivationFunctionType_TANH).Union());
 
-    BuildInterpreter({});  // Input sizes are already set
+    BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/true, /*allocate_and_delegate=*/false);
   }
 
+  void PerformAllocateAndDelegate() { AllocateAndDelegate(true); }
+
   void SetInputToInputWeights(const std::vector<float>& f) {
     QuantizeAndPopulate<int8_t>(input_to_input_weights_, f);
   }
@@ -1692,6 +1696,8 @@ TEST(IntegerLstmOpTest, NoCifg_NoPeephole_Projection_LayerNorm) {
                           /*use_layer_norm=*/true,
                           /*use_8x8_8_implementation=*/false, ranges,
                           intermediates);
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
 
   // Set weights.
   lstm.SetInputToInputWeights(input_to_input_weights);
@@ -1859,6 +1865,9 @@ TEST(IntegerLstmOpTest, NoCifg_Peephole_Projection_LayerNorm) {
                           /*use_8x8_8_implementation=*/false, ranges,
                           intermediates);
 
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
+
   // Set weights.
   lstm.SetInputToInputWeights(input_to_input_weights);
   lstm.SetInputToCellWeights(input_to_cell_weights);
@@ -2026,6 +2035,9 @@ TEST(IntegerLstmOpTest, Cifg_NoPeephole_Projection_LayerNorm_8x8_8) {
                           /*use_8x8_8_implementation=*/true, ranges,
                           intermediates);
 
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
+
   // Set weights.
   // lstm.SetInputToInputWeights(input_to_input_weights);
   lstm.SetInputToCellWeights(input_to_cell_weights);
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 8f396355777..b227e4a4f6c 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -120,8 +120,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   } else {
     // Requantize use case.
     if (input->type == kTfLiteInt16) {
-      TF_LITE_ENSURE(
-          context, output->type == kTfLiteInt8 || output->type == kTfLiteInt16);
+      TF_LITE_ENSURE(context, output->type == kTfLiteInt8 ||
+                                  output->type == kTfLiteInt16 ||
+                                  output->type == kTfLiteInt32);
     } else {
       TF_LITE_ENSURE(context,
                      input->type == kTfLiteInt8 || input->type == kTfLiteUInt8);
@@ -135,6 +136,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                        &data->output_shift);
   }
 
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+  }
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  }
+
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
 }
@@ -198,6 +206,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   output->params.zero_point,
                                   GetTensorData<int16_t>(output));
           return kTfLiteOk;
+        case kTfLiteInt32:
+          // This case is not supported by the converter or other TFLite tools.
+          // The only use case is for applications that take quantized int32
+          // inference outputs.
+          Requantize<kernel_type>(GetTensorData<int16_t>(input),
+                                  MatchingFlatSize(input_shape, output_shape),
+                                  data->output_multiplier, data->output_shift,
+                                  input->params.zero_point,
+                                  output->params.zero_point,
+                                  GetTensorData<int32_t>(output));
+          return kTfLiteOk;
         default:
           ReportError(context, input->type, output->type);
           return kTfLiteError;
diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc
index d7392b3e3ea..718f55c9d54 100644
--- a/tensorflow/lite/kernels/quantize_test.cc
+++ b/tensorflow/lite/kernels/quantize_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <initializer_list>
+#include <limits>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -91,26 +92,27 @@ TEST(QuantizeOpTest, INT16) {
                                 12700, 12800}));
 }
 
-// rescale factor is around 2
+// Input scale 1.000000, output scale 0.500000, input zeropoint 0, output
+// zeropoint 0
 TEST(QuantizeOpTest, Int16Int16) {
-  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -16383, 16384},
-                    {TensorType_INT16, {1, 1, 2, 5}, 0, 16384});
+  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 1.0, 0},
+                    {TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0});
 
   m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutput<int16_t>(),
-              ElementsAreArray({-32764, -32760, -32756, -32752, -32748, -32744,
-                                -32740, -32736, -32732, -32728}));
+              ElementsAreArray({2, 4, 6, 8, 10, 12, 14, 16, 18, 20}));
 }
 
-// zero point is -1, scale is 0.5
+// Input scale 0.500000, output scale 0.500000, input zeropoint 0, output
+// zeropoint 0
 TEST(QuantizeOpTest, Int16Int16SameScale) {
-  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -16384, 16384},
-                    {TensorType_INT16, {1, 1, 2, 5}, -16384, 16384});
+  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0},
+                    {TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0});
   m.SetInputAndQuantize<int16_t>({0, 1, 2, 3, 4, 5, 6, 7, 8, 37767});
   m.Invoke();
   EXPECT_THAT(m.GetOutput<int16_t>(),
-              ElementsAreArray({-1, 1, 3, 5, 7, 9, 11, 13, 15, 32767}));
+              ElementsAreArray({0, 2, 4, 6, 8, 10, 12, 14, 16, 32767}));
 }
 
 // Input scale 0.500000, output scale 0.500000, input zeropoint -1, output
@@ -408,24 +410,24 @@ TEST(QuantizeOpTest, Uint8Int8SmallerScale) {
               ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
 }
 
-// Input scale 0.500000, output scale 0.500000, input zeropoint -1, output
+// Input scale 0.500000, output scale 0.500000, input zeropoint 0, output
 // zeropoint -1
 TEST(QuantizeOpTest, Int16Int8SameScale) {
-  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -63.5, 64},
-                    {TensorType_INT8, {1, 1, 2, 5}, -63.5, 64});
+  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0},
+                    {TensorType_INT8, {1, 1, 2, 5}, 0, 0, 0.5, -1});
 
-  // Input will quantized to {1,3,5,7,9,11,13,15,17,19}.
+  // Input will quantized to {2,4,6,8,10,12,14,16,18,20}.
   m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutput<int8_t>(),
               ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
 }
 
-// Input scale 0.500000, output scale 1.000000, input zeropoint -1, output
+// Input scale 0.500000, output scale 1.000000, input zeropoint 0, output
 // zeropoint -1
 TEST(QuantizeOpTest, Int16Int8LargerScale) {
-  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -63.5, 64},
-                    {TensorType_INT8, {1, 1, 2, 5}, -127, 128});
+  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0},
+                    {TensorType_INT8, {1, 1, 2, 5}, 0, 0, 1.0, -1});
 
   m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
@@ -433,11 +435,11 @@ TEST(QuantizeOpTest, Int16Int8LargerScale) {
               ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
 }
 
-// Input scale 1.000000, output scale 0.500000, input zeropoint -1, output
+// Input scale 1.000000, output scale 0.500000, input zeropoint 0, output
 // zeropoint -1
 TEST(QuantizeOpTest, Int16Int8SmallerScale) {
-  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -127, 128},
-                    {TensorType_INT8, {1, 1, 2, 5}, -63.5, 64});
+  QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 1.0, 0},
+                    {TensorType_INT8, {1, 1, 2, 5}, 0, 0, 0.5, -1});
 
   m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
@@ -447,8 +449,8 @@ TEST(QuantizeOpTest, Int16Int8SmallerScale) {
 
 // Same as previous test, except more data to hit the neon path.
 TEST(QuantizeOpTest, Int16Int8SmallerScaleNeonPath) {
-  QuantizeOpModel m({TensorType_INT16, {1, 1, 4, 5}, -127, 128},
-                    {TensorType_INT8, {1, 1, 4, 5}, -63.5, 64});
+  QuantizeOpModel m({TensorType_INT16, {1, 1, 4, 5}, 0, 0, 1.0, 0},
+                    {TensorType_INT8, {1, 1, 4, 5}, 0, 0, 0.5, -1});
 
   m.SetInputAndQuantize<int16_t>(
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
@@ -458,5 +460,59 @@ TEST(QuantizeOpTest, Int16Int8SmallerScaleNeonPath) {
                                 19, 17, 15, 13, 11, 9,  7,  5,  3,  1}));
 }
 
+// Input scale 1.0, output scale 1.0, input zeropoint 0, output zeropoint 0
+TEST(QuantizeOpTest, Int16Int32SameScale) {
+  QuantizeOpModel m({TensorType_INT16,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int16_t>::min(),
+                     std::numeric_limits<int16_t>::max()},
+                    {TensorType_INT32,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int32_t>::min(),
+                     std::numeric_limits<int32_t>::max()});
+
+  // Input will quantized to {1,3,5,7,9,11,13,15,17,19}.
+  m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+}
+
+// Input scale 0.500000, output scale 1.000000, input zeropoint -1, output
+// zeropoint 0
+TEST(QuantizeOpTest, Int16Int32LargerScale) {
+  QuantizeOpModel m({TensorType_INT16,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int16_t>::min() / 2.0,
+                     std::numeric_limits<int16_t>::max() / 2.0},
+                    {TensorType_INT32,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int32_t>::min(),
+                     std::numeric_limits<int32_t>::max()});
+
+  m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+}
+
+// Input scale 1.000000, output scale 0.500000, input zeropoint -1, output
+// zeropoint 0
+TEST(QuantizeOpTest, Int16Int32SmallerScale) {
+  QuantizeOpModel m({TensorType_INT16,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int16_t>::min(),
+                     std::numeric_limits<int16_t>::max()},
+                    {TensorType_INT32,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int32_t>::min() / 2.0,
+                     std::numeric_limits<int32_t>::max() / 2.0});
+
+  m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({2, 4, 6, 8, 10, 12, 14, 16, 18, 20}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index c3debef0f86..dd236a8ccde 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -263,6 +263,12 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplier(real_multiplier, &data->multiplier, &exponent);
     data->shift = exponent;
   }
+
+  if (op_context.input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point, 0);
+  }
+
   TfLiteTensor* temp_sum;
   TF_LITE_ENSURE_OK(context,
                     GetTemporarySafe(context, node, /*index=*/2, &temp_sum));
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index 2e724189fde..8dc327fabe3 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -52,6 +52,24 @@ class BaseOpModel : public SingleOpModel {
 
   int Input() { return input_; }
 
+ protected:
+  TensorData& SymmetricInt16Scaling(TensorData& tensor) {
+    // Symmetric range and null zero-point is required for INT16 tensors. As
+    // SingleOpModel::QuantizationParams calculates the scale on an asymmetric
+    // base [int_type::min, int_type::max], manually calculate the scale on a
+    // symmetric range [int_type::min+1, int_type::max] to ensure a null
+    // zero-point.
+    if (tensor.type == TensorType_INT16) {
+      CHECK_EQ(std::abs(tensor.min), tensor.max);
+      tensor.scale = tensor.max / std::numeric_limits<int16_t>::max();
+      tensor.zero_point = 0;
+      tensor.min = 0;
+      tensor.max = 0;
+    }
+
+    return tensor;
+  }
+
  protected:
   int input_;
   int axis_;
@@ -61,12 +79,12 @@ class BaseOpModel : public SingleOpModel {
 // Model for the tests case where axis is a const tensor.
 class MeanOpConstModel : public BaseOpModel {
  public:
-  MeanOpConstModel(const TensorData& input, const TensorData& output,
+  MeanOpConstModel(TensorData input, TensorData output,
                    std::initializer_list<int> axis_shape,
                    std::initializer_list<int> axis, bool keep_dims) {
-    input_ = AddInput(input);
+    input_ = AddInput(SymmetricInt16Scaling(input));
     axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
-    output_ = AddOutput(output);
+    output_ = AddOutput(SymmetricInt16Scaling(output));
     SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions,
                  CreateReducerOptions(builder_, keep_dims).Union());
     BuildInterpreter({GetShape(input_)});
@@ -450,14 +468,10 @@ TEST(ConstUint8MeanOpTest, KeepDims) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void MeanOpConstModelTest() {
-  float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
+  float kQuantizedTolerance = GetTolerance<integer_type>(-255.0, 255.0);
   std::vector<float> data = {105.0, 71.0, 233.0, 92.0, 227.0, 11.0, 14.0, 43.0};
-
-  float scale = tensor_dtype == TensorType_INT16 ? 255 / 32767.0f : 0.0f;
-
-  MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, 0.0, 255.0, scale, 0},
-                     {tensor_dtype, {1, 2, 4}, 0.0, 255.0, scale, 0}, {1}, {1},
-                     false);
+  MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -255.0, 255.0},
+                     {tensor_dtype, {1, 2, 4}, -255, 255.0}, {1}, {1}, false);
   m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 4}));
@@ -479,12 +493,8 @@ template <typename integer_type, TensorType tensor_dtype>
 void ConstMeanOpTestNonSameScale() {
   float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
   std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
-
-  float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f;
-
-  MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -1.0, 1.0, scale, 0},
-                     {tensor_dtype, {1, 2}, -5.0, 5.0, scale, 0}, {2}, {1, 3},
-                     false);
+  MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -1.0, 1.0},
+                     {tensor_dtype, {1, 2}, -5.0, 5.0}, {2}, {1, 3}, false);
   m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
@@ -506,16 +516,12 @@ TEST_F(ConstMeanOpTestNonSameScale, NonSpecialAxisNonSameScaleInt16) {
 template <typename integer_type, TensorType tensor_dtype>
 void MeanOpTestQuantizedSameScale() {
   float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
-
-  float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f;
-
   std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
                              0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
                              0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
                              0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
-  MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0, scale, 0},
-                     {tensor_dtype, {2}, -1.0, 1.0, scale, 0}, {2}, {1, 2},
-                     true);
+  MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0},
+                     {tensor_dtype, {2}, -1.0, 1.0}, {2}, {1, 2}, true);
   m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
@@ -538,16 +544,12 @@ TEST_F(MeanOpTestQuantizedSameScale, QuantizedSameScaleInt16) {
 template <typename integer_type, TensorType tensor_dtype>
 void MeanOpTestQuantizedDifferentScale() {
   float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
-
-  float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f;
-
   std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
                              0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
                              0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
                              0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
-  MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0, scale, 0},
-                     {tensor_dtype, {2}, -4.0, 4.0, scale, 0}, {2}, {1, 2},
-                     true);
+  MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0},
+                     {tensor_dtype, {2}, -4.0, 4.0}, {2}, {1, 2}, true);
   m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 5dddc4e5982..ea168a72079 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -152,7 +152,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V(),
              /* min_version = */ 1,
              /* max_version = */ 2);
-  AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
+  AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
              /* min_version = */ 1,
              /* max_version = */ 4);
@@ -293,7 +295,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
   AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_CUMSUM, Register_CUMSUM());
+  AddBuiltin(BuiltinOperator_CALL_ONCE,
+             tflite::ops::builtin::Register_CALL_ONCE());
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 7c507eb353f..d88c47d87f0 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -447,7 +447,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY());
   AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddCustom("NumericVerify",
             tflite::ops::custom::Register_NUMERIC_VERIFY_REF());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
diff --git a/tensorflow/lite/kernels/squeeze.cc b/tensorflow/lite/kernels/squeeze.cc
index c4dc51026a6..ac282fd0959 100644
--- a/tensorflow/lite/kernels/squeeze.cc
+++ b/tensorflow/lite/kernels/squeeze.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -78,6 +79,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   SqueezeContext op_context(context, node);
+  if (op_context.input->type == kTfLiteString) {
+    const int input_flat_size = GetTensorShape(op_context.input).FlatSize();
+    const int output_flat_size = GetTensorShape(op_context.output).FlatSize();
+    TF_LITE_ENSURE_EQ(context, input_flat_size, output_flat_size);
+    SequentialTensorWriter<string> writer(op_context.input, op_context.output);
+    for (int i = 0; i < input_flat_size; i++) {
+      writer.Write(i);
+    }
+    return kTfLiteOk;
+  }
+
   TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes);
   memcpy(op_context.output->data.raw, op_context.input->data.raw,
          op_context.input->bytes);
diff --git a/tensorflow/lite/kernels/squeeze_test.cc b/tensorflow/lite/kernels/squeeze_test.cc
index 4239ae43e1c..9aac56cf2ef 100644
--- a/tensorflow/lite/kernels/squeeze_test.cc
+++ b/tensorflow/lite/kernels/squeeze_test.cc
@@ -56,7 +56,14 @@ class SqueezeOpModel : public BaseSqueezeOpModel {
 
   void SetInput(std::initializer_list<T> data) { PopulateTensor(input_, data); }
 
+  void SetStringInput(std::initializer_list<string> data) {
+    PopulateStringTensor(input_, data);
+  }
+
   std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<string> GetStringOutput() {
+    return ExtractVector<string>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 };
 
@@ -122,5 +129,36 @@ TYPED_TEST(SqueezeOpTest, SqueezeAllDims) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
 }
 
+TEST(SqueezeOpTest, SqueezeAllString) {
+  std::initializer_list<std::string> data = {"a", "b"};
+  SqueezeOpModel<std::string> m({GetTensorType<std::string>(), {1, 2, 1}},
+                                {GetTensorType<std::string>(), {2}}, {});
+  m.SetStringInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"a", "b"}));
+}
+
+TEST(SqueezeOpTest, SqueezeNegativeAxisString) {
+  std::initializer_list<std::string> data = {"a", "b"};
+  SqueezeOpModel<std::string> m({GetTensorType<std::string>(), {1, 2, 1}},
+                                {GetTensorType<std::string>(), {24}}, {-1});
+  m.SetStringInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"a", "b"}));
+}
+
+TYPED_TEST(SqueezeOpTest, SqueezeAllDimsString) {
+  std::initializer_list<std::string> data = {"a"};
+  SqueezeOpModel<std::string> m(
+      {GetTensorType<std::string>(), {1, 1, 1, 1, 1, 1, 1}},
+      {GetTensorType<std::string>(), {1}}, {});
+  m.SetStringInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"a"}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
index 8f1964ad10f..6cf3e89b8c1 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stdint.h>
 #include <stdlib.h>
 
+#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -29,6 +30,48 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
+
+// Forward declaration for op kernels.
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_ASSIGN_VARIABLE();
+TfLiteRegistration* Register_READ_VARIABLE();
+
+namespace random_int {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 0);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
+  outputSize->data[0] = 1;
+  // TODO(jaesung): Make output size be changeable depending on user's input to
+  // make it generic.
+  return context->ResizeTensor(context, output, outputSize);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor& output = context->tensors[node->outputs->data[0]];
+
+  std::random_device rd;
+  std::uniform_int_distribution<int> dist(1, 32768);
+  output.data.i32[0] = dist(rd);
+  return kTfLiteOk;
+}
+
+}  // namespace random_int
+
+TfLiteRegistration* Register_RANDOM_INT() {
+  static TfLiteRegistration r = {nullptr, nullptr, random_int::Prepare,
+                                 random_int::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+
 namespace subgraph_test_util {
 
 namespace {
@@ -328,6 +371,65 @@ void SubgraphBuilder::BuildWhileSubgraph(Subgraph* subgraph) {
                                   &node_index);
 }
 
+void SubgraphBuilder::BuildAssignRandomValueToVariableSubgraph(
+    Subgraph* subgraph) {
+  const int kConstResourceId = 0;
+  const int kRandomValue = 1;
+  const int kTensorCount = 3;
+
+  // Construct a graph like ths:
+  //   %1 = random_int()
+  //   variable_assign(%0, %1)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(subgraph->SetInputs({}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({}), kTfLiteOk);
+
+  SetupTensor(subgraph, kRandomValue, kTfLiteInt32);
+  CreateConstantInt32Tensor(subgraph, kConstResourceId, {1}, {1024});
+
+  int node_index;
+  subgraph->AddNodeWithParameters({}, {kRandomValue}, {}, nullptr, 0, nullptr,
+                                  ::tflite::ops::custom::Register_RANDOM_INT(),
+                                  &node_index);
+  subgraph->AddNodeWithParameters(
+      {kConstResourceId, kRandomValue}, {}, {}, nullptr, 0, nullptr,
+      ::tflite::ops::custom::Register_ASSIGN_VARIABLE(), &node_index);
+}
+
+void SubgraphBuilder::BuildCallOnceAndReadVariableSubgraph(Subgraph* subgraph) {
+  const int kConstResourceId = 0;
+  const int kOutput = 1;
+  const int kTensorCount = 2;
+
+  // Construct a graph like ths:
+  //   Output: %1
+  //   %1 = read_variable(%0)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(subgraph->SetInputs({}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+  CreateConstantInt32Tensor(subgraph, kConstResourceId, {1}, {1024});
+
+  TfLiteCallOnceParams* params = reinterpret_cast<TfLiteCallOnceParams*>(
+      malloc(sizeof(TfLiteCallOnceParams)));
+  params->init_subgraph_index = 1;
+
+  int node_index;
+  subgraph->AddNodeWithParameters({}, {}, {}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_CALL_ONCE(),
+                                  &node_index);
+  subgraph->AddNodeWithParameters(
+      {kConstResourceId}, {kOutput}, {}, nullptr, 0, nullptr,
+      ::tflite::ops::custom::Register_READ_VARIABLE(), &node_index);
+}
+
 void SubgraphBuilder::CreateConstantInt32Tensor(Subgraph* subgraph,
                                                 int tensor_index,
                                                 const std::vector<int>& shape,
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
index 7306f82344d..e2de12b5434 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.h
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -85,6 +85,14 @@ class SubgraphBuilder {
   // 2 inputs, 2 outputs.
   void BuildWhileSubgraph(Subgraph* subgraph);
 
+  // Build a subgraph that assigns a random value to a variable.
+  // No input/output.
+  void BuildAssignRandomValueToVariableSubgraph(Subgraph* graph);
+
+  // Build a subgraph with CallOnce op and ReadVariable op.
+  // No input and 1 output.
+  void BuildCallOnceAndReadVariableSubgraph(Subgraph* graph);
+
  private:
   void CreateConstantInt32Tensor(Subgraph* subgraph, int tensor_index,
                                  const std::vector<int>& shape,
diff --git a/tensorflow/lite/kernels/test_main.cc b/tensorflow/lite/kernels/test_main.cc
index 3b3797890a3..e29e0e93525 100644
--- a/tensorflow/lite/kernels/test_main.cc
+++ b/tensorflow/lite/kernels/test_main.cc
@@ -34,6 +34,7 @@ void InitKernelTest(int* argc, char** argv) {
     auto* params = delegate_providers->MutableParams();
     if (!params->HasValueSet<std::string>("nnapi_accelerator_name")) {
       params->Set<std::string>("nnapi_accelerator_name", "nnapi-reference");
+      params->Set("disable_nnapi_cpu", false);
     }
   }
 }
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index ad513e9f918..05ce059f0fd 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -43,8 +43,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/logging.h"
@@ -193,7 +193,10 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
   UpdateOpVersion(buffer_pointer);
 
   if (!resolver_) {
-    auto resolver = new ops::builtin::BuiltinOpResolver();
+    MutableOpResolver* resolver =
+        apply_delegate
+            ? new ops::builtin::BuiltinOpResolver()
+            : new ops::builtin::BuiltinOpResolverWithoutDefaultDelegates();
     for (const auto& reg : custom_registrations_) {
       resolver->AddCustom(reg.first.data(), reg.second());
     }
@@ -227,6 +230,13 @@ TfLiteStatus SingleOpModel::ApplyDelegate() {
     ++num_applied_delegates_;
   } else {
     auto* delegate_providers = tflite::KernelTestDelegateProviders::Get();
+    // Most TFLite NNAPI delegation tests have been written to run against the
+    // NNAPI CPU path. We'll enable that for tests. However, need to first check
+    // if the parameter is present - it will not be if the NNAPI delegate
+    // provider is not linked into the test.
+    if (delegate_providers->ConstParams().HasParam("disable_nnapi_cpu")) {
+      delegate_providers->MutableParams()->Set("disable_nnapi_cpu", false);
+    }
     for (auto& one : delegate_providers->CreateAllDelegates()) {
       // The raw ptr always points to the actual TfLiteDegate object.
       auto* delegate_raw_ptr = one.get();
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index f739827c5b3..9cd272f3030 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -485,6 +485,10 @@ class SingleOpModel {
 
   // Build the interpreter for this model. Also, resize and allocate all
   // tensors given the shapes of the inputs.
+  // Note: 'apply_delegate' also serves to tell whether default TfLite delegates
+  // should be applied implicitly for a test case. For example, when testing the
+  // specific implementation of a TfLite delegate, it might be necessary to set
+  // this to false.
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                         int num_threads, bool allow_fp32_relax_to_fp16,
                         bool apply_delegate, bool allocate_and_delegate = true);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 240d7125a5f..d1a7c6ba9b2 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/lstm_eval.h"
@@ -31,15 +33,350 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 namespace unidirectional_sequence_lstm {
+namespace {
 
 struct OpData {
   // If the lstm is layer norm.
-  bool is_layer_norm_lstm;
+  bool use_layer_norm;
   // The scratch tensor index.
   int scratch_tensor_index;
   bool compute_row_sums = false;
+
+  lstm_eval::IntegerLstmParameter integer_lstm_param;
 };
 
+TfLiteStatus PopulateQuantizedLstmParams8x8_16(
+    TfLiteContext* context, TfLiteNode* node,
+    lstm_eval::IntegerLstmParameter* integer_lstm_param) {
+  // Calculate quantized clip for projection and cell.
+  const auto* params =
+      static_cast<TfLiteUnidirectionalSequenceLSTMParams*>(node->builtin_data);
+  const float cell_clip = params->cell_clip;
+  const float proj_clip = params->proj_clip;
+
+  const TfLiteTensor* cell_state =
+      GetVariableInput(context, node, lstm::full::kCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetOutputSafe(context, node, lstm::full::kOutputTensor, &output_tensor));
+
+  auto* cell_state_params =
+      static_cast<TfLiteAffineQuantization*>(cell_state->quantization.params);
+  auto* proj_params = static_cast<TfLiteAffineQuantization*>(
+      output_tensor->quantization.params);
+  if (cell_clip > 0.0) {
+    integer_lstm_param->quantized_cell_clip = static_cast<int16_t>(std::min(
+        std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
+        32767.0f));
+  } else {
+    integer_lstm_param->quantized_cell_clip = 0;
+  }
+  if (proj_clip > 0.0) {
+    integer_lstm_param->quantized_proj_clip = static_cast<int8_t>(std::min(
+        std::max(proj_clip / proj_params->scale->data[0], -128.0f), 127.0f));
+  } else {
+    integer_lstm_param->quantized_proj_clip = 0;
+  }
+
+  // Calculate effective scales.
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+  const bool use_layer_norm = op_data->use_layer_norm;
+
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kInputTensor, &input));
+
+  const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToForgetWeightsTensor,
+                   &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node,
+                                          lstm::full::kInputToCellWeightsTensor,
+                                          &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToOutputWeightsTensor,
+                   &input_to_output_weights));
+
+  const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToForgetWeightsTensor,
+                   &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToCellWeightsTensor,
+                   &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToOutputWeightsTensor,
+                   &recurrent_to_output_weights));
+
+  const TfLiteTensor* cell_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node, lstm::full::kInputLayerNormCoefficientsTensor);
+  const TfLiteTensor* forget_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node, lstm::full::kForgetLayerNormCoefficientsTensor);
+  const TfLiteTensor* cell_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node, lstm::full::kCellLayerNormCoefficientsTensor);
+  const TfLiteTensor* output_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node, lstm::full::kOutputLayerNormCoefficientsTensor);
+
+  const TfLiteTensor* projection_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kProjectionWeightsTensor);
+
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool use_projection = (projection_weights != nullptr);
+
+  // Get intermediate scales and zero points.
+  std::vector<float> intermediate_scale;
+  std::vector<int32> intermediate_zp;
+  for (int i = 0; i < 4; ++i) {
+    if (use_layer_norm) {
+      TfLiteTensor* intermediate;
+      TF_LITE_ENSURE_OK(context,
+                        GetIntermediatesSafe(context, node, i, &intermediate));
+      auto* params = static_cast<TfLiteAffineQuantization*>(
+          intermediate->quantization.params);
+      intermediate_scale.push_back(params->scale->data[0]);
+      intermediate_zp.push_back(params->zero_point->data[0]);
+    } else {
+      // Q3.12 for activation functions.
+      intermediate_scale.push_back(std::pow(2, -12));
+      intermediate_zp.push_back(0);
+    }
+  }
+  // In the absense of projection, hidden becomes otuput and this intermediate
+  // is ignored.
+  TfLiteTensor* hidden;
+  TF_LITE_ENSURE_OK(context, GetIntermediatesSafe(context, node, 4, &hidden));
+  auto* hidden_params =
+      static_cast<TfLiteAffineQuantization*>(hidden->quantization.params);
+  intermediate_scale.push_back(hidden_params->scale->data[0]);
+  intermediate_zp.push_back(hidden_params->zero_point->data[0]);
+
+  // Scales.
+  const float default_scale = 1.0;
+  float input_scale = default_scale;
+  float input_to_input_weight_scale = default_scale;
+  float recurrent_to_input_weight_scale = default_scale;
+  float cell_to_input_weight_scale = default_scale;
+  float input_to_forget_weight_scale = default_scale;
+  float recurrent_to_forget_weight_scale = default_scale;
+  float cell_to_forget_weight_scale = default_scale;
+  float input_to_cell_weight_scale = default_scale;
+  float recurrent_to_cell_weight_scale = default_scale;
+  float input_to_output_weight_scale = default_scale;
+  float recurrent_to_output_weight_scale = default_scale;
+  float cell_to_output_weight_scale = default_scale;
+  float projection_weight_scale = default_scale;
+  float layer_norm_input_scale = default_scale;
+  float layer_norm_forget_scale = default_scale;
+  float layer_norm_cell_scale = default_scale;
+  float layer_norm_output_scale = default_scale;
+  float output_state_scale = default_scale;
+  int cell_scale = 1;
+
+  // Effective scales.
+  float effective_input_to_input_scale = default_scale;
+  float effective_recurrent_to_input_scale = default_scale;
+  float effective_cell_to_input_scale = default_scale;
+  float effective_input_to_forget_scale = default_scale;
+  float effective_recurrent_to_forget_scale = default_scale;
+  float effective_cell_to_forget_scale = default_scale;
+  float effective_input_to_cell_scale = default_scale;
+  float effective_recurrent_to_cell_scale = default_scale;
+  float effective_input_to_output_scale = default_scale;
+  float effective_recurrent_to_output_scale = default_scale;
+  float effective_cell_to_output_scale = default_scale;
+  float effective_proj_scale = default_scale;
+  float effective_hidden_scale = default_scale;
+
+  // Populate scales.
+  if (!use_cifg) {
+    input_to_input_weight_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weight_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weight_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weight_scale = cell_to_output_weights->params.scale;
+  }
+
+  if (use_layer_norm) {
+    if (!use_cifg) {
+      layer_norm_input_scale = input_layer_norm_coefficients->params.scale;
+    }
+    layer_norm_forget_scale = forget_layer_norm_coefficients->params.scale;
+    layer_norm_cell_scale = cell_layer_norm_coefficients->params.scale;
+    layer_norm_output_scale = output_layer_norm_coefficients->params.scale;
+  }
+
+  if (use_projection) {
+    projection_weight_scale = projection_weights->params.scale;
+  }
+  output_state_scale = output_state->params.scale;
+
+  input_to_forget_weight_scale = input_to_forget_weights->params.scale;
+  input_to_cell_weight_scale = input_to_cell_weights->params.scale;
+  input_to_output_weight_scale = input_to_output_weights->params.scale;
+  recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale;
+  recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
+  recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
+
+  // Check cell state (already used above)
+  TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale));
+  // TF_LITE_ENSURE(context, cell_scale <= -9);
+  integer_lstm_param->cell_scale = cell_scale;
+  input_scale = input->params.scale;
+
+  // Calculate effective scales.
+  if (!use_cifg) {
+    effective_input_to_input_scale =
+        input_to_input_weight_scale * input_scale / intermediate_scale[0];
+    effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
+                                         output_state_scale /
+                                         intermediate_scale[0];
+  }
+  effective_input_to_forget_scale =
+      input_to_forget_weight_scale * input_scale / intermediate_scale[1];
+  effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
+                                        output_state_scale /
+                                        intermediate_scale[1];
+
+  effective_input_to_cell_scale =
+      input_to_cell_weight_scale * input_scale / intermediate_scale[2];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[2];
+
+  effective_input_to_output_scale =
+      input_to_output_weight_scale * input_scale / intermediate_scale[3];
+  effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
+                                        output_state_scale /
+                                        intermediate_scale[3];
+
+  effective_hidden_scale =
+      std::pow(2, -15) / intermediate_scale[4] * std::pow(2, -15);
+
+  effective_proj_scale =
+      projection_weight_scale * intermediate_scale[4] / output_state_scale;
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      effective_cell_to_input_scale = std::pow(2, cell_scale) *  // NOLINT
+                                      cell_to_input_weight_scale /
+                                      intermediate_scale[0];
+    }
+    effective_cell_to_forget_scale = std::pow(2, cell_scale) *  // NOLINT
+                                     cell_to_forget_weight_scale /
+                                     intermediate_scale[1];
+    effective_cell_to_output_scale = std::pow(2, cell_scale) *  // NOLINT
+                                     cell_to_output_weight_scale /
+                                     intermediate_scale[3];
+  }
+
+  // Decompose scales.
+  QuantizeMultiplier(effective_input_to_input_scale,
+                     &integer_lstm_param->effective_input_to_input_scale_a,
+                     &integer_lstm_param->effective_input_to_input_scale_b);
+  QuantizeMultiplier(effective_recurrent_to_input_scale,
+                     &integer_lstm_param->effective_recurrent_to_input_scale_a,
+                     &integer_lstm_param->effective_recurrent_to_input_scale_b);
+  QuantizeMultiplier(effective_cell_to_input_scale,
+                     &integer_lstm_param->effective_cell_to_input_scale_a,
+                     &integer_lstm_param->effective_cell_to_input_scale_b);
+  QuantizeMultiplier(effective_input_to_forget_scale,
+                     &integer_lstm_param->effective_input_to_forget_scale_a,
+                     &integer_lstm_param->effective_input_to_forget_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_forget_scale,
+      &integer_lstm_param->effective_recurrent_to_forget_scale_a,
+      &integer_lstm_param->effective_recurrent_to_forget_scale_b);
+  QuantizeMultiplier(effective_cell_to_forget_scale,
+                     &integer_lstm_param->effective_cell_to_forget_scale_a,
+                     &integer_lstm_param->effective_cell_to_forget_scale_b);
+  QuantizeMultiplier(effective_input_to_cell_scale,
+                     &integer_lstm_param->effective_input_to_cell_scale_a,
+                     &integer_lstm_param->effective_input_to_cell_scale_b);
+  QuantizeMultiplier(effective_recurrent_to_cell_scale,
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_a,
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_b);
+  QuantizeMultiplier(effective_input_to_output_scale,
+                     &integer_lstm_param->effective_input_to_output_scale_a,
+                     &integer_lstm_param->effective_input_to_output_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_output_scale,
+      &integer_lstm_param->effective_recurrent_to_output_scale_a,
+      &integer_lstm_param->effective_recurrent_to_output_scale_b);
+  QuantizeMultiplier(effective_cell_to_output_scale,
+                     &integer_lstm_param->effective_cell_to_output_scale_a,
+                     &integer_lstm_param->effective_cell_to_output_scale_b);
+  QuantizeMultiplier(effective_proj_scale,
+                     &integer_lstm_param->effective_proj_scale_a,
+                     &integer_lstm_param->effective_proj_scale_b);
+  QuantizeMultiplier(effective_hidden_scale,
+                     &integer_lstm_param->effective_hidden_scale_a,
+                     &integer_lstm_param->effective_hidden_scale_b);
+  QuantizeMultiplier(layer_norm_input_scale,
+                     &integer_lstm_param->layer_norm_input_scale_a,
+                     &integer_lstm_param->layer_norm_input_scale_b);
+  QuantizeMultiplier(layer_norm_forget_scale,
+                     &integer_lstm_param->layer_norm_forget_scale_a,
+                     &integer_lstm_param->layer_norm_forget_scale_b);
+  QuantizeMultiplier(layer_norm_cell_scale,
+                     &integer_lstm_param->layer_norm_cell_scale_a,
+                     &integer_lstm_param->layer_norm_cell_scale_b);
+  QuantizeMultiplier(layer_norm_output_scale,
+                     &integer_lstm_param->layer_norm_output_scale_a,
+                     &integer_lstm_param->layer_norm_output_scale_b);
+
+  integer_lstm_param->hidden_zp = intermediate_zp[4];
+
+  // 10000 is used to make sure the kernel logic does not overflow.
+  if (!use_cifg) {
+    integer_lstm_param->input_variance_guard =
+        std::max(1, static_cast<int32_t>(10000 * layer_norm_input_scale));
+  }
+  integer_lstm_param->forget_variance_guard =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_forget_scale));
+  integer_lstm_param->cell_variance_guard =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_cell_scale));
+  integer_lstm_param->output_variance_guard =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_output_scale));
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
 // Temporary tensors
 enum TemporaryTensor {
   kScratchBuffer = 0,
@@ -72,7 +409,7 @@ void Free(TfLiteContext* context, void* buffer) {
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                         TfLiteNode* node, int n_input,
                                         int n_output, int n_cell,
-                                        bool is_layer_norm_lstm) {
+                                        bool use_layer_norm, bool is_integer) {
   const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Making sure clipping parameters have valid values.
@@ -151,6 +488,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (cell_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_TYPES_EQ(
+        context, cell_to_input_weights->type,
+        is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
   }
 
   const TfLiteTensor* cell_to_forget_weights = GetOptionalInputTensor(
@@ -158,6 +498,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (cell_to_forget_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_TYPES_EQ(
+        context, cell_to_forget_weights->type,
+        is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
   }
 
   const TfLiteTensor* cell_to_output_weights = GetOptionalInputTensor(
@@ -165,6 +508,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (cell_to_output_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_TYPES_EQ(
+        context, cell_to_output_weights->type,
+        is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
   }
 
   // Making sure the peephole weights are there all or none.
@@ -186,6 +532,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteInt32);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+    }
   }
 
   const TfLiteTensor* forget_gate_bias;
@@ -194,6 +545,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                             &forget_gate_bias));
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+  if (is_integer) {
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+  }
 
   const TfLiteTensor* cell_gate_bias;
   TF_LITE_ENSURE_OK(context,
@@ -201,6 +557,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                  &cell_gate_bias));
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
+  if (is_integer) {
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
+  }
 
   const TfLiteTensor* output_gate_bias;
   TF_LITE_ENSURE_OK(
@@ -208,6 +569,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                             &output_gate_bias));
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+  if (is_integer) {
+    TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+  }
 
   const TfLiteTensor* projection_weights = GetOptionalInputTensor(
       context, node, lstm::full::kProjectionWeightsTensor);
@@ -222,6 +588,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteInt32);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteFloat32);
+    }
   }
 
   // Making sure the projection tensors are consistent:
@@ -233,7 +604,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       ((projection_weights != nullptr) || (projection_bias == nullptr));
   TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
 
-  if (is_layer_norm_lstm) {
+  if (use_layer_norm) {
     const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
         context, node, lstm::full::kInputLayerNormCoefficientsTensor);
     if (use_cifg) {
@@ -243,8 +614,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
                         n_cell);
-      TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
-                              kTfLiteFloat32);
+      if (is_integer) {
+        TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                                kTfLiteInt16);
+      } else {
+        TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                                kTfLiteFloat32);
+      }
     }
 
     const TfLiteTensor* forget_layer_norm_coefficients;
@@ -255,8 +631,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
-                            kTfLiteFloat32);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                              kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
+    }
 
     const TfLiteTensor* cell_layer_norm_coefficients;
     TF_LITE_ENSURE_OK(context,
@@ -266,8 +647,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
-                            kTfLiteFloat32);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                              kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
+    }
 
     const TfLiteTensor* output_layer_norm_coefficients;
     TF_LITE_ENSURE_OK(
@@ -277,13 +663,185 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
-                            kTfLiteFloat32);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                              kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
+    }
   }
 
   return kTfLiteOk;
 }
 
+TfLiteStatus PrecomputeZeroPointTimesWeightWithBias(
+    TfLiteContext* context, int32_t zero_point,
+    const TfLiteTensor* weight_tensor, const TfLiteTensor* bias_tensor,
+    std::unique_ptr<int32_t[]>* output) {
+  if (weight_tensor == nullptr) {
+    return kTfLiteOk;
+  }
+
+  const RuntimeShape& weight_shape = GetTensorShape(weight_tensor);
+  TF_LITE_ENSURE_EQ(context, weight_shape.DimensionsCount(), 2);
+  const int row = weight_shape.Dims(0);
+  const int col = weight_shape.Dims(1);
+  output->reset(new int32_t[row]);
+  if (bias_tensor == nullptr) {
+    memset(output->get(), 0, row * sizeof(int32_t));
+  } else {
+    const int32_t* bias = GetTensorData<int32_t>(bias_tensor);
+    memcpy(output->get(), bias, row * sizeof(int32_t));
+  }
+  if (zero_point != 0) {
+    const int8_t* weight = GetTensorData<int8_t>(weight_tensor);
+    tensor_utils::MatrixScalarMultiplyAccumulate(weight, zero_point, row, col,
+                                                 output->get());
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
+                                                       OpData* op_data,
+                                                       TfLiteNode* node) {
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kInputTensor, &input));
+  const TfLiteTensor* output_state =
+      GetVariableInput(context, node, lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+
+  const int32_t input_zero_point = -input->params.zero_point;
+  const int32_t output_state_zero_point = -output_state->params.zero_point;
+
+  const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToForgetWeightsTensor,
+                   &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node,
+                                          lstm::full::kInputToCellWeightsTensor,
+                                          &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToOutputWeightsTensor,
+                   &input_to_output_weights));
+
+  const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToForgetWeightsTensor,
+                   &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToCellWeightsTensor,
+                   &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToOutputWeightsTensor,
+                   &recurrent_to_output_weights));
+
+  const TfLiteTensor* projection_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, lstm::full::kProjectionBiasTensor);
+
+  lstm_eval::IntegerLstmParameter* integer_lstm_params =
+      &op_data->integer_lstm_param;
+
+  const TfLiteTensor* intermediate =
+      &context->tensors[node->intermediates->data[4]];
+  const auto* params =
+      static_cast<TfLiteAffineQuantization*>(intermediate->quantization.params);
+  const int32_t hidden_zp = params->zero_point->data[0];
+
+  // Get bias and perform zero point calculation.
+  // When there is layer normalization, the gate bias does not apply to matmul
+  // directly:
+  //      y = ln(w * x + w * r + w * c) + b.
+  const bool is_layer_norm = op_data->use_layer_norm;
+
+  // Forget gate.
+  const TfLiteTensor* forget_gate_bias =
+      is_layer_norm
+          ? nullptr
+          : GetInput(context, node, lstm::full::kForgetGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_forget_weights, forget_gate_bias,
+          &(integer_lstm_params->input_to_forget_effective_bias)));
+
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_forget_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_forget_effective_bias)));
+
+  // Modulation gate.
+  const TfLiteTensor* cell_gate_bias =
+      is_layer_norm ? nullptr
+                    : GetInput(context, node, lstm::full::kCellGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_cell_weights, cell_gate_bias,
+          &(integer_lstm_params->input_to_cell_effective_bias)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_cell_weights, nullptr,
+          &(integer_lstm_params->recurrent_to_cell_effective_bias)));
+
+  // Output gate.
+  const TfLiteTensor* output_gate_bias =
+      is_layer_norm
+          ? nullptr
+          : GetInput(context, node, lstm::full::kOutputGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_output_weights, output_gate_bias,
+          &(integer_lstm_params->input_to_output_effective_bias)));
+
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_output_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_output_effective_bias)));
+
+  // Input gate. The calculation is only meaningful for non-cifg case.
+  const TfLiteTensor* input_gate_bias =
+      is_layer_norm ? nullptr
+                    : GetInput(context, node, lstm::full::kInputGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_input_weights, input_gate_bias,
+          &(integer_lstm_params->input_to_input_effective_bias)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_input_weights, nullptr,
+          &(integer_lstm_params->recurrent_to_input_effective_bias)));
+
+  // Projection bias. The calculation is only meaningful for with projection.
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, hidden_zp, projection_weights, projection_bias,
+                        &(integer_lstm_params->projection_effective_bias)));
+  return kTfLiteOk;
+}
+
 // Resize the output and  state tensors based on the sizes of the input tensors.
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
@@ -292,18 +850,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int scratch_tensor_index = op_data->scratch_tensor_index;
 
   // Check we have all the inputs and outputs we need.
-  bool is_layer_norm_lstm = false;
+  bool use_layer_norm = false;
   if (node->inputs->size == 24) {
     const TfLiteTensor* forget_layer_norm_coefficients = GetOptionalInputTensor(
         context, node, lstm::full::kForgetLayerNormCoefficientsTensor);
     if (forget_layer_norm_coefficients == nullptr) {
-      is_layer_norm_lstm = false;
+      use_layer_norm = false;
     } else {
-      is_layer_norm_lstm = true;
+      use_layer_norm = true;
     }
   } else if (node->inputs->size == 20) {
     // This is deprecated and is only kept here for backward compatibility.
-    is_layer_norm_lstm = false;
+    use_layer_norm = false;
   } else {
     context->ReportError(
         context, "The LSTM Full kernel expects 20 or 24 inputs. Got %d inputs",
@@ -311,14 +869,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  op_data->is_layer_norm_lstm = is_layer_norm_lstm;
+  op_data->use_layer_norm = use_layer_norm;
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
   const TfLiteTensor* input;
   TF_LITE_ENSURE_OK(
       context, GetInputSafe(context, node, lstm::full::kInputTensor, &input));
-  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  const bool is_integer = input->type == kTfLiteInt8;
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const auto* params =
       reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
@@ -347,9 +905,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context,
-                    CheckInputTensorDimensions(context, node, n_input, n_output,
-                                               n_cell, is_layer_norm_lstm));
+  TF_LITE_ENSURE_OK(
+      context, CheckInputTensorDimensions(context, node, n_input, n_output,
+                                          n_cell, use_layer_norm, is_integer));
 
   // Get the pointer to output, output_state and cell_state buffer tensors.
   TfLiteTensor* output;
@@ -375,9 +933,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size));
 
+  if (is_integer) {
+    const int num_intermediate_tensors = node->intermediates->size;
+    TF_LITE_ENSURE(context, num_intermediate_tensors == 5);
+  }
+
   TfLiteIntArrayFree(node->temporaries);
   if (IsHybridOp(input, input_to_output_weights)) {
     node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
+  } else if (is_integer) {
+    node->temporaries = TfLiteIntArrayCreate(6);
   } else {
     node->temporaries = TfLiteIntArrayCreate(1);
   }
@@ -590,6 +1155,50 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           context, context->ResizeTensor(context, row_sums, row_sums_size));
     }
   }
+
+  if (is_integer) {
+    // Integer UnidirectionalSequenceLSTM prepare function for 8x8->16.
+    // This code path needs 5 intermediate tensors per Op.
+    // Populate quantization parameters.
+    PopulateQuantizedLstmParams8x8_16(context, node,
+                                      &op_data->integer_lstm_param);
+    // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
+    // and 1 8bit buffer with size n_batch * n_cell. We also need 1 32 bit
+    // buffer with size n_batch * n_cell.
+    //
+    // Handle cifg case as well, which might save one buffer.
+    for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
+      node->temporaries->data[scratch_index] =
+          op_data->scratch_tensor_index + scratch_index;
+      TfLiteTensor* scratch_tensor;
+      TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, scratch_index,
+                                                  &scratch_tensor));
+
+      scratch_tensor->type = kTfLiteInt16;
+      if (scratch_index == 4) {
+        scratch_tensor->type = kTfLiteInt8;
+      } else if (scratch_index == 5) {
+        scratch_tensor->type = kTfLiteInt32;
+      }
+
+      scratch_tensor->allocation_type = kTfLiteArenaRw;
+      const int scratch_dimension[2] = {n_batch, n_cell};
+      if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
+                                     scratch_dimension)) {
+        TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+        scratch_buffer_size->data[0] = n_batch;
+        scratch_buffer_size->data[1] = n_cell;
+        TF_LITE_ENSURE_OK(context,
+                          context->ResizeTensor(context, scratch_tensor,
+                                                scratch_buffer_size));
+      }
+    }
+
+    // Populate precomputed zp * weight.
+    TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
+                                   context, op_data, node));
+  }
+
   return kTfLiteOk;
 }
 
@@ -598,7 +1207,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
           node->builtin_data);
   const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
+  const bool use_layer_norm = op_data->use_layer_norm;
   const bool time_major = params->time_major;
   const TfLiteTensor* input;
   TF_LITE_ENSURE_OK(
@@ -666,11 +1275,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, lstm::full::kProjectionBiasTensor);
 
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer;
-  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kScratchBuffer,
-                                              &scratch_buffer));
-
   TfLiteTensor* output_state =
       GetVariableInput(context, node, lstm::full::kOutputStateTensor);
   TFLITE_DCHECK(output_state != nullptr);
@@ -679,25 +1283,22 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(cell_state != nullptr);
 
   const TfLiteTensor* input_layer_norm_coefficients =
-      is_layer_norm_lstm
+      use_layer_norm
           ? GetOptionalInputTensor(
                 context, node, lstm::full::kInputLayerNormCoefficientsTensor)
           : nullptr;
   const TfLiteTensor* forget_layer_norm_coefficients =
-      is_layer_norm_lstm
-          ? GetInput(context, node,
-                     lstm::full::kForgetLayerNormCoefficientsTensor)
-          : nullptr;
+      use_layer_norm ? GetInput(context, node,
+                                lstm::full::kForgetLayerNormCoefficientsTensor)
+                     : nullptr;
   const TfLiteTensor* cell_layer_norm_coefficients =
-      is_layer_norm_lstm
-          ? GetInput(context, node,
-                     lstm::full::kCellLayerNormCoefficientsTensor)
-          : nullptr;
+      use_layer_norm ? GetInput(context, node,
+                                lstm::full::kCellLayerNormCoefficientsTensor)
+                     : nullptr;
   const TfLiteTensor* output_layer_norm_coefficients =
-      is_layer_norm_lstm
-          ? GetInput(context, node,
-                     lstm::full::kOutputLayerNormCoefficientsTensor)
-          : nullptr;
+      use_layer_norm ? GetInput(context, node,
+                                lstm::full::kOutputLayerNormCoefficientsTensor)
+                     : nullptr;
 
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
@@ -712,6 +1313,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
+      // Index the scratch buffers pointers to the global scratch buffer.
+      TfLiteTensor* scratch_buffer;
+      TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kScratchBuffer,
+                                                  &scratch_buffer));
       return lstm_eval::EvalFloat(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
@@ -733,53 +1338,96 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteUInt8:
     case kTfLiteInt8: {
-      OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-      TfLiteTensor* row_sums;
-      TF_LITE_ENSURE_OK(context,
-                        GetTemporarySafe(context, node, kRowSums, &row_sums));
-      const int row_sums_size = row_sums->dims->data[0];
-      return lstm_eval::EvalHybrid(
-          input, input_to_input_weights,
-          /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
-          /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
-          /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
-          /*input_to_output_weights_ledger*/ nullptr,
-          recurrent_to_input_weights,
-          /*recurrent_to_input_weights_ledger*/ nullptr,
-          recurrent_to_forget_weights,
-          /*recurrent_to_forget_weights_ledger*/ nullptr,
-          recurrent_to_cell_weights,
-          /*recurrent_to_cell_weights_ledger*/ nullptr,
-          recurrent_to_output_weights,
-          /*recurrent_to_output_weights_ledger*/ nullptr, cell_to_input_weights,
-          cell_to_forget_weights, cell_to_output_weights,
-          input_layer_norm_coefficients, forget_layer_norm_coefficients,
-          cell_layer_norm_coefficients, output_layer_norm_coefficients,
-          /*aux_input=*/nullptr,
-          /*aux_input_to_input_weights=*/nullptr,
-          /*aux_input_to_forget_weights=*/nullptr,
-          /*aux_input_to_cell_weights=*/nullptr,
-          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_gate_bias, output_gate_bias,
-          projection_weights, /*projection_weights_ledger*/ nullptr,
-          projection_bias, &lstm_params,
-          /*forward_sequence=*/true, time_major,
-          /*output_offset=*/0, scratch_buffer,
-          GetTemporary(context, node, kInputScalingFactors),
-          /*aux_input_sf=*/nullptr,
-          GetTemporary(context, node, kOutputStateScalingFactors),
-          GetTemporary(context, node, kProductScalingFactors),
-          GetTemporary(context, node, kRecoveredCellWeights),
-          GetTemporary(context, node, kInputQuantized),
-          /*aux_input_quantized=*/nullptr,
-          GetTemporary(context, node, kOutputStateQuantized),
-          GetTemporary(context, node, kCellStateQuantized), output_state,
-          cell_state, GetTemporary(context, node, kAccumScratch), output,
-          GetTemporary(context, node, kInputZeroPoints),
-          /*aux_input_zp=*/nullptr,
-          GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
-          row_sums_size, &op_data->compute_row_sums,
-          CpuBackendContext::GetFromContext(context));
+      const bool is_hybrid = input->type == kTfLiteFloat32;
+      if (is_hybrid) {
+        // Index the scratch buffers pointers to the global scratch buffer.
+        TfLiteTensor* scratch_buffer;
+        TF_LITE_ENSURE_OK(
+            context,
+            GetTemporarySafe(context, node, kScratchBuffer, &scratch_buffer));
+
+        OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+        TfLiteTensor* row_sums;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, kRowSums, &row_sums));
+        const int row_sums_size = row_sums->dims->data[0];
+        return lstm_eval::EvalHybrid(
+            input, input_to_input_weights,
+            /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
+            /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
+            /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
+            /*input_to_output_weights_ledger*/ nullptr,
+            recurrent_to_input_weights,
+            /*recurrent_to_input_weights_ledger*/ nullptr,
+            recurrent_to_forget_weights,
+            /*recurrent_to_forget_weights_ledger*/ nullptr,
+            recurrent_to_cell_weights,
+            /*recurrent_to_cell_weights_ledger*/ nullptr,
+            recurrent_to_output_weights,
+            /*recurrent_to_output_weights_ledger*/ nullptr,
+            cell_to_input_weights, cell_to_forget_weights,
+            cell_to_output_weights, input_layer_norm_coefficients,
+            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+            output_layer_norm_coefficients,
+            /*aux_input=*/nullptr,
+            /*aux_input_to_input_weights=*/nullptr,
+            /*aux_input_to_forget_weights=*/nullptr,
+            /*aux_input_to_cell_weights=*/nullptr,
+            /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+            forget_gate_bias, cell_gate_bias, output_gate_bias,
+            projection_weights, /*projection_weights_ledger*/ nullptr,
+            projection_bias, &lstm_params,
+            /*forward_sequence=*/true, time_major,
+            /*output_offset=*/0, scratch_buffer,
+            GetTemporary(context, node, kInputScalingFactors),
+            /*aux_input_sf=*/nullptr,
+            GetTemporary(context, node, kOutputStateScalingFactors),
+            GetTemporary(context, node, kProductScalingFactors),
+            GetTemporary(context, node, kRecoveredCellWeights),
+            GetTemporary(context, node, kInputQuantized),
+            /*aux_input_quantized=*/nullptr,
+            GetTemporary(context, node, kOutputStateQuantized),
+            GetTemporary(context, node, kCellStateQuantized), output_state,
+            cell_state, GetTemporary(context, node, kAccumScratch), output,
+            GetTemporary(context, node, kInputZeroPoints),
+            /*aux_input_zp=*/nullptr,
+            GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
+            row_sums_size, &op_data->compute_row_sums,
+            CpuBackendContext::GetFromContext(context));
+      } else {
+        TfLiteTensor* scratch0;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, 0, &scratch0));
+        TfLiteTensor* scratch1;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, 1, &scratch1));
+        TfLiteTensor* scratch2;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, 2, &scratch2));
+        TfLiteTensor* scratch3;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, 3, &scratch3));
+        TfLiteTensor* scratch4;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, 4, &scratch4));
+        TfLiteTensor* scratch5;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, 5, &scratch5));
+        return lstm_eval::EvalInteger8x8_16(
+            input, input_to_input_weights, input_to_forget_weights,
+            input_to_cell_weights, input_to_output_weights,
+            recurrent_to_input_weights, recurrent_to_forget_weights,
+            recurrent_to_cell_weights, recurrent_to_output_weights,
+            cell_to_input_weights, cell_to_forget_weights,
+            cell_to_output_weights, input_layer_norm_coefficients,
+            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+            output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+            cell_gate_bias, output_gate_bias, projection_weights,
+            projection_bias, &lstm_params, /*forward_sequence=*/true,
+            time_major, &op_data->integer_lstm_param, output_state, cell_state,
+            output, scratch0, scratch1, scratch2, scratch3, scratch4, scratch5,
+            CpuBackendContext::GetFromContext(context));
+      }
     }
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s is not currently supported.",
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 90a96ca98fe..94ed9f19352 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -2739,6 +2739,611 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
+class UnidirectionalSequenceLSTMIntegerOpModel : public SingleOpModel {
+ public:
+  UnidirectionalSequenceLSTMIntegerOpModel(
+      int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
+      bool time_major, bool use_cifg, bool use_peephole,
+      bool use_projection_weights, bool use_projection_bias,
+      bool use_layer_norm, bool use_8x8_8_implementation,
+      const std::vector<std::pair<float, float>>& ranges,
+      const std::vector<std::pair<float, int>>& intermediates,
+      bool asymmetric_quantize_inputs = false)
+      : n_input_(n_input), n_output_(n_output) {
+    input_ = AddInput({TensorType_INT8,
+                       {sequence_length, n_batch, n_input},
+                       ranges[0].first,
+                       ranges[0].second});
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput({TensorType_INT8,
+                                          {n_cell, n_input},
+                                          ranges[1].first,
+                                          ranges[1].second});
+    }
+    input_to_forget_weights_ = AddInput({TensorType_INT8,
+                                         {n_cell, n_input},
+                                         ranges[2].first,
+                                         ranges[2].second});
+    input_to_cell_weights_ = AddInput({TensorType_INT8,
+                                       {n_cell, n_input},
+                                       ranges[3].first,
+                                       ranges[3].second});
+    input_to_output_weights_ = AddInput({TensorType_INT8,
+                                         {n_cell, n_input},
+                                         ranges[4].first,
+                                         ranges[4].second});
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput({TensorType_INT8,
+                                              {n_cell, n_output},
+                                              ranges[5].first,
+                                              ranges[5].second});
+    }
+    recurrent_to_forget_weights_ = AddInput({TensorType_INT8,
+                                             {n_cell, n_output},
+                                             ranges[6].first,
+                                             ranges[6].second});
+    recurrent_to_cell_weights_ = AddInput({TensorType_INT8,
+                                           {n_cell, n_output},
+                                           ranges[7].first,
+                                           ranges[7].second});
+    recurrent_to_output_weights_ = AddInput({TensorType_INT8,
+                                             {n_cell, n_output},
+                                             ranges[8].first,
+                                             ranges[8].second});
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(
+            {TensorType_INT16, {n_cell}, ranges[9].first, ranges[9].second});
+      }
+      cell_to_forget_weights_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[10].first, ranges[10].second});
+      cell_to_output_weights_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[11].first, ranges[11].second});
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(
+          {TensorType_INT32, {n_cell}, ranges[12].first, ranges[12].second});
+    }
+    forget_gate_bias_ = AddInput(
+        {TensorType_INT32, {n_cell}, ranges[13].first, ranges[13].second});
+    cell_gate_bias_ = AddInput(
+        {TensorType_INT32, {n_cell}, ranges[14].first, ranges[14].second});
+    output_gate_bias_ = AddInput(
+        {TensorType_INT32, {n_cell}, ranges[15].first, ranges[15].second});
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput({TensorType_INT8,
+                                      {n_output, n_cell},
+                                      ranges[16].first,
+                                      ranges[16].second});
+    } else {
+      projection_weights_ = AddNullInput();
+    }
+    if (use_projection_bias) {
+      CHECK(use_projection_weights);
+      projection_bias_ = AddInput(
+          {TensorType_INT32, {n_output}, ranges[17].first, ranges[17].second});
+    } else {
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    AddVariableInput({TensorType_INT16,
+                      {n_batch, n_output},
+                      ranges[18].first,
+                      ranges[18].second});
+    AddVariableInput({TensorType_INT16,
+                      {n_batch, n_cell},
+                      ranges[19].first,
+                      ranges[19].second});
+
+    // Layer norm weights.
+    if (use_layer_norm) {
+      if (use_cifg) {
+        input_layer_norm_coefficients_ = AddNullInput();
+      } else {
+        input_layer_norm_coefficients_ = AddInput(
+            {TensorType_INT16, {n_cell}, ranges[20].first, ranges[20].second});
+      }
+      forget_layer_norm_coefficients_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[21].first, ranges[21].second});
+      cell_layer_norm_coefficients_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[22].first, ranges[22].second});
+      output_layer_norm_coefficients_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[23].first, ranges[23].second});
+    }
+
+    // use_8x8_8_implementation is not supported yet.
+    CHECK(!use_8x8_8_implementation);
+    EXPECT_EQ(intermediates.size(), 5);
+
+    for (int i = 0; i < intermediates.size(); ++i) {
+      AddIntermediate(TensorType_INT16, {intermediates[i].first},
+                      {intermediates[i].second});
+    }
+
+    output_ = AddOutput({TensorType_INT8,
+                         {n_batch, n_output},
+                         ranges[24].first,
+                         ranges[24].second});
+
+    // TODO(b/161825581): Add tests where cell_clip and/or proj_clip is not the
+    // default 0.
+    SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+                 BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+                 CreateUnidirectionalSequenceLSTMOptions(
+                     builder_, ActivationFunctionType_TANH, /*cell_clip=*/0.0f,
+                     /*proj_clip=*/0.0f, time_major, asymmetric_quantize_inputs)
+                     .Union());
+
+    BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/true, /*allocate_and_delegate=*/false);
+  }
+
+  void PerformAllocateAndDelegate() { AllocateAndDelegate(true); }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(output_layer_norm_coefficients_, f);
+  }
+
+  void SetInputGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(cell_gate_bias_, f);
+  }
+
+  void SetOutputGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(projection_weights_, f);
+  }
+
+  void SetProjectionBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(projection_bias_, f);
+  }
+
+  void SetInput(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_, f);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_coefficients_;
+  int forget_layer_norm_coefficients_;
+  int cell_layer_norm_coefficients_;
+  int output_layer_norm_coefficients_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_gate_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_;
+
+  int n_input_;
+  int n_output_;
+};
+
+TEST(IntegerUnidirectionalSequenceLstmOpTest,
+     NoCifg_NoPeephole_Projection_LayerNorm) {
+  // Hyper parameters.
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const int sequence_length = 3;
+
+  // Model related weights.
+  const std::vector<float> input_to_input_weights = {
+      0.5,  0.6, 0.7,  -0.8, -0.9, 0.1,  0.2,  0.3,  -0.4, 0.5,
+      -0.8, 0.7, -0.6, 0.5,  -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+  const std::vector<float> input_to_forget_weights = {
+      -0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2, -0.4, 0.3,  -0.8,
+      -0.4, 0.3,  -0.5, -0.4, -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+  const std::vector<float> input_to_cell_weights = {
+      -0.4, -0.3, -0.2, -0.1, -0.5, 0.5, -0.2, -0.3, -0.2, -0.6,
+      0.6,  -0.1, -0.4, -0.3, -0.7, 0.7, -0.9, -0.5, 0.8,  0.6};
+
+  const std::vector<float> input_to_output_weights = {
+      -0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3, -0.3, -0.8, -0.2,
+      0.6,  -0.2, 0.4,  -0.7, -0.3, -0.5, 0.1, 0.5,  -0.6, -0.4};
+
+  const std::vector<float> input_gate_bias = {0.03, 0.15, 0.22, 0.38};
+
+  const std::vector<float> forget_gate_bias = {0.1, -0.3, -0.2, 0.1};
+
+  const std::vector<float> cell_gate_bias = {-0.05, 0.72, 0.25, 0.08};
+
+  const std::vector<float> output_gate_bias = {0.05, -0.01, 0.2, 0.1};
+
+  const std::vector<float> recurrent_to_input_weights = {
+      -0.2, -0.3, 0.4, 0.1, -0.5, 0.9, -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  const std::vector<float> recurrent_to_cell_weights = {
+      -0.3, 0.2, 0.1, -0.3, 0.8, -0.08, -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  const std::vector<float> recurrent_to_forget_weights = {
+      -0.5, -0.3, -0.5, -0.2, 0.6, 0.4, 0.9, 0.3, -0.1, 0.2, 0.5, 0.2};
+
+  const std::vector<float> recurrent_to_output_weights = {
+      0.3, -0.1, 0.1, -0.2, -0.5, -0.7, -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  const std::vector<float> input_layer_norm_coefficients = {0.1, 0.2, 0.3, 0.5};
+  const std::vector<float> forget_layer_norm_coefficients = {0.2, 0.2, 0.4,
+                                                             0.3};
+  const std::vector<float> cell_layer_norm_coefficients = {0.7, 0.2, 0.3, 0.8};
+  const std::vector<float> output_layer_norm_coefficients = {0.6, 0.2, 0.2,
+                                                             0.5};
+
+  const std::vector<float> projection_weights = {
+      -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
+
+  // Input ranges.
+  const std::vector<std::pair<float, float>> ranges = {
+      {-1.0, 127.0 / 128},  // input tensor
+      {-1.0, 1.0},          // input_to_input_weight tensor
+      {-1.0, 1.0},          // input_to_forget_weight tensor
+      {-1.0, 1.0},          // input_to_cell_weight tensor
+      {-1.0, 1.0},          // input_to_output_weight tensor
+
+      {-1.0, 1.0},  // recurrent_to_input_weight tensor
+      {-1.0, 1.0},  // recurrent_to_forget_weight tensor
+      {-1.0, 1.0},  // recurrent_to_cell_weight tensor
+      {-1.0, 1.0},  // recurrent_to_output_weight tensor
+
+      {-1, 1},  // cell_to_input_weight tensor
+      {-1, 1},  // cell_to_forget_weight tensor
+      {-1, 1},  // cell_to_output_weight tensor
+
+      {-100, 100},  // input_gate_bias tensor
+      {-100, 100},  // forget_gate_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
+      {-100, 100},  // output_gate_bias tensor
+
+      {-0.5, 0.5},  // projection_weight tensor
+      {-1, 1},      // projection_bias tensor
+
+      {-1.0, 32767.0 / 32768},  // output_state tensor
+      {-1, 1},                  // cell_state tensor
+
+      {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
+      // Output scale is the same as output_state scale and only output_state
+      // scale is used in the op, so this is only provided for clarity.
+      {-1.0, 32767.0 / 32768},  // output tensor.
+  };
+
+  // The scale and zero point of intermediate tensors.
+  std::vector<std::pair<float, int>> intermediates = {
+      {0.007059, 0}, {0.007812, 0}, {0.007059, 0}, {0.007812, 0}, {0.007, 0}};
+
+  // Create model.
+  UnidirectionalSequenceLSTMIntegerOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*time_major=*/true,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*use_layer_norm=*/true,
+      /*use_8x8_8_implementation=*/false, ranges, intermediates);
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
+
+  // Set weights.
+  lstm.SetInputToInputWeights(input_to_input_weights);
+  lstm.SetInputToCellWeights(input_to_cell_weights);
+  lstm.SetInputToForgetWeights(input_to_forget_weights);
+  lstm.SetInputToOutputWeights(input_to_output_weights);
+
+  lstm.SetInputGateBias(input_gate_bias);
+  lstm.SetCellBias(cell_gate_bias);
+  lstm.SetForgetGateBias(forget_gate_bias);
+  lstm.SetOutputGateBias(output_gate_bias);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights);
+
+  lstm.SetProjectionWeights(projection_weights);
+
+  lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients);
+  lstm.SetForgetLayerNormCoefficients(forget_layer_norm_coefficients);
+  lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients);
+  lstm.SetOutputLayerNormCoefficients(output_layer_norm_coefficients);
+
+  // Model inputs. sequence -batch - input
+  const std::vector<float> lstm_input = {
+      0.7, 0.8, 0.1, 0.2, 0.3,  //
+      0.8, 0.1, 0.2, 0.4, 0.5,  //
+      0.2, 0.7, 0.7, 0.1, 0.7,  //
+      0.3, 0.2, 0.9, 0.8, 0.1,  //
+      0.7, 0.8, 0.1, 0.2, 0.3,  //
+      0.3, 0.2, 0.9, 0.8, 0.1,  //
+  };
+
+  // Expected outputs, n_batch * sequence_length * n_output
+  const std::vector<int8_t> expected_output = {
+      127,  127, -108, -67, 127, 127, -128, 127, 127,
+      -128, 127, 127,  127, 127, 127, -128, 127, 127,
+  };
+
+  // Invoke and verify the result.
+  lstm.SetInput(lstm_input);
+  lstm.Invoke();
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(expected_output));
+}
+
+TEST(IntegerUnidirectionalSequenceLstmOpTest,
+     NoCifg_Peephole_Projection_LayerNorm) {
+  // Hyper parameters.
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const int sequence_length = 3;
+
+  // Model related weights.
+  const std::vector<float> input_to_input_weights = {
+      0.5,  0.6, 0.7,  -0.8, -0.9, 0.1,  0.2,  0.3,  -0.4, 0.5,
+      -0.8, 0.7, -0.6, 0.5,  -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+  const std::vector<float> input_to_forget_weights = {
+      -0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2, -0.4, 0.3,  -0.8,
+      -0.4, 0.3,  -0.5, -0.4, -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+  const std::vector<float> input_to_cell_weights = {
+      -0.4, -0.3, -0.2, -0.1, -0.5, 0.5, -0.2, -0.3, -0.2, -0.6,
+      0.6,  -0.1, -0.4, -0.3, -0.7, 0.7, -0.9, -0.5, 0.8,  0.6};
+
+  const std::vector<float> input_to_output_weights = {
+      -0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3, -0.3, -0.8, -0.2,
+      0.6,  -0.2, 0.4,  -0.7, -0.3, -0.5, 0.1, 0.5,  -0.6, -0.4};
+
+  const std::vector<float> input_gate_bias = {0.03, 0.15, 0.22, 0.38};
+
+  const std::vector<float> forget_gate_bias = {0.1, -0.3, -0.2, 0.1};
+
+  const std::vector<float> cell_gate_bias = {-0.05, 0.72, 0.25, 0.08};
+
+  const std::vector<float> output_gate_bias = {0.05, -0.01, 0.2, 0.1};
+
+  const std::vector<float> recurrent_to_input_weights = {
+      -0.2, -0.3, 0.4, 0.1, -0.5, 0.9, -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  const std::vector<float> recurrent_to_cell_weights = {
+      -0.3, 0.2, 0.1, -0.3, 0.8, -0.08, -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  const std::vector<float> recurrent_to_forget_weights = {
+      -0.5, -0.3, -0.5, -0.2, 0.6, 0.4, 0.9, 0.3, -0.1, 0.2, 0.5, 0.2};
+
+  const std::vector<float> recurrent_to_output_weights = {
+      0.3, -0.1, 0.1, -0.2, -0.5, -0.7, -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  const std::vector<float> cell_to_input_weights = {0.3, -0.1, 0.1, -0.2};
+
+  const std::vector<float> cell_to_forget_weights = {0.2, -0.1, 0.1, -0.2};
+
+  const std::vector<float> cell_to_output_weights = {0.3, -0.1, 0.1, -0.3};
+
+  const std::vector<float> input_layer_norm_coefficients = {0.1, 0.2, 0.3, 0.5};
+  const std::vector<float> forget_layer_norm_coefficients = {0.2, 0.2, 0.4,
+                                                             0.3};
+  const std::vector<float> cell_layer_norm_coefficients = {0.7, 0.2, 0.3, 0.8};
+  const std::vector<float> output_layer_norm_coefficients = {0.6, 0.2, 0.2,
+                                                             0.5};
+
+  const std::vector<float> projection_weights = {
+      -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
+
+  // Input ranges.
+  const std::vector<std::pair<float, float>> ranges = {
+      {-1.0, 127.0 / 128},  // input tensor
+      {-1.0, 1.0},          // input_to_input_weight tensor
+      {-1.0, 1.0},          // input_to_forget_weight tensor
+      {-1.0, 1.0},          // input_to_cell_weight tensor
+      {-1.0, 1.0},          // input_to_output_weight tensor
+
+      {-1.0, 1.0},  // recurrent_to_input_weight tensor
+      {-0.9, 0.9},  // recurrent_to_forget_weight tensor
+      {-1.0, 1.0},  // recurrent_to_cell_weight tensor
+      {-1.0, 1.0},  // recurrent_to_output_weight tensor
+
+      {-0.3, 0.3},  // cell_to_input_weight tensor
+      {-0.3, 0.3},  // cell_to_forget_weight tensor
+      {-0.3, 0.3},  // cell_to_output_weight tensor
+
+      {-100, 100},  // input_gate_bias tensor
+      {-100, 80},   // forget_gate_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
+      {-100, 100},  // output_gate_bias tensor
+
+      {-0.5, 0.5},  // projection_weight tensor
+      {-1, 1},      // projection_bias tensor
+
+      {-1.0, 32767.0 / 32768},  // output_state tensor
+      {-1, 1},                  // cell_state tensor
+
+      {-0.5, 0.5},  // input_layer_norm_coefficient tensor
+      {-0.5, 0.5},  // forget_layer_norm_coefficient tensor
+      {-1.0, 1.0},  // cell_layer_norm_coefficient tensor
+      {-1.0, 1.0},  // output_layer_norm_coefficient tensor
+      // Output scale is the same as output_state scale and only output_state
+      // scale is used in the op, so this is only provided for clarity.
+      {-1.0, 32767.0 / 32768},  // output tensor.
+  };
+
+  // The scale and zero point of intermediate tensors.
+  std::vector<std::pair<float, int>> intermediates = {
+      {0.007059, 0}, {0.007812, 0}, {0.007059, 0}, {0.007812, 0}, {0.007, 0}};
+
+  // Create model.
+  UnidirectionalSequenceLSTMIntegerOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*time_major=*/true,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*use_layer_norm=*/true,
+      /*use_8x8_8_implementation=*/false, ranges, intermediates);
+
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
+
+  // Set weights.
+  lstm.SetInputToInputWeights(input_to_input_weights);
+  lstm.SetInputToCellWeights(input_to_cell_weights);
+  lstm.SetInputToForgetWeights(input_to_forget_weights);
+  lstm.SetInputToOutputWeights(input_to_output_weights);
+
+  lstm.SetInputGateBias(input_gate_bias);
+  lstm.SetCellBias(cell_gate_bias);
+  lstm.SetForgetGateBias(forget_gate_bias);
+  lstm.SetOutputGateBias(output_gate_bias);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights);
+  lstm.SetCellToOutputWeights(cell_to_output_weights);
+
+  lstm.SetProjectionWeights(projection_weights);
+
+  lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients);
+  lstm.SetForgetLayerNormCoefficients(forget_layer_norm_coefficients);
+  lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients);
+  lstm.SetOutputLayerNormCoefficients(output_layer_norm_coefficients);
+
+  // Model inputs. sequence -batch - input
+  const std::vector<float> lstm_input = {
+      0.7, 0.8, 0.1, 0.2, 0.3,  //
+      0.8, 0.1, 0.2, 0.4, 0.5,  //
+      0.2, 0.7, 0.7, 0.1, 0.7,  //
+      0.3, 0.2, 0.9, 0.8, 0.1,  //
+      0.7, 0.8, 0.1, 0.2, 0.3,  //
+      0.3, 0.2, 0.9, 0.8, 0.1,  //
+  };
+
+  // Expected outputs, n_batch * sequence_length * n_output
+  const std::vector<int8_t> expected_output = {
+      127,  127, -16, -21, 127, 127, 23,   127, 127,
+      -128, 127, 127, 127, 127, 127, -128, 127, 127,
+  };
+
+  // Invoke and verify the result.
+  lstm.SetInput(lstm_input);
+  lstm.Invoke();
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(expected_output));
+}
+
 #define QUANTIZE_PARAMETER_TEST(test) \
   INSTANTIATE_TEST_SUITE_P(test, test, ::testing::ValuesIn({false, true}));
 
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index bd8f39bb925..d480d2e5f8c 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -52,6 +52,7 @@ cc_library(
         "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers//:runtime_cc",
     ],
 )
@@ -81,13 +82,13 @@ cc_library(
     deps = [
         ":micro_utils",
         ":op_resolvers",
+        "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers//:runtime_cc",
     ],
 )
diff --git a/tensorflow/lite/micro/CONTRIBUTING.md b/tensorflow/lite/micro/CONTRIBUTING.md
index 4227eb42858..78360b913f6 100644
--- a/tensorflow/lite/micro/CONTRIBUTING.md
+++ b/tensorflow/lite/micro/CONTRIBUTING.md
@@ -109,7 +109,7 @@ fixing a bug needs a bigger architectural change.
 ### Reference Kernel Implementations
 
 Pull requests that port reference kernels from TF Lite Mobile to TF Lite Micro
-are welcome once we have enouch context from the contributor on why the
+are welcome once we have enough context from the contributor on why the
 additional kernel is needed.
 
 1.  Please create a
diff --git a/tensorflow/lite/micro/README.md b/tensorflow/lite/micro/README.md
index c9dcc43c91a..a13a30b1d73 100644
--- a/tensorflow/lite/micro/README.md
+++ b/tensorflow/lite/micro/README.md
@@ -22,6 +22,12 @@ kilobytes of memory.
 To learn how to use the framework, visit the developer documentation at
 [tensorflow.org/lite/microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
 
+# Continuous Buils Status
+
+Build Type | Status                                                                                                                                                                       | Artifacts
+---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+Linux      | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/tflite-micro.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/tflite-micro.html) |
+
 # Getting Help and Involved
 
 A
@@ -48,3 +54,4 @@ project, we have additional documentation in the [docs](docs/) folder.
 *   [Benchmarks](benchmarks/README.md)
 *   [Memory Management](docs/memory_management.md)
 *   [New Platform Support](docs/new_platform_support.md)
+*   [Software Emulation with Renode](docs/renode.md)
diff --git a/tensorflow/lite/micro/benchmarks/Makefile.inc b/tensorflow/lite/micro/benchmarks/Makefile.inc
index b50a51758c0..a47bc2e723a 100644
--- a/tensorflow/lite/micro/benchmarks/Makefile.inc
+++ b/tensorflow/lite/micro/benchmarks/Makefile.inc
@@ -27,15 +27,9 @@ tensorflow/lite/micro/examples/person_detection_experimental/person_detect_model
 $(eval $(call microlite_test,keyword_benchmark,\
 $(KEYWORD_BENCHMARK_SRCS),$(KEYWORD_BENCHMARK_HDRS)))
 
-# We can not build the person detection benchmarks for STM32F4 due to
-# insufficient flash (see issue #43743 for more details). Currently disabling
-# this benchmark for STM32F4 but we can revisit this later.
-ifneq ($(TARGET), stm32f4)
-
 $(eval $(call microlite_test,person_detection_benchmark,\
 $(PERSON_DETECTION_BENCHMARK_SRCS),$(PERSON_DETECTION_BENCHMARK_HDRS)))
 
 $(eval $(call microlite_test,person_detection_experimental_benchmark,\
 $(PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_SRCS),$(PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_HDRS)))
 
-endif
diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/README.md b/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
deleted file mode 100644
index 4d5f85b24ea..00000000000
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generic Cortex-Mx customizations
-
-The customization requires a definition where the debug log goes to. The purpose
-of the generic Cortex-Mx target is to generate a TFLM library file for use in
-application projects outside of this repo. As the chip HAL and the board
-specific layer are only defined in the application project, the TFLM library
-cannot write the debug log anywhere. Instead, we allow the application layer to
-register a callback function for writing the TFLM kernel debug log.
-
-# Usage
-
-See debug_log_callback.h
diff --git a/tensorflow/lite/micro/cortex_m_generic/README.md b/tensorflow/lite/micro/cortex_m_generic/README.md
new file mode 100644
index 00000000000..69e65944d4f
--- /dev/null
+++ b/tensorflow/lite/micro/cortex_m_generic/README.md
@@ -0,0 +1,65 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
+# Generic Cortex-Mx customizations
+
+The customization requires a definition where the debug log goes to. The purpose
+of the generic Cortex-Mx target is to generate a TFLM library file for use in
+application projects outside of this repo. As the chip HAL and the board
+specific layer are only defined in the application project, the TFLM library
+cannot write the debug log anywhere. Instead, we allow the application layer to
+register a callback function for writing the TFLM kernel debug log.
+
+# Usage
+
+See debug_log_callback.h
+
+# How to build
+
+Required parameters:
+
+  - TARGET: cortex_m_generic
+  - TARGET_ARCH: cortex-mXX (For all options see: tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc)
+
+Optional parameters:
+
+  - TOOLCHAIN: gcc (default) or armmclang
+  - For Cortex-M55, ARM Compiler 6.14 or later is required.
+
+Some examples:
+
+Building with arm-gcc
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m7 microlite
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m7 TAGS=cmsis-nn microlite
+
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m4 TAGS=cmsis-nn microlite
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m4+fp TAGS=cmsis-nn microlite
+```
+
+Building with armclang
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armclang TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 microlite
+make -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armclang TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 TAGS=cmsis-nn microlite
+make -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armclang TARGET=cortex_m_generic TARGET_ARCH=cortex-m55+nofp TAGS=cmsis-nn microlite
+```
+
+The Tensorflow Lite Micro makefiles download a specific version of the arm-gcc
+compiler to tensorflow/lite/micro/tools/make/downloads/gcc_embedded.
+
+If desired, a different version can be used by providing `TARGET_TOOLCHAIN_ROOT`
+option to the Makefile:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m4+fp TARGET_TOOLCHAIN_ROOT=/path/to/arm-gcc/ microlite
+```
+
+Similarly, `TAGS=cmsis-nn` downloads a specific version of CMSIS to
+tensorflow/lite/micro/tools/make/downloads/cmsis. While this is the only version
+that is regularly tested, you can use your own version of CMSIS as well by
+providing `CMSIS_PATH` to the Makefile:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m4+fp TAGS=cmsis-nn CMSIS_PATH=/path/to/own/cmsis microlite
+```
diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc b/tensorflow/lite/micro/cortex_m_generic/debug_log.cc
similarity index 91%
rename from tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
rename to tensorflow/lite/micro/cortex_m_generic/debug_log.cc
index fce512e199b..bc79d439170 100644
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
+++ b/tensorflow/lite/micro/cortex_m_generic/debug_log.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // Implementation for the DebugLog() function that prints to the debug logger on
-// an generic cortex-m device.
+// an generic Cortex-M device.
 
 #ifdef __cplusplus
 extern "C" {
@@ -22,7 +22,7 @@ extern "C" {
 
 #include "tensorflow/lite/micro/debug_log.h"
 
-#include "tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h"
+#include "tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h"
 
 static DebugLogCallback debug_log_callback = nullptr;
 
diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h b/tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h
similarity index 83%
rename from tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h
rename to tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h
index d462c8db368..c1afd19a578 100644
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h
+++ b/tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_
-#define TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_
+#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_
+#define TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_
 
 // The application layer must implement and register a callback before calling
 // the network in a way similar to
@@ -46,4 +46,4 @@ void RegisterDebugLogCallback(DebugLogCallback callback);
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_
+#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_
diff --git a/tensorflow/lite/micro/debug_log.cc b/tensorflow/lite/micro/debug_log.cc
index 7ef582bd376..46ca253a6d5 100644
--- a/tensorflow/lite/micro/debug_log.cc
+++ b/tensorflow/lite/micro/debug_log.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,6 +36,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/debug_log.h"
 
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
 #include <cstdio>
+#endif
 
-extern "C" void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
+extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
+  // maximum reduction in binary size. This is because we have DebugLog calls
+  // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
+  fprintf(stderr, "%s", s);
+#endif
+}
diff --git a/tensorflow/lite/micro/docs/memory_management.md b/tensorflow/lite/micro/docs/memory_management.md
index a936cb6d7c3..7e7f05b46eb 100644
--- a/tensorflow/lite/micro/docs/memory_management.md
+++ b/tensorflow/lite/micro/docs/memory_management.md
@@ -1,3 +1,5 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
 <!--
 Semi-automated TOC generation with instructions from
 https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
@@ -38,13 +40,14 @@ sections:
 
 The illustration below represents typical allocations in TFLM:
 
-## ```
-
-| | | | | HEAD |<-- TEMPORARY -->| TAIL |
-
-## | | | |
-
-*   Lowest Address Highest Address * ```
+```
+--------------------------------------------------------------------------------
+|        |                     |                                               |
+|  HEAD  |<--  TEMPORARY    -->|                    TAIL                       |
+|        |                     |                                               |
+--------------------------------------------------------------------------------
+* Lowest Address                                               Highest Address *
+```
 
 ### Head Section
 
@@ -114,7 +117,7 @@ detailed allocation logging:
 #include "recording_micro_interpreter.h"
 
 // Simply change the class name from 'MicroInterpreter' to 'RecordingMicroInterpreter':
-tflite::RecoridngMicroInterpreter interpreter(
+tflite::RecordingMicroInterpreter interpreter(
   tflite::GetModel(my_model_data), ops_resolver,
   tensor_arena, tensor_arena_size, error_reporter);
 
@@ -129,20 +132,18 @@ interpreter.PrintAllocations();
 
 The output of this call will look something similar to this (output from the
 [memory_arena_threshold_test](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/memory_arena_threshold_test.cc#L205)):
-`sh [RecordingMicroAllocator] Arena allocation total 9568 bytes
+
+```bash
+[RecordingMicroAllocator] Arena allocation total 9568 bytes
 [RecordingMicroAllocator] Arena allocation head 7744 bytes
 [RecordingMicroAllocator] Arena allocation tail 1824 bytes
-[RecordingMicroAllocator] 'TfLiteEvalTensor data' used 360 bytes with alignment
-overhead (requested 360 bytes for 15 allocations) [RecordingMicroAllocator]
-'Persistent TfLiteTensor data' used 0 bytes with alignment overhead (requested 0
-bytes for 0 tensors) [RecordingMicroAllocator] 'Persistent TfLiteTensor
-quantization data' used 0 bytes with alignment overhead (requested 0 bytes for 0
-allocations) [RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used
-0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
-[RecordingMicroAllocator] 'NodeAndRegistration struct' used 392 bytes with
-alignment overhead (requested 392 bytes for 7 NodeAndRegistration structs)
-[RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment
-overhead (requested 136 bytes for 5 OpData structs)`
+[RecordingMicroAllocator] 'TfLiteEvalTensor data' used 360 bytes with alignment overhead (requested 360 bytes for 15 allocations)
+[RecordingMicroAllocator] 'Persistent TfLiteTensor data' used 0 bytes with alignment overhead (requested 0 bytes for 0 tensors)
+[RecordingMicroAllocator] 'Persistent TfLiteTensor quantization data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
+[RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
+[RecordingMicroAllocator] 'NodeAndRegistration struct' used 392 bytes with alignment overhead (requested 392 bytes for 7 NodeAndRegistration structs)
+[RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment overhead (requested 136 bytes for 5 OpData structs)
+```
 
 ### Allocation Section Details
 
diff --git a/tensorflow/lite/micro/docs/renode.md b/tensorflow/lite/micro/docs/renode.md
new file mode 100644
index 00000000000..daf3159ff6c
--- /dev/null
+++ b/tensorflow/lite/micro/docs/renode.md
@@ -0,0 +1,78 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
+<!--
+Semi-automated TOC generation with instructions from
+https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
+-->
+
+<!--ts-->
+   * [Software Emulation with Renode](#software-emulation-with-renode)
+   * [Installation](#installation)
+   * [Running Unit Tests](#running-unit-tests)
+      * [Under the hood of the Testing Infrastructure](#under-the-hood-of-the-testing-infrastructure)
+   * [Running a non-test Binary with Renode](#running-a-non-test-binary-with-renode)
+
+<!-- Added by: advaitjain, at: Fri 23 Oct 2020 04:40:49 PM PDT -->
+
+<!--te-->
+
+# Software Emulation with Renode
+
+TensorFlow Lite Micro makes use of [Renode](https://github.com/renode/renode) to
+for software emulation.
+
+Here, we document how Renode is used as part of the TFLM project. For more
+general use of Renode, please refer to the [Renode
+documentation](https://renode.readthedocs.io/en/latest/).
+
+# Installation
+
+Renode can be installed and used in a variety of ways, as documented
+[here](https://renode.readthedocs.io/en/latest/). For the purpose of Tensorflow
+Lite Micro, we make use of a portable version for Linux.
+
+ 1. Download portable version of Renode for Linux:
+
+    ```
+    tensorflow/lite/micro/testing/download_renode.sh tensorflow/lite/micro/tools/make/downloads/renode
+    ```
+
+ 2. Install the Renode test dependencies
+
+    ```
+    pip3 install -r tensorflow/lite/micro/tools/make/downloads/renode/tests/requirements.txt
+    ```
+
+At this point in time you will be ready to run TFLM tests with Renode.
+
+# Running Unit Tests
+
+All the tests for a specific platform (e.g. bluepill) can be run with:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=bluepill test
+```
+
+ * This makes use of the robot framework from Renode.
+ * Note that the tests can currently not be run in parallel.
+
+## Under the hood of the Testing Infrastructure
+
+Describe how we wait for a particular string on the UART. Some pointers into the
+robot files as well as any relevant documentation from Renode.
+
+A test failure is the absence of a specific string on the UART so the test will
+wait for a specific timeout period (configured in the .robot) file before
+failing.
+
+ * What this means in practice is that a failing test will take longer to finish
+   than a test that passes.
+
+ * If needed, an optimization on this would be to have a specific failure
+   message as well so that both success and failure can be detected quickly.
+
+# Running a non-test Binary with Renode
+
+It may be useful to run binaries on Renode that are not tests, independent of
+the robot framework. We will be adding some documentation for that in this
+section.
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 4253b470759..3cffdd79b76 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -1,3 +1,5 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
 # Hello World Example
 
 This example is designed to demonstrate the absolute basics of using [TensorFlow
@@ -168,10 +170,8 @@ make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_hello_worl
 
 ### Building the example
 
-Go the the example project directory
-```
-cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/hello_world/esp-idf
-```
+Go to the example project directory `cd
+tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/hello_world/esp-idf`
 
 Then build with `idf.py`
 ```
@@ -201,7 +201,7 @@ idf.py --port /dev/ttyUSB0 flash monitor
 
 The following instructions will help you build and deploy this example to
 [HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
-board. To undstand more about using this board, please check
+board. To understand more about using this board, please check
 [HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
 
 ### Initial Setup
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
index e36145c332a..f4d8a9fed1d 100644
--- a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
@@ -14,3 +14,4 @@
 # ==============================================================================
 CONFIG_CPLUSPLUS=y
 CONFIG_NEWLIB_LIBC=y
+CONFIG_NETWORKING=n
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
index 188298524dd..c2a30280f62 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
@@ -64,7 +64,7 @@ int main(int argc, char** argv) {
 
   constexpr int tensor_arena_size = 50 * 1024;
   uint8_t tensor_arena[tensor_arena_size];
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
+  tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
                                        tensor_arena_size, error_reporter);
   interpreter.AllocateTensors();
 
diff --git a/tensorflow/lite/micro/examples/magic_wand/README.md b/tensorflow/lite/micro/examples/magic_wand/README.md
index fea1eda4d6d..d9f0c754e5f 100644
--- a/tensorflow/lite/micro/examples/magic_wand/README.md
+++ b/tensorflow/lite/micro/examples/magic_wand/README.md
@@ -145,7 +145,7 @@ SLOPE:
 
 The following instructions will help you build and deploy this example to
 [HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
-board. To undstand more about using this board, please check
+board. To understand more about using this board, please check
 [HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
 
 ### Initial Setup
@@ -246,7 +246,7 @@ Following the Steps to run magic wand example at HIMAX WE1 EVB platform.
 
 After these steps, press reset button on the HIMAX WE1 EVB, you will see
 application output in the serial terminal. Perform following gestures
-`'Wing'`,`'Ring'`,`'Slope'` and you can see the otuput in serial terminal.
+`'Wing'`,`'Ring'`,`'Slope'` and you can see the output in serial terminal.
 
 ```
 WING:
diff --git a/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc
index 245221aec96..9080e49b0f5 100644
--- a/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc
@@ -18,7 +18,6 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
     tensorflow/lite/micro/examples/micro_speech/CMSIS/hanning.h \
     tensorflow/lite/micro/examples/micro_speech/CMSIS/sin_1k.h \
     third_party/CMSIS_ext/README.md \
-    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h
 
   PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
   PREPROCESSOR_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
@@ -33,7 +32,6 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
   MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
 
   THIRD_PARTY_CC_SRCS += \
-    $(MAKEFILE_DIR)/downloads/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
     $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
     $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c \
     $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index 7d4c060b6f4..f896e40de2e 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -1,3 +1,5 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
 # Micro Speech Example
 
 This example shows how to run a 20 kB model that can recognize 2 keywords,
@@ -69,7 +71,7 @@ generate_micro_speech_mock_make_project
 ```
 
 Note that `TAGS=reduce_codesize` applies example specific changes of code to
-reduce total size of application. It can be ommited.
+reduce total size of application. It can be omitted.
 
 ### Build and Run Example
 
@@ -214,28 +216,36 @@ The next steps assume that the
 
 ### Generate the examples
 
-The example project can be generated with the following command: `make -f
-tensorflow/lite/micro/tools/make/Makefile TARGET=esp
-generate_micro_speech_esp_project`
+The example project can be generated with the following command:
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_micro_speech_esp_project
+```
 
 ### Building the example
 
-Go the the example project directory `cd
-tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/micro_speech/esp-idf`
+Go the the example project directory
+```
+cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/micro_speech/esp-idf
+```
 
 Then build with `idf.py` `idf.py build`
 
 ### Load and run the example
 
-To flash (replace `/dev/ttyUSB0` with the device serial port): `idf.py --port
-/dev/ttyUSB0 flash`
+To flash (replace `/dev/ttyUSB0` with the device serial port):
+```
+idf.py --port /dev/ttyUSB0 flash
+```
 
-Monitor the serial output: `idf.py --port /dev/ttyUSB0 monitor`
+Monitor the serial output:
+```idf.py --port /dev/ttyUSB0 monitor```
 
 Use `Ctrl+]` to exit.
 
-The previous two commands can be combined: `idf.py --port /dev/ttyUSB0 flash
-monitor`
+The previous two commands can be combined:
+```
+idf.py --port /dev/ttyUSB0 flash monitor
+```
 
 ## Deploy to SparkFun Edge
 
@@ -688,9 +698,9 @@ The following instructions will help you build and deploy the sample to the
 5.  Build the project:
     /tensorflow/lite/micro/tools/make/gen/ceva_bx1/prj/micro_speech/make$ make
 6.  This should build the project and create a file called micro_speech.elf.
-7.  The supplied configuarion reads input from a files and expects a file called
-    input.wav (easily changed in audio_provider.cc) to be placed in the same
-    directory of the .elf file
+7.  The supplied configuration reads input from a files and expects a file
+    called input.wav (easily changed in audio_provider.cc) to be placed in the
+    same directory of the .elf file
 8.  We used Google's speech command dataset: V0.0.2:
     http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz V0.0.1:
     http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md
index 3069edf810b..8f437524ef0 100644
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@@ -1,3 +1,5 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
 # Person detection example
 
 This example shows how you can use Tensorflow Lite to run a 250 kilobyte neural
@@ -302,33 +304,45 @@ The next steps assume that the
 
 ### Generate the examples
 
-The example project can be generated with the following command: `make -f
-tensorflow/lite/micro/tools/make/Makefile TARGET=esp
-generate_person_detection_esp_project`
+The example project can be generated with the following command:
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_person_detection_esp_project
+```
 
 ### Building the example
 
-Go the the example project directory `cd
-tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/person_detection/esp-idf`
+Go the the example project directory
+```
+cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/person_detection/esp-idf
+```
 
 As the `person_detection` example requires an external component `esp32-camera`
 for functioning hence we will have to manually clone it in `components/`
-directory of the example with following command. `git clone
-https://github.com/espressif/esp32-camera.git components/esp32-camera`
+directory of the example with following command.
+```
+git clone https://github.com/espressif/esp32-camera.git components/esp32-camera
+```
 
 Then build with `idf.py` `idf.py build`
 
 ### Load and run the example
 
-To flash (replace `/dev/ttyUSB0` with the device serial port): `idf.py --port
-/dev/ttyUSB0 flash`
+To flash (replace `/dev/ttyUSB0` with the device serial port):
+```
+idf.py --port /dev/ttyUSB0 flash
+```
 
-Monitor the serial output: `idf.py --port /dev/ttyUSB0 monitor`
+Monitor the serial output:
+```
+idf.py --port /dev/ttyUSB0 monitor
+```
 
 Use `Ctrl+]` to exit.
 
-The previous two commands can be combined: `idf.py --port /dev/ttyUSB0 flash
-monitor`
+The previous two commands can be combined:
+```
+idf.py --port /dev/ttyUSB0 flash monitor
+```
 
 ## Running on SparkFun Edge
 
diff --git a/tensorflow/lite/micro/examples/person_detection/esp/README_ESP.md b/tensorflow/lite/micro/examples/person_detection/esp/README_ESP.md
index 3d79d24d6bc..6075e1b30f8 100644
--- a/tensorflow/lite/micro/examples/person_detection/esp/README_ESP.md
+++ b/tensorflow/lite/micro/examples/person_detection/esp/README_ESP.md
@@ -1,3 +1,5 @@
+<!-- mdformat off(b/169948621#comment2) -->
+
 # TensorFlow Lite Micro ESP-IDF Project
 
 This folder has been autogenerated by TensorFlow, and contains source, header,
@@ -40,15 +42,22 @@ idf.py build
 
 ### Load and run the example
 
-To flash (replace `/dev/ttyUSB0` with the device serial port): `idf.py --port
-/dev/ttyUSB0 flash`
+To flash (replace `/dev/ttyUSB0` with the device serial port):
+```
+idf.py --port /dev/ttyUSB0 flash
+```
 
-Monitor the serial output: `idf.py --port /dev/ttyUSB0 monitor`
+Monitor the serial output:
+```
+idf.py --port /dev/ttyUSB0 monitor
+```
 
 Use `Ctrl+]` to exit.
 
-The previous two commands can be combined: `idf.py --port /dev/ttyUSB0 flash
-monitor`
+The previous two commands can be combined:
+```
+idf.py --port /dev/ttyUSB0 flash monitor
+```
 
 ## Project Generation
 
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 22ea96f6d49..ad0c5e6268e 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 
 // Create an area of memory to use for input, output, and intermediate arrays.
 constexpr int tensor_arena_size = 93 * 1024;
+__attribute__((section(".bss.NoInit"), aligned(16)))
 uint8_t tensor_arena[tensor_arena_size];
 
 TF_LITE_MICRO_TESTS_BEGIN
diff --git a/tensorflow/lite/micro/examples/person_detection/training_a_model.md b/tensorflow/lite/micro/examples/person_detection/training_a_model.md
index 24067fc188f..f6a20eec9c7 100644
--- a/tensorflow/lite/micro/examples/person_detection/training_a_model.md
+++ b/tensorflow/lite/micro/examples/person_detection/training_a_model.md
@@ -140,41 +140,41 @@ This will take a couple of days on a single-GPU v100 instance to complete all
 one-million steps, but you should be able to get a fairly accurate model after
 a few hours if you want to experiment early.
 
-- The checkpoints and summaries will the saved in the folder given in the
-`--train_dir` argument, so that's where you'll have to look for the results.
-- The `--dataset_dir` parameter should match the one where you saved the
-TFRecords from the Visual Wake Words build script.
-- The architecture we'll be using is defined by the `--model_name` argument.
-The 'mobilenet_v1' prefix tells the script to use the first version of
-MobileNet. We did experiment with later versions, but these used more RAM for
-their intermediate activation buffers, so for now we kept with the original.
-The '025' is the depth multiplier to use, which mostly affects the number of
-weight parameters, this low setting ensures the model fits within 250KB of
-Flash.
-- `--preprocessing_name` controls how input images are modified before they're
-fed into the model. The 'mobilenet_v1' version shrinks the width and height of
-the images to the size given in `--train_image_size` (in our case 96 pixels
-since we want to reduce the compute requirements). It also scales the pixel
-values from 0 to 255 integers into -1.0 to +1.0 floating point numbers (though
-we'll be quantizing those after training).
-- The
-[HM01B0](https://himax.com.tw/products/cmos-image-sensor/image-sensors/hm01b0/)
-camera we're using on the SparkFun Edge board is monochrome, so to get the best
-results we have to train our model on black and white images too, so we pass in
-the `--input_grayscale` flag to enable that preprocessing.
-- The `--learning_rate`, `--label_smoothing`, `--learning_rate_decay_factor`,
-`--num_epochs_per_decay`, `--moving_average_decay` and `--batch_size` are all
-parameters that control how weights are updated during the the training
-process. Training deep networks is still a bit of a dark art, so these exact
-values we found through experimentation for this particular model. You can try
-tweaking them to speed up training or gain a small boost in accuracy, but we
-can't give much guidance for how to make those changes, and it's easy to get
-combinations where the training accuracy never converges.
-- The `--max_number_of_steps` defines how long the training should continue.
-There's no good way to figure out this threshold in advance, you have to
-experiment to tell when the accuracy of the model is no longer improving to
-tell when to cut it off. In our case we default to a million steps, since with
-this particular model we know that's a good point to stop.
+-   The checkpoints and summaries will the saved in the folder given in the
+    `--train_dir` argument, so that's where you'll have to look for the results.
+-   The `--dataset_dir` parameter should match the one where you saved the
+    TFRecords from the Visual Wake Words build script.
+-   The architecture we'll be using is defined by the `--model_name` argument.
+    The 'mobilenet_v1' prefix tells the script to use the first version of
+    MobileNet. We did experiment with later versions, but these used more RAM
+    for their intermediate activation buffers, so for now we kept with the
+    original. The '025' is the depth multiplier to use, which mostly affects the
+    number of weight parameters, this low setting ensures the model fits within
+    250KB of Flash.
+-   `--preprocessing_name` controls how input images are modified before they're
+    fed into the model. The 'mobilenet_v1' version shrinks the width and height
+    of the images to the size given in `--train_image_size` (in our case 96
+    pixels since we want to reduce the compute requirements). It also scales the
+    pixel values from 0 to 255 integers into -1.0 to +1.0 floating point numbers
+    (though we'll be quantizing those after training).
+-   The
+    [HM01B0](https://himax.com.tw/products/cmos-image-sensor/image-sensors/hm01b0/)
+    camera we're using on the SparkFun Edge board is monochrome, so to get the
+    best results we have to train our model on black and white images too, so we
+    pass in the `--input_grayscale` flag to enable that preprocessing.
+-   The `--learning_rate`, `--label_smoothing`, `--learning_rate_decay_factor`,
+    `--num_epochs_per_decay`, `--moving_average_decay` and `--batch_size` are
+    all parameters that control how weights are updated during the training
+    process. Training deep networks is still a bit of a dark art, so these exact
+    values we found through experimentation for this particular model. You can
+    try tweaking them to speed up training or gain a small boost in accuracy,
+    but we can't give much guidance for how to make those changes, and it's easy
+    to get combinations where the training accuracy never converges.
+-   The `--max_number_of_steps` defines how long the training should continue.
+    There's no good way to figure out this threshold in advance, you have to
+    experiment to tell when the accuracy of the model is no longer improving to
+    tell when to cut it off. In our case we default to a million steps, since
+    with this particular model we know that's a good point to stop.
 
 Once you start the script, you should see output that looks something like this:
 
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index f5f1d64d2ab..c628cfbc9ce 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -56,7 +56,7 @@ generate_person_detection_int8_make_project
 ```
 
 Note that `TAGS=reduce_codesize` applies example specific changes of code to
-reduce total size of application. It can be ommited.
+reduce total size of application. It can be omitted.
 
 ### Build and Run Example
 
@@ -275,7 +275,7 @@ greyscale, and 18.6 seconds to run inference.
 
 The following instructions will help you build and deploy this example to
 [HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
-board. To undstand more about using this board, please check
+board. To understand more about using this board, please check
 [HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
 
 ### Initial Setup
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md b/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
index beb743a2923..81ac39bbf14 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
@@ -140,41 +140,41 @@ This will take a couple of days on a single-GPU v100 instance to complete all
 one-million steps, but you should be able to get a fairly accurate model after
 a few hours if you want to experiment early.
 
-- The checkpoints and summaries will the saved in the folder given in the
-`--train_dir` argument, so that's where you'll have to look for the results.
-- The `--dataset_dir` parameter should match the one where you saved the
-TFRecords from the Visual Wake Words build script.
-- The architecture we'll be using is defined by the `--model_name` argument.
-The 'mobilenet_v1' prefix tells the script to use the first version of
-MobileNet. We did experiment with later versions, but these used more RAM for
-their intermediate activation buffers, so for now we kept with the original.
-The '025' is the depth multiplier to use, which mostly affects the number of
-weight parameters, this low setting ensures the model fits within 250KB of
-Flash.
-- `--preprocessing_name` controls how input images are modified before they're
-fed into the model. The 'mobilenet_v1' version shrinks the width and height of
-the images to the size given in `--train_image_size` (in our case 96 pixels
-since we want to reduce the compute requirements). It also scales the pixel
-values from 0 to 255 integers into -1.0 to +1.0 floating point numbers (though
-we'll be quantizing those after training).
-- The
-[HM01B0](https://himax.com.tw/products/cmos-image-sensor/image-sensors/hm01b0/)
-camera we're using on the SparkFun Edge board is monochrome, so to get the best
-results we have to train our model on black and white images too, so we pass in
-the `--input_grayscale` flag to enable that preprocessing.
-- The `--learning_rate`, `--label_smoothing`, `--learning_rate_decay_factor`,
-`--num_epochs_per_decay`, `--moving_average_decay` and `--batch_size` are all
-parameters that control how weights are updated during the the training
-process. Training deep networks is still a bit of a dark art, so these exact
-values we found through experimentation for this particular model. You can try
-tweaking them to speed up training or gain a small boost in accuracy, but we
-can't give much guidance for how to make those changes, and it's easy to get
-combinations where the training accuracy never converges.
-- The `--max_number_of_steps` defines how long the training should continue.
-There's no good way to figure out this threshold in advance, you have to
-experiment to tell when the accuracy of the model is no longer improving to
-tell when to cut it off. In our case we default to a million steps, since with
-this particular model we know that's a good point to stop.
+-   The checkpoints and summaries will the saved in the folder given in the
+    `--train_dir` argument, so that's where you'll have to look for the results.
+-   The `--dataset_dir` parameter should match the one where you saved the
+    TFRecords from the Visual Wake Words build script.
+-   The architecture we'll be using is defined by the `--model_name` argument.
+    The 'mobilenet_v1' prefix tells the script to use the first version of
+    MobileNet. We did experiment with later versions, but these used more RAM
+    for their intermediate activation buffers, so for now we kept with the
+    original. The '025' is the depth multiplier to use, which mostly affects the
+    number of weight parameters, this low setting ensures the model fits within
+    250KB of Flash.
+-   `--preprocessing_name` controls how input images are modified before they're
+    fed into the model. The 'mobilenet_v1' version shrinks the width and height
+    of the images to the size given in `--train_image_size` (in our case 96
+    pixels since we want to reduce the compute requirements). It also scales the
+    pixel values from 0 to 255 integers into -1.0 to +1.0 floating point numbers
+    (though we'll be quantizing those after training).
+-   The
+    [HM01B0](https://himax.com.tw/products/cmos-image-sensor/image-sensors/hm01b0/)
+    camera we're using on the SparkFun Edge board is monochrome, so to get the
+    best results we have to train our model on black and white images too, so we
+    pass in the `--input_grayscale` flag to enable that preprocessing.
+-   The `--learning_rate`, `--label_smoothing`, `--learning_rate_decay_factor`,
+    `--num_epochs_per_decay`, `--moving_average_decay` and `--batch_size` are
+    all parameters that control how weights are updated during the training
+    process. Training deep networks is still a bit of a dark art, so these exact
+    values we found through experimentation for this particular model. You can
+    try tweaking them to speed up training or gain a small boost in accuracy,
+    but we can't give much guidance for how to make those changes, and it's easy
+    to get combinations where the training accuracy never converges.
+-   The `--max_number_of_steps` defines how long the training should continue.
+    There's no good way to figure out this threshold in advance, you have to
+    experiment to tell when the accuracy of the model is no longer improving to
+    tell when to cut it off. In our case we default to a million steps, since
+    with this particular model we know that's a good point to stop.
 
 Once you start the script, you should see output that looks something like this:
 
diff --git a/tensorflow/lite/micro/hexagon/micro_time.cc b/tensorflow/lite/micro/hexagon/micro_time.cc
new file mode 100644
index 00000000000..9baf77b5653
--- /dev/null
+++ b/tensorflow/lite/micro/hexagon/micro_time.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Hexagon timer implementation.
+// To include this with make, add TARGET=hexagon.
+#include "tensorflow/lite/micro/micro_time.h"
+
+#include <time.h>
+
+namespace tflite {
+
+int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
+
+int32_t GetCurrentTimeTicks() { return clock(); }
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 6eaf3549b32..e9d3faaf027 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -216,7 +216,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -231,7 +230,6 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -243,10 +241,8 @@ tflite_micro_cc_test(
         "fully_connected_test.cc",
     ],
     deps = [
-        ":fully_connected",
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
@@ -290,7 +286,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -305,7 +300,6 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_utils",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -319,7 +313,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -389,7 +382,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -403,7 +395,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -431,7 +422,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -488,7 +478,6 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -503,7 +492,6 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -582,8 +570,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -597,8 +583,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -644,9 +628,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -674,7 +656,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -688,7 +669,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -758,8 +738,6 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -771,7 +749,18 @@ tflite_micro_cc_test(
     deps = [
         ":kernel_runner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
+cc_test(
+    name = "shape_test",
+    srcs = ["shape_test.cc"],
+    deps = [
+        ":kernel_runner",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
diff --git a/tensorflow/lite/micro/kernels/activations.cc b/tensorflow/lite/micro/kernels/activations.cc
index b6feb786a95..a92d5c73820 100644
--- a/tensorflow/lite/micro/kernels/activations.cc
+++ b/tensorflow/lite/micro/kernels/activations.cc
@@ -205,12 +205,12 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, input != nullptr);
 
   if (input->type == kTfLiteInt8) {
-    data->six_int8 = FloatToAsymmetricQuantizedInt8(6.0f, input->params.scale,
-                                                    input->params.zero_point);
+    data->six_int8 = FloatToQuantizedType<int8_t>(6.0f, input->params.scale,
+                                                  input->params.zero_point);
     data->zero_int8 = input->params.zero_point;
   } else if (input->type == kTfLiteUInt8) {
-    data->six_uint8 = FloatToAsymmetricQuantizedUInt8(6.0f, input->params.scale,
-                                                      input->params.zero_point);
+    data->six_uint8 = FloatToQuantizedType<uint8_t>(6.0f, input->params.scale,
+                                                    input->params.zero_point);
     data->zero_uint8 = input->params.zero_point;
   }
 
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index edbe717bedb..3a51472f9bb 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -35,8 +35,8 @@ void TestReluFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
@@ -68,8 +68,8 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
@@ -123,8 +123,8 @@ void TestReluUint8(const int* input_dims_data, const float* input_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  AsymmetricQuantize(golden, golden_quantized, output_elements_count,
-                     output_scale, output_zero_point);
+  Quantize(golden, golden_quantized, output_elements_count, output_scale,
+           output_zero_point);
 
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden_quantized[i], output_data[i]);
@@ -164,8 +164,8 @@ void TestRelu6Uint8(const int* input_dims_data, const float* input_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  AsymmetricQuantize(golden, golden_quantized, output_elements_count,
-                     output_scale, output_zero_point);
+  Quantize(golden, golden_quantized, output_elements_count, output_scale,
+           output_zero_point);
 
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden_quantized[i], output_data[i]);
@@ -204,8 +204,8 @@ void TestReluInt8(const int* input_dims_data, const float* input_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  AsymmetricQuantize(golden, golden_quantized, output_elements_count,
-                     output_scale, output_zero_point);
+  Quantize(golden, golden_quantized, output_elements_count, output_scale,
+           output_zero_point);
 
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden_quantized[i], output_data[i]);
@@ -244,8 +244,8 @@ void TestRelu6Int8(const int* input_dims_data, const float* input_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  AsymmetricQuantize(golden, golden_quantized, output_elements_count,
-                     output_scale, output_zero_point);
+  Quantize(golden, golden_quantized, output_elements_count, output_scale,
+           output_zero_point);
 
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden_quantized[i], output_data[i]);
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 241dda7d090..a11b73c3290 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -100,9 +100,9 @@ void TestAddFloat(const int* input1_dims_data, const float* input1_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateAddGoldens(tensors, tensors_size, expected_output, output_data,
@@ -136,9 +136,8 @@ void TestAddQuantized(const int* input1_dims_data, const float* input1_data,
       tflite::testing::CreateQuantizedTensor(output_data, output_dims,
                                              output_scale, output_zero_point),
   };
-  tflite::AsymmetricQuantize(golden, golden_quantized,
-                             ElementCount(*output_dims), output_scale,
-                             output_zero_point);
+  tflite::Quantize(golden, golden_quantized, ElementCount(*output_dims),
+                   output_scale, output_zero_point);
 
   ValidateAddGoldens(tensors, tensors_size, golden_quantized, output_data,
                      ElementCount(*output_dims), activation);
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index 55ef2650bef..4522421fa56 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -66,6 +66,7 @@ struct OpData {
   int32_t output_activation_max;
 };
 
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
   switch (padding) {
     case TfLitePadding::kTfLitePaddingSame:
@@ -77,6 +78,7 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
       return PaddingType::kNone;
   }
 }
+#endif
 
 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLiteTensor* filter, const TfLiteTensor* bias,
@@ -194,7 +196,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   data->output_zero_point = output->params.zero_point;
 
   return kTfLiteOk;
-}  // namespace conv
+}
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteConvParams* params, const OpData& data,
@@ -259,10 +261,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
     mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
 
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
+    ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+    ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+    ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+    ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
 
     if (params->activation == kTfLiteActRelu) {
       cfg.relu.type = MLI_RELU_GEN;
@@ -313,14 +315,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     mli_tensor out_local = mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+    TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
         context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+    TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
         &in_local, &out_local, kernel_height, cfg.stride_height,
         cfg.padding_top, cfg.padding_bottom, &in_slice_height,
         &out_slice_height));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
-        &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+    TF_LITE_ENSURE_STATUS(
+        ops::micro::arc_scratch_buffer_calc_slice_size_weights(
+            &weights_local, &bias_local, weight_out_ch_dimension,
+            &slice_channels));
 
     /* is_local indicates that the tensor is already in local memory,
        so in that case the original tensor can be used,
@@ -330,10 +334,12 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     const bool w_is_local = weights_local.data == mli_weights.data;
     const bool b_is_local = bias_local.data == mli_bias.data;
 
-    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
-    TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
-    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
-                              0, 0, 0, true);
+    ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
+                                     slice_channels);
+    ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension,
+                                     slice_channels);
+    ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
+                                          slice_channels, 0, 0, 0, true);
 
     mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
     mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -352,15 +358,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
       dimension. for that the sliceHeight has been calculated. The tensor slicer
       is configured that it will completely slice the nBatch dimension (0) and
       slice the height dimension (1) in chunks of 'sliceHeight' */
-      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
-                            cfg.padding_top, cfg.padding_bottom, overlap);
+      ops::micro::TensorSlicer in_slice(&mli_in, height_dimension,
+                                        in_slice_height, cfg.padding_top,
+                                        cfg.padding_bottom, overlap);
 
       /* output tensor is alreade sliced in the output channel dimension.
       out_ch_slice.Sub() is the tensor for the amount of output channels of this
       itteration of the weight slice loop. This tensor needs to be further
       sliced over the batch and height dimension. */
-      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
-                             out_slice_height);
+      ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
+                                         out_slice_height);
 
       /* setup the pointers to the local or remote tensor to make the code
        * inside the loop easier. */
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index d30a5308708..8fe5d307cdd 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -242,10 +242,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
   mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
 
-  ConvertToMliTensor<int8_t>(input, &mli_in);
-  ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-  ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-  ConvertToMliTensor<int8_t>(output, &mli_out);
+  ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+  ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+  ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+  ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
 
   if (params->activation == kTfLiteActRelu) {
     cfg.relu.type = MLI_RELU_GEN;
@@ -301,7 +301,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   mli_mov_cfg_t copy_config;
   mli_mov_cfg_for_copy(&copy_config);
 
-  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+  TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
       context, &in_local, &weights_local, &bias_local, &out_local));
   /* is_local indicates that the tensor is already in local memory,
      so in that case the original tensor can be used,
@@ -311,10 +311,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   const bool w_is_local = weights_local.data == mli_weights.data;
   const bool b_is_local = bias_local.data == mli_bias.data;
 
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
       &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
       cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
       &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
 
   /* if input channels is not equal to output channels, a channel multiplier
@@ -324,13 +324,14 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     slice_channels = (slice_channels / in_channels) * in_channels;
   }
 
-  TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
-  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
-                       0, 0, true);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
-                            0, 0, 0, true);
-  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
-                           0, 0, true);
+  ops::micro::TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension,
+                                   slice_channels);
+  ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
+                                   slice_channels, 0, 0, 0, true);
+  ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
+                                        slice_channels, 0, 0, 0, true);
+  ops::micro::TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension,
+                                       slice_channels, 0, 0, 0, true);
 
   mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
   mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -355,14 +356,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     the sliceHeight has been calculated. The tensor slicer is configured that
     it will completely slice the nBatch dimension (0) and slice the height
     dimension (1) in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
-                          padding_top, padding_bottom, overlap);
+    ops::micro::TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension,
+                                      inSliceHeight, padding_top,
+                                      padding_bottom, overlap);
 
     /* output tensor is alreade sliced in the output channel dimension.
     out_ch_slice.Sub() is the tensor for the amount of output channels of this
     itteration of the weight slice loop. This tensor needs to be further
     sliced over the batch and height dimension. */
-    TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension,
+                                       outSliceHeight);
 
     /* setup the pointers to the local or remote tensor to make the code
      * inside the loop easier. */
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 2d201653efc..ea5c6c6eaf3 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -29,9 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
 namespace {
 
 struct OpData {
@@ -127,10 +124,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   mli_tensor mli_bias = {};
   mli_tensor mli_out = {};
 
-  ConvertToMliTensor<int8_t>(input, &mli_in);
-  ConvertToMliTensor<int8_t>(filter, &mli_weights);
-  ConvertToMliTensor<int32_t>(bias, &mli_bias);
-  ConvertToMliTensor<int8_t>(output, &mli_out);
+  ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+  ops::micro::ConvertToMliTensor<int8_t>(filter, &mli_weights);
+  ops::micro::ConvertToMliTensor<int32_t>(bias, &mli_bias);
+  ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
 
   /* The input tensor can have more than 2 dimensions. for the compute this
      doesn't make any difference because all the inputs or a batch entry will
@@ -156,9 +153,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   int slice_size = mli_weights.shape[weight_out_dimension];
 
   /* allocate the local buffers, and compute the slice size */
-  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
-      context, &in_local, &weights_local, &bias_local, &out_local));
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+  TF_LITE_ENSURE_STATUS(
+      ops::micro::get_arc_scratch_buffer_for_fully_connect_tensors(
+          context, &in_local, &weights_local, &bias_local, &out_local));
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
       &weights_local, &bias_local, weight_out_dimension, &slice_size));
   int max_out_slice_size =
       out_local.capacity / mli_hlp_tensor_element_size(&out_local);
@@ -172,10 +170,11 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   const bool w_is_local = weights_local.data == mli_weights.data;
   const bool b_is_local = bias_local.data == mli_bias.data;
 
-  TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
-  TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
-                            true);
+  ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_dimension,
+                                   slice_size);
+  ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+  ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension,
+                                        slice_size, 0, 0, 0, true);
 
   mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
   mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -188,15 +187,15 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 
     // Slice the input over the batches (one at a time with the size of a
     // complete input)
-    TensorSlicer in_slice(&mli_in, input_size_dimension,
-                          mli_in.shape[input_size_dimension]);
+    ops::micro::TensorSlicer in_slice(&mli_in, input_size_dimension,
+                                      mli_in.shape[input_size_dimension]);
 
     /* output tensor is alreade sliced in the output size dimension.
     out_ch_slice.Sub() is the tensor for the amount of output size of this
     itteration of the weight slice loop. This tensor needs to be further
     sliced over the batch */
-    TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
-                           slice_size);
+    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
+                                       slice_size);
 
     /* setup the pointers to the local or remote tensor to make the code
      * inside the loop easier. */
@@ -359,19 +358,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace fully_connected
-
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
index e20eea22a03..905c6fedf9d 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@@ -25,13 +25,13 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
                            int slice_size, int padding_pre, int padding_post,
                            int overlap, bool interleave_mode)
     : full_tensor_(full_tensor),
+      sub_tensor_{},
+      sub_cfg_{},
+      done_(false),
       sliceDim_(slice_dim),
       pad_pre_(padding_pre),
       pad_post_(padding_post),
-      overlap_(overlap),
-      sub_cfg_{},
-      sub_tensor_{},
-      done_(false) {
+      overlap_(overlap) {
   /* In the interleave mode, the slicing happens from the deepest dimension up
   to the slice_dim for example in an HWC layout this can mode can be used to
   slice in the C dimenstion. in this mode the data is not contiguous in memory
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index b85d59555f8..e1e87d39be3 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -60,9 +60,9 @@ void TestArgMinMaxFloat(const int* input_dims_data, const float* input_values,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_values, input_dims),
-      CreateInt32Tensor(axis_values, axis_dims),
-      CreateInt32Tensor(output, output_dims),
+      CreateTensor(input_values, input_dims),
+      CreateTensor(axis_values, axis_dims),
+      CreateTensor(output, output_dims),
   };
 
   ValidateArgMinMaxGoldens(tensors, tensors_size, goldens, output,
@@ -88,8 +88,8 @@ void TestArgMinMaxQuantized(const int* input_dims_data,
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_values, input_quantized, input_dims,
                             input_scale, input_zero_point),
-      CreateInt32Tensor(axis_values, axis_dims),
-      CreateInt32Tensor(output, output_dims),
+      CreateTensor(axis_values, axis_dims),
+      CreateTensor(output, output_dims),
   };
 
   ValidateArgMinMaxGoldens(tensors, tensors_size, goldens, output,
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index a388d285db3..286cbd2f194 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -33,8 +33,8 @@ void TestCeil(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
index 6db88839073..2816e118271 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/add.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 80a0a2ae748..f74a8622fde 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nn_types.h"
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nn_types.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -419,6 +419,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       EvalFloat(context, node, params, data, input, filter, bias, nullptr,
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 3a59b71c985..7715dbe465d 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index 9f901d436a1..11a0f0bdc23 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
index e7e23818f5e..20686500ac8 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/mul.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 4229b2c244c..e1ac2b595a3 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "flatbuffers/base.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
@@ -99,17 +99,17 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
                           TfLiteEvalTensor* output) {
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
 
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data.padding.height;
-  op_params.padding_values.width = data.padding.width;
-  op_params.quantized_activation_min = data.activation_min;
-  op_params.quantized_activation_max = data.activation_max;
-
   if (input->type == kTfLiteUInt8) {
+    PoolParams op_params;
+    op_params.stride_height = params->stride_height;
+    op_params.stride_width = params->stride_width;
+    op_params.filter_height = params->filter_height;
+    op_params.filter_width = params->filter_width;
+    op_params.padding_values.height = data.padding.height;
+    op_params.padding_values.width = data.padding.width;
+    op_params.quantized_activation_min = data.activation_min;
+    op_params.quantized_activation_max = data.activation_max;
+
     reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
                                tflite::micro::GetTensorData<uint8_t>(input),
                                tflite::micro::GetTensorShape(output),
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
index 60e1a9a88b0..9ca08abe862 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc b/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc
index 16358e62e10..f4ee0c73ccf 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 
-#include "cmsis/CMSIS/NN/Include/arm_nn_types.h"
-#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "CMSIS/NN/Include/arm_nn_types.h"
+#include "CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index 855192baed2..addb08aa4da 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -61,9 +61,9 @@ void TestComparisonFloat(const TfLiteRegistration& registration,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
 
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateBoolTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TestComparison(registration, tensors, expected_output_data, output_data);
@@ -79,9 +79,9 @@ void TestComparisonBool(const TfLiteRegistration& registration,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
 
   TfLiteTensor tensors[tensors_size] = {
-      CreateBoolTensor(input1_data, input1_dims),
-      CreateBoolTensor(input2_data, input2_dims),
-      CreateBoolTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TestComparison(registration, tensors, expected_output_data, output_data);
@@ -97,9 +97,9 @@ void TestComparisonInt(const TfLiteRegistration& registration,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
 
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(input1_data, input1_dims),
-      CreateInt32Tensor(input2_data, input2_dims),
-      CreateBoolTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TestComparison(registration, tensors, expected_output_data, output_data);
@@ -122,7 +122,7 @@ void TestComparisonQuantizedUInt8(const TfLiteRegistration& registration,
                             input1_scale, input1_zero_point),
       CreateQuantizedTensor(input2_data, input2_quantized, input2_dims,
                             input2_scale, input2_zero_point),
-      CreateBoolTensor(output_data, output_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TestComparison(registration, tensors, expected_output_data, output_data);
@@ -145,7 +145,7 @@ void TestComparisonQuantizedInt8(const TfLiteRegistration& registration,
                             input1_scale, input1_zero_point),
       CreateQuantizedTensor(input2_data, input2_quantized, input2_dims,
                             input2_scale, input2_zero_point),
-      CreateBoolTensor(output_data, output_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TestComparison(registration, tensors, expected_output_data, output_data);
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index d7ed2213c98..cb7e0bff626 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -38,10 +38,9 @@ void TestConcatenateTwoInputs(const int* input1_dims_data,
   constexpr int input_size = 2;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input1_data, input1_dims),
+                                        CreateTensor(input2_data, input2_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 9b1b1148176..55efa486234 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -299,6 +299,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       EvalFloat(context, node, params, data, input, filter, bias, nullptr,
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index d0d942c53c8..39f4bac9732 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -54,11 +54,8 @@ static TfLiteConvParams common_conv_params = {
 };
 
 template <typename T>
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const T* expected_output_data, T* output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 float tolerance = 1e-5) {
+TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, T* output_data,
+                        int output_length, TfLiteConvParams* conv_params) {
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
@@ -70,14 +67,24 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
       reinterpret_cast<void*>(conv_params), micro_test::reporter);
 
   const char* init_data = reinterpret_cast<const char*>(conv_params);
-
-  // TODO(b/154240825): Use a test macro here which fails and returns.
   TfLiteStatus status = runner.InitAndPrepare(init_data);
   if (status != kTfLiteOk) {
     return status;
   }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+  return runner.Invoke();
+}
 
+template <typename T>
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const T* expected_output_data, T* output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 float tolerance = 1e-5) {
+  TfLiteStatus status = InvokeConv(tensors, tensors_size, output_data,
+                                   output_length, conv_params);
+  if (status != kTfLiteOk) {
+    return status;
+  }
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
                               tolerance);
@@ -100,10 +107,10 @@ void TestConvFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(filter_data, filter_dims),
-      CreateFloatTensor(bias_data, bias_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(filter_data, filter_dims),
+      CreateTensor(bias_data, bias_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TF_LITE_MICRO_EXPECT_EQ(
@@ -126,8 +133,8 @@ void TestConvQuantizedPerLayer(
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
-  tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized,
-                             output_dims_count, output_scale, 128);
+  tflite::Quantize(expected_output_data, expected_output_quantized,
+                   output_dims_count, output_scale, 128);
 
   constexpr int inputs_size = 3;
   constexpr int outputs_size = 1;
@@ -211,9 +218,8 @@ void TestConvQuantizedPerChannel(
       output_tensor,
   };
 
-  tflite::AsymmetricQuantize(expected_output_data,
-                             expected_output_data_quantized, output_dims_count,
-                             output_scale, output_zero_point);
+  tflite::Quantize(expected_output_data, expected_output_data_quantized,
+                   output_dims_count, output_scale, output_zero_point);
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
       ValidateConvGoldens(tensors, tensors_size, expected_output_data_quantized,
@@ -278,6 +284,64 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
       &tflite::testing::common_conv_params);
 }
 
+TF_LITE_MICRO_TEST(InputOutputDifferentTypeIsError) {
+  using tflite::testing::CreateQuantizedTensor;
+  using tflite::testing::CreateTensor;
+  using tflite::testing::IntArrayFromInts;
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(tflite::testing::kInputShape);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(tflite::testing::kFilterShape);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(tflite::testing::kBiasShape);
+  TfLiteIntArray* output_dims = IntArrayFromInts(tflite::testing::kOutputShape);
+  const int output_dims_count = tflite::ElementCount(*output_dims);
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+
+  int8_t output_data[tflite::testing::kOutputElements];
+  TfLiteTensor tensors[tensors_size] = {
+      CreateTensor(tflite::testing::kInputData, input_dims),
+      CreateTensor(tflite::testing::kFilterData, filter_dims),
+      CreateTensor(tflite::testing::kBiasData, bias_dims),
+      CreateQuantizedTensor(output_data, output_dims, /*scale=*/0.0f,
+                            /*zero_point=*/0),
+  };
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, tflite::testing::InvokeConv(
+                        tensors, tensors_size, output_data, output_dims_count,
+                        &tflite::testing::common_conv_params));
+}
+
+TF_LITE_MICRO_TEST(HybridModeIsError) {
+  using tflite::testing::CreateQuantizedTensor;
+  using tflite::testing::CreateTensor;
+  using tflite::testing::IntArrayFromInts;
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(tflite::testing::kInputShape);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(tflite::testing::kFilterShape);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(tflite::testing::kBiasShape);
+  TfLiteIntArray* output_dims = IntArrayFromInts(tflite::testing::kOutputShape);
+  const int output_dims_count = tflite::ElementCount(*output_dims);
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+
+  int8_t filter_data[tflite::testing::kFilterElements] = {};
+  float output_data[tflite::testing::kOutputElements];
+  TfLiteTensor tensors[tensors_size] = {
+      CreateTensor(tflite::testing::kInputData, input_dims),
+      CreateQuantizedTensor(filter_data, filter_dims,
+                            /*scale=*/0.0f,
+                            /*zero_point=*/0),
+      CreateTensor(tflite::testing::kBiasData, bias_dims),
+      CreateTensor(output_data, output_dims),
+  };
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, tflite::testing::InvokeConv(
+                        tensors, tensors_size, output_data, output_dims_count,
+                        &tflite::testing::common_conv_params));
+}
+
 TF_LITE_MICRO_TEST(SimpleTestDilatedQuantized) {
   const int output_dims_count = 24;
   uint8_t output_data[output_dims_count];
@@ -567,8 +631,8 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
       output_tensor,
   };
 
-  tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized,
-                             output_dims_count, output_scale, 0);
+  tflite::Quantize(tflite::testing::kGoldenData, golden_quantized,
+                   output_dims_count, output_scale, 0);
 
   // Set filter quant to mismatched dimension.
   TfLiteAffineQuantization* quant = reinterpret_cast<TfLiteAffineQuantization*>(
@@ -641,7 +705,7 @@ TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
                             tflite::testing::kBiasElements,
                             input_scale * output_scale);
   TfLiteTensor bias_tensor =
-      tflite::testing::CreateInt32Tensor(bias_quantized, bias_dims);
+      tflite::testing::CreateTensor(bias_quantized, bias_dims);
 
   int bias_zero_points[2] = {1, 0};
   float bias_scales[2] = {1, input_scale * filter_scale};
@@ -670,8 +734,8 @@ TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
       output_tensor,
   };
 
-  tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized,
-                             output_dims_count, output_scale, 0);
+  tflite::Quantize(tflite::testing::kGoldenData, golden_quantized,
+                   output_dims_count, output_scale, 0);
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::testing::ValidateConvGoldens(
@@ -767,7 +831,7 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   tflite::SymmetricQuantize(bias_values, bias_quantized, kSampleSize,
                             input_scale * output_scale);
   TfLiteTensor bias_tensor =
-      tflite::testing::CreateInt32Tensor(bias_quantized, bias_dims);
+      tflite::testing::CreateTensor(bias_quantized, bias_dims);
 
   // There is a single zero point of 0, and a single scale of
   // input_scale * filter_scale.
@@ -802,9 +866,8 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   };
 
   int8_t golden_quantized[kSampleSize];
-  tflite::AsymmetricQuantize(expected_output, golden_quantized,
-                             output_dims_count, output_scale,
-                             output_zero_point);
+  tflite::Quantize(expected_output, golden_quantized, output_dims_count,
+                   output_scale, output_zero_point);
 
   // Rounding errors due to quantization should not exceed 1.
   constexpr int kQuantizationTolerance = 1;
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index 358d508a564..d324c9d033b 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -96,10 +96,10 @@ void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(filter_data, filter_dims),
-      CreateFloatTensor(bias_data, bias_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(filter_data, filter_dims),
+      CreateTensor(bias_data, bias_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count,
@@ -151,8 +151,8 @@ void TestDepthwiseConvQuantizedPerLayer(
                                          IntArrayFromInts(bias_zero_points), 0};
   tensors[2].quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
-                     output_zero_point);
+  Quantize(golden, golden_quantized, output_dims_count, output_scale,
+           output_zero_point);
   ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, conv_params,
                                1.0, tensors_size, tensors);
 }
@@ -217,8 +217,8 @@ void TestDepthwiseConvQuantizedPerChannel(
       output_tensor,
   };
 
-  AsymmetricQuantize(expected_output_data, expected_output_data_quantized,
-                     output_dims_count, output_scale, output_zero_point);
+  Quantize(expected_output_data, expected_output_data_quantized,
+           output_dims_count, output_scale, output_zero_point);
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
@@ -810,7 +810,7 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
   tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
                             input_scale * output_scale);
   TfLiteTensor bias_tensor =
-      tflite::testing::CreateInt32Tensor(bias_quantized, bias_dims);
+      tflite::testing::CreateTensor(bias_quantized, bias_dims);
 
   int bias_zero_points[2] = {1, 0};
   float bias_scales[2] = {1, input_scale * filter_scale};
@@ -839,8 +839,8 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
       output_tensor,
   };
 
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count,
-                             output_scale, 0);
+  tflite::Quantize(golden, golden_quantized, output_dims_count, output_scale,
+                   0);
 
   TfLiteDepthwiseConvParams conv_params;
   conv_params.activation = kTfLiteActNone;
@@ -954,7 +954,7 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
                             input_scale * output_scale);
   TfLiteTensor bias_tensor =
-      tflite::testing::CreateInt32Tensor(bias_quantized, bias_dims);
+      tflite::testing::CreateTensor(bias_quantized, bias_dims);
 
   // Set zero point and scale arrays with a single element for each.
   int bias_zero_points[] = {1, 0};
@@ -989,8 +989,7 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   };
 
   int8_t golden_quantized[output_elements];
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_elements,
-                             output_scale, 0);
+  tflite::Quantize(golden, golden_quantized, output_elements, output_scale, 0);
 
   // Errors due to quantization should not exceed 1.
   constexpr int kQuantizationTolerance = 1;
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index 86059c63647..8664595a99c 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -61,7 +61,7 @@ void TestDequantizeToFloat(const int* input_dims_data, const float* input_data,
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_data, input_data_quantized, input_dims, scale,
                             zero_point),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateDequantizeGoldens(tensors, tensors_size, expected_output_data,
@@ -84,7 +84,7 @@ void TestDequantizeToInt32(const int* input_dims_data, const float* input_data,
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_data, input_data_quantized, input_dims,
                             input_scale, input_zero_point),
-      CreateInt32Tensor(output_data, output_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   tensors[1].params.scale = output_scale;
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index 1f3b49b3616..665f8d4e0d6 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -35,9 +35,8 @@ void TestElementwiseFloat(const TfLiteRegistration& registration,
   constexpr int input_size = 1;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input_data, input_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   // Place a unique value in the uninitialized output buffer.
   for (int i = 0; i < output_dims_count; ++i) {
@@ -72,9 +71,8 @@ void TestElementwiseBool(const TfLiteRegistration& registration,
   constexpr int input_size = 1;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateBoolTensor(input_data, input_dims),
-      CreateBoolTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input_data, input_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   // Place false in the uninitialized output buffer.
   for (int i = 0; i < output_dims_count; ++i) {
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index dc9086a07cd..9e9da1ddd57 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -34,8 +34,8 @@ void TestFloor(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 3f113010485..ca0a4bcf758 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -276,10 +276,10 @@ TfLiteStatus TestFullyConnectedFloat(
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(weights_data, weights_dims),
-      CreateFloatTensor(bias_data, bias_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(weights_data, weights_dims),
+      CreateTensor(bias_data, bias_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   return ValidateFullyConnectedGoldens(tensors, tensors_size, activation, 1e-4f,
@@ -317,8 +317,8 @@ TfLiteStatus TestFullyConnectedQuantized(
                             output_zero_point),
   };
 
-  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
-                     output_zero_point);
+  Quantize(golden, golden_quantized, output_dims_count, output_scale,
+           output_zero_point);
 
   return ValidateFullyConnectedGoldens(tensors, tensors_size, activation, 0.0f,
                                        output_dims_count, golden_quantized,
diff --git a/tensorflow/lite/micro/kernels/hard_swish_test.cc b/tensorflow/lite/micro/kernels/hard_swish_test.cc
index 91345870023..2b92e902aa3 100644
--- a/tensorflow/lite/micro/kernels/hard_swish_test.cc
+++ b/tensorflow/lite/micro/kernels/hard_swish_test.cc
@@ -114,8 +114,8 @@ void TestHardSwishQuantized(int size, const T* output_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  AsymmetricDequantize<T>(output_data, output_elements_count, output_scale,
-                          output_zero_point, dequantized_output);
+  Dequantize<T>(output_data, output_elements_count, output_scale,
+                output_zero_point, dequantized_output);
 
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values[i], dequantized_output[i],
@@ -194,8 +194,8 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  AsymmetricDequantize<T>(output_data, output_elements_count, output_scale,
-                          output_zero_point, dequantized_output);
+  Dequantize<T>(output_data, output_elements_count, output_scale,
+                output_zero_point, dequantized_output);
 
   float sum_diff = 0;
   for (int i = 0; i < size; i++) {
@@ -229,8 +229,8 @@ void TestHardSwishFloat(const int size, float* output_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(float_input_values, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(float_input_values, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
index b37c6394a66..cac39278f10 100644
--- a/tensorflow/lite/micro/kernels/l2norm_test.cc
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -32,7 +32,7 @@ constexpr float kOutputMax = 127.0 / 128.0;
 
 TfLiteTensor CreateL2NormTensor(const float* data, TfLiteIntArray* dims,
                                 bool is_input) {
-  return CreateFloatTensor(data, dims);
+  return CreateTensor(data, dims);
 }
 
 template <typename T>
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index 67606e772e4..cca2e6a2eb7 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -38,9 +38,9 @@ void TestLogicalOp(const TfLiteRegistration& registration,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateBoolTensor(input1_data, input1_dims),
-      CreateBoolTensor(input2_data, input2_dims),
-      CreateBoolTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {2, 0, 1};
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index 7ba2dd8f52f..3099f2972dc 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -79,8 +79,8 @@ void TestLogisticFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateLogisticGoldens(tensors, tensors_size, output_data, golden,
@@ -108,8 +108,8 @@ void TestLogisticQuantized(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_elements_count,
-                             output_scale, output_zero_point);
+  tflite::Quantize(golden, golden_quantized, output_elements_count,
+                   output_scale, output_zero_point);
   ValidateLogisticGoldens(tensors, tensors_size, output_data, golden_quantized,
                           output_elements_count, 1.0);
 }
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 0db93ff18cb..9c0eac0726e 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -38,9 +38,9 @@ void TestMaxMinFloat(const TfLiteRegistration& registration,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {2, 0, 1};
@@ -118,9 +118,9 @@ void TestMaxMinQuantizedInt32(
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(input1_data, input1_dims),
-      CreateInt32Tensor(input2_data, input2_dims),
-      CreateInt32Tensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {2, 0, 1};
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index 8503a1502d1..5c0fe275e07 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -80,9 +80,9 @@ void TestMulFloat(const int* input1_dims_data, const float* input1_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateMulGoldens(tensors, tensors_size, activation, golden,
@@ -114,8 +114,8 @@ void TestMulQuantized(const int* input1_dims_data, const float* input1_data,
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             output_zero_point)};
 
-  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
-                     output_zero_point);
+  Quantize(golden, golden_quantized, output_dims_count, output_scale,
+           output_zero_point);
 
   ValidateMulGoldens(tensors, tensors_size, activation, golden_quantized,
                      output_dims_count, 1.0f, output_data);
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index f3c0e7d36a8..40111dca0d4 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -34,8 +34,8 @@ void TestNegFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index 5ac80d698b5..d523db3e983 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -61,10 +61,9 @@ void TestPackTwoInputsFloat(const int* input1_dims_data,
   constexpr int input_size = 2;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input1_data, input1_dims),
+                                        CreateTensor(input2_data, input2_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   TfLitePackParams builtin_data = {
       .values_count = 2,
@@ -95,11 +94,10 @@ void TestPackThreeInputsFloat(
   constexpr int input_size = 3;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(input3_data, input3_dims),
-      CreateFloatTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input1_data, input1_dims),
+                                        CreateTensor(input2_data, input2_dims),
+                                        CreateTensor(input3_data, input3_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   TfLitePackParams builtin_data = {
       .values_count = 3,
@@ -167,10 +165,9 @@ void TestPackTwoInputsQuantized32(const int* input1_dims_data,
   constexpr int input_size = 2;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(input1_data, input1_dims),
-      CreateInt32Tensor(input2_data, input2_dims),
-      CreateInt32Tensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input1_data, input1_dims),
+                                        CreateTensor(input2_data, input2_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   TfLitePackParams builtin_data = {
       .values_count = 2,
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index e94bc993fea..859fc1b05e9 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -101,10 +101,9 @@ void TestPadFloat(const int* input_dims_data, const float* input_data,
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateInt32Tensor(pad_data, pad_dims),
-      CreateFloatTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input_data, input_dims),
+                                        CreateTensor(pad_data, pad_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   // Pad tensor must be constant.
   tensors[1].allocation_type = kTfLiteMmapRo;
@@ -130,10 +129,9 @@ void TestPadV2Float(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateInt32Tensor(pad_data, pad_dims),
-      CreateFloatTensor(&pad_value, pad_value_dims),
-      CreateFloatTensor(output_data, output_dims)};
+      CreateTensor(input_data, input_dims), CreateTensor(pad_data, pad_dims),
+      CreateTensor(&pad_value, pad_value_dims),
+      CreateTensor(output_data, output_dims)};
 
   // Pad tensor must be constant.
   tensors[1].allocation_type = kTfLiteMmapRo;
@@ -161,15 +159,15 @@ void TestPadQuantized(const int* input_dims_data, const float* input_data,
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_data, input_quantized, input_dims,
                             input_scale, input_zero_point),
-      CreateInt32Tensor(pad_data, pad_dims),
+      CreateTensor(pad_data, pad_dims),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             output_zero_point)};
 
   // Pad tensor must be constant.
   tensors[1].allocation_type = kTfLiteMmapRo;
 
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count,
-                             output_scale, output_zero_point);
+  tflite::Quantize(golden, golden_quantized, output_dims_count, output_scale,
+                   output_zero_point);
   TF_LITE_MICRO_EXPECT_EQ(
       expected_status,
       ValidatePadGoldens(tensors, tensors_size, golden_quantized, output_data,
@@ -200,7 +198,7 @@ void TestPadV2Quantized(const int* input_dims_data, const float* input_data,
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_data, input_quantized, input_dims,
                             input_scale, input_zero_point),
-      CreateInt32Tensor(pad_data, pad_dims),
+      CreateTensor(pad_data, pad_dims),
       CreateQuantizedTensor(&pad_value, &pad_value_quantized, pad_value_dims,
                             pad_value_scale, pad_value_zero_point),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
@@ -211,8 +209,8 @@ void TestPadV2Quantized(const int* input_dims_data, const float* input_data,
   tensors[2].params.scale = pad_value_scale;
   tensors[3].params.scale = output_scale;
 
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count,
-                             output_scale, output_zero_point);
+  tflite::Quantize(golden, golden_quantized, output_dims_count, output_scale,
+                   output_zero_point);
   TF_LITE_MICRO_EXPECT_EQ(
       expected_status,
       ValidatePadV2Goldens(tensors, tensors_size, golden_quantized, output_data,
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 9782b49ad98..2f384597e7c 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -73,8 +73,8 @@ void TestAveragePoolFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   const TfLiteRegistration registration =
@@ -131,8 +131,8 @@ void TestMaxPoolFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   const TfLiteRegistration registration =
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 3a0b10a0d94..92acecf052a 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -57,9 +57,9 @@ void TestPreluFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(alpha_data, alpha_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(alpha_data, alpha_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidatePreluGoldens(tensors, tensors_size, expected_output_data,
@@ -93,8 +93,8 @@ void TestPreluQuantized(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
-                     output_zero_point);
+  Quantize(golden, golden_quantized, output_dims_count, output_scale,
+           output_zero_point);
 
   ValidatePreluGoldens(tensors, tensors_size, golden_quantized,
                        output_dims_count, output_data);
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index 81f5f073b48..f6d8c927949 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -67,7 +67,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                               input->type == kTfLiteInt8);
   TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
                               output->type == kTfLiteInt8 ||
-                              output->type == kTfLiteInt16);
+                              output->type == kTfLiteInt16 ||
+                              output->type == kTfLiteInt32);
 
   if (((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
        output->type == kTfLiteInt8) ||
@@ -139,6 +140,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             data->quantization_params.zero_point,
             tflite::micro::GetTensorData<int16_t>(output));
         return kTfLiteOk;
+      case kTfLiteInt32:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->output_multiplier, data->output_shift, data->input_zero_point,
+            data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
+        return kTfLiteOk;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                            TfLiteTypeGetName(input->type),
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index b630fb53bca..f32f04ddb28 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -43,7 +43,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   // Use reference quantization from test utils to compare against op output.
-  AsymmetricQuantize(golden, golden_quantized, output_len, scale, zero_point);
+  Quantize(golden, golden_quantized, output_len, scale, zero_point);
   for (int i = 0; i < output_len; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden_quantized[i], output_data[i]);
   }
@@ -71,7 +71,7 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
   // 1 input, 1 output.
   constexpr int tensors_size = 2;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
+      CreateTensor(input_data, input_dims),
       output_tensor,
   };
 
@@ -282,4 +282,21 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) {
                                   output_zero_point, output_quantized);
 }
 
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt32) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-32, -31, -30, -29, -28, 27, 28, 29, 30, 31};
+  const float input_scale = 1.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 0;
+  int32_t output_quantized[length];
+  int32_t values_quantized[length];
+  int16_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index fdb8fe95466..3666bc0b2fb 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -106,9 +106,9 @@ void TestMeanFloatInput4D(const int* input_dims_data, const float* input_data,
 
   constexpr int tensors_size = num_of_inputs + num_of_outputs;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateInt32Tensor(axis_data, axis_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(axis_data, axis_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TF_LITE_MICRO_EXPECT_EQ(
@@ -133,9 +133,9 @@ void TestReduceOpFloat(const int* input_dims_data, const float* input_data,
 
   constexpr int tensors_size = num_of_inputs + num_of_outputs;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateInt32Tensor(axis_data, axis_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(axis_data, axis_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   TF_LITE_MICRO_EXPECT_EQ(
@@ -165,15 +165,14 @@ void TestReduceOpQuantized(
   TfLiteTensor tensors[] = {
       CreateQuantizedTensor(input_data, input_data_quant, input_dims,
                             input_scale, input_zero_point),
-      CreateInt32Tensor(axis_data, axis_dims),
+      CreateTensor(axis_data, axis_dims),
       CreateQuantizedTensor(output_data_quant, output_dims, output_scale,
                             output_zero_point),
   };
 
   // Quantize expected output
-  tflite::AsymmetricQuantize(expected_output_data, expected_output_data_quant,
-                             output_dims_count, output_scale,
-                             output_zero_point);
+  tflite::Quantize(expected_output_data, expected_output_data_quant,
+                   output_dims_count, output_scale, output_zero_point);
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
@@ -204,15 +203,14 @@ void TestMeanOpQuantized(const int* input_dims_data, const float* input_data,
   TfLiteTensor tensors[] = {
       CreateQuantizedTensor(input_data, input_data_quant, input_dims,
                             input_scale, input_zero_point),
-      CreateInt32Tensor(axis_data, axis_dims),
+      CreateTensor(axis_data, axis_dims),
       CreateQuantizedTensor(output_data_quant, output_dims, output_scale,
                             output_zero_point),
   };
 
   // Quantize expected output
-  tflite::AsymmetricQuantize(expected_output_data, expected_output_data_quant,
-                             output_dims_count, output_scale,
-                             output_zero_point);
+  tflite::Quantize(expected_output_data, expected_output_data_quant,
+                   output_dims_count, output_scale, output_zero_point);
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index 48d1956f1c8..9e1da3ca51d 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -121,9 +121,9 @@ void TestReshape(const int* input_dims_data, const float* input_data,
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  TfLiteTensor input_tensor = CreateFloatTensor(input_data, input_dims);
-  TfLiteTensor shape_tensor = CreateInt32Tensor(shape_data, shape_dims);
-  TfLiteTensor output_tensor = CreateFloatTensor(output_data, output_dims);
+  TfLiteTensor input_tensor = CreateTensor(input_data, input_dims);
+  TfLiteTensor shape_tensor = CreateTensor(shape_data, shape_dims);
+  TfLiteTensor output_tensor = CreateTensor(output_data, output_dims);
 
   TestReshapeWithShape(&input_tensor, &shape_tensor, &output_tensor,
                        expected_output, expected_output_len, expected_dims,
@@ -144,7 +144,7 @@ void TestReshapeQuantized(const int* input_dims_data, const T* input_data,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   TfLiteTensor input_tensor = CreateQuantizedTensor(
       input_data, input_dims, /*scale=*/1.f, /*zero_point=*/0);
-  TfLiteTensor shape_tensor = CreateInt32Tensor(shape_data, shape_dims);
+  TfLiteTensor shape_tensor = CreateTensor(shape_data, shape_dims);
   TfLiteTensor output_tensor = CreateQuantizedTensor(
       output_data, output_dims, /*scale=*/1.f, /*zero_point=*/0);
 
@@ -213,14 +213,12 @@ TF_LITE_MICRO_TEST(ReshapeWithInvalidShapeShouldFail) {
   TfLiteIntArray* input_dims =
       tflite::testing::IntArrayFromInts(input_dims_data);
   const float input_data[] = {3.0f};
-  auto input_tensor =
-      tflite::testing::CreateFloatTensor(input_data, input_dims);
+  auto input_tensor = tflite::testing::CreateTensor(input_data, input_dims);
   float output_data[4];
   int output_dims_data[6] = {2, 2, 1, 2, 2, 1};
   TfLiteIntArray* output_dims =
       tflite::testing::IntArrayFromInts(output_dims_data);
-  auto output_tensor =
-      tflite::testing::CreateFloatTensor(output_data, output_dims);
+  auto output_tensor = tflite::testing::CreateTensor(output_data, output_dims);
   const int expected_output[] = {};
   const int expected_output_len = 0;
   const int expected_dims[] = {};
@@ -328,25 +326,24 @@ TF_LITE_MICRO_TEST(ReshapeWithScalarOutputShouldSucceed) {
 // Some old models specify '[0]' as the new shape, indicating that both input
 // and output are scalars.
 TF_LITE_MICRO_TEST(ReshapeWithLegacyScalarOutputShouldSucceed) {
-  using tflite::testing::CreateFloatTensor;
+  using tflite::testing::CreateTensor;
   using tflite::testing::IntArrayFromInts;
 
   int input_dims_data[] = {1, 1};
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   const float input_data[] = {3.0f};
-  auto input_tensor = CreateFloatTensor(input_data, input_dims);
+  auto input_tensor = CreateTensor(input_data, input_dims);
 
   float output_data[1];
   int output_dims_data[2] = {1, 0};
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  auto output_tensor = CreateFloatTensor(output_data, output_dims);
+  auto output_tensor = CreateTensor(output_data, output_dims);
 
   int shape_dims_data[] = {1, 0};
   TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
 
   const int32_t shape_data[] = {0};
-  auto shape_tensor =
-      tflite::testing::CreateInt32Tensor(shape_data, shape_dims);
+  auto shape_tensor = tflite::testing::CreateTensor(shape_data, shape_dims);
   const float expected_output_with_shape[] = {};
   const int expected_output_with_shape_len = 0;
   const float expected_output_no_shape[] = {3};
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index 9362a89a3ed..f1af763d9bb 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -27,7 +27,7 @@ using uint8_t = std::uint8_t;
 using int32_t = std::int32_t;
 
 TfLiteTensor TestCreateTensor(const float* data, TfLiteIntArray* dims) {
-  return CreateFloatTensor(data, dims);
+  return CreateTensor(data, dims);
 }
 
 TfLiteTensor TestCreateTensor(const uint8_t* data, TfLiteIntArray* dims) {
@@ -59,7 +59,7 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
   constexpr int tensors_size = 3;
   TfLiteTensor tensors[tensors_size] = {
       TestCreateTensor(input_data, input_dims),
-      CreateInt32Tensor(expected_size_data, expected_size_dims),
+      CreateTensor(expected_size_data, expected_size_dims),
       TestCreateTensor(output_data, output_dims),
   };
 
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index 8067d8cd091..412ecf5b539 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -33,8 +33,8 @@ void TestRound(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/shape_test.cc b/tensorflow/lite/micro/kernels/shape_test.cc
index 7c7e0db82db..5bfdee5bb10 100755
--- a/tensorflow/lite/micro/kernels/shape_test.cc
+++ b/tensorflow/lite/micro/kernels/shape_test.cc
@@ -55,8 +55,8 @@ void TestShape(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateInt32Tensor(output_data, output_dims, true),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims, true),
   };
 
   ValidateShape(tensors, tensors_size, output_data, expected_output_data,
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index 21fc1074760..bfc1c4b61ff 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -281,8 +281,8 @@ void TestSoftmaxFloat(const int* input_dims_data, const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateSoftmaxGoldens(tensors, tensors_size, output_data,
@@ -310,8 +310,8 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
-                     output_zero_point);
+  Quantize(golden, golden_quantized, output_dims_count, output_scale,
+           output_zero_point);
 
   ValidateSoftmaxGoldens(tensors, tensors_size, output_data, golden_quantized,
                          output_dims_count, tolerance);
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index cd9a90804e0..b5d038cdc3a 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -42,10 +42,9 @@ void TestSplitTwoOutputsFloat(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(axis_data, axis_dims),
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output1_data, output1_dims),
-      CreateFloatTensor(output2_data, output2_dims)};
+      CreateTensor(axis_data, axis_dims), CreateTensor(input_data, input_dims),
+      CreateTensor(output1_data, output1_dims),
+      CreateTensor(output2_data, output2_dims)};
 
   // Currently only support constant axis tensor.
   tensors[0].allocation_type = kTfLiteMmapRo;
@@ -104,12 +103,12 @@ void TestSplitFourOutputsFloat(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(axis_data, axis_dims),
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output1_data, output1_dims),
-      CreateFloatTensor(output2_data, output2_dims),
-      CreateFloatTensor(output3_data, output1_dims),
-      CreateFloatTensor(output4_data, output1_dims)};
+      CreateTensor(axis_data, axis_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output1_data, output1_dims),
+      CreateTensor(output2_data, output2_dims),
+      CreateTensor(output3_data, output1_dims),
+      CreateTensor(output4_data, output1_dims)};
 
   // Currently only support constant axis tensor.
   tensors[0].allocation_type = kTfLiteMmapRo;
@@ -171,7 +170,7 @@ void TestSplitTwoOutputsQuantized(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(axis_data, axis_dims),
+      CreateTensor(axis_data, axis_dims),
       CreateQuantizedTensor(input_data, input_dims, 0, 10),
       CreateQuantizedTensor(output1_data, output1_dims, 0, 10),
       CreateQuantizedTensor(output2_data, output2_dims, 0, 10)};
@@ -227,10 +226,9 @@ void TestSplitTwoOutputsQuantized32(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(axis_data, axis_dims),
-      CreateInt32Tensor(input_data, input_dims),
-      CreateInt32Tensor(output1_data, output1_dims),
-      CreateInt32Tensor(output2_data, output2_dims)};
+      CreateTensor(axis_data, axis_dims), CreateTensor(input_data, input_dims),
+      CreateTensor(output1_data, output1_dims),
+      CreateTensor(output2_data, output2_dims)};
 
   // Currently only support constant axis tensor.
   tensors[0].allocation_type = kTfLiteMmapRo;
diff --git a/tensorflow/lite/micro/kernels/split_v_test.cc b/tensorflow/lite/micro/kernels/split_v_test.cc
index 6a41b2b1985..06c90cb69e3 100755
--- a/tensorflow/lite/micro/kernels/split_v_test.cc
+++ b/tensorflow/lite/micro/kernels/split_v_test.cc
@@ -63,13 +63,13 @@ void TestSplitVFloat(const int* input_dims_data, const float* input_data,
   // then come outputs
 
   TfLiteTensor tensors[tensors_size];
-  tensors[0] = CreateFloatTensor(input_data, input_dims);
-  tensors[1] = CreateInt32Tensor(split_data, split_dims);
-  tensors[2] = CreateInt32Tensor(axis_data, axis_dims);
+  tensors[0] = CreateTensor(input_data, input_dims);
+  tensors[1] = CreateTensor(split_data, split_dims);
+  tensors[2] = CreateTensor(axis_data, axis_dims);
 
   // add output tensors
   for (int i = 0; i < N; i++)
-    tensors[3 + i] = CreateFloatTensor(output_tensors.data[i], output_dims[i]);
+    tensors[3 + i] = CreateTensor(output_tensors.data[i], output_dims[i]);
 
   tensors[2].allocation_type = kTfLiteMmapRo;
   tensors[1].allocation_type = kTfLiteMmapRo;
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index a6de5bd1e59..7f8446001eb 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -74,11 +74,11 @@ void TestStridedSliceFloat(const int* input_shape, const int* begin_shape,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateInt32Tensor(begin_data, begin_dims),
-      CreateInt32Tensor(end_data, end_dims),
-      CreateInt32Tensor(strides_data, strides_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(begin_data, begin_dims),
+      CreateTensor(end_data, end_dims),
+      CreateTensor(strides_data, strides_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateStridedSliceGoldens(tensors, tensors_size, expected_output,
@@ -106,9 +106,9 @@ void TestStridedSliceQuantized(
       std::numeric_limits<T>::max() + std::numeric_limits<T>::min() / 2;
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_data, input_dims, 1.0, zero_point),
-      CreateInt32Tensor(begin_data, begin_dims),
-      CreateInt32Tensor(end_data, end_dims),
-      CreateInt32Tensor(strides_data, strides_dims),
+      CreateTensor(begin_data, begin_dims),
+      CreateTensor(end_data, end_dims),
+      CreateTensor(strides_data, strides_dims),
       CreateQuantizedTensor(output_data, output_dims, 1.0, zero_point),
   };
 
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index 1cc0c80527b..badca6e14e4 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -99,9 +99,9 @@ void TestSubFloat(const int* input1_dims_data, const float* input1_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input1_data, input1_dims),
-      CreateFloatTensor(input2_data, input2_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input1_data, input1_dims),
+      CreateTensor(input2_data, input2_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateSubGoldens(tensors, tensors_size, expected_output, output_data,
@@ -135,9 +135,8 @@ void TestSubQuantized(const int* input1_dims_data, const float* input1_data,
       tflite::testing::CreateQuantizedTensor(output_data, output_dims,
                                              output_scale, output_zero_point),
   };
-  tflite::AsymmetricQuantize(golden, golden_quantized,
-                             ElementCount(*output_dims), output_scale,
-                             output_zero_point);
+  tflite::Quantize(golden, golden_quantized, ElementCount(*output_dims),
+                   output_scale, output_zero_point);
 
   ValidateSubGoldens(tensors, tensors_size, golden_quantized, output_data,
                      ElementCount(*output_dims), activation);
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index 771ff66a4b7..775477b9710 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -565,13 +565,13 @@ void TestSVDF(const int batch_size, const int num_units, const int input_size,
 
   const int tensor_count = 6;  // 5 inputs, 1 output
   TfLiteTensor tensors[] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(feature_weights_data, feature_weights_dims),
-      CreateFloatTensor(time_weights_data, time_weights_dims),
-      CreateFloatTensor(bias_data, bias_dims),
-      CreateFloatTensor(activation_state_data, activation_state_dims,
-                        /*is_variable=*/true),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(feature_weights_data, feature_weights_dims),
+      CreateTensor(time_weights_data, time_weights_dims),
+      CreateTensor(bias_data, bias_dims),
+      CreateTensor(activation_state_data, activation_state_dims,
+                   /*is_variable=*/true),
+      CreateTensor(output_data, output_dims),
   };
 
   ValidateSVDFGoldens(batch_size, num_units, input_size, rank, tensors,
@@ -640,12 +640,10 @@ inline void TestIntegerSVDF(
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             output_zero_point)};
 
-  tflite::AsymmetricQuantize(golden_output, golden_output_quantized,
-                             golden_output_len, output_scale,
-                             output_zero_point);
-  tflite::AsymmetricQuantize(input_sequences_data, input_sequences_quantized,
-                             input_sequences_len, input_scale,
-                             input_zero_point);
+  tflite::Quantize(golden_output, golden_output_quantized, golden_output_len,
+                   output_scale, output_zero_point);
+  tflite::Quantize(input_sequences_data, input_sequences_quantized,
+                   input_sequences_len, input_scale, input_zero_point);
 
   ValidateSVDFGoldens(batch_size, num_units, input_size, rank, tensors,
                       tensor_count, activation, input_sequences_quantized,
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
index 4a4f94bc2e5..52a03aedcff 100644
--- a/tensorflow/lite/micro/kernels/tanh_test.cc
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -77,8 +77,8 @@ void TestTanhFloat(const int input_dims_data[], const float* input_data,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims),
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {1, 0};
@@ -113,9 +113,8 @@ void TestTanhQuantized(const int input_dims_data[], const float* input_data,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_elements_count = ElementCount(*output_dims);
 
-  tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized,
-                             output_elements_count, output_scale,
-                             output_zero_point);
+  tflite::Quantize(expected_output_data, expected_output_quantized,
+                   output_elements_count, output_scale, output_zero_point);
 
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index b5c17bd8d2f..95846651cd0 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -41,10 +41,10 @@ void TestUnpackThreeOutputsFloat(
   constexpr int output_size = 3;
   constexpr int tensors_size = input_size + output_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output1_data, output1_dims),
-      CreateFloatTensor(output2_data, output2_dims),
-      CreateFloatTensor(output3_data, output3_dims)};
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output1_data, output1_dims),
+      CreateTensor(output2_data, output2_dims),
+      CreateTensor(output3_data, output3_dims)};
 
   // Place a unique value in the uninitialized output buffer.
   for (int i = 0; i < output1_dims_count; ++i) {
@@ -102,9 +102,8 @@ void TestUnpackOneOutputFloat(const int* input_dims_data,
   constexpr int input_size = 1;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(output_data, output_dims)};
+  TfLiteTensor tensors[tensors_size] = {CreateTensor(input_data, input_dims),
+                                        CreateTensor(output_data, output_dims)};
 
   // Place a unique value in the uninitialized output buffer.
   for (int i = 0; i < output_dims_count; ++i) {
@@ -222,10 +221,10 @@ void TestUnpackThreeOutputsQuantized32(
   constexpr int output_size = 3;
   constexpr int tensors_size = input_size + output_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateInt32Tensor(input_data, input_dims),
-      CreateInt32Tensor(output1_data, output1_dims),
-      CreateInt32Tensor(output2_data, output2_dims),
-      CreateInt32Tensor(output3_data, output3_dims)};
+      CreateTensor(input_data, input_dims),
+      CreateTensor(output1_data, output1_dims),
+      CreateTensor(output2_data, output2_dims),
+      CreateTensor(output3_data, output3_dims)};
 
   // Place a unique value in the uninitialized output buffer.
   for (int i = 0; i < output1_dims_count; ++i) {
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/README.md b/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
index 48c1e52e133..16f56613aae 100644
--- a/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
@@ -116,7 +116,7 @@ root
 ```
 # Each stack* object contains the following information
 stack*
-|-- counts: 5 # Number of occurence with the exact same call stack
+|-- counts: 5 # Number of occurrences with the exact same call stack
 |-- [list of functions in the call stack]
 ```
 
@@ -130,4 +130,4 @@ The regular expression used in this script is configured with a standard
 *   `base`: Base regular expression to clean up the log, this is set to clean up
     the ANSI color codes in GDB
 *   `custom`: A series of other regular expressions (the script will run them in
-    order) to extract the information from the the log
+    order) to extract the information from the log
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 74bd7e216b4..a1d14df1352 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -30,10 +30,8 @@ namespace tflite {
 #define INT24_MIN -8388608
 #define INT24_MAX 8388607
 
-//
 // Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
 // aligned value in the QR register.
-//
 inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
                                              int32_t quantized_multiplier,
                                              int shift) {
@@ -92,11 +90,10 @@ inline ae_q56s MultiplyByQuantizedMultiplierResult48Bit(
   return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
 }
 
-//
 // Calculate quantization params for 24bit runtimes.
-//
-inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
-                               int* shift) {
+inline void QuantizeMultiplierForInt24(float multiplier,
+                                       int32_t* quantized_multiplier,
+                                       int* shift) {
   if (multiplier == 0.0f) {
     *quantized_multiplier = 0;
     *shift = 0;
@@ -123,9 +120,7 @@ inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
   *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-//
 // Convert a floating point number to a Q representation for 24 bit integers.
-//
 inline int CreateQConstantForInt24(int integer_bits, float f) {
   const float min_bounds = static_cast<float>(INT24_MIN);
   const float max_bounds = static_cast<float>(INT24_MAX);
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index 06e16d3d584..30a5b6a602a 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -147,8 +147,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
   double real_multiplier = 0.0;
   TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
       context, input, filter, bias, output, &real_multiplier));
-  QuantizeMultiplier(real_multiplier, &data->output_multiplier,
-                     &data->output_shift);
+  QuantizeMultiplierForInt24(real_multiplier, &data->output_multiplier,
+                             &data->output_shift);
   return CalculateActivationRangeQuantized(context, activation, output,
                                            &data->output_activation_min,
                                            &data->output_activation_max);
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
index 1b8a1bab0d3..28f8f1e1af0 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@@ -357,10 +357,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  QuantizeMultiplier(effective_scale_1, &data->effective_scale_1_a,
-                     &data->effective_scale_1_b);
-  QuantizeMultiplier(effective_scale_2, &data->effective_scale_2_a,
-                     &data->effective_scale_2_b);
+  QuantizeMultiplierForInt24(effective_scale_1, &data->effective_scale_1_a,
+                             &data->effective_scale_1_b);
+  QuantizeMultiplierForInt24(effective_scale_2, &data->effective_scale_2_a,
+                             &data->effective_scale_2_b);
 
   data->input_zero_point = input->params.zero_point;
   data->output_zero_point = output->params.zero_point;
diff --git a/tensorflow/lite/micro/memory_helpers_test.cc b/tensorflow/lite/micro/memory_helpers_test.cc
index 5000a880638..566ad369849 100644
--- a/tensorflow/lite/micro/memory_helpers_test.cc
+++ b/tensorflow/lite/micro/memory_helpers_test.cc
@@ -180,11 +180,11 @@ TF_LITE_MICRO_TEST(TestAllocateOutputDimensionsFromInput) {
   const int input1_dims[] = {1, 1};
   const int input2_dims[] = {kDimsLen, 5, 5, 5, 5};
   int output_dims[] = {0, 0, 0, 0, 0};
-  TfLiteTensor input_tensor1 = tflite::testing::CreateInt32Tensor(
+  TfLiteTensor input_tensor1 = tflite::testing::CreateTensor<int32_t>(
       nullptr, tflite::testing::IntArrayFromInts(input1_dims));
-  TfLiteTensor input_tensor2 = tflite::testing::CreateInt32Tensor(
+  TfLiteTensor input_tensor2 = tflite::testing::CreateTensor<int32_t>(
       nullptr, tflite::testing::IntArrayFromInts(input2_dims));
-  TfLiteTensor output_tensor = tflite::testing::CreateInt32Tensor(
+  TfLiteTensor output_tensor = tflite::testing::CreateTensor<int32_t>(
       nullptr, tflite::testing::IntArrayFromInts(output_dims));
   TfLiteContext context;
   // Only need to allocate space for output_tensor.dims.  Use a simple
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 770921b4234..675a64d2d58 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 
@@ -229,21 +230,6 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
     for (size_t n = 0; n < op->inputs()->size(); ++n) {
       const int tensor_index = op->inputs()->Get(n);
       AllocationInfo* current = &info_[tensor_index];
-
-      // TODO(b/166484865): Figure out a more general solution.
-      // This workaround is needed to handle situations where subgraph input !=
-      // operator input.
-      // In case operator input(s) are not in subgraph inputs initialize them.
-      if (current->first_created == 0) {
-        for (size_t op_input = 0; op_input < op->inputs()->size(); ++op_input) {
-          const int op_tensor_index = op->inputs()->Get(op_input);
-          AllocationInfo* op_current = &info_[op_tensor_index];
-          if (op_current->needs_allocating && op_current->first_created == -1) {
-            op_current->first_created = i;
-          }
-        }
-      }
-
       if (((current->last_used == -1) || (current->last_used < i))) {
         current->last_used = i;
       }
@@ -257,16 +243,15 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
     }
   }
 
-  // Work out which tensors need to be allocated.
+  // Sanity check for valid tensor lifetime.
   for (size_t i = 0; i < tensor_count_; ++i) {
     AllocationInfo* current = &info_[i];
-    const bool is_read_only =
+    // Even though tensor appears to be read only it may still need to be
+    // allocated.
+    const bool appears_read_only =
         (current->first_created == -1) && (current->last_used != -1);
-    if (is_read_only) {
-      current->needs_allocating = false;
-    }
     const bool has_partial_lifetime =
-        !is_read_only &&
+        !appears_read_only &&
         ((current->first_created == -1) || (current->last_used == -1));
     if (has_partial_lifetime && current->needs_allocating) {
       TF_LITE_REPORT_ERROR(
@@ -621,13 +606,6 @@ MicroAllocator::~MicroAllocator() {}
 MicroAllocator* MicroAllocator::Create(uint8_t* tensor_arena, size_t arena_size,
                                        ErrorReporter* error_reporter) {
   uint8_t* aligned_arena = AlignPointerUp(tensor_arena, kBufferAlignment);
-  if (aligned_arena != tensor_arena) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter,
-        "%d bytes lost due to alignment. To avoid this loss, please make sure "
-        "the tensor_arena is 16 bytes aligned.",
-        aligned_arena - tensor_arena);
-  }
   size_t aligned_arena_size = tensor_arena + arena_size - aligned_arena;
   return Create(SimpleMemoryAllocator::Create(error_reporter, aligned_arena,
                                               aligned_arena_size),
@@ -820,7 +798,7 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
     if (status != kTfLiteOk) {
       TF_LITE_REPORT_ERROR(error_reporter_,
                            "Failed to get registration from op code %s\n ",
-                           EnumNameBuiltinOperator(opcode->builtin_code()));
+                           EnumNameBuiltinOperator(GetBuiltinCode(opcode)));
       return status;
     }
     const auto* registration = node_and_registrations[i].registration;
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 222c8566fb9..26322760b19 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -28,6 +28,12 @@ namespace testing {
 namespace {
 
 constexpr int kExpectedAlignment = 4;
+constexpr int t0 = 0;
+constexpr int t1 = 1;
+constexpr int t2 = 2;
+constexpr int t3 = 3;
+constexpr int t4 = 4;
+constexpr int t5 = 5;
 
 void VerifyMockTfLiteTensor(TfLiteTensor* tensor, bool is_variable = false) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, tensor->type);
@@ -446,18 +452,18 @@ TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
 TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   int version = 1;
   int subgraph = 0;
-  constexpr int nbr_tensors = 4;
+  constexpr int number_tensors = 4;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
-                                nbr_tensors] = {version, subgraph,
-                                                nbr_tensors,  // header
-                                                // memory offsets:
-                                                -1, -1, -1, -1};
+                                number_tensors] = {version, subgraph,
+                                                   number_tensors,  // header
+                                                   // memory offsets:
+                                                   -1, -1, -1, -1};
 
   // The structure is identical to the one in
   // TestAllocationForModelsWithBranches
-  int num_conns = 3;
+  int number_connections = 3;
   tflite::testing::NodeConnection node_list[3] = {{
                                                       {0},  // input
                                                       {1}   // output
@@ -472,7 +478,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
                                                   }};
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list, num_conns);
+      number_tensors, metadata_buffer, node_list, number_connections);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
@@ -506,37 +512,26 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
-  constexpr int nbr_tensors = 4;
+  constexpr int number_tensors = 4;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
-                                nbr_tensors] = {1,  0, nbr_tensors,
-                                                0,    // t0
-                                                48,   // t1
-                                                0,    // t2
-                                                48};  // t3
-
-  int t0 = 0;
-  int t1 = 1;
-  int t2 = 2;
-  int t3 = 3;
-
-  int num_conns = 3;
-  tflite::testing::NodeConnection node_list[3] = {{
-                                                      {t0},  // input
-                                                      {t1}   // output
-                                                  },
-                                                  {
-                                                      {t1},  // input
-                                                      {t2}   // output
-                                                  },
-                                                  {
-                                                      {t2},  // input
-                                                      {t3}   // output
-                                                  }};
+                                number_tensors] = {1,         0, number_tensors,
+                                                   /*t0=*/0,
+                                                   /*t1=*/48,
+                                                   /*t2=*/0,
+                                                   /*t3=*/48};
+  constexpr int number_connections = 3;
+  tflite::testing::NodeConnection node_list[number_connections] = {
+      {/*input=*/{tflite::testing::t0},
+       /*output=*/{tflite::testing::t1}},
+      {/*input=*/{tflite::testing::t1},
+       /*output=*/{tflite::testing::t2}},
+      {/*input=*/{tflite::testing::t2},
+       /*output=*/{tflite::testing::t3}}};
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list, num_conns);
+      number_tensors, metadata_buffer, node_list, number_connections);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
@@ -561,37 +556,28 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
-  constexpr int nbr_tensors = 4;
+  constexpr int number_tensors = 4;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
-                                nbr_tensors] = {
-      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
-      // memory offsets:
-      0,    // t0
-      0,    // t1
-      48,   // t2
-      -1};  // t3
+                                number_tensors] = {/*version=*/1,
+                                                   /*subgraph=*/0,
+                                                   number_tensors,
+                                                   /*t0=*/0,
+                                                   /*t1=*/0,
+                                                   /*t2=*/48,
+                                                   /*t3=*/-1};
 
-  int t0 = 0;
-  int t1 = 1;
-  int t2 = 2;
-  int t3 = 3;
-
-  int num_conns = 2;
+  int number_connections = 2;
   tflite::testing::NodeConnection node_list[2] = {
-      {
-          {t0, t1},  // input, scratch
-          {t2}       // output
-      },
-      {
-          {t2},  // input
-          {t3}   // output
-      },
+      {/*input, scratch=*/{tflite::testing::t0, tflite::testing::t1},
+       /*output=*/{tflite::testing::t2}},
+      {/*input=*/{tflite::testing::t2},
+       /*output=*/{tflite::testing::t3}},
   };
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list, num_conns);
+      number_tensors, metadata_buffer, node_list, number_connections);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
@@ -617,39 +603,33 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
-  constexpr int nbr_tensors = 5;
+  constexpr int number_tensors = 5;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
-                                nbr_tensors] = {
-      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
-      // memory offsets:
-      0,    // t0
-      48,   // t1
-      -1,   // t2
-      0,    // t3
-      -1};  // t4
+                                number_tensors] = {/*version=*/1,
+                                                   /*subgraph=*/0,
+                                                   number_tensors,
+                                                   /*t0=*/0,
+                                                   /*t1=*/48,
+                                                   /*t2=*/-1,
+                                                   /*t3=*/0,
+                                                   /*t4=*/-1};
 
-  int t0 = 0;
-  int t1 = 1;
-  int t2 = 2;
-  int t3 = 3;
-  int t4 = 4;
-
-  int num_conns = 2;
-  tflite::testing::NodeConnection node_list[2] = {
+  constexpr int number_connections = 2;
+  tflite::testing::NodeConnection node_list[number_connections] = {
       {
-          {t0, t1},  // input, scratch
-          {t2},      // output
+          /*input, scratch=*/{tflite::testing::t0, tflite::testing::t1},
+          /*output=*/{tflite::testing::t2},
       },
       {
-          {t2},      // input
-          {t3, t4},  // output1, output2
+          /*input=*/{tflite::testing::t2},
+          /*output1, output2=*/{tflite::testing::t3, tflite::testing::t4},
       },
   };
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list, num_conns);
+      number_tensors, metadata_buffer, node_list, number_connections);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
@@ -757,43 +737,34 @@ TF_LITE_MICRO_TEST(TestAllocateTfLiteTensorWithReset) {
 }
 
 TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) {
-  constexpr int nbr_tensors = 5;
+  constexpr int number_tensors = 5;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
-                                nbr_tensors] = {
-      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
-      // memory offsets:
-      0,    // t0
-      0,    // t1
-      0,    // t2
-      48,   // t3
-      -1};  // t4
+                                number_tensors] = {/*version=*/1,
+                                                   /*subgraph=*/0,
+                                                   number_tensors,
+                                                   /*t0=*/0,
+                                                   /*t1=*/0,
+                                                   /*t2=*/0,
+                                                   /*t3=*/48,
+                                                   /*t4=*/-1};
 
-  int t0 = 0;
-  int t1 = 1;
-  int t2 = 2;
-  int t3 = 3;
-  int t4 = 4;
-
-  int num_conns = 2;
-  tflite::testing::NodeConnection node_list[2] = {
-      {
-          {t0, t1, t2},  // t0: input (actual input part of subgraph inputs as
-                         // well as operator inputs)
-                         // t1: scratch1 (only in operator inputs)
-                         // t2: scratch2 (only in operator inputs)
-          {t3}           // output
-      },
-      {
-          {t3},  // input
-          {t4}   // output
-      },
+  constexpr int number_connections = 2;
+  tflite::testing::NodeConnection node_list[number_connections] = {
+      {// t0: input (actual input part of subgraph inputs as
+       // well as operator inputs)
+       // t1: scratch1 (only in operator inputs)
+       // t2: scratch2 (only in operator inputs)
+       {tflite::testing::t0, tflite::testing::t1, tflite::testing::t2},
+       /*t3: output=*/{tflite::testing::t3}},
+      {/*t3: input=*/{tflite::testing::t3},
+       /*t4: output=*/{tflite::testing::t4}},
   };
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list, num_conns,
-      1 /* only first tensor (t0) is in subgraph input list*/);
+      number_tensors, metadata_buffer, node_list, number_connections,
+      /*Only first tensor (t0) is in subgraph input list=*/1);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
@@ -818,4 +789,61 @@ TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) {
   TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[4].data.uint8 - start);
 }
 
+TF_LITE_MICRO_TEST(TestTypicalFirstOpAndSecondOpWithScratchTensors) {
+  constexpr int number_tensors = 6;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+  tflite::NodeAndRegistration* node_and_registration;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                number_tensors] = {/*version=*/1,
+                                                   /*subgraph=*/0,
+                                                   number_tensors,
+                                                   /*t0=*/0,
+                                                   /*t1=*/0,
+                                                   /*t2=*/0,
+                                                   /*t3=*/0,
+                                                   /*t4=*/48,
+                                                   /*t5=*/-1};
+
+  constexpr int number_connections = 3;
+  tflite::testing::NodeConnection node_list[number_connections] = {
+      {/*t0: input (subgraph and operator input)=*/{tflite::testing::t0},
+       /*t1: output=*/{tflite::testing::t1}},
+      {// t1: input
+       // t2: scratch1 (only in operator inputs)
+       // t3: scratch2 (only in operator inputs)
+       {tflite::testing::t1, tflite::testing::t2, tflite::testing::t3},
+
+       /*t4: output=*/{tflite::testing::t4}},
+      {/*t4: input=*/{tflite::testing::t4},
+       /*t5: output=*/{tflite::testing::t5}},
+  };
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      number_tensors, metadata_buffer, node_list, number_connections,
+      /*Only first tensor (t0) is in subgraph input list=*/1);
+
+  TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
+
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[3].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[4].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[5].data.uint8 - start);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_utils.cc b/tensorflow/lite/micro/micro_utils.cc
index ff885fa04ff..96152364c25 100644
--- a/tensorflow/lite/micro/micro_utils.cc
+++ b/tensorflow/lite/micro/micro_utils.cc
@@ -15,34 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_utils.h"
 
-#include <limits.h>
-#include <math.h>
-#include <stdint.h>
+#include <cmath>
+#include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 
-namespace {
-
-static const uint8_t kAsymmetricUInt8Min = 0;
-static const uint8_t kAsymmetricUInt8Max = UINT8_MAX;
-static const uint8_t kSymmetricUInt8Min = 1;
-static const uint8_t kSymmetricUInt8Max = UINT8_MAX;
-static const int8_t kAsymmetricInt8Min = INT8_MIN;
-static const int8_t kAsymmetricInt8Max = INT8_MAX;
-static const int kSymmetricInt8Scale = kAsymmetricInt8Max;
-
-static const int16_t kAsymmetricInt16Min = INT16_MIN;
-static const int16_t kAsymmetricInt16Max = INT16_MAX;
-static const int kSymmetricInt16Scale = kAsymmetricInt16Max;
-
-static const int32_t kAsymmetricInt32Max = INT32_MAX;
-static const int kSymmetricInt32Scale = kAsymmetricInt32Max;
-
-}  // namespace
-
 int ElementCount(const TfLiteIntArray& dims) {
   int result = 1;
   for (int i = 0; i < dims.size; ++i) {
@@ -51,109 +32,6 @@ int ElementCount(const TfLiteIntArray& dims) {
   return result;
 }
 
-// Converts a float value into an unsigned eight-bit quantized value.
-uint8_t FloatToAsymmetricQuantizedUInt8(const float value, const float scale,
-                                        const int zero_point) {
-  int32_t result = round(value / scale) + zero_point;
-  if (result < kAsymmetricUInt8Min) {
-    result = kAsymmetricUInt8Min;
-  }
-  if (result > kAsymmetricUInt8Max) {
-    result = kAsymmetricUInt8Max;
-  }
-  return result;
-}
-
-uint8_t FloatToSymmetricQuantizedUInt8(const float value, const float scale) {
-  int32_t result = round(value / scale);
-  if (result < kSymmetricUInt8Min) {
-    result = kSymmetricUInt8Min;
-  }
-  if (result > kSymmetricUInt8Max) {
-    result = kSymmetricUInt8Max;
-  }
-  return result;
-}
-
-int8_t FloatToAsymmetricQuantizedInt8(const float value, const float scale,
-                                      const int zero_point) {
-  int32_t result = round(value / scale) + zero_point;
-  if (result < kAsymmetricInt8Min) {
-    result = kAsymmetricInt8Min;
-  }
-  if (result > kAsymmetricInt8Max) {
-    result = kAsymmetricInt8Max;
-  }
-  return result;
-}
-
-int16_t FloatToAsymmetricQuantizedInt16(const float value, const float scale,
-                                        const int zero_point) {
-  int32_t result = round(value / scale) + zero_point;
-  if (result < kAsymmetricInt16Min) {
-    result = kAsymmetricInt16Min;
-  }
-  if (result > kAsymmetricInt16Max) {
-    result = kAsymmetricInt16Max;
-  }
-  return result;
-}
-
-int8_t FloatToSymmetricQuantizedInt8(const float value, const float scale) {
-  return FloatToAsymmetricQuantizedInt8(value, scale, 0.0f);
-}
-
-int32_t FloatToSymmetricQuantizedInt32(const float value, const float scale) {
-  float quantized = round(value / scale);
-  if (static_cast<int>(quantized) > INT_MAX) {
-    quantized = static_cast<float>(INT_MAX);
-  } else if (quantized < INT_MIN) {
-    quantized = static_cast<float> INT_MIN;
-  }
-
-  return static_cast<int>(quantized);
-}
-
-void AsymmetricQuantize(const float* input, int8_t* output, int num_elements,
-                        float scale, int zero_point) {
-  for (int i = 0; i < num_elements; i++) {
-    output[i] = FloatToAsymmetricQuantizedInt8(input[i], scale, zero_point);
-  }
-}
-
-void AsymmetricQuantize(const float* input, uint8_t* output, int num_elements,
-                        float scale, int zero_point) {
-  for (int i = 0; i < num_elements; i++) {
-    output[i] = FloatToAsymmetricQuantizedUInt8(input[i], scale, zero_point);
-  }
-}
-
-void AsymmetricQuantize(const float* input, int16_t* output, int num_elements,
-                        float scale, int zero_point) {
-  for (int i = 0; i < num_elements; i++) {
-    output[i] = FloatToAsymmetricQuantizedInt16(input[i], scale, zero_point);
-  }
-}
-
-void SymmetricQuantize(const float* input, int32_t* output, int num_elements,
-                       float scale) {
-  for (int i = 0; i < num_elements; i++) {
-    output[i] = FloatToSymmetricQuantizedInt32(input[i], scale);
-  }
-}
-
-void SymmetricPerChannelQuantize(const float* input, int32_t* output,
-                                 int num_elements, int num_channels,
-                                 float* scales) {
-  int elements_per_channel = num_elements / num_channels;
-  for (int i = 0; i < num_channels; i++) {
-    for (int j = 0; j < elements_per_channel; j++) {
-      output[i * elements_per_channel + j] = FloatToSymmetricQuantizedInt32(
-          input[i * elements_per_channel + j], scales[i]);
-    }
-  }
-}
-
 void SignedSymmetricPerChannelQuantize(const float* values,
                                        TfLiteIntArray* dims,
                                        int quantized_dimension,
@@ -186,94 +64,17 @@ void SignedSymmetricPerChannelQuantize(const float* values,
       max = fmaxf(max, values[idx]);
     }
     scaling_factors[channel] =
-        fmaxf(fabs(min), fabs(max)) / kSymmetricInt8Scale;
+        fmaxf(fabs(min), fabs(max)) / std::numeric_limits<int8_t>::max();
     for (int i = 0; i < per_channel_size; i++) {
       int idx = channel * channel_stride + i * stride;
       const int32_t quantized_value =
           static_cast<int32_t>(roundf(values[idx] / scaling_factors[channel]));
       // Clamp: just in case some odd numeric offset.
-      quantized_values[idx] = fminf(
-          kSymmetricInt8Scale, fmaxf(-kSymmetricInt8Scale, quantized_value));
+      quantized_values[idx] =
+          fminf(std::numeric_limits<int8_t>::max(),
+                fmaxf(std::numeric_limits<int8_t>::min() + 1, quantized_value));
     }
   }
 }
 
-void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                             int8_t* quantized_values, float* scaling_factor) {
-  int input_size = ElementCount(*dims);
-
-  float min = 0;
-  float max = 0;
-  for (int i = 0; i < input_size; i++) {
-    min = fminf(min, values[i]);
-    max = fmaxf(max, values[i]);
-  }
-  *scaling_factor = fmaxf(fabs(min), fabs(max)) / kSymmetricInt8Scale;
-  for (int i = 0; i < input_size; i++) {
-    const int32_t quantized_value =
-        static_cast<int32_t>(roundf(values[i] / *scaling_factor));
-    // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = fminf(kSymmetricInt8Scale,
-                                fmaxf(-kSymmetricInt8Scale, quantized_value));
-  }
-}
-
-void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                             int16_t* quantized_values, float* scaling_factor) {
-  int input_size = ElementCount(*dims);
-
-  float min = 0;
-  float max = 0;
-  for (int i = 0; i < input_size; i++) {
-    min = fminf(min, values[i]);
-    max = fmaxf(max, values[i]);
-  }
-  *scaling_factor = fmaxf(fabs(min), fabs(max)) / kSymmetricInt16Scale;
-  for (int i = 0; i < input_size; i++) {
-    const int32_t quantized_value =
-        static_cast<int32_t>(roundf(values[i] / *scaling_factor));
-    // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = fminf(kSymmetricInt16Scale,
-                                fmaxf(-kSymmetricInt16Scale, quantized_value));
-  }
-}
-
-void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                             int32_t* quantized_values, float* scaling_factor) {
-  int input_size = ElementCount(*dims);
-
-  float min = 0;
-  float max = 0;
-  for (int i = 0; i < input_size; i++) {
-    min = fminf(min, values[i]);
-    max = fmaxf(max, values[i]);
-  }
-
-  *scaling_factor =
-      fmaxf(fabs(min), fabs(max)) / static_cast<float>(kSymmetricInt32Scale);
-  for (int i = 0; i < input_size; i++) {
-    const int32_t quantized_value =
-        static_cast<int32_t>(roundf(values[i] / *scaling_factor));
-    // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = fminf(
-        static_cast<float>(kSymmetricInt32Scale),
-        fmaxf(static_cast<float>(-kSymmetricInt32Scale), quantized_value));
-  }
-}
-
-void SymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                       uint8_t* quantized_values, float* scaling_factor) {
-  SignedSymmetricQuantize(values, dims,
-                          reinterpret_cast<int8_t*>(quantized_values),
-                          scaling_factor);
-}
-
-void SymmetricDequantize(const int8_t* values, const int size,
-                         const float dequantization_scale,
-                         float* dequantized_values) {
-  for (int i = 0; i < size; ++i) {
-    dequantized_values[i] = values[i] * dequantization_scale;
-  }
-}
-
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_utils.h b/tensorflow/lite/micro/micro_utils.h
index 24aebad8a78..b9a3121a1f3 100644
--- a/tensorflow/lite/micro/micro_utils.h
+++ b/tensorflow/lite/micro/micro_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_UTILS_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_UTILS_H_
 
-#include <stdint.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
 
 #include "tensorflow/lite/c/common.h"
 
@@ -26,23 +28,28 @@ namespace tflite {
 
 int ElementCount(const TfLiteIntArray& dims);
 
-uint8_t FloatToAsymmetricQuantizedUInt8(const float value, const float scale,
-                                        const int zero_point);
+// Converts a float value into a quantized value.  Note that large values (close
+// to max int and min int) may see significant error due to a lack of floating
+// point granularity for large values.
+template <typename T>
+T FloatToQuantizedType(const float value, const float scale, int zero_point) {
+  int32_t result = round(value / scale) + zero_point;
+  result =
+      std::max(static_cast<int32_t>(std::numeric_limits<T>::min()), result);
+  result =
+      std::min(static_cast<int32_t>(std::numeric_limits<T>::max()), result);
+  return result;
+}
 
-uint8_t FloatToSymmetricQuantizedUInt8(const float value, const float scale);
-
-int8_t FloatToAsymmetricQuantizedInt8(const float value, const float scale,
-                                      const int zero_point);
-
-int16_t FloatToAsymmetricQuantizedInt16(const float value, const float scale,
-                                        const int zero_point);
-
-int8_t FloatToSymmetricQuantizedInt8(const float value, const float scale);
-
-// Converts a float value into a signed thirty-two-bit quantized value.  Note
-// that values close to max int and min int may see significant error due to
-// a lack of floating point granularity for large values.
-int32_t FloatToSymmetricQuantizedInt32(const float value, const float scale);
+template <typename T>
+T FloatToSymmetricQuantizedType(const float value, const float scale) {
+  int32_t result = round(value / scale);
+  result =
+      std::max(static_cast<int32_t>(std::numeric_limits<T>::min() + 1), result);
+  result =
+      std::min(static_cast<int32_t>(std::numeric_limits<T>::max()), result);
+  return result;
+}
 
 // Helper methods to quantize arrays of floats to the desired format.
 //
@@ -55,22 +62,34 @@ int32_t FloatToSymmetricQuantizedInt32(const float value, const float scale);
 //
 // The per-op quantization spec can be found here:
 // https://www.tensorflow.org/lite/performance/quantization_spec
+template <typename T>
+void Quantize(const float* input, T* output, int num_elements, float scale,
+              int zero_point) {
+  for (int i = 0; i < num_elements; i++) {
+    output[i] = FloatToQuantizedType<T>(input[i], scale, zero_point);
+  }
+}
 
-void AsymmetricQuantize(const float* input, int8_t* output, int num_elements,
-                        float scale, int zero_point = 0);
+template <typename T>
+void SymmetricQuantize(const float* input, T* output, int num_elements,
+                       float scale) {
+  for (int i = 0; i < num_elements; i++) {
+    output[i] = FloatToSymmetricQuantizedType<T>(input[i], scale);
+  }
+}
 
-void AsymmetricQuantize(const float* input, uint8_t* output, int num_elements,
-                        float scale, int zero_point = 128);
-
-void AsymmetricQuantize(const float* input, int16_t* output, int num_elements,
-                        float scale, int zero_point = 0);
-
-void SymmetricQuantize(const float* input, int32_t* output, int num_elements,
-                       float scale);
-
-void SymmetricPerChannelQuantize(const float* input, int32_t* output,
+template <typename T>
+void SymmetricPerChannelQuantize(const float* input, T* output,
                                  int num_elements, int num_channels,
-                                 float* scales);
+                                 float* scales) {
+  int elements_per_channel = num_elements / num_channels;
+  for (int i = 0; i < num_channels; i++) {
+    for (int j = 0; j < elements_per_channel; j++) {
+      output[i * elements_per_channel + j] = FloatToSymmetricQuantizedType<T>(
+          input[i * elements_per_channel + j], scales[i]);
+    }
+  }
+}
 
 void SignedSymmetricPerChannelQuantize(const float* values,
                                        TfLiteIntArray* dims,
@@ -78,30 +97,35 @@ void SignedSymmetricPerChannelQuantize(const float* values,
                                        int8_t* quantized_values,
                                        float* scaling_factor);
 
-void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                             int8_t* quantized_values, float* scaling_factor);
+// Quantizes inputs based on the values provided, choosing the smallest range
+// which includes all input values.
+template <typename T>
+void SymmetricQuantizeCalculateScales(const float* values, TfLiteIntArray* dims,
+                                      T* output, float* scale) {
+  int input_size = ElementCount(*dims);
 
-void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                             int16_t* quantized_values, float* scaling_factor);
-
-void SignedSymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                             int32_t* quantized_values, float* scaling_factor);
-
-void SymmetricQuantize(const float* values, TfLiteIntArray* dims,
-                       uint8_t* quantized_values, float* scaling_factor);
-
-void SymmetricDequantize(const int8_t* values, const int size,
-                         const float dequantization_scale,
-                         float* dequantized_values);
+  float min = 0;
+  float max = 0;
+  for (int i = 0; i < input_size; i++) {
+    min = fminf(min, values[i]);
+    max = fmaxf(max, values[i]);
+  }
+  *scale = fmaxf(std::abs(min), std::abs(max)) / std::numeric_limits<T>::max();
+  for (int i = 0; i < input_size; i++) {
+    const int32_t quantized_value =
+        static_cast<int32_t>(roundf(values[i] / *scale));
+    // Clamp: just in case some odd numeric offset.
+    quantized_value = fminf(std::numeric_limits<T>::max(), quantized_value);
+    quantized_value = fmaxf(std::numeric_limits<T>::min() + 1, quantized_value);
+    output[i] = quantized_value;
+  }
+}
 
 template <typename T>
-void AsymmetricDequantize(const T* values, const int size,
-                          const float dequantization_scale,
-                          int dequantization_zero_point,
-                          float* dequantized_values) {
+void Dequantize(const T* values, const int size, const float scale,
+                int zero_point, float* dequantized_values) {
   for (int i = 0; i < size; ++i) {
-    dequantized_values[i] =
-        (values[i] - dequantization_zero_point) * dequantization_scale;
+    dequantized_values[i] = (values[i] - zero_point) * scale;
   }
 }
 
diff --git a/tensorflow/lite/micro/micro_utils_test.cc b/tensorflow/lite/micro/micro_utils_test.cc
index 7aa31130595..d74004eacee 100644
--- a/tensorflow/lite/micro/micro_utils_test.cc
+++ b/tensorflow/lite/micro/micro_utils_test.cc
@@ -20,63 +20,68 @@ limitations under the License.
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(FloatToAsymmetricQuantizedUInt8Test) {
-  using tflite::FloatToAsymmetricQuantizedUInt8;
+  using tflite::FloatToQuantizedType;
   // [0, 127.5] -> zero_point=0, scale=0.5
-  TF_LITE_MICRO_EXPECT_EQ(0, FloatToAsymmetricQuantizedUInt8(0, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(254, FloatToAsymmetricQuantizedUInt8(127, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(255, FloatToAsymmetricQuantizedUInt8(127.5, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(0, FloatToQuantizedType<uint8_t>(0, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(254, FloatToQuantizedType<uint8_t>(127, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(255, FloatToQuantizedType<uint8_t>(127.5, 0.5, 0));
   // [-10, 245] -> zero_point=10, scale=1.0
-  TF_LITE_MICRO_EXPECT_EQ(0, FloatToAsymmetricQuantizedUInt8(-10, 1.0, 10));
-  TF_LITE_MICRO_EXPECT_EQ(1, FloatToAsymmetricQuantizedUInt8(-9, 1.0, 10));
-  TF_LITE_MICRO_EXPECT_EQ(128, FloatToAsymmetricQuantizedUInt8(118, 1.0, 10));
-  TF_LITE_MICRO_EXPECT_EQ(253, FloatToAsymmetricQuantizedUInt8(243, 1.0, 10));
-  TF_LITE_MICRO_EXPECT_EQ(254, FloatToAsymmetricQuantizedUInt8(244, 1.0, 10));
-  TF_LITE_MICRO_EXPECT_EQ(255, FloatToAsymmetricQuantizedUInt8(245, 1.0, 10));
+  TF_LITE_MICRO_EXPECT_EQ(0, FloatToQuantizedType<uint8_t>(-10, 1.0, 10));
+  TF_LITE_MICRO_EXPECT_EQ(1, FloatToQuantizedType<uint8_t>(-9, 1.0, 10));
+  TF_LITE_MICRO_EXPECT_EQ(128, FloatToQuantizedType<uint8_t>(118, 1.0, 10));
+  TF_LITE_MICRO_EXPECT_EQ(253, FloatToQuantizedType<uint8_t>(243, 1.0, 10));
+  TF_LITE_MICRO_EXPECT_EQ(254, FloatToQuantizedType<uint8_t>(244, 1.0, 10));
+  TF_LITE_MICRO_EXPECT_EQ(255, FloatToQuantizedType<uint8_t>(245, 1.0, 10));
 }
 
 TF_LITE_MICRO_TEST(FloatToAsymmetricQuantizedInt8Test) {
-  using tflite::FloatToAsymmetricQuantizedInt8;
+  using tflite::FloatToQuantizedType;
   // [-64, 63.5] -> zero_point=0, scale=0.5
-  TF_LITE_MICRO_EXPECT_EQ(2, FloatToAsymmetricQuantizedInt8(1, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(4, FloatToAsymmetricQuantizedInt8(2, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(6, FloatToAsymmetricQuantizedInt8(3, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(-10, FloatToAsymmetricQuantizedInt8(-5, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(-128, FloatToAsymmetricQuantizedInt8(-64, 0.5, 0));
-  TF_LITE_MICRO_EXPECT_EQ(127, FloatToAsymmetricQuantizedInt8(63.5, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(2, FloatToQuantizedType<int8_t>(1, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(4, FloatToQuantizedType<int8_t>(2, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(6, FloatToQuantizedType<int8_t>(3, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(-10, FloatToQuantizedType<int8_t>(-5, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(-128, FloatToQuantizedType<int8_t>(-64, 0.5, 0));
+  TF_LITE_MICRO_EXPECT_EQ(127, FloatToQuantizedType<int8_t>(63.5, 0.5, 0));
   // [-127, 128] -> zero_point=-1, scale=1.0
-  TF_LITE_MICRO_EXPECT_EQ(0, FloatToAsymmetricQuantizedInt8(1, 1.0, -1));
-  TF_LITE_MICRO_EXPECT_EQ(-1, FloatToAsymmetricQuantizedInt8(0, 1.0, -1));
-  TF_LITE_MICRO_EXPECT_EQ(126, FloatToAsymmetricQuantizedInt8(127, 1.0, -1));
-  TF_LITE_MICRO_EXPECT_EQ(127, FloatToAsymmetricQuantizedInt8(128, 1.0, -1));
-  TF_LITE_MICRO_EXPECT_EQ(-127, FloatToAsymmetricQuantizedInt8(-126, 1.0, -1));
-  TF_LITE_MICRO_EXPECT_EQ(-128, FloatToAsymmetricQuantizedInt8(-127, 1.0, -1));
+  TF_LITE_MICRO_EXPECT_EQ(0, FloatToQuantizedType<int8_t>(1, 1.0, -1));
+  TF_LITE_MICRO_EXPECT_EQ(-1, FloatToQuantizedType<int8_t>(0, 1.0, -1));
+  TF_LITE_MICRO_EXPECT_EQ(126, FloatToQuantizedType<int8_t>(127, 1.0, -1));
+  TF_LITE_MICRO_EXPECT_EQ(127, FloatToQuantizedType<int8_t>(128, 1.0, -1));
+  TF_LITE_MICRO_EXPECT_EQ(-127, FloatToQuantizedType<int8_t>(-126, 1.0, -1));
+  TF_LITE_MICRO_EXPECT_EQ(-128, FloatToQuantizedType<int8_t>(-127, 1.0, -1));
 }
 
 TF_LITE_MICRO_TEST(FloatToSymmetricQuantizedInt8Test) {
-  using tflite::FloatToSymmetricQuantizedInt8;
+  using tflite::FloatToSymmetricQuantizedType;
   // [-64, 63.5] -> zero_point=0, scale=0.5
-  TF_LITE_MICRO_EXPECT_EQ(2, FloatToSymmetricQuantizedInt8(1, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(4, FloatToSymmetricQuantizedInt8(2, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(6, FloatToSymmetricQuantizedInt8(3, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(-10, FloatToSymmetricQuantizedInt8(-5, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(-128, FloatToSymmetricQuantizedInt8(-64, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(127, FloatToSymmetricQuantizedInt8(63.5, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(2, FloatToSymmetricQuantizedType<int8_t>(1, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(4, FloatToSymmetricQuantizedType<int8_t>(2, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(6, FloatToSymmetricQuantizedType<int8_t>(3, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(-10, FloatToSymmetricQuantizedType<int8_t>(-5, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(-127,
+                          FloatToSymmetricQuantizedType<int8_t>(-64, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(127,
+                          FloatToSymmetricQuantizedType<int8_t>(63.5, 0.5));
   // [-127, 128] -> zero_point=-1, scale=1.0
-  TF_LITE_MICRO_EXPECT_EQ(1, FloatToSymmetricQuantizedInt8(1, 1.0));
-  TF_LITE_MICRO_EXPECT_EQ(0, FloatToSymmetricQuantizedInt8(0, 1.0));
-  TF_LITE_MICRO_EXPECT_EQ(127, FloatToSymmetricQuantizedInt8(127, 1.0));
-  TF_LITE_MICRO_EXPECT_EQ(127, FloatToSymmetricQuantizedInt8(128, 1.0));
-  TF_LITE_MICRO_EXPECT_EQ(-126, FloatToSymmetricQuantizedInt8(-126, 1.0));
-  TF_LITE_MICRO_EXPECT_EQ(-127, FloatToSymmetricQuantizedInt8(-127, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(1, FloatToSymmetricQuantizedType<int8_t>(1, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(0, FloatToSymmetricQuantizedType<int8_t>(0, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(127, FloatToSymmetricQuantizedType<int8_t>(127, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(127, FloatToSymmetricQuantizedType<int8_t>(128, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(-126,
+                          FloatToSymmetricQuantizedType<int8_t>(-126, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(-127,
+                          FloatToSymmetricQuantizedType<int8_t>(-127, 1.0));
 }
 
 TF_LITE_MICRO_TEST(FloatToAsymmetricQuantizedInt32Test) {
-  using tflite::FloatToSymmetricQuantizedInt32;
-  TF_LITE_MICRO_EXPECT_EQ(0, FloatToSymmetricQuantizedInt32(0, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(2, FloatToSymmetricQuantizedInt32(1, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(-2, FloatToSymmetricQuantizedInt32(-1, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(-100, FloatToSymmetricQuantizedInt32(-50, 0.5));
-  TF_LITE_MICRO_EXPECT_EQ(100, FloatToSymmetricQuantizedInt32(50, 0.5));
+  using tflite::FloatToSymmetricQuantizedType;
+  TF_LITE_MICRO_EXPECT_EQ(0, FloatToSymmetricQuantizedType<int32_t>(0, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(2, FloatToSymmetricQuantizedType<int32_t>(1, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(-2, FloatToSymmetricQuantizedType<int32_t>(-1, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(-100,
+                          FloatToSymmetricQuantizedType<int32_t>(-50, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(100, FloatToSymmetricQuantizedType<int32_t>(50, 0.5));
 }
 
 TF_LITE_MICRO_TEST(AsymmetricQuantizeInt8) {
@@ -84,7 +89,7 @@ TF_LITE_MICRO_TEST(AsymmetricQuantizeInt8) {
   int8_t goldens[] = {-20, -5, -3, -3, -1, 1, 3, 5, 7, 9};
   constexpr int length = sizeof(values) / sizeof(float);
   int8_t quantized[length];
-  tflite::AsymmetricQuantize(values, quantized, length, 0.5, 1);
+  tflite::Quantize(values, quantized, length, 0.5, 1);
   for (int i = 0; i < length; i++) {
     TF_LITE_MICRO_EXPECT_EQ(quantized[i], goldens[i]);
   }
@@ -95,7 +100,7 @@ TF_LITE_MICRO_TEST(AsymmetricQuantizeUInt8) {
   uint8_t goldens[] = {106, 121, 123, 123, 125, 127, 129, 131, 133, 135};
   constexpr int length = sizeof(values) / sizeof(float);
   uint8_t quantized[length];
-  tflite::AsymmetricQuantize(values, quantized, length, 0.5, 127);
+  tflite::Quantize(values, quantized, length, 0.5, 127);
   for (int i = 0; i < length; i++) {
     TF_LITE_MICRO_EXPECT_EQ(quantized[i], goldens[i]);
   }
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 82a57890231..897f3110036 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -30,7 +30,8 @@ limitations under the License.
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
+
+// TODO(b/170464050): Use TFLM test only version of schema_utils.
 
 namespace tflite {
 namespace testing {
@@ -85,8 +86,7 @@ class ModelBuilder {
       : builder_(builder) {}
 
   // Registers an operator that will be used in the model.
-  Operator RegisterOp(BuiltinOperator op, const char* custom_code,
-                      int32_t version);
+  Operator RegisterOp(BuiltinOperator op, const char* custom_code);
 
   // Adds a tensor to the model.
   Tensor AddTensor(TensorType type, std::initializer_list<int32_t> shape) {
@@ -145,11 +145,10 @@ class ModelBuilder {
 };
 
 ModelBuilder::Operator ModelBuilder::RegisterOp(BuiltinOperator op,
-                                                const char* custom_code,
-                                                int32_t version) {
+                                                const char* custom_code) {
   TFLITE_DCHECK(next_operator_code_id_ <= kMaxOperatorCodes);
-  operator_codes_[next_operator_code_id_] =
-      tflite::CreateOperatorCodeDirect(*builder_, op, custom_code, version);
+  operator_codes_[next_operator_code_id_] = tflite::CreateOperatorCodeDirect(
+      *builder_, /*deprecated_builtin_code=*/0, custom_code, /*version=*/0, op);
   next_operator_code_id_++;
   return next_operator_code_id_ - 1;
 }
@@ -206,7 +205,7 @@ const Model* ModelBuilder::BuildModel(
   } else {
     // A non-zero value of num_subgraph_inputs means that some of
     // the operator input tensors are not subgraph inputs.
-    TFLITE_DCHECK(num_subgraph_inputs < inputs.size());
+    TFLITE_DCHECK(num_subgraph_inputs <= inputs.size());
   }
 
   const flatbuffers::Offset<SubGraph> subgraphs[subgraphs_size] = {
@@ -261,7 +260,7 @@ const Model* BuildSimpleStatefulModel() {
   ModelBuilder model_builder(fb_builder);
 
   const int op_id =
-      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "simple_stateful_op", 0);
+      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "simple_stateful_op");
   const int input_tensor = model_builder.AddTensor(TensorType_UINT8, {3});
   const int median_tensor = model_builder.AddTensor(TensorType_UINT8, {3});
   const int invoke_count_tensor =
@@ -302,8 +301,7 @@ const Model* BuildSimpleModelWithBranch() {
                  v
   */
   const int op_id =
-      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom",
-                               /* version= */ 0);
+      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom");
   const int t0 = model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
   const int t1 = model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
   const int t2 = model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
@@ -325,8 +323,7 @@ const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
   ModelBuilder model_builder(fb_builder);
 
   const int op_id =
-      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom",
-                               /* version= */ 0);
+      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom");
 
   for (int i = 0; i < number_of_tensors; ++i) {
     model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
@@ -407,8 +404,9 @@ const Model* BuildSimpleMockModel() {
                      builder->CreateString("test_subgraph"))};
   constexpr size_t operator_codes_size = 1;
   const Offset<OperatorCode> operator_codes[operator_codes_size] = {
-      CreateOperatorCodeDirect(*builder, BuiltinOperator_CUSTOM, "mock_custom",
-                               0)};
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "mock_custom",
+                               /*version=*/0, BuiltinOperator_CUSTOM)};
   const Offset<Model> model_offset = CreateModel(
       *builder, 0, builder->CreateVector(operator_codes, operator_codes_size),
       builder->CreateVector(subgraphs, subgraphs_size),
@@ -556,8 +554,9 @@ const Model* BuildComplexMockModel() {
 
   constexpr size_t operator_codes_size = 1;
   const Offset<OperatorCode> operator_codes[operator_codes_size] = {
-      CreateOperatorCodeDirect(*builder, BuiltinOperator_CUSTOM, "mock_custom",
-                               0)};
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "mock_custom",
+                               /*version=*/0, BuiltinOperator_CUSTOM)};
 
   const Offset<Model> model_offset = CreateModel(
       *builder, 0, builder->CreateVector(operator_codes, operator_codes_size),
@@ -870,101 +869,17 @@ TfLiteFloatArray* FloatArrayFromFloats(const float* floats) {
   return reinterpret_cast<TfLiteFloatArray*>(const_cast<float*>(floats));
 }
 
-TfLiteTensor CreateTensor(TfLiteIntArray* dims, bool is_variable) {
-  TfLiteTensor result;
-  result.dims = dims;
-  result.params = {};
-  result.quantization = {kTfLiteNoQuantization, nullptr};
-  result.is_variable = is_variable;
-  result.allocation_type = kTfLiteMemNone;
-  return result;
-}
-
-TfLiteTensor CreateFloatTensor(const float* data, TfLiteIntArray* dims,
-                               bool is_variable) {
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteFloat32;
-  result.data.f = const_cast<float*>(data);
-  result.bytes = ElementCount(*dims) * sizeof(float);
-  return result;
-}
-
-void PopulateFloatTensor(TfLiteTensor* tensor, float* begin, float* end) {
-  float* p = begin;
-  float* v = tensor->data.f;
-  while (p != end) {
-    *v++ = *p++;
-  }
-}
-
-TfLiteTensor CreateBoolTensor(const bool* data, TfLiteIntArray* dims,
-                              bool is_variable) {
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteBool;
-  result.data.b = const_cast<bool*>(data);
-  result.bytes = ElementCount(*dims) * sizeof(bool);
-  return result;
-}
-
-TfLiteTensor CreateInt32Tensor(const int32_t* data, TfLiteIntArray* dims,
-                               bool is_variable) {
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteInt32;
-  result.data.i32 = const_cast<int32_t*>(data);
-  result.bytes = ElementCount(*dims) * sizeof(int32_t);
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
-                                   float scale, int zero_point,
-                                   bool is_variable) {
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteUInt8;
-  result.data.uint8 = const_cast<uint8_t*>(data);
-  result.params = {scale, zero_point};
-  result.quantization = {kTfLiteAffineQuantization, nullptr};
-  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
-                                   float scale, int zero_point,
-                                   bool is_variable) {
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteInt8;
-  result.data.int8 = const_cast<int8_t*>(data);
-  result.params = {scale, zero_point};
-  result.quantization = {kTfLiteAffineQuantization, nullptr};
-  result.bytes = ElementCount(*dims) * sizeof(int8_t);
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(const int16_t* data, TfLiteIntArray* dims,
-                                   float scale, int zero_point,
-                                   bool is_variable) {
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteInt16;
-  result.data.i16 = const_cast<int16_t*>(data);
-  result.params = {scale, zero_point};
-  result.quantization = {kTfLiteAffineQuantization, nullptr};
-  result.bytes = ElementCount(*dims) * sizeof(int16_t);
-  return result;
-}
-
 TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
                                        float weights_scale, bool is_variable) {
   float bias_scale = input_scale * weights_scale;
   tflite::SymmetricQuantize(data, quantized, ElementCount(*dims), bias_scale);
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteInt32;
-  result.data.i32 = const_cast<int32_t*>(quantized);
+
   // Quantized int32_t tensors always have a zero point of 0, since the range of
   // int32_t values is large, and because zero point costs extra cycles during
   // processing.
-  result.params = {bias_scale, 0};
-  result.quantization = {kTfLiteAffineQuantization, nullptr};
-  result.bytes = ElementCount(*dims) * sizeof(int32_t);
+  TfLiteTensor result =
+      CreateQuantizedTensor(quantized, dims, bias_scale, 0, is_variable);
   return result;
 }
 
@@ -986,18 +901,15 @@ TfLiteTensor CreatePerChannelQuantizedBiasTensor(
     zero_points[i + 1] = 0;
   }
 
-  SymmetricPerChannelQuantize(input, quantized, input_size, num_channels,
-                              scales_array);
+  SymmetricPerChannelQuantize<int32_t>(input, quantized, input_size,
+                                       num_channels, scales_array);
 
   affine_quant->scale = FloatArrayFromFloats(scales);
   affine_quant->zero_point = IntArrayFromInts(zero_points);
   affine_quant->quantized_dimension = quantized_dimension;
 
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteInt32;
-  result.data.i32 = const_cast<int32_t*>(quantized);
+  TfLiteTensor result = CreateTensor(quantized, dims, is_variable);
   result.quantization = {kTfLiteAffineQuantization, affine_quant};
-  result.bytes = ElementCount(*dims) * sizeof(int32_t);
   return result;
 }
 
@@ -1020,11 +932,8 @@ TfLiteTensor CreateSymmetricPerChannelQuantizedTensor(
   affine_quant->zero_point = IntArrayFromInts(zero_points);
   affine_quant->quantized_dimension = quantized_dimension;
 
-  TfLiteTensor result = CreateTensor(dims, is_variable);
-  result.type = kTfLiteInt8;
-  result.data.int8 = const_cast<int8_t*>(quantized);
+  TfLiteTensor result = CreateTensor(quantized, dims, is_variable);
   result.quantization = {kTfLiteAffineQuantization, affine_quant};
-  result.bytes = ElementCount(*dims) * sizeof(int8_t);
   return result;
 }
 
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 57c6c365662..1db0d81facc 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -22,10 +22,12 @@ limitations under the License.
 #include <limits>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite//kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -140,35 +142,42 @@ TfLiteIntArray* IntArrayFromInts(const int* int_array);
 // supplied array must be the size of the array expressed as a float.
 TfLiteFloatArray* FloatArrayFromFloats(const float* floats);
 
-TfLiteTensor CreateFloatTensor(const float* data, TfLiteIntArray* dims,
-                               bool is_variable = false);
+template <typename T>
+TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims,
+                          const bool is_variable = false) {
+  TfLiteTensor result;
+  result.dims = dims;
+  result.params = {};
+  result.quantization = {kTfLiteNoQuantization, nullptr};
+  result.is_variable = is_variable;
+  result.allocation_type = kTfLiteMemNone;
+  result.type = typeToTfLiteType<T>();
+  // Const cast is used to allow passing in const and non-const arrays within a
+  // single CreateTensor method. A Const array should be used for immutable
+  // input tensors and non-const array should be used for mutable and output
+  // tensors.
+  result.data.data = const_cast<T*>(data);
+  result.quantization = {kTfLiteAffineQuantization, nullptr};
+  result.bytes = ElementCount(*dims) * sizeof(T);
+  return result;
+}
 
-void PopulateFloatTensor(TfLiteTensor* tensor, float* begin, float* end);
-
-TfLiteTensor CreateBoolTensor(const bool* data, TfLiteIntArray* dims,
-                              bool is_variable = false);
-
-TfLiteTensor CreateInt32Tensor(const int32_t*, TfLiteIntArray* dims,
-                               bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
-                                   float scale, int zero_point,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
-                                   float scale, int zero_point,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(const int16_t* data, TfLiteIntArray* dims,
-                                   float scale, int zero_point,
-                                   bool is_variable = false);
+template <typename T>
+TfLiteTensor CreateQuantizedTensor(const T* data, TfLiteIntArray* dims,
+                                   const float scale, const int zero_point = 0,
+                                   const bool is_variable = false) {
+  TfLiteTensor result = CreateTensor(data, dims, is_variable);
+  result.params = {scale, zero_point};
+  result.quantization = {kTfLiteAffineQuantization, nullptr};
+  return result;
+}
 
 template <typename T>
 TfLiteTensor CreateQuantizedTensor(const float* input, T* quantized,
                                    TfLiteIntArray* dims, float scale,
                                    int zero_point, bool is_variable = false) {
   int input_size = ElementCount(*dims);
-  tflite::AsymmetricQuantize(input, quantized, input_size, scale, zero_point);
+  tflite::Quantize(input, quantized, input_size, scale, zero_point);
   return CreateQuantizedTensor(quantized, dims, scale, zero_point, is_variable);
 }
 
diff --git a/tensorflow/lite/micro/testing/bluepill.resc b/tensorflow/lite/micro/testing/bluepill.resc
index 9cc9dcd9f79..5e0aa6e984b 100644
--- a/tensorflow/lite/micro/testing/bluepill.resc
+++ b/tensorflow/lite/micro/testing/bluepill.resc
@@ -22,7 +22,7 @@ machine LoadPlatformDescription @platforms/cpus/stm32f103.repl
 machine LoadPlatformDescriptionFromString "uartSemihosting: UART.SemihostingUart @ cpu"
 showAnalyzer cpu.uartSemihosting Antmicro.Renode.Analyzers.LoggingUartAnalyzer
 
-logFile @/tmp/renode_bluepill_log.txt
+logFile $logfile
 
 macro reset
 """
diff --git a/tensorflow/lite/micro/testing/bluepill.robot b/tensorflow/lite/micro/testing/bluepill.robot
index 37612168576..8c14e7e9930 100644
--- a/tensorflow/lite/micro/testing/bluepill.robot
+++ b/tensorflow/lite/micro/testing/bluepill.robot
@@ -2,7 +2,8 @@
 Suite Setup                   Setup
 Suite Teardown                Teardown
 Test Setup                    Reset Emulation
-Resource                      /opt/renode/tests/renode-keywords.robot
+Test Teardown                 Test Teardown
+Resource                      ${RENODEKEYWORDS}
 
 *** Variables ***
 ${UART}                       sysbus.cpu.uartSemihosting
@@ -13,11 +14,13 @@ Should Run Bluepill Test
     [Tags]                    bluepill  uart  tensorflow  arm
     ${BIN} =                  Get Environment Variable    BIN
     ${SCRIPT} =               Get Environment Variable    SCRIPT
+    ${LOGFILE} =              Get Environment Variable    LOGFILE
     ${EXPECTED} =             Get Environment Variable    EXPECTED
     Execute Command           $bin = @${BIN}
+    Execute Command           $logfile = @${LOGFILE}
     Execute Script            ${SCRIPT}
 
-    Create Terminal Tester    ${UART}  timeout=30
+    Create Terminal Tester    ${UART}  timeout=2
     Start Emulation
 
     Wait For Line On Uart     ${EXPECTED}
diff --git a/tensorflow/lite/micro/testing/download_renode.sh b/tensorflow/lite/micro/testing/download_renode.sh
new file mode 100755
index 00000000000..c74b1a49049
--- /dev/null
+++ b/tensorflow/lite/micro/testing/download_renode.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+# # ==============================================================================
+#
+# # Utility script that handles downloading and extracting portable version of Renode for testing purposes.
+# # Called with one argument:
+# # 1 - Path to new folder to unpack the package into.
+#
+
+if [ $# -ne 1 ]; then
+    echo "Usage: download_renode.sh PATH"
+    echo "    PATH is a path where Renode should be unpacked"
+    echo ""
+    echo "E.g: ./download_renode.sh /tmp/renode"
+    exit 1
+fi
+
+# Colours
+ORANGE="\033[33m"
+RED="\033[31m"
+NC="\033[0m"
+
+# Target version
+RENODE_VERSION='1.11.0'
+# Get target path
+TARGET_PATH=$1
+mkdir -p "${TARGET_PATH}" || exit 1
+
+echo "Downloading Renode portable in version ${RENODE_VERSION}"
+
+# Get link to requested version
+RELEASES_JSON=`curl https://api.github.com/repos/renode/renode/releases 2>/dev/null`
+LINUX_PORTABLE_URL=`echo "${RELEASES_JSON}" |grep 'browser_download_url'|\
+    grep --extended-regexp --only-matching "https://.*${RENODE_VERSION}.*linux-portable.*tar.gz"`
+if [ -z "${LINUX_PORTABLE_URL}" ]; then
+  echo -e "${RED}Portable version of release v${RENODE_VERSION} not found. Please make sure you use correct version format ('[0-9]+.[0-9]+.[0-9]+')${NC}"
+  exit 1
+fi
+
+# Check if newer version available
+LATEST_RENODE_VERSION=`echo "${RELEASES_JSON}" |grep 'tag_name' |\
+    head --lines 1 | grep --extended-regexp --only-matching '[0-9]+\.[0-9]+\.[0-9]+'`
+if [ "${RENODE_VERSION}" != "${LATEST_RENODE_VERSION}" ]; then
+  echo -e "${ORANGE}Latest available version is ${LATEST_RENODE_VERSION}, please consider using it.${NC}"
+fi
+echo "Downloading from url: ${LINUX_PORTABLE_URL}"
+
+# Get portable & unpack
+wget --quiet --output-document - "${LINUX_PORTABLE_URL}" |\
+    tar xz --strip-components=1 --directory "${TARGET_PATH}"
+echo "Unpacked to directory: ${TARGET_PATH}"
diff --git a/tensorflow/lite/micro/testing/stm32f4.robot b/tensorflow/lite/micro/testing/stm32f4.robot
index d1d204f51e9..0833c0b0e11 100644
--- a/tensorflow/lite/micro/testing/stm32f4.robot
+++ b/tensorflow/lite/micro/testing/stm32f4.robot
@@ -17,7 +17,7 @@ Should Run Stm32f4 Test
     Execute Command           $bin = @${BIN}
     Execute Script            ${SCRIPT}
 
-    Create Terminal Tester    ${UART}  timeout=30
+    Create Terminal Tester    ${UART}  timeout=60
     Start Emulation
 
     Wait For Line On Uart     ${EXPECTED}
diff --git a/tensorflow/lite/micro/testing/test_bluepill_binary.sh b/tensorflow/lite/micro/testing/test_bluepill_binary.sh
index a9608f2c4d4..6353299f4e3 100755
--- a/tensorflow/lite/micro/testing/test_bluepill_binary.sh
+++ b/tensorflow/lite/micro/testing/test_bluepill_binary.sh
@@ -1,5 +1,5 @@
 #!/bin/bash -e
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,30 +27,49 @@
 declare -r ROOT_DIR=`pwd`
 declare -r TEST_TMPDIR=/tmp/test_bluepill_binary/
 declare -r MICRO_LOG_PATH=${TEST_TMPDIR}
-declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}logs.txt
 mkdir -p ${MICRO_LOG_PATH}
 
-docker build -t renode_bluepill \
-  -f ${ROOT_DIR}/tensorflow/lite/micro/testing/Dockerfile.bluepill \
-  ${ROOT_DIR}/tensorflow/lite/micro/testing/
+declare -r RENODE_TEST_SCRIPT=${ROOT_DIR}/tensorflow/lite/micro/tools/make/downloads/renode/test.sh
+if [ ! -f "${RENODE_TEST_SCRIPT}" ]; then
+  echo "The renode test script: ${RENODE_TEST_SCRIPT} does not exist. Please " \
+       "make sure that you have correctly installed Renode for TFLM. See " \
+       "tensorflow/lite/micro/docs/renode.md for more details."
+  exit 1
+fi
+
+if ! ${RENODE_TEST_SCRIPT} &> /dev/null
+then
+  echo "The following command failed: ${RENODE_TEST_SCRIPT}. Please " \
+       "make sure that you have correctly installed Renode for TFLM. See " \
+       "tensorflow/lite/micro/docs/renode.md for more details."
+  exit 1
+fi
+
+
+# This check ensures that we only have a single $MICRO_LOG_FILENAME. Without it,
+# renode will do a log rotation and there will be multiple files such as
+# $MICRO_LOG_FILENAME.1 $MICRO_LOG_FILENAME.2 etc.
+if [ -e $MICRO_LOG_FILENAME ]; then
+    rm $MICRO_LOG_FILENAME &> /dev/null
+fi;
 
 exit_code=0
-# running in `if` to avoid setting +e
-if ! docker run \
-  --log-driver=none -a stdout -a stderr \
-  -v ${ROOT_DIR}:/workspace \
-  -v /tmp:/tmp \
-  -e BIN=/workspace/$1 \
-  -e SCRIPT=/workspace/tensorflow/lite/micro/testing/bluepill.resc \
-  -e EXPECTED="$2" \
-  -it renode_bluepill \
-  /bin/bash -c "/opt/renode/tests/test.sh /workspace/tensorflow/lite/micro/testing/bluepill.robot 2>&1 >${MICRO_LOG_FILENAME}"
+
+if ! BIN=${ROOT_DIR}/$1 \
+  SCRIPT=${ROOT_DIR}/tensorflow/lite/micro/testing/bluepill.resc \
+  LOGFILE=$MICRO_LOG_FILENAME \
+  EXPECTED="$2" \
+  ${RENODE_TEST_SCRIPT} \
+  ${ROOT_DIR}/tensorflow/lite/micro/testing/bluepill.robot \
+  -r $TEST_TMPDIR &> ${MICRO_LOG_PATH}robot_logs.txt
 then
   exit_code=1
 fi
 
 echo "LOGS:"
-cat ${MICRO_LOG_FILENAME}
+# Extract output from renode log
+cat ${MICRO_LOG_FILENAME} |grep 'uartSemihosting' |sed 's/^.*from start] *//g'
 if [ $exit_code -eq 0 ]
 then
   echo "$1: PASS"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index 354d26d9102..a31b5d1382f 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -52,7 +52,7 @@ tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh PRESUBMIT
 echo "Running Arduino tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_arduino.sh
 
-echo "Running cortex_m_gcc_generic tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
+echo "Running cortex_m_generic tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
 
 echo "Finished all micro tests at `date`"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
index e333e9e6cd9..006951e4cf0 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
@@ -25,14 +25,15 @@ cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean clean_downloads
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 
 TARGET=arduino
+TAGS=cmsis-nn
 
 # TODO(b/143715361): parallel builds do not work with generated files right now.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile \
   TARGET=${TARGET} \
-  TAGS="cmsis-nn" \
+  TAGS=${TAGS} \
   generate_arduino_zip
 
 readable_run tensorflow/lite/micro/tools/ci_build/install_arduino_cli.sh
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
index 8770ea96980..3856cb849f8 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
@@ -33,7 +33,7 @@ rm -rf ${TEMP_BUILD_DIR}
 mkdir -p "${ARDUINO_HOME_DIR}/libraries"
 mkdir -p ${TEMP_BUILD_DIR}
 
-unzip -q ${LIBRARY_ZIP} -d "${ARDUINO_LIBRARIES_DIR}"
+unzip -o -q ${LIBRARY_ZIP} -d "${ARDUINO_LIBRARIES_DIR}"
 
 # Installs all dependencies for Arduino
 InstallLibraryDependencies () {
@@ -51,7 +51,7 @@ InstallLibraryDependencies () {
   # commit is tested to work; if we bump the commit, we need to ensure that
   # the defines in ArduCAM/memorysaver.h are correct.
   wget -O /tmp/arducam-master.zip https://github.com/ArduCAM/Arduino/archive/e216049ba304048ec9bb29adfc2cc24c16f589b1/master.zip
-  unzip /tmp/arducam-master.zip -d /tmp
+  unzip -o /tmp/arducam-master.zip -d /tmp
   cp -r /tmp/Arduino-e216049ba304048ec9bb29adfc2cc24c16f589b1/ArduCAM "${ARDUINO_LIBRARIES_DIR}"
 }
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
similarity index 68%
rename from tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
rename to tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
index 596c88965e7..f2a43f72630 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,23 +24,24 @@ cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
-TARGET=cortex_m_gcc_generic
+TARGET=cortex_m_generic
+TAGS=cmsis-nn
 
 # TODO(b/143715361): downloading first to allow for parallel builds.
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4F third_party_downloads
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} TARGET_ARCH=cortex-m4 third_party_downloads
 
 # Build for Cortex-M4 (no FPU) without CMSIS
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} CORTEX_M_CORE=M4 microlite
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} TARGET_ARCH=cortex-m4 microlite
 
 # Build for Cortex-M4F (FPU present) without CMSIS
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} CORTEX_M_CORE=M4F microlite
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} TARGET_ARCH=cortex-m4+fp microlite
 
 # Build for Cortex-M4 (no FPU) with CMSIS
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4 microlite
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} TARGET_ARCH=cortex-m4 microlite
 
 # Build for Cortex-M4 (FPU present) with CMSIS
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4F microlite
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} TARGET_ARCH=cortex-m4+fp microlite
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index c3db0181a80..e24279b5da6 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -37,6 +37,7 @@ TARGET := $(HOST_OS)
 TARGET_ARCH := $(HOST_ARCH)
 
 # Default compiler and tool names:
+TOOLCHAIN:=gcc
 CXX_TOOL := g++
 CC_TOOL := gcc
 AR_TOOL := ar
@@ -50,7 +51,6 @@ ALL_TAGS := $(TAGS) $(TARGET)
 # include directories from one source.
 INCLUDES := \
 -I. \
--I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
 -I$(MAKEFILE_DIR)/downloads/ruy
@@ -111,6 +111,7 @@ COMMON_FLAGS := \
   -fdata-sections \
   -fmessage-length=0 \
   -DTF_LITE_STATIC_MEMORY \
+  -DTF_LITE_DISABLE_X86_NEON \
   $(OPTIMIZATION_LEVEL) \
   $(CC_WARNINGS) \
   $(TAG_DEFINES)
@@ -122,15 +123,21 @@ CXXFLAGS := \
   -fno-threadsafe-statics \
   $(COMMON_FLAGS)
 
-CCFLAGS  := \
-	-std=c11 \
-	$(COMMON_FLAGS)
+CCFLAGS := \
+  -std=c11 \
+  $(COMMON_FLAGS)
 
 ARFLAGS := -r
 
-LDFLAGS += \
-  -Wl,--fatal-warnings \
-  -Wl,--gc-sections
+ifeq ($(TOOLCHAIN), gcc)
+  ifneq ($(TARGET), osx)
+    # GCC on MacOS uses an LLVM backend so we avoid the additional linker flags
+    # that are unsupported with LLVM.
+    LDFLAGS += \
+      -Wl,--fatal-warnings \
+      -Wl,--gc-sections
+  endif
+endif
 
 # override these in the makefile.inc for specific compiler targets
 TARGET_TOOLCHAIN_PREFIX :=
@@ -354,11 +361,21 @@ $(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
 $(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
 $(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
 
-# These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
-# based on platforms or architectures should happen within these files, to
-# keep this main makefile focused on the sources and dependencies.
-include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
+# The target-specific makefile must have a name that is exactly
+# TARGET_makefile.inc and is only needed for cross-compilation (i.e. when TARGET
+# is different from the HOST_OS).
+# There are also some other targets like arduino and CHRE that are also special
+# in that they do no have a <target>_makefile but are still used to create a
+# directory for the generated artifacts. We are using a workaround right now and
+# will be separating the project generation from the Makefile in the future.
+TARGETS_WITHOUT_MAKEFILES := \
+$(HOST_OS) \
+arduino \
+chre
+
+ifeq ($(findstring $(TARGET),$(TARGETS_WITHOUT_MAKEFILES)),)
+  include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc
+endif
 
 # Load dependencies for optimized kernel implementations.
 include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc)
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index da98ab1b042..4551d5b0190 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -83,121 +83,6 @@ patch_kissfft() {
   echo "Finished patching kissfft"
 }
 
-# Fixes issues with CMSIS.
-patch_cmsis() {
-  # See the RFC at https://docs.google.com/document/d/14GRxeVEgSKgKBKAijO7oxnI49nLoTYBFQmPok-rG0cw
-  # for full details on the path qualification changes we have to make below to enable the CMSIS-NN
-  # library source files to compile in an environment like the Arduino IDE that doesn't suppport
-  # custom include paths.
-  # These include changes were found through trial and error while trying to get the Arduino
-  # library compiling with the CMSIS-NN kernels included.
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.c' -exec \
-    sed -i -E $'s@#include "arm_nnfunctions.h"@#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.c' -exec \
-    sed -i -E $'s@#include "arm_nnsupportfunctions.h"@#include "cmsis/CMSIS/NN/Include/arm_nnsupportfunctions.h"@g' {} \; 
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.c' -exec \
-    sed -i -E $'s@#include "arm_nn_types.h"@#include "cmsis/CMSIS/NN/Include/arm_nn_types.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_math.h"@#include "cmsis/CMSIS/DSP/Include/arm_math.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_common_tables.h"@#include "cmsis/CMSIS/DSP/Include/arm_common_tables.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_nn_tables.h"@#include "cmsis/CMSIS/NN/Include/arm_nn_tables.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_math_types.h"@#include "cmsis/CMSIS/DSP/Include/arm_math_types.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_math_memory.h"@#include "cmsis/CMSIS/DSP/Include/arm_math_memory.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/none.h"@#include "cmsis/CMSIS/DSP/Include/dsp/none.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/utils.h"@#include "cmsis/CMSIS/DSP/Include/dsp/utils.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/basic_math_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/basic_math_functions.h"@g' {} \;
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/statistics_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/statistics_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/svm_defines.h"@#include "cmsis/CMSIS/DSP/Include/dsp/svm_defines.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/svm_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/svm_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/support_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/support_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/transform_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/transform_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/bayes_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/bayes_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/complex_math_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/complex_math_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/controller_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/controller_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/distance_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/distance_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/fast_math_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/fast_math_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/filtering_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/filtering_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/interpolation_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/interpolation_functions.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "dsp/matrix_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/matrix_functions.h"@g' {} \;
-
-  # Until the fix for https://github.com/ARMmbed/mbed-os/issues/12568 is
-  # rolled into Mbed version used on the Arduino IDE, we have to replace
-  # one intrinsic with a patched equivalent.
-  sed -i -E 's@__SXTB16_RORn@__patched_SXTB16_RORn@g' \
-    tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
-
-  sed -i -E $'33 a \\\n\\\n// Work around for https://github.com/ARMmbed/mbed-os/issues/12568\\\n__STATIC_FORCEINLINE uint32_t __patched_SXTB16_RORn(uint32_t op1, uint32_t rotate) {\\\n  uint32_t result;\\\n  __ASM ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (rotate) );\\\n  return result;\\\n}' \
-    tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
-
-  echo "Finished patching CMSIS"
-}
-
 # Create a header file containing an array with the first 10 images from the
 # CIFAR10 test dataset.
 patch_cifar10_dataset() {
@@ -295,6 +180,7 @@ download_and_extract() {
     fi
   else
     echo "Error unsupported archive type. Failed to extract tool after download."
+    exit 1
   fi
   rm -rf ${tempdir2} ${tempdir}
 
@@ -307,8 +193,6 @@ download_and_extract() {
     patch_kissfft ${dir}
   elif [[ ${action} == "patch_cifar10_dataset" ]]; then
     patch_cifar10_dataset ${dir}
-  elif [[ ${action} == "patch_cmsis" ]]; then
-    patch_cmsis ${dir}
   elif [[ ${action} == "build_embarc_mli" ]]; then
     if [[ "${action_param1}" == *.tcf ]]; then
       cp ${action_param1} ${dir}/hw/arc.tcf
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc
similarity index 91%
rename from tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
rename to tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc
index b988dac0c94..b4caadf9252 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc
@@ -6,9 +6,9 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
 
     # Setup CMSIS-NN lib and add required header files to microlite lib INCLUDE
     THIRD_PARTY_DOWNLOADS += \
-      $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
+      $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
 
-    CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+    CMSIS_PATH := $(MAKEFILE_DIR)/downloads/cmsis/
 
     # List of files generated with:
     # find tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ -iname "*.c"
@@ -26,7 +26,6 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c \
-      $(CMSIS_PATH)/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c \
@@ -90,25 +89,15 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
 
     # Cherry-picked list of headers that are needed to compile the CMSIS-NN
     # optimized kernels. We don't include all the possible CMSIS headers because
-    # of their large number. See the RFC document for more details:
-    # https://docs.google.com/document/d/14GRxeVEgSKgKBKAijO7oxnI49nLoTYBFQmPok-rG0cw
+    # of their large number.
     THIRD_PARTY_CC_HDRS += \
-      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnfunctions.h \
-      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnsupportfunctions.h \
-      $(CMSIS_PATH)CMSIS/NN/Include/arm_nn_tables.h \
-      $(CMSIS_PATH)CMSIS/NN/Include/arm_nn_types.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math.h \
+      $(CMSIS_PATH)CMSIS/Core/Include/cmsis_compiler.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/arm_common_tables.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math_types.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_helium_utils.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/arm_math_memory.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/none.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/utils.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math_types.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/basic_math_functions.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/statistics_functions.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/svm_defines.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/svm_functions.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/support_functions.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/transform_functions.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/bayes_functions.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/complex_math_functions.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/controller_functions.h \
@@ -116,14 +105,28 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/fast_math_functions.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/filtering_functions.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/dsp/interpolation_functions.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/matrix_functions.h
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/matrix_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/none.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/statistics_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/support_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/svm_defines.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/svm_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/transform_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/utils.h \
+      $(CMSIS_PATH)CMSIS/NN/Include/arm_nn_tables.h \
+      $(CMSIS_PATH)CMSIS/NN/Include/arm_nn_types.h \
+      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnfunctions.h \
+      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnsupportfunctions.h
 
-
-    # Need to add the CMSIS Core includes path.
-    # All other CMSIS header files are included with their relative path
-    # in the CMSIS-NN micro kernel source files in
-    # tensorflow/lite/micro/kernels/cmsis-nn
+    # We add -I$(CMSIS_PATH) to enable the code in the TFLM repo (mostly in the
+    # tensorflow/lite/micro/kernels/cmsis-nn) to use include paths relative to
+    # the CMSIS code-base.
+    #
+    # The CMSIS code itself uses includes such as #include "arm_math.h" and so
+    # we add $(CMSIS_PATH)/CMSIS/Core/Include etc. to be able to build the CMSIS
+    # code without any modifications.
     INCLUDES += \
+      -I$(CMSIS_PATH) \
       -I$(CMSIS_PATH)/CMSIS/Core/Include \
       -I$(CMSIS_PATH)/CMSIS/DSP/Include \
       -I$(CMSIS_PATH)/CMSIS/NN/Include
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc
index 44de3ebfc7c..7cae7a062f4 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc
@@ -1,19 +1,37 @@
 ifneq ($(filter ethos-u,$(ALL_TAGS)),)
-    # Don't want -lm flag
-    MICROLITE_LIBS :=
+    # Arm Compiler will not link the Math library (see below), therefore we're filtering it out.
+    # See Fatal error: L6450U: Cannot find library m:
+    # "Arm Compiler is designed to run in a bare metal environment,
+    # and automatically includes implementations of these functions,
+    # and so no such flag is necessary."
+    # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice
+    MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
 
     ifneq (,$(filter $(TARGET_ARCH), x86_64))
         $(error target architecture x86_64 not supported)
     endif
 
-    THIRD_PARTY_DOWNLOADS += \
-      $(eval $(call add_third_party_download,$(ETHOSU_URL),$(ETHOSU_MD5),ethosu,))
     ETHOSU_DRIVER_PATH = $(MAKEFILE_DIR)/downloads/ethosu
 
+    # The driver need to be downloaded before the recursive_find below.
+    # That won't happen with the standard way of downloading by generating a
+    # target(call add_third_party_download), so instead use the shell function.
+    NEED_DOWNLOAD := YES
+    ifeq ($(NEED_DOWNLOAD),$(shell test -d $(ETHOSU_DRIVER_PATH) || echo $(NEED_DOWNLOAD)))
+        DOWNLOAD_SCRIPT := ./tensorflow/lite/micro/tools/make/download_and_extract.sh
+        DOWNLOAD_OK := OK
+        DOWNLOAD_STATUS := $(shell $(DOWNLOAD_SCRIPT) $(ETHOSU_URL) $(ETHOSU_MD5) $(ETHOSU_DRIVER_PATH) >&2 && echo $(DOWNLOAD_OK))
+        ifneq ($(DOWNLOAD_OK),$(DOWNLOAD_STATUS))
+            $(error $(DOWNLOAD_SCRIPT) failed)
+        endif
+    endif
+
     # Currently there is a dependency to CMSIS-NN
     THIRD_PARTY_DOWNLOADS += \
-        $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-    CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+        $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+    ifeq ($(CMSIS_PATH),)
+      CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+    endif
     THIRD_PARTY_CC_HDRS += $(call recursive_find,$(CMSIS_PATH)/CMSIS/Core/Include,*.h)
 
     THIRD_PARTY_CC_HDRS += $(call recursive_find,$(ETHOSU_DRIVER_PATH)/include,*.h)
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 68792496ec3..7ce44b46f4e 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -1,143 +1,136 @@
-# Settings for apollo3 evb and SparkFun Edge platforms.
-ifeq ($(TARGET),$(filter $(TARGET),\
-  apollo3evb\
-  sparkfun_edge\
-  ))
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-  TARGET_ARCH := cortex-m4
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-  TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/
-  # Download the Ambiq Apollo3 SDK and set this variable to find the header
-  # files:
-  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
-  # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
-  # with the hard interfaces.
-  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
+export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+TARGET_ARCH := cortex-m4
+TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/
+# Download the Ambiq Apollo3 SDK and set this variable to find the header
+# files:
+APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
+# Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
+# with the hard interfaces.
+GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-  $(eval $(call add_third_party_download,$(AM_SDK_URL),$(AM_SDK_MD5),$(AM_SDK_DEST),patch_am_sdk))
+$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+$(eval $(call add_third_party_download,$(AM_SDK_URL),$(AM_SDK_MD5),$(AM_SDK_DEST),patch_am_sdk))
 
-  ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
-    $(eval $(call add_third_party_download,$(SF_BSPS_URL),$(SF_BSPS_MD5),$(AM_SDK_DEST)/$(SF_BSPS_DEST),))
-    # Make sure that we download the full Ambiq SDK before the SparkFun BSPs.
+ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
+  $(eval $(call add_third_party_download,$(SF_BSPS_URL),$(SF_BSPS_MD5),$(AM_SDK_DEST)/$(SF_BSPS_DEST),))
+  # Make sure that we download the full Ambiq SDK before the SparkFun BSPs.
 $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
-  endif
-
-  PLATFORM_FLAGS = \
-    -DPART_apollo3 \
-    -DAM_PACKAGE_BGA \
-    -DAM_PART_APOLLO3 \
-    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DNDEBUG \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -D __FPU_PRESENT=1 \
-    -DARM_MATH_CM4 \
-    -fno-rtti \
-    -fmessage-length=0 \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -ffunction-sections \
-    -fdata-sections \
-    -funsigned-char \
-    -MMD \
-    -mcpu=cortex-m4 \
-    -mthumb \
-    -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=hard \
-    -std=gnu++11 \
-    -Wvla \
-    -Wall \
-    -Wextra \
-    -Wno-missing-field-initializers \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -Wno-unused-function \
-    -Wno-unused-parameter \
-    -fno-delete-null-pointer-checks \
-    -fno-threadsafe-statics \
-    -fomit-frame-pointer \
-    -fno-use-cxa-atexit \
-    -nostdlib \
-    -ggdb \
-    -O3
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-  LDFLAGS += \
-    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
-    -nostartfiles -static \
-    -Wl,--gc-sections -Wl,--entry,Reset_Handler \
-    -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
-    -fno-exceptions \
-    -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(TENSORFLOW_ROOT)$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
-    -Wl,-Map=$(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
-  BUILD_TYPE := micro
-  ifeq ($(TARGET), apollo3evb)
-    BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp
-  endif
-  ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
-    BOARD_BSP_PATH := $(APOLLO3_SDK)/$(SF_BSPS_DEST)/$(subst sparkfun_,,$(TARGET))/bsp
-    INCLUDES+= \
-      -I$(APOLLO3_SDK)/$(SF_BSPS_DEST)/common/third_party/hm01b0
-  endif
-  MICROLITE_LIBS := \
-    $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \
-    $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
-    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
-    -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
-    -I$(GCC_ARM)/arm-none-eabi/ \
-    -I$(APOLLO3_SDK)/mcu/apollo3/ \
-    -I$(APOLLO3_SDK)/mcu/apollo3/regs \
-    -I$(APOLLO3_SDK)/mcu/apollo3/hal \
-    -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
-    -I$(BOARD_BSP_PATH) \
-    -I$(APOLLO3_SDK)/devices/ \
-    -I$(APOLLO3_SDK)/utils/ \
-
-
-  # The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c
-  # file from Ambiq:
-  #   - Increase the stack size from 1k to 20k
-  #   - Change the application entry call from main() to _main()
-  # The am_*.c files should be copied from the Ambiq Apollo3 SDK
-  # _main.c contains application and target specific initialization, like
-  # setting clock speed, default uart setups, etc. and an implementation
-  # of the DebugLog interfaces.
-  MICROLITE_CC_SRCS += \
-    $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
-    $(APOLLO3_SDK)/utils/am_util_delay.c \
-    $(APOLLO3_SDK)/utils/am_util_faultisr.c \
-    $(APOLLO3_SDK)/utils/am_util_id.c \
-    $(APOLLO3_SDK)/utils/am_util_stdio.c \
-    $(APOLLO3_SDK)/devices/am_devices_led.c
-
-  CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
-  THIRD_PARTY_CC_SRCS := \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
-
-  MICRO_SPEECH_TEST_SRCS += \
-    $(AP3_MICRO_DIR)/_main.c
-
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh
-  # These are tests that don't currently work on the Apollo3 board.
-  EXCLUDED_TESTS := \
-    tensorflow/lite/micro/micro_interpreter_test.cc \
-    tensorflow/lite/micro/simple_tensor_allocator_test.cc
-  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
-
 endif
+
+PLATFORM_FLAGS = \
+  -DPART_apollo3 \
+  -DAM_PACKAGE_BGA \
+  -DAM_PART_APOLLO3 \
+  -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+  -DTF_LITE_STATIC_MEMORY \
+  -DNDEBUG \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -D __FPU_PRESENT=1 \
+  -DARM_MATH_CM4 \
+  -fno-rtti \
+  -fmessage-length=0 \
+  -fno-exceptions \
+  -fno-unwind-tables \
+  -ffunction-sections \
+  -fdata-sections \
+  -funsigned-char \
+  -MMD \
+  -mcpu=cortex-m4 \
+  -mthumb \
+  -mfpu=fpv4-sp-d16 \
+  -mfloat-abi=hard \
+  -std=gnu++11 \
+  -Wvla \
+  -Wall \
+  -Wextra \
+  -Wno-missing-field-initializers \
+  -Wno-strict-aliasing \
+  -Wno-type-limits \
+  -Wno-unused-function \
+  -Wno-unused-parameter \
+  -fno-delete-null-pointer-checks \
+  -fno-threadsafe-statics \
+  -fomit-frame-pointer \
+  -fno-use-cxa-atexit \
+  -nostdlib \
+  -ggdb \
+  -O3
+CXXFLAGS += $(PLATFORM_FLAGS)
+CCFLAGS += $(PLATFORM_FLAGS)
+LDFLAGS += \
+  -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
+  -nostartfiles -static \
+  -Wl,--gc-sections -Wl,--entry,Reset_Handler \
+  -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
+  -fno-exceptions \
+  -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
+  -Wl,-T,$(TENSORFLOW_ROOT)$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
+  -Wl,-Map=$(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+BUILD_TYPE := micro
+ifeq ($(TARGET), apollo3evb)
+  BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp
+endif
+ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
+  BOARD_BSP_PATH := $(APOLLO3_SDK)/$(SF_BSPS_DEST)/$(subst sparkfun_,,$(TARGET))/bsp
+  INCLUDES+= \
+    -I$(APOLLO3_SDK)/$(SF_BSPS_DEST)/common/third_party/hm01b0
+endif
+MICROLITE_LIBS := \
+  $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \
+  $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
+  $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
+  -lm
+INCLUDES += \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+  -I$(GCC_ARM)/arm-none-eabi/ \
+  -I$(APOLLO3_SDK)/mcu/apollo3/ \
+  -I$(APOLLO3_SDK)/mcu/apollo3/regs \
+  -I$(APOLLO3_SDK)/mcu/apollo3/hal \
+  -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
+  -I$(BOARD_BSP_PATH) \
+  -I$(APOLLO3_SDK)/devices/ \
+  -I$(APOLLO3_SDK)/utils/ \
+
+
+# The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c
+# file from Ambiq:
+#   - Increase the stack size from 1k to 20k
+#   - Change the application entry call from main() to _main()
+# The am_*.c files should be copied from the Ambiq Apollo3 SDK
+# _main.c contains application and target specific initialization, like
+# setting clock speed, default uart setups, etc. and an implementation
+# of the DebugLog interfaces.
+MICROLITE_CC_SRCS += \
+  $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
+  $(APOLLO3_SDK)/utils/am_util_delay.c \
+  $(APOLLO3_SDK)/utils/am_util_faultisr.c \
+  $(APOLLO3_SDK)/utils/am_util_id.c \
+  $(APOLLO3_SDK)/utils/am_util_stdio.c \
+  $(APOLLO3_SDK)/devices/am_devices_led.c
+
+CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
+THIRD_PARTY_CC_SRCS := \
+$(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
+$(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+$(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+$(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+$(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+$(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+MICRO_SPEECH_TEST_SRCS += \
+  $(AP3_MICRO_DIR)/_main.c
+
+TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh
+# These are tests that don't currently work on the Apollo3 board.
+EXCLUDED_TESTS := \
+  tensorflow/lite/micro/micro_interpreter_test.cc \
+  tensorflow/lite/micro/simple_tensor_allocator_test.cc
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/README.md b/tensorflow/lite/micro/tools/make/targets/arc/README.md
index a614a80e993..7f61025b3ad 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/README.md
+++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md
@@ -70,7 +70,7 @@ section for instructions on toolchain installation.
 
 If you wish to use the MetaWare Debugger to debug your code, you need to also
 install the Digilent Adept 2 software, which includes the necessary drivers for
-connecting to the targets. This is available from oficial
+connecting to the targets. This is available from official
 [Digilent site](https://reference.digilentinc.com/reference/software/adept/start?redirect=1#software_downloads).
 You should install the “System” component, and Runtime. Utilities and SDK are
 NOT required.
@@ -269,7 +269,7 @@ comments about make versions.
 Before building the application itself, you need to generate the project for
 this application from TensorFlow sources and external dependencies. To generate
 it for a custom TCF you need to set the following variables in the make command
-line: * TARGET_ARCH=arc * TCF_FILE=<path to TCF file> * (optional)
+line: * TARGET=arc_custom * TCF_FILE=<path to TCF file> * (optional)
 LCF_FILE=<path to LCF file>
 
 If you don’t supply an external LCF, the one embedded in the TCF will be used
@@ -279,7 +279,7 @@ For instance, to build **Person Detection** test application, use the following
 command from the root directory of the TensorFlow repo:
 
 ```
-make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET_ARCH=arc TCF_FILE=<path_to_tcf_file> LCF_FILE=<path_to_lcf_file>
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET=arc_custom TCF_FILE=<path_to_tcf_file> LCF_FILE=<path_to_lcf_file>
 ```
 
 The application project will be generated into
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 28c0fcd8571..51ae86e5886 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -123,15 +123,19 @@ endif
 
   CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
   CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+
+  ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
+  LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
+  
   MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
 
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += $(PLATFORM_LDFLAGS)
 
-
-
-
 endif # ARC_TOOLCHAIN
-endif  # TARGET_ARCH
 
+else
+  $(error "Only ARC target architecture supported (TARGET_ARCH=arc)")
+
+endif  # TARGET_ARCH
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_custom_makefile.inc
similarity index 75%
rename from tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
rename to tensorflow/lite/micro/tools/make/targets/arc_custom_makefile.inc
index 9f5442b4c6c..9332bc9fbd1 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_custom_makefile.inc
@@ -15,26 +15,22 @@
 # Settings for not pre-defined ARC processors. 
 # User need to specify ARC target with Tool Configuration File (*.tcf). 
 # Path to this file must be passed through TCF_FILE variable.
-# Otherwise, default em7d_voice_audio configuration is used 
-ifeq ($(TARGET_ARCH), arc)
-
-# Known target are specified with their own make configurations. 
-ifeq ($(filter $(TARGET), arc_emsdp),)
+# Otherwise, default em7d_voice_audio configuration is used
 
+TARGET_ARCH := arc
 ARC_TOOLCHAIN := mwdt
 
+# Overriding TARGET variable to change name of project folder according
+# to specified Tool Configuration File (*.tcf) passed through TCF_FILE variable
+# or default em7d_voice_audio configuration.
 ifneq ($(TCF_FILE), )
-  TARGET = $(basename $(notdir $(TCF_FILE)))
+  override TARGET = $(basename $(notdir $(TCF_FILE)))
 else
   $(warning TCF_FILE variable is not specified. Use default em7d_voice_audio configuration)
-  TARGET = em7d_voice_audio
+  override TARGET = em7d_voice_audio
   TCF_FILE = em7d_voice_audio
 endif
 
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
 
 MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC.md
-
-endif  # $(TARGET)
-endif  # $(TARGET_ARCH)...
-
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index 99f2de05890..c7b5c124fd6 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -13,14 +13,13 @@
 # limitations under the License.
 
 # Settings for EMSDP target (ARC processor)
-ifeq ($(TARGET), arc_emsdp)
 
-  TARGET_ARCH := arc
-  ARC_TOOLCHAIN := mwdt
+TARGET_ARCH := arc
+ARC_TOOLCHAIN := mwdt
 
 
-  BUILD_ARC_MLI := false
-  ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
+BUILD_ARC_MLI := false
+ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
 
 ifneq ($(filter no_arc_mli,$(ALL_TAGS)),)
   MLI_LIB_DIR = arc_mli_package
@@ -29,37 +28,35 @@ else ifeq ($(BUILD_ARC_MLI), true)
   MLI_LIB_DIR = arc_mli_$(ARC_MLI_PRE_COMPILED_TARGET)
 endif
 
-  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
-  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
+LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
 
 
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
   
-   ARC_EXTRA_APP_SETTINGS = \
-      BIN_DIR = .$(DLR)\(PS\)bin\n\
-      BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
+  ARC_EXTRA_APP_SETTINGS = \
+    BIN_DIR = .$(DLR)\(PS\)bin\n\
+    BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
 
-   ARC_EXTRA_APP_RULES = \
-     $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
-     \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
-     \n \
-     \n$(DLR)\(BIN_DIR\):\
-     \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
+  ARC_EXTRA_APP_RULES = \
+    $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
+    \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
+    \n \
+    \n$(DLR)\(BIN_DIR\):\
+    \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
 
-   ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
+  ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
 
-   ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
-   ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
-   
-   ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\)
-   ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
-   ARC_EXTRA_EXECUTE_RULES = 
+  ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
+  ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
+  
+  ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\)
+  ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
+  ARC_EXTRA_EXECUTE_RULES = 
 
-  MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
+MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
 
-  # for default EMSDP configuration we can use em9d_va rt libs
-  # for better performance runtime should be built for emsdp configuration
-  # No hostlink library for smaller codesize purpose
-  PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib=
-
-endif
+# for default EMSDP configuration we can use em9d_va rt libs
+# for better performance runtime should be built for emsdp configuration
+# No hostlink library for smaller codesize purpose
+PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib=
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 96bc53d1809..46aa263a1a6 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -1,64 +1,60 @@
-# Settings for Blue Pill platforms.
-ifeq ($(TARGET), bluepill)
+export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+TARGET_ARCH := cortex-m3
+TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
 
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-  TARGET_ARCH := cortex-m3
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+$(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-  $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
+PLATFORM_FLAGS = \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -mcpu=cortex-m3 \
+  -mthumb \
+  -Wno-vla \
+  -Wno-strict-aliasing \
+  -Wno-shadow \
+  -Wno-type-limits \
+  -fomit-frame-pointer \
+  -nostdlib
 
-  PLATFORM_FLAGS = \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -mcpu=cortex-m3 \
-    -mthumb \
-    -Wno-vla \
-    -Wno-strict-aliasing \
-    -Wno-shadow \
-    -Wno-type-limits \
-    -fomit-frame-pointer \
-    -nostdlib
+# TODO(b/168334217): Currently we always add -DNDEBUG because the build is
+# broken w/o it. Remove this workaround once the issue is resolved.
+PLATFORM_FLAGS += -DNDEBUG
 
-  # TODO(b/168334217): Currently we always add -DNDEBUG because the build is
-  # broken w/o it. Remove this workaround once the issue is resolved.
-  PLATFORM_FLAGS += -DNDEBUG
+CXXFLAGS += $(PLATFORM_FLAGS) -fno-use-cxa-atexit
+CCFLAGS += $(PLATFORM_FLAGS)
 
-  CXXFLAGS += $(PLATFORM_FLAGS) -fno-use-cxa-atexit
-  CCFLAGS += $(PLATFORM_FLAGS)
+LDFLAGS += \
+  -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \
+  -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
 
-  LDFLAGS += \
-    -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \
-    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+# Additional include paths needed for the stm_32_bare_lib only.
+INCLUDES += \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include
 
-  # Additional include paths needed for the stm_32_bare_lib only.
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include
+MICROLITE_CC_SRCS += \
+  $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
+  $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
+EXCLUDED_SRCS := \
+  $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
 
-  MICROLITE_CC_SRCS += \
-    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
-    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-  EXCLUDED_SRCS := \
-    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
-  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+# TODO(b/143286954): Figure out why some tests fail and enable ince the issues
+# are resolved.
+EXCLUDED_TESTS := \
+  tensorflow/lite/micro/micro_interpreter_test.cc \
+  tensorflow/lite/micro/micro_allocator_test.cc \
+  tensorflow/lite/micro/memory_helpers_test.cc \
+  tensorflow/lite/micro/memory_arena_threshold_test.cc \
+  tensorflow/lite/micro/kernels/circular_buffer_test.cc
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
-  # TODO(b/143286954): Figure out why some tests fail and enable ince the issues
-  # are resolved.
-  EXCLUDED_TESTS := \
-    tensorflow/lite/micro/micro_interpreter_test.cc \
-    tensorflow/lite/micro/micro_allocator_test.cc \
-    tensorflow/lite/micro/memory_helpers_test.cc \
-    tensorflow/lite/micro/memory_arena_threshold_test.cc \
-    tensorflow/lite/micro/kernels/circular_buffer_test.cc
-  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+EXCLUDED_EXAMPLE_TESTS := \
+  tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+  tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+  tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
+MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
 
-  EXCLUDED_EXAMPLE_TESTS := \
-    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
-    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
-    tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
-  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
 
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
-
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
deleted file mode 100644
index dd7ccca7ba5..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
+++ /dev/null
@@ -1,31 +0,0 @@
-# Generic Makefile target for ARM Cortex Mx gcc builds.
-ifeq ($(TARGET), cortex_m_gcc_generic)
-  TARGET_ARCH := arm
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-
-  PLATFORM_FLAGS = \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -Wno-type-limits \
-    -funsigned-char \
-    -mcpu=cortex-m4 \
-    -mfpu=fpv4-sp-d16 \
-    -mthumb \
-    -fomit-frame-pointer
-
-ifeq ($(CORTEX_M_CORE), M4F)
-  PLATFORM_FLAGS += -mfloat-abi=hard
-else ifeq ($(CORTEX_M_CORE), M4)
-  PLATFORM_FLAGS += -mfloat-abi=softfp
-else ifeq ($(CORTEX_M_CORE), )
-  $(error CORTEX_M_CORE=[M4|M4F] not defined on the command line)
-else
-  $(error invalid target defined in command line option CORTEX_M_CORE=[M4|M4F])
-endif
-
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
new file mode 100644
index 00000000000..6747ab9fc36
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
@@ -0,0 +1,128 @@
+# Generic Makefile target for ARM Cortex M builds.
+# For more info see: tensorflow/lite/micro/cortex_m_generic/README.md
+
+FLOAT := soft
+GCC_TARGET_ARCH := $(TARGET_ARCH)
+
+ifeq ($(TARGET_ARCH), cortex-m0)
+  CORE=M0
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M0
+
+else ifeq ($(TARGET_ARCH), cortex-m3)
+  CORE=M3
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M3
+
+else ifeq ($(TARGET_ARCH), cortex-m33)
+  CORE=M33
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M33
+  TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1 -D__VTOR_PRESENT=1 -D__FPU_USED=1
+  FLOAT=hard
+
+else ifeq ($(TARGET_ARCH), cortex-m33+nodsp)
+  CORE=M33
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M33.no_dsp.no_fp
+
+else ifeq ($(TARGET_ARCH), cortex-m4)
+  CORE=M4
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M4.no_fp
+
+else ifeq ($(TARGET_ARCH), cortex-m4+fp)
+  CORE=M4
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M4
+  TARGET_SPECIFIC_FLAGS += -D__FPU_PRESENT=1 -mfpu=fpv4-sp-d16
+  FLOAT=hard
+  GCC_TARGET_ARCH := cortex-m4
+
+else ifeq ($(TARGET_ARCH), cortex-m55)
+  CORE=M55
+  ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.fp
+  TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1
+  FLOAT=hard
+
+else ifeq ($(TARGET_ARCH), cortex-m55+nodsp+nofp)
+  CORE=M55
+  ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_dsp.no_fp
+
+else ifeq ($(TARGET_ARCH), cortex-m55+nofp)
+  CORE=M55
+  ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_fp
+  TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1
+
+else ifeq ($(TARGET_ARCH), cortex-m7)
+  CORE=M7
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M7.no_fp
+
+else ifeq ($(TARGET_ARCH), cortex-m7+fp)
+  CORE=M7
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M7
+  FLOAT=hard
+  GCC_TARGET_ARCH := cortex-m7
+
+else
+  $(error "TARGET_ARCH=$(TARGET_ARCH) is not supported")
+endif
+
+ifneq ($(filter cortex-m55%,$(TARGET_ARCH)),)
+  ifeq ($(TOOLCHAIN), gcc)
+    $(error "Micro architecure support is not available for arm-gcc for TARGET_ARCH=$(TARGET_ARCH)")
+  endif
+
+  # soft-abi=soft disables MVE - use softfp instead for M55.
+  ifeq ($(FLOAT),soft)
+    FLOAT=softfp
+  endif
+endif
+
+# Toolchain specfic flags
+ifeq ($(TOOLCHAIN), armclang)
+  CXX_TOOL  := armclang
+  CC_TOOL   := armclang
+  AR_TOOL   := armar
+  LD        := armlink
+
+  FLAGS_ARMC = \
+    --target=arm-arm-none-eabi \
+    -mcpu=$(TARGET_ARCH)
+
+  CXXFLAGS += $(FLAGS_ARMC)
+  CCFLAGS += $(FLAGS_ARMC)
+  LDFLAGS += $(ARM_LDFLAGS)
+
+  # Arm Compiler will not link the Math library (see below), therefore we're filtering it out.
+  # See Fatal error: L6450U: Cannot find library m:
+  # "Arm Compiler is designed to run in a bare metal environment,
+  # and automatically includes implementations of these functions,
+  # and so no such flag is necessary."
+  # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+else ifeq ($(TOOLCHAIN), gcc)
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+
+  FLAGS_GCC = -mcpu=$(GCC_TARGET_ARCH)
+  CXXFLAGS += $(FLAGS_GCC)
+  CCFLAGS += $(FLAGS_GCC)
+
+else
+  $(error "TOOLCHAIN=$(TOOLCHAIN) is not supported.")
+endif
+
+PLATFORM_FLAGS = \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -mthumb \
+  -mfloat-abi=$(FLOAT) \
+  -funsigned-char \
+  -mlittle-endian \
+  -Wno-type-limits \
+  -Wno-unused-private-field \
+  -fomit-frame-pointer \
+  -MD \
+  -DCPU_$(CORE)=1 \
+  $(TARGET_SPECIFIC_FLAGS)
+
+# Common + C/C++ flags
+CXXFLAGS += $(PLATFORM_FLAGS)
+CCFLAGS += $(PLATFORM_FLAGS)
diff --git a/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc b/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
deleted file mode 100644
index b334224e3a2..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
+++ /dev/null
@@ -1,7 +0,0 @@
-# Settings for Espressif ESP32
-
-ifeq ($(TARGET), esp)
-  TARGET_ARCH := xtensa-esp32
-  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
-  CFLAGS += -std=c11
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/esp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/esp_makefile.inc
new file mode 100644
index 00000000000..afc78e7075b
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/esp_makefile.inc
@@ -0,0 +1,5 @@
+# Settings for Espressif ESP32
+
+TARGET_ARCH := xtensa-esp32
+CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+CFLAGS += -std=c11
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
index d19ce680b41..f01bfe7f80f 100644
--- a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -1,92 +1,93 @@
 # Settings for himax WE_1 evb.
-ifeq ($(TARGET), himax_we1_evb)
+
+CC_TOOL = ccac
+AR_TOOL = arac
+CXX_TOOL = ccac
+LD_TOOL := ccac
+TARGET_ARCH := arc
+#ARC_TOOLCHAIN := mwdt 
+
+BUILD_ARC_MLI := false
+ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
+
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+
+#download SDK & MLI
+HIMAX_WE1_SDK_NAME := himax_we1_sdk
+$(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
+
+#export path of toolchain
+#export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
+
+TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
+LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
+ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
+LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
+
+
+DEFAULT_HEAPSZ := 8192
+DEFAULT_STACKSZ := 8192
+
+TCF_FILE_NAME = $(notdir $(TCF_FILE))
+ARC_TARGET_COPY_FILES += $(notdir $(TCF_FILE))!$(TCF_FILE)
+MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+
+
   
-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
-  LD_TOOL := ccac
-  TARGET_ARCH := arc
-  #ARC_TOOLCHAIN := mwdt 
+LCF_FILE_NAME = $(notdir $(LCF_FILE))
+ARC_TARGET_COPY_FILES += $(notdir $(LCF_FILE))!$(LCF_FILE)
+MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
 
-  BUILD_ARC_MLI := false
-  ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
-  
-  include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
+ARC_TARGET_COPY_FILES += $(notdir $(ARCLIB_FILE))!$(ARCLIB_FILE)
+MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
 
-  #download SDK & MLI
-  HIMAX_WE1_SDK_NAME := himax_we1_sdk
-  $(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
-
-  #export path of toolchain
-  #export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
-  
-  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
-  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
-  ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
-  LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
-  
-
-  DEFAULT_HEAPSZ := 8192
-  DEFAULT_STACKSZ := 8192
-
-  TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_COPY_FILES += $(notdir $(TCF_FILE))!$(TCF_FILE)
-  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
+ARC_TARGET_COPY_FILES += $(notdir $(LIB_HEADER_FILE))!$(LIB_HEADER_FILE)
+MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
 
 
-    
-  LCF_FILE_NAME = $(notdir $(LCF_FILE))
-  ARC_TARGET_COPY_FILES += $(notdir $(LCF_FILE))!$(LCF_FILE)
-  MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
-  
-  ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
-  ARC_TARGET_COPY_FILES += $(notdir $(ARCLIB_FILE))!$(ARCLIB_FILE)
-  MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
-  
-  LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
-  ARC_TARGET_COPY_FILES += $(notdir $(LIB_HEADER_FILE))!$(LIB_HEADER_FILE)
-  MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
-  
-  
-  # Need a pointer to the TCF and lcf file
+# Need a pointer to the TCF and lcf file
 
-  PLATFORM_FLAGS = \
-    -DNDEBUG \
-    -g \
-    -DCPU_ARC \
-    -Hnosdata \
-    -DTF_LITE_STATIC_MEMORY \
-    -tcf=$(TCF_FILE_NAME) \
-    -Hnocopyr \
-    -Hpurge \
-    -Hcl \
-    -fslp-vectorize-aggressive \
-    -ffunction-sections \
-    -fdata-sections \
-    -tcf_core_config \
+PLATFORM_FLAGS = \
+  -DNDEBUG \
+  -g \
+  -DCPU_ARC \
+  -Hnosdata \
+  -DTF_LITE_STATIC_MEMORY \
+  -tcf=$(TCF_FILE_NAME) \
+  -Hnocopyr \
+  -Hpurge \
+  -Hcl \
+  -fslp-vectorize-aggressive \
+  -ffunction-sections \
+  -fdata-sections \
+  -tcf_core_config \
 
-  CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
+CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
+CCFLAGS += $(PLATFORM_FLAGS)
 
-  INCLUDES+= \
-    -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
-    -I $(MAKEFILE_DIR)/downloads/kissfft
+INCLUDES+= \
+  -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
+  -I $(MAKEFILE_DIR)/downloads/kissfft
 
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/kissfft
+GENERATED_PROJECT_INCLUDES += \
+  -I. \
+  -I./third_party/kissfft
 
-  LDFLAGS += \
-    -Hheap=8192 \
-    -tcf=$(TCF_FILE_NAME) \
-    -Hnocopyr \
-    -m \
-    -Hldopt=-Coutput=$(TARGET).map \
-    $(LCF_FILE_NAME) \
-    -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
+LDFLAGS += \
+  -Hheap=8192 \
+  -tcf=$(TCF_FILE_NAME) \
+  -Hnocopyr \
+  -m \
+  -Hldopt=-Coutput=$(TARGET).map \
+  $(LCF_FILE_NAME) \
+  -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
 
-  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
-  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
-  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
 
-endif
+ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
+LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
+
+MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
diff --git a/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc
deleted file mode 100644
index 8ea78e8f3e3..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc
+++ /dev/null
@@ -1,9 +0,0 @@
-# Settings for x86 on Linux
-ifeq ($(TARGET), linux)
-  ifeq ($(TARGET_ARCH), x86_64)
-    PLATFORM_FLAGS = \
-      -DTF_LITE_DISABLE_X86_NEON
-    CXXFLAGS += $(PLATFORM_FLAGS)
-    CCFLAGS += $(PLATFORM_FLAGS)
-  endif
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/mbed_makefile.inc b/tensorflow/lite/micro/tools/make/targets/mbed_makefile.inc
index c7e7560e0c8..512769bb475 100644
--- a/tensorflow/lite/micro/tools/make/targets/mbed_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/mbed_makefile.inc
@@ -1,7 +1,7 @@
 # Settings for mbed platforms.
 ifeq ($(TARGET), mbed)
   TARGET_ARCH := cortex-m4
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
+  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
   $(eval $(call add_third_party_download,$(CUST_CMSIS_URL),$(CUST_CMSIS_MD5),CMSIS_ext,))
   $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
 endif
diff --git a/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc
deleted file mode 100644
index 9b1e2220575..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc
+++ /dev/null
@@ -1,13 +0,0 @@
-# Settings for Mac OS platforms.
-ifeq ($(TARGET), osx)
-
-  # Make sure we can find the embedded GCC compiler.
-  export PATH := ${PATH}:tensorflow/lite/micro/tools/make/downloads/gcc_embedded/bin/
-
-  PLATFORM_FLAGS = \
-    -DTF_LITE_DISABLE_X86_NEON
-
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-
-endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc b/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc
deleted file mode 100644
index 78febaf5ddd..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc
+++ /dev/null
@@ -1,10 +0,0 @@
-# Settings for x86 on Mac
-ifeq ($(TARGET), osx)
-  ifeq ($(TARGET_ARCH), x86_64)
-    PLATFORM_FLAGS = \
-      -DTF_LITE_DISABLE_X86_NEON
-    CXXFLAGS += $(PLATFORM_FLAGS)
-    CCFLAGS += $(PLATFORM_FLAGS)
-  endif
-endif
-
diff --git a/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc b/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc
new file mode 100644
index 00000000000..0a4e53202eb
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc
@@ -0,0 +1,2 @@
+include $(MAKEFILE_DIR)/targets/apollo3evb_makefile.inc
+
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
index 8e8b3f75448..1856368b4dc 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
@@ -14,26 +14,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-/* Copied and modified from: tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
-
-*/
-
-/*
- * 0x00000000 - 0x07ffffff - aliased to flash or sys memory depending on BOOT jumpers.
- * 0x08000000 - 0x0801ffff - Flash.
- * 0x1ffff000 - 0x1ffff7ff - Boot firmware in system memory.
- * 0x1ffff800 - 0x1fffffff - Option bytes.
- * 0x20000000 - 0x20004fff - SRAM.
- * 0x40000000 - 0x40023400 - Peripherals
- */
-
 /* Define main entry point */
 ENTRY(_main)
 
-/* 32K of RAM and 256K of FLASH */
+/* 256K of RAM and 2048K of FLASH. Source: */
+/* https://github.com/renode/renode/blob/master/platforms/cpus/stm32f4.repl*/
 MEMORY {
-RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 32K
-FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 256K
+ RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 256K
+ FLASH (rx) : ORIGIN = 0x8000000, LENGTH =  2048K
 }
 
 /* Compute where the stack ends rather than hard coding it */
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index e9ee7296999..2c2785ab256 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -1,87 +1,82 @@
 # Settings for stm32f4 based platforms
-ifeq ($(TARGET), stm32f4)
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-  TARGET_ARCH := cortex-m4
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-  TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/
 
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-  $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
+export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+TARGET_ARCH := cortex-m4
+TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/
 
-  # TODO(b/161478030) : change - Wno - vla to - Wvla and remove - Wno-shadow once
-  # we have a solution for fixing / avoiding being tripped up by these warnings.
-  PLATFORM_FLAGS = \
-    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -fmessage-length=0 \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -ffunction-sections \
-    -fdata-sections \
-    -funsigned-char \
-    -MMD \
-    -mcpu=cortex-m4 \
-    -mthumb \
-    -Wall \
-    -Wextra \
-    -Wno-shadow \
-    -Wno-vla \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -Wno-unused-parameter \
-    -Wno-missing-field-initializers \
-    -Wno-write-strings \
-    -Wno-sign-compare \
-    -Wunused-function \
-    -fno-delete-null-pointer-checks \
-    -fomit-frame-pointer \
-    -g \
-    -Os
-  CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fno-use-cxa-atexit
-  CCFLAGS += $(PLATFORM_FLAGS)
-  LDFLAGS += \
-    --specs=nosys.specs \
-    -T ${TENSORFLOW_ROOT}$(MAKEFILE_DIR)/targets/stm32f4/stm32f4.lds \
-    -Wl,-Map=${TENSORFLOW_ROOT}$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref \
-    -Wl,--gc-sections
-  BUILD_TYPE := micro
-  MICROLITE_LIBS := \
-    -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include/
-  THIRD_PARTY_CC_SRCS += \
-    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
-    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-  EXCLUDED_SRCS := \
-    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
-  THIRD_PARTY_CC_SRCS  := $(filter-out $(EXCLUDED_SRCS), $(THIRD_PARTY_CC_SRCS))
-  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_stm32f4_binary.sh
-  # TODO, non working tests.. the micro_speech example partly works
-  # TODO(b/158324045): Examine why some tests fail here.
+$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+$(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
-  EXCLUDED_TESTS := \
-    tensorflow/lite/micro/micro_interpreter_test.cc \
-    tensorflow/lite/micro/micro_allocator_test.cc \
-    tensorflow/lite/micro/memory_helpers_test.cc \
-    tensorflow/lite/micro/memory_arena_threshold_test.cc \
-    tensorflow/lite/micro/recording_micro_allocator_test.cc \
-    tensorflow/lite/micro/kernels/circular_buffer_test.cc \
-    tensorflow/lite/micro/kernels/conv_test.cc \
-    tensorflow/lite/micro/kernels/fully_connected_test.cc
+# TODO(b/161478030) : change - Wno - vla to - Wvla and remove - Wno-shadow once
+# we have a solution for fixing / avoiding being tripped up by these warnings.
+PLATFORM_FLAGS = \
+  -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+  -DTF_LITE_STATIC_MEMORY \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -fmessage-length=0 \
+  -fno-exceptions \
+  -fno-unwind-tables \
+  -ffunction-sections \
+  -fdata-sections \
+  -funsigned-char \
+  -MMD \
+  -mcpu=cortex-m4 \
+  -mthumb \
+  -Wall \
+  -Wextra \
+  -Wno-shadow \
+  -Wno-vla \
+  -Wno-strict-aliasing \
+  -Wno-type-limits \
+  -Wno-unused-parameter \
+  -Wno-missing-field-initializers \
+  -Wno-write-strings \
+  -Wno-sign-compare \
+  -Wunused-function \
+  -fno-delete-null-pointer-checks \
+  -fomit-frame-pointer \
+  -g \
+  -Os
+CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fno-use-cxa-atexit
+CCFLAGS += $(PLATFORM_FLAGS)
+LDFLAGS += \
+  --specs=nosys.specs \
+  -T ${TENSORFLOW_ROOT}$(MAKEFILE_DIR)/targets/stm32f4/stm32f4.lds \
+  -Wl,-Map=${TENSORFLOW_ROOT}$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref \
+  -Wl,--gc-sections
+BUILD_TYPE := micro
+MICROLITE_LIBS := \
+  -lm
+INCLUDES += \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include/
+THIRD_PARTY_CC_SRCS += \
+  $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
+  $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
+EXCLUDED_SRCS := \
+  $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+THIRD_PARTY_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(THIRD_PARTY_CC_SRCS))
+MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+TEST_SCRIPT := tensorflow/lite/micro/testing/test_stm32f4_binary.sh
 
-  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+# TODO(b/158324045): Examine why some tests fail here.
+EXCLUDED_TESTS := \
+  tensorflow/lite/micro/micro_interpreter_test.cc \
+  tensorflow/lite/micro/micro_allocator_test.cc \
+  tensorflow/lite/micro/memory_helpers_test.cc \
+  tensorflow/lite/micro/memory_arena_threshold_test.cc \
+  tensorflow/lite/micro/recording_micro_allocator_test.cc \
+  tensorflow/lite/micro/kernels/circular_buffer_test.cc
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
-  EXCLUDED_EXAMPLE_TESTS := \
-    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
-    tensorflow/lite/micro/examples/person_detection/Makefile.inc \
-    tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc \
-    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
-    tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
-  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+EXCLUDED_EXAMPLE_TESTS := \
+  tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+  tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+  tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc \
+  tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
+MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
 
 # These are microcontroller-specific rules for converting the ELF output
 # of the linker into a binary image that can be loaded directly.
@@ -91,4 +86,3 @@ $(BINDIR)/%.bin: $(BINDIR)/%
 	@mkdir -p $(dir $@)
 	$(OBJCOPY) $< $@ -O binary
 
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/zephyr_vexriscv_makefile.inc
similarity index 100%
rename from tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc
rename to tensorflow/lite/micro/tools/make/targets/zephyr_vexriscv_makefile.inc
diff --git a/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
index d7bb4511f32..dc1eee5547d 100644
--- a/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
@@ -2,13 +2,11 @@ cmake_minimum_required(VERSION 3.13.1)
 include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
 project(tf_lite_magic_wand)
 
-set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} %{CXX_FLAGS}%")
+# -fno-threadsafe-statics -- disables the mutex around initialization of local static variables
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} %{CXX_FLAGS}% -fno-threadsafe-statics -Wno-sign-compare -Wno-narrowing")
 set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} %{CC_FLAGS}%")
 set(CMAKE_EXE_LINKER_FLAGS "%{LINKER_FLAGS}%")
 
-# -fno-threadsafe-statics -- disables the mutex around initialization of local static variables
-target_compile_options(app PRIVATE "-fno-threadsafe-statics")
-
 target_sources(app PRIVATE
 		%{SRCS}%
 		)
diff --git a/tensorflow/lite/micro/xtensa_hifimini/debug_log.cc b/tensorflow/lite/micro/xtensa_hifimini/debug_log.cc
deleted file mode 100644
index 45d9317478a..00000000000
--- a/tensorflow/lite/micro/xtensa_hifimini/debug_log.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Reference implementation of the DebugLog() function that's required for a
-// platform to support the TensorFlow Lite for Microcontrollers library. This is
-// the only function that's absolutely required to be available on a target
-// device, since it's used for communicating test results back to the host so
-// that we can verify the implementation is working correctly.
-// It's designed to be as easy as possible to supply an implementation though.
-// On platforms that have a POSIX stack or C library, it can be written as a
-// single call to `fprintf(stderr, "%s", s)` to output a string to the error
-// stream of the console, but if there's no OS or C library available, there's
-// almost always an equivalent way to write out a string to some serial
-// interface that can be used instead. For example on Arm M-series MCUs, calling
-// the `bkpt #0xAB` assembler instruction will output the string in r1 to
-// whatever debug serial connection is available. If you're running mbed, you
-// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
-// `pc.printf("%s", s)`.
-// To add an equivalent function for your own platform, create your own
-// implementation file, and place it in a subfolder with named after the OS
-// you're targeting. For example, see the Cortex M bare metal version in
-// tensorflow/lite/micro/bluepill/debug_log.cc or the mbed one on
-// tensorflow/lite/micro/mbed/debug_log.cc.
-
-#include "tensorflow/lite/micro/debug_log.h"
-
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-#include <cstdio>
-#endif
-
-extern "C" void DebugLog(const char* s) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
-  // maximum reduction in binary size. This is because we have DebugLog calls
-  // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
-  fprintf(stderr, "%s", s);
-#endif
-}
diff --git a/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc b/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc
index 6f3844c1fe3..22880657882 100644
--- a/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc
+++ b/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Xtensa implementation of micro_timer.
-// To include this with make, add TAGS=xtensa-xpg.
+// Xtensa timer implementation.
+// To include this with make, add TARGET=xtensa_hifimini.
 #include "tensorflow/lite/micro/micro_time.h"
 
 #include <time.h>
diff --git a/tensorflow/lite/portable_type_to_tflitetype.h b/tensorflow/lite/portable_type_to_tflitetype.h
index 208efcce5b2..32423a4474b 100644
--- a/tensorflow/lite/portable_type_to_tflitetype.h
+++ b/tensorflow/lite/portable_type_to_tflitetype.h
@@ -58,7 +58,7 @@ struct TfLiteTypeToType {};  // Specializations below
 
 // No string mapping is included here, since the TF Lite packed representation
 // doesn't correspond to a C++ type well.
-MATCH_TYPE_AND_TFLITE_TYPE(int, kTfLiteInt32);
+MATCH_TYPE_AND_TFLITE_TYPE(int32_t, kTfLiteInt32);
 MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
 MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
 MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 9ddc01baefd..c5cf2328341 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -86,6 +86,7 @@ py_library(
         ":lite",
         "//tensorflow/lite/toco/logging:gen_html",
         "//tensorflow/lite/toco/logging:toco_conversion_log_proto_py",
+        "//tensorflow/python:util",
         "@six_archive//:six",
     ],
 )
@@ -99,7 +100,7 @@ py_test(
     ],
     python_version = "PY3",
     # Increased thread count for reducing timeout failures.
-    shard_count = 4,
+    shard_count = 10,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -109,9 +110,26 @@ py_test(
         "notsan",  # b/160824139
     ],
     deps = [
+        ":convert",
         ":tflite_convert",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/training:training_util",
+        "//tensorflow/python/training/tracking",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -130,10 +148,8 @@ py_library(
         "//tensorflow/lite/experimental/examples/lstm:tflite_lstm_ops",
         "//tensorflow/lite/experimental/microfrontend:audio_microfrontend_py",
         "//tensorflow/lite/experimental/tensorboard:ops_util",
-        "//tensorflow/lite/python/keras/saving:saving_utils",
         "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/python:graph_util",
-        "//tensorflow/python/keras",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
         "@six_archive//:six",
@@ -152,6 +168,7 @@ py_test(
     ],
     deps = [
         ":lite",
+        "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "@six_archive//:six",
@@ -216,6 +233,8 @@ py_library(
     deps = [
         ":op_hint",
         ":schema_py",
+        ":schema_util",
+        "//tensorflow/lite/python:tflite_keras_util",
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/python:convert_to_constants",
         "//tensorflow/python:dtypes",
@@ -258,6 +277,18 @@ py_test(
     ],
 )
 
+py_library(
+    name = "tflite_keras_util",
+    srcs = [
+        "tflite_keras_util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
 py_library(
     name = "wrap_toco",
     srcs = [
@@ -360,7 +391,6 @@ py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
-        "//tensorflow/python/keras",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model",
     ],
@@ -384,3 +414,13 @@ sh_test(
     srcs = ["convert_file_to_c_source_test.sh"],
     data = [":convert_file_to_c_source"],
 )
+
+py_library(
+    name = "schema_util",
+    srcs = ["schema_util.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow/lite/schema:utils_friends"],
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 8223868f1e2..68f49d50498 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -35,6 +35,7 @@ from tensorflow.lite.python import wrap_toco
 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util import deprecation
@@ -301,7 +302,7 @@ Alternative, use virtualenv.""")
         pass
 
 
-def build_toco_flags(inference_type=lite_constants.FLOAT,
+def build_toco_flags(inference_type=dtypes.float32,
                      inference_input_type=None,
                      input_format=lite_constants.TENSORFLOW_GRAPHDEF,
                      output_format=lite_constants.TFLITE,
@@ -352,7 +353,7 @@ def build_toco_flags(inference_type=lite_constants.FLOAT,
 
 def build_toco_convert_protos(input_tensors,
                               output_tensors,
-                              inference_type=lite_constants.FLOAT,
+                              inference_type=dtypes.float32,
                               inference_input_type=None,
                               input_format=lite_constants.TENSORFLOW_GRAPHDEF,
                               input_shapes=None,
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index e3654217b3a..3cccce38669 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.lite.python import convert
-from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import op_hint
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python.client import session
@@ -59,7 +58,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
 
     tflite_model = convert.toco_convert(
         sess.graph_def, [in_tensor], [out_tensor],
-        inference_type=lite_constants.QUANTIZED_UINT8,
+        inference_type=dtypes.uint8,
         quantized_input_stats=[(0., 1.)])
     self.assertTrue(tflite_model)
 
@@ -73,7 +72,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
     tflite_model = convert.toco_convert_graph_def(
         sess.graph_def, [("input", [1, 16, 16, 3])], ["add"],
         enable_mlir_converter=False,
-        inference_type=lite_constants.FLOAT)
+        inference_type=dtypes.float32)
     self.assertTrue(tflite_model)
 
     # Check values from converted model.
@@ -111,7 +110,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
         input_arrays_map,
         output_arrays,
         enable_mlir_converter=False,
-        inference_type=lite_constants.QUANTIZED_UINT8,
+        inference_type=dtypes.uint8,
         quantized_input_stats=[(0., 1.), (0., 1.)])
     self.assertTrue(tflite_model)
 
@@ -158,7 +157,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
           input_arrays_map,
           output_arrays,
           enable_mlir_converter=False,
-          inference_type=lite_constants.QUANTIZED_UINT8)
+          inference_type=dtypes.uint8)
     self.assertEqual(
         "std_dev and mean must be defined when inference_type or "
         "inference_input_type is QUANTIZED_UINT8 or INT8.",
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index bcb338b84cf..62bd9710f23 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -218,6 +218,18 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     output_data = interpreter.get_tensor(output_details[0]['index'])
     self.assertTrue((expected_output == output_data).all())
 
+  def testStringZeroDim(self):
+    data = b'abcd' + bytes(16)
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/gather_string_0d.tflite'))
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    interpreter.set_tensor(input_details[0]['index'], np.array(data))
+    test_input_tensor = interpreter.get_tensor(input_details[0]['index'])
+    self.assertEqual(len(data), len(test_input_tensor.item(0)))
+
   def testPerChannelParams(self):
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile('testdata/pc_conv.bin'))
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index d2f308a74a2..b854e2ebd69 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -153,6 +153,11 @@ bool FillStringBufferWithPyArray(PyObject* value,
     case NPY_OBJECT:
     case NPY_STRING:
     case NPY_UNICODE: {
+      if (PyArray_NDIM(array) == 0) {
+        dynamic_buffer->AddString(static_cast<char*>(PyArray_DATA(array)),
+                                  PyArray_NBYTES(array));
+        return true;
+      }
       UniquePyObjectRef iter(PyArray_IterNew(value));
       while (PyArray_ITER_NOTDONE(iter.get())) {
         UniquePyObjectRef item(PyArray_GETITEM(
diff --git a/tensorflow/lite/python/keras/saving/BUILD b/tensorflow/lite/python/keras/saving/BUILD
deleted file mode 100644
index ff5c679a527..00000000000
--- a/tensorflow/lite/python/keras/saving/BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-py_library(
-    name = "saving_utils",
-    srcs = [
-        "saving_utils.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:util",
-    ],
-)
diff --git a/tensorflow/lite/python/keras/saving/saving_utils.py b/tensorflow/lite/python/keras/saving/saving_utils.py
deleted file mode 100644
index 03a442d2ee3..00000000000
--- a/tensorflow/lite/python/keras/saving/saving_utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Utility functions for TensorFlow models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-
-from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
-
-
-def _enforce_names_consistency(specs):
-  """Enforces that either all specs have names or none do."""
-
-  def _has_name(spec):
-    return hasattr(spec, 'name') and spec.name is not None
-
-  def _clear_name(spec):
-    spec = copy.deepcopy(spec)
-    if hasattr(spec, 'name'):
-      spec._name = None  # pylint:disable=protected-access
-    return spec
-
-  flat_specs = nest.flatten(specs)
-  name_inconsistency = (
-      any(_has_name(s) for s in flat_specs) and
-      not all(_has_name(s) for s in flat_specs))
-
-  if name_inconsistency:
-    specs = nest.map_structure(_clear_name, specs)
-  return specs
-
-
-def model_input_signature(model, keep_original_batch_size=False):
-  """Inspect model to get its input signature.
-
-  The model's input signature is a list with a single (possibly-nested) object.
-  This is due to the Keras-enforced restriction that tensor inputs must be
-  passed in as the first argument.
-
-  For example, a model with input {'feature1': <Tensor>, 'feature2': <Tensor>}
-  will have input signature: [{'feature1': TensorSpec, 'feature2': TensorSpec}]
-
-  Args:
-    model: Keras Model object.
-    keep_original_batch_size: A boolean indicating whether we want to keep using
-      the original batch size or set it to None. Default is `False`, which means
-      that the batch dim of the returned input signature will always be set to
-      `None`.
-
-  Returns:
-    A list containing either a single TensorSpec or an object with nested
-    TensorSpecs. This list does not contain the `training` argument.
-  """
-  input_specs = model._get_save_spec(dynamic_batch=not keep_original_batch_size)  # pylint: disable=protected-access
-  if input_specs is None:
-    return None
-  input_specs = _enforce_names_consistency(input_specs)
-  # Return a list with a single element as the model's input signature.
-  if isinstance(input_specs,
-                collections_abc.Sequence) and len(input_specs) == 1:
-    # Note that the isinstance check filters out single-element dictionaries,
-    # which should also be wrapped as a single-element list.
-    return input_specs
-  else:
-    return [input_specs]
-
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index d518d8675d8..a14806251cb 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -50,7 +50,6 @@ from tensorflow.lite.python.convert import toco_convert_protos  # pylint: disabl
 from tensorflow.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
 from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.lite.python.interpreter import load_delegate  # pylint: disable=unused-import
-from tensorflow.lite.python.keras.saving import saving_utils as _keras_saving_utils
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import is_ophint_converted as _is_ophint_converted
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
@@ -63,10 +62,11 @@ from tensorflow.lite.python.util import get_grappler_config as _get_grappler_con
 from tensorflow.lite.python.util import get_tensor_name as _get_tensor_name
 from tensorflow.lite.python.util import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
 from tensorflow.lite.python.util import is_frozen_graph as _is_frozen_graph
+from tensorflow.lite.python.util import model_input_signature as _model_input_signature
 from tensorflow.lite.python.util import modify_model_io_type as _modify_model_io_type
 from tensorflow.lite.python.util import run_graph_optimizations as _run_graph_optimizations
 from tensorflow.lite.python.util import set_tensor_shapes as _set_tensor_shapes
-from tensorflow.python import keras as _keras
+from tensorflow.lite.python.util import trace_model_call as _trace_model_call
 from tensorflow.python.client import session as _session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function as _def_function
@@ -76,7 +76,6 @@ from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework.errors_impl import NotFoundError as _NotFoundError
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
-from tensorflow.python.keras.saving import saving_utils as _saving_utils
 from tensorflow.python.lib.io import file_io as _file_io
 from tensorflow.python.saved_model import loader_impl as _loader_impl
 from tensorflow.python.saved_model import signature_constants as _signature_constants
@@ -84,6 +83,7 @@ from tensorflow.python.saved_model import tag_constants as _tag_constants
 from tensorflow.python.saved_model.load import load as _load
 from tensorflow.python.saved_model.loader_impl import parse_saved_model_with_debug_info as _parse_saved_model_with_debug_info
 from tensorflow.python.util import deprecation as _deprecation
+from tensorflow.python.util import keras_deps
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
@@ -163,9 +163,8 @@ class TargetSpec(object):
     supported_ops: Experimental flag, subject to change. Set of OpsSet options
       supported by the device. (default set([OpsSet.TFLITE_BUILTINS]))
     supported_types: List of types for constant values on the target device.
-      Supported values are types exported by lite.constants. Frequently, an
-      optimization choice is driven by the most compact (i.e. smallest) type in
-      this list (default [constants.FLOAT])
+      Frequently, an optimization choice is driven by the most compact
+      (i.e. smallest) type in this list (default [tf.float32])
   """
 
   def __init__(self, supported_ops=None, supported_types=None):
@@ -193,6 +192,7 @@ class QuantizationMode(object):
     """Post training int8 quantize, disallow float fallback."""
     return (self._is_int8_target_required() and
             not self._is_int16x8_target_required() and
+            not self._is_allow_float() and
             self._representative_dataset is not None)
 
   def post_training_int8_allow_float(self):
@@ -200,7 +200,7 @@ class QuantizationMode(object):
     return (self._any_optimization_enabled() and
             not self._is_int16x8_target_required() and
             self._representative_dataset is not None and
-            self._smallest_supported_type() == constants.INT8)
+            self._smallest_supported_type() == _dtypes.int8)
 
   def is_post_training_integer_quantize_8(self):
     """Post training integer 8 quantization."""
@@ -212,16 +212,14 @@ class QuantizationMode(object):
     return (self.post_training_int16x8_no_float() or
             self.post_training_int16x8_allow_float())
 
-  def is_post_training_integer_quantize(self):
-    """Post training integer quantization."""
+  def is_integer_quantize(self):
     return (self.is_post_training_integer_quantize_8() or
-            self.is_post_training_integer_quantize_16x8())
+            self.is_post_training_integer_quantize_16x8() or
+            self.is_training_time_int8_allow_float())
 
-  def training_time_int8_allow_float(self):
-    """Training-time int8 quantize, allow float fallback."""
+  def is_training_time_int8_allow_float(self):
     return (self._any_optimization_enabled() and
-            not self.post_training_dynamic_range_int8() and
-            not self.post_training_fp16())
+            self.contains_training_quant_op())
 
   def post_training_int16x8_no_float(self):
     """Post training int16x8 quantize, disallow float fallback."""
@@ -241,54 +239,45 @@ class QuantizationMode(object):
     return (self._any_optimization_enabled() and
             self._representative_dataset is None and
             not self.contains_training_quant_op() and
-            self._smallest_supported_type() == constants.INT8)
+            self._smallest_supported_type() == _dtypes.int8)
 
   def post_training_fp16(self):
     """Post training fp16 quantize."""
     return (self._any_optimization_enabled() and
-            self._smallest_supported_type() == constants.FLOAT16)
+            self._smallest_supported_type() == _dtypes.float16)
 
   def fp32_execution(self):
     """If none of the above are true."""
-    return not (self.post_training_int8_no_float() or
-                self.post_training_int8_allow_float() or
-                self.training_time_int8_allow_float() or
-                self.post_training_int16x8_no_float() or
-                self.post_training_int16x8_allow_float() or
+    return not (self.is_integer_quantize() or
                 self.post_training_dynamic_range_int8() or
                 self.post_training_fp16())
 
   def activations_type(self):
-    return constants.INT16 if self._is_int16x8_target_required() \
-      else constants.INT8
+    return _dtypes.int16 if self._is_int16x8_target_required() \
+      else _dtypes.int8
 
   def converter_flags(self, inference_ty=None, inference_input_ty=None):
     """Flags to the converter."""
-    if self.is_post_training_integer_quantize():
-      # The inference_input_type is for the quantizer, then we need to keep the
-      # converter inference_input_type to float.
-      inference_input_ty = constants.FLOAT
 
-    if self.training_time_int8_allow_float():
+    if self.is_integer_quantize():
       return {
           "inference_type": inference_ty if inference_ty else \
             self.activations_type(),
-          "inference_input_type":
-              inference_input_ty if inference_input_ty else constants.FLOAT,
+          "inference_input_type": _dtypes.float32,
           "post_training_quantize": False,  # disable dynamic range quantization
           "quantize_to_float16": False  # disable float16 quantization
       }
     elif self.post_training_dynamic_range_int8():
       return {
-          "inference_type": constants.FLOAT,
-          "inference_input_type": constants.FLOAT,
+          "inference_type": _dtypes.float32,
+          "inference_input_type": _dtypes.float32,
           "post_training_quantize": True,  # enable dynamic range quantization
           "quantize_to_float16": False  # disable float16 quantization
       }
     elif self.post_training_fp16():
       return {
-          "inference_type": constants.FLOAT,
-          "inference_input_type": constants.FLOAT,
+          "inference_type": _dtypes.float32,
+          "inference_input_type": _dtypes.float32,
           "post_training_quantize": True,
           "quantize_to_float16": True  # enable float16 quantization
       }
@@ -296,7 +285,7 @@ class QuantizationMode(object):
       # Note this might still trigger (uint8) quantization to be compatible with
       # TOCO.
       return {
-          "inference_type": inference_ty if inference_ty else constants.FLOAT,
+          "inference_type": inference_ty if inference_ty else _dtypes.float32,
           "inference_input_type": inference_input_ty,
           "post_training_quantize": False,  # enable dynamic range quantization
           "quantize_to_float16": False  # disable float16 quantization
@@ -305,8 +294,8 @@ class QuantizationMode(object):
   def quantizer_flags(self, input_ty=None, output_ty=None):
     """Default flags to the TFMOT quantizer."""
 
-    inference_input_type = input_ty if input_ty else constants.FLOAT
-    inference_output_type = output_ty if output_ty else constants.FLOAT
+    inference_input_type = input_ty if input_ty else _dtypes.float32
+    inference_output_type = output_ty if output_ty else _dtypes.float32
 
     if self.post_training_int8_no_float() \
       or self.post_training_int16x8_no_float():
@@ -327,20 +316,13 @@ class QuantizationMode(object):
     else:
       return False, None
 
-  def flags_modify_model_io_type(self,
-                                 input_type=constants.FLOAT,
-                                 output_type=constants.FLOAT):
+  def flags_modify_model_io_type(self, input_ty=None, output_ty=None):
     """Flags for modifying the input and output type of a tflite model."""
-    is_post_training_quantize = self.quantizer_flags(input_type, output_type)[0]
-    is_training_time_only_quantize = self.training_time_int8_allow_float() and \
-        not is_post_training_quantize
 
-    # TODO(b/153576658): Consolidate post/during training quantization workflows
-    # to modify model input/output type after MLIR conversion.
-    if is_training_time_only_quantize:
+    if self.is_integer_quantize():
       return {
-          "inference_input_type": input_type,
-          "inference_output_type": output_type,
+          "inference_input_type": input_ty if input_ty else _dtypes.float32,
+          "inference_output_type": output_ty if output_ty else _dtypes.float32,
       }
     else:
       return None
@@ -353,7 +335,7 @@ class QuantizationMode(object):
       return
 
     if self._target_spec.supported_types and (self._smallest_supported_type() !=
-                                              constants.INT8):
+                                              _dtypes.int8):
       raise ValueError("TFLITE_BUILTINS_INT8 requires smallest supported "
                        "type to be INT8.")
 
@@ -370,20 +352,18 @@ class QuantizationMode(object):
                        "TFLITE_BUILTINS_INT8 or INT8 supported types.")
 
   def _is_int8_target_required(self):
-    return (set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
-        self._target_spec.supported_ops) or
-            set(self._target_spec.supported_types) == set([constants.INT8]))
+    return (OpsSet.TFLITE_BUILTINS_INT8 in set(
+        self._target_spec.supported_ops)) or (set(
+            self._target_spec.supported_types) == set([_dtypes.int8]))
 
   def _is_int16x8_target_required(self):
-    return bool(
-        set(self._target_spec.supported_ops).intersection([
-            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-        ]))
+    return (OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+            in set(self._target_spec.supported_ops))
 
   def _is_allow_float(self):
-    return bool(
-        set(self._target_spec.supported_ops).intersection(
-            [OpsSet.TFLITE_BUILTINS]))
+    return (OpsSet.TFLITE_BUILTINS in set(
+        self._target_spec.supported_ops)) or (OpsSet.SELECT_TF_OPS in set(
+            self._target_spec.supported_ops))
 
   def _any_optimization_enabled(self):
     return bool(
@@ -397,7 +377,7 @@ class QuantizationMode(object):
       return min(self._target_spec.supported_types, key=lambda x: x.size)
     else:
       # The default smallest supported type is INT8.
-      return constants.INT8
+      return _dtypes.int8
 
   def contains_training_quant_op(self):
     """Checks if the graph contains any training-time quantization ops."""
@@ -464,6 +444,8 @@ class TFLiteConverterBase(object):
       self.representative_dataset = RepresentativeDataset(
           self.representative_dataset)
 
+    # Add intermediate tensors to the model if needed.
+    result = _calibrator.add_intermediate_tensors(result)
     calibrate_quantize = _calibrator.Calibrator(result)
     if self._experimental_calibrate_only or self._experimental_new_quantizer:
       calibrated = calibrate_quantize.calibrate(
@@ -556,18 +538,18 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
   def __init__(self):
     """Constructor for TFLiteConverter."""
     super(TFLiteConverterBaseV2, self).__init__()
-    self.inference_input_type = constants.FLOAT
-    self.inference_output_type = constants.FLOAT
+    self.inference_input_type = _dtypes.float32
+    self.inference_output_type = _dtypes.float32
 
   def _validate_inference_input_output_types(self, quant_mode):
     """Validate inference_input_type and inference_output_type flags."""
-    default_types = [constants.FLOAT]
+    default_types = [_dtypes.float32]
     # We support integer input/output for integer quantized models only.
-    if quant_mode.training_time_int8_allow_float():
+    if quant_mode.is_integer_quantize():
       if quant_mode.is_post_training_integer_quantize_16x8():
-        all_types = default_types + [constants.INT16]
+        all_types = default_types + [_dtypes.int16]
       else:
-        all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
+        all_types = default_types + [_dtypes.int8, _dtypes.uint8]
       if self.inference_input_type not in all_types or \
           self.inference_output_type not in all_types:
         all_types_names = ["tf." + t.name for t in all_types]
@@ -645,8 +627,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
-    calibrate_and_quantize, flags = quant_mode.quantizer_flags(
-        self.inference_input_type, self.inference_output_type)
+    calibrate_and_quantize, flags = quant_mode.quantizer_flags()
     if calibrate_and_quantize:
       result = self._calibrate_quantize_model(result, **flags)
 
@@ -756,8 +737,7 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
     converter_kwargs.update(quant_mode.converter_flags())
 
     result = _convert_saved_model(**converter_kwargs)
-    calibrate_and_quantize, flags = quant_mode.quantizer_flags(
-        self.inference_input_type, self.inference_output_type)
+    calibrate_and_quantize, flags = quant_mode.quantizer_flags()
     if calibrate_and_quantize:
       result = self._calibrate_quantize_model(result, **flags)
 
@@ -860,10 +840,11 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
       # Pass `keep_original_batch_size=True` will ensure that we get an input
       # signature including the batch dimension specified by the user.
       # TODO(b/169898786): Use the Keras public API when TFLite moves out of TF
-      input_signature = _keras_saving_utils.model_input_signature(
+      input_signature = _model_input_signature(
           self._keras_model, keep_original_batch_size=True)
 
-    func = _saving_utils.trace_model_call(self._keras_model, input_signature)
+    # TODO(b/169898786): Use the Keras public API when TFLite moves out of TF
+    func = _trace_model_call(self._keras_model, input_signature)
     concrete_func = func.get_concrete_function()
     self._funcs = [concrete_func]
 
@@ -1148,7 +1129,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
         graph debug info for a set of nodes from the `graph_def`.
     """
     super(TFLiteConverterBaseV1, self).__init__()
-    self.inference_type = constants.FLOAT
+    self.inference_type = _dtypes.float32
     self.inference_input_type = None
     self.inference_output_type = None
     self.output_format = constants.TFLITE
@@ -1195,7 +1176,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
   def _validate_quantized_input_stats(self, converter_kwargs, calibrate):
     """Ensure the `quantized_input_stats` flag is provided if required."""
 
-    quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
+    quantized_types = frozenset({_dtypes.int8, _dtypes.uint8})
 
     requires_quantized_input_stats = (
         (converter_kwargs["inference_type"] in quantized_types or
@@ -1307,8 +1288,11 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
                    "please file a bug. You can opt-out "
                    "by setting experimental_new_converter=False")
 
-    calibrate_quantize, flags = quant_mode.quantizer_flags(
-        self.inference_input_type, self.inference_output_type)
+    if not self.experimental_new_converter:
+      calibrate_quantize, flags = quant_mode.quantizer_flags(
+          self.inference_input_type, self.inference_output_type)
+    else:
+      calibrate_quantize, flags = quant_mode.quantizer_flags()
 
     self._validate_quantized_input_stats(converter_kwargs, calibrate_quantize)
 
@@ -1329,6 +1313,12 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     if calibrate_quantize:
       result = self._calibrate_quantize_model(result, **flags)
 
+    if self.experimental_new_converter:
+      flags_modify_model_io_type = quant_mode.flags_modify_model_io_type(
+          self.inference_input_type, self.inference_output_type)
+      if flags_modify_model_io_type:
+        result = _modify_model_io_type(result, **flags_modify_model_io_type)
+
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
 
@@ -1476,10 +1466,9 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
                          "with Eager mode. If your model requires any of these "
                          "parameters, please use disable_eager_execution().")
 
-      _keras.backend.set_learning_phase(False)
-      keras_model = _keras.models.load_model(model_file, custom_objects)
-
-      function = _saving_utils.trace_model_call(keras_model)
+      keras_model = keras_deps.get_load_model_function()(
+          model_file, custom_objects)
+      function = _trace_model_call(keras_model)
       concrete_func = function.get_concrete_function()
 
       frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
@@ -1493,10 +1482,10 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
       return
 
     # Handles Keras when Eager mode is disabled.
-    _keras.backend.clear_session()
-    _keras.backend.set_learning_phase(False)
-    keras_model = _keras.models.load_model(model_file, custom_objects)
-    sess = _keras.backend.get_session()
+    keras_deps.get_clear_session_function()()
+    keras_model = keras_deps.get_load_model_function()(
+        model_file, custom_objects)
+    sess = keras_deps.get_get_session_function()()
 
     # Get input and output tensors.
     if input_arrays:
@@ -1587,7 +1576,7 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
       output_tensors: List of output tensors (only .name is used from this).
       input_arrays_with_shape: Tuple of strings representing input tensor names
         and list of integers representing input shapes
-        (e.g., [("foo" : [1, 16, 16, 3])]). Use only when graph cannot be loaded
+        (e.g., [("foo", [1, 16, 16, 3])]). Use only when graph cannot be loaded
           into TensorFlow and when `input_tensors` and `output_tensors` are
           None. (default None)
       output_arrays: List of output tensors to freeze graph with. Use only when
@@ -1614,6 +1603,15 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
       self._input_arrays_with_shape = input_arrays_with_shape
       self._output_arrays = output_arrays
 
+    if input_tensors is not None and input_arrays_with_shape is not None:
+      logging.warning("input_arrays_with_shape will be ignored when both the "
+                      "given input_tensors and input_arrays_with_shape are not "
+                      "None.")
+
+    if output_tensors is not None and output_arrays is not None:
+      logging.warning("output_arrays will be ignored when both the given "
+                      "output_tensors and output_arrays are not None.")
+
 
 @_tf_export(v1=["lite.TFLiteConverter"])
 class TFLiteConverter(TFLiteFrozenGraphConverter):
@@ -1645,8 +1643,8 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     quantized_input_stats: Dict of strings representing input tensor names
       mapped to tuple of floats representing the mean and standard deviation
       of the training data (e.g., {"foo" : (0., 1.)}). Only need if
-        `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
-        (quantized_input_value - mean_value) / std_dev_value. (default {})
+      `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
+      (quantized_input_value - mean_value) / std_dev_value. (default {})
     default_ranges_stats: Tuple of integers representing (min, max) range values
       for all arrays without a specified range. Intended for experimenting with
       quantization via "dummy quantization". (default None)
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index d17fc94cd20..20fa407a045 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -28,17 +28,18 @@ from absl.testing import parameterized
 import numpy as np
 import six
 from six.moves import range
+from tensorflow import keras
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python.convert import ConverterError
 from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
-from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -111,6 +112,112 @@ class FromConstructor(TestModels):
     converter = lite.TFLiteConverter(None, ['input_tensor'], ['output_tensor'])
     self.assertTrue(converter._has_valid_tensors())
 
+  def testRedundantArgumentsWarning(self):
+    """Test if the warning message when there are redundant arguments."""
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='in_tensor')
+      out_tensor = math_ops.add(in_tensor, in_tensor, name='add')
+      sess = session.Session()
+
+    frozen_graph_def = (
+        convert_to_constants.convert_variables_to_constants_from_session_graph(
+            sess, sess.graph_def, ['add']))
+
+    # Convert model and ensure model is not None.
+    log = io.BytesIO() if six.PY2 else io.StringIO()
+    handler = logging.StreamHandler(log)
+    logging.root.addHandler(handler)
+    converter = lite.TFLiteConverter(frozen_graph_def, [in_tensor],
+                                     [out_tensor],
+                                     [('in_tensor', [2, 16, 16, 3])], ['add'])
+
+    input_warning_message = 'input_arrays_with_shape will be ignored'
+    output_warning_message = 'output_arrays will be ignored'
+
+    # Convert model and ensure model is not None.
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+    self.assertIn(input_warning_message, log.getvalue())
+    self.assertIn(output_warning_message, log.getvalue())
+    logging.root.removeHandler(handler)
+
+  def testShapeOverriding(self):
+    """Test a shape overriding case via the constructor."""
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='in_tensor')
+      math_ops.add(in_tensor, in_tensor, name='add')
+      sess = session.Session()
+
+    frozen_graph_def = (
+        convert_to_constants.convert_variables_to_constants_from_session_graph(
+            sess, sess.graph_def, ['add']))
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter(frozen_graph_def, None, None,
+                                     [('in_tensor', [2, 16, 16, 3])], ['add'])
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual('in_tensor', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertAllEqual([2, 16, 16, 3], input_details[0]['shape'])
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertAllEqual([2, 16, 16, 3], output_details[0]['shape'])
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testPartialShapeOverriding(self):
+    """Test a partial shape overriding case via the constructor."""
+    with ops.Graph().as_default():
+      in_tensor_a = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='in_tensor_a')
+      in_tensor_b = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='in_tensor_b')
+      math_ops.add(in_tensor_a, in_tensor_b, name='add')
+      sess = session.Session()
+
+    frozen_graph_def = (
+        convert_to_constants.convert_variables_to_constants_from_session_graph(
+            sess, sess.graph_def, ['add']))
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter(frozen_graph_def, None, None,
+                                     [('in_tensor_a', [2, 16, 16, 3])], ['add'])
+    # There is an unhandled Placeholder op.
+    with self.assertRaises(ConverterError):
+      converter.convert()
+
+  def testInvalidShapeOverriding(self):
+    """Test an invalid shape overriding case via the constructor."""
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='in_tensor')
+      math_ops.add(in_tensor, in_tensor, name='add')
+      sess = session.Session()
+
+    frozen_graph_def = (
+        convert_to_constants.convert_variables_to_constants_from_session_graph(
+            sess, sess.graph_def, ['add']))
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter(frozen_graph_def, None, None,
+                                     [('wrong_tensor', [2, 16, 16, 3])],
+                                     ['add'])
+    with self.assertRaises(ConverterError):
+      converter.convert()
+
 
 class FromSessionTest(TestModels, parameterized.TestCase):
 
@@ -155,8 +262,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_input_type = lite_constants.QUANTIZED_UINT8
-    converter.inference_type = lite_constants.FLOAT
+    converter.inference_input_type = dtypes.uint8
+    converter.inference_type = dtypes.float32
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
@@ -198,6 +305,81 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     with self.assertRaises(ValueError):
       interpreter.set_tensor(input_index, dummy_tensor)
 
+  @parameterized.named_parameters(
+      ('_INT8InputOutput', False, False, dtypes.int8),
+      ('_UINT8InputOutput', False, False, dtypes.uint8),
+      ('_INT16Quantize_INT16InputOutput', False, True, dtypes.int16),
+      ('_IntOnly_INT8InputOutput', True, False, dtypes.int8),
+      ('_IntOnly_UINT8InputOutput', True, False, dtypes.uint8),
+      ('_IntOnly_INT16Quantize_INT16InputOutput', True, True, dtypes.int16))
+  def testIntegerQuantizationWithUnsupportedOps(self, is_int_only,
+                                                is_int16_quantize,
+                                                inference_input_output_type):
+    with ops.Graph().as_default():
+      in_tensor_a = array_ops.placeholder(shape=[3], dtype=dtypes.float32)
+      in_tensor_b = array_ops.placeholder(shape=[3], dtype=dtypes.float32)
+      # ceil kernel does not support int8 nor int16 types neither.
+      left = math_ops.ceil(in_tensor_a)
+      out_tensor_b = math_ops.tanh(in_tensor_b)
+      add = math_ops.add(left, out_tensor_b)
+      # ceil kernel does not support int8 nor int16 types neither.
+      out_tensor_a = math_ops.ceil(add)
+      sess = session.Session()
+
+    def calibration_gen():
+      for _ in range(5):
+        yield [
+            np.random.uniform(-1, 1, size=(3)).astype(np.float32),
+            np.random.uniform(-1, 1, size=(3)).astype(np.float32)
+        ]
+
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [in_tensor_a, in_tensor_b], [out_tensor_a, out_tensor_b])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    if is_int_only:
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS_INT8, lite.OpsSet.TFLITE_BUILTINS
+        ]
+    else:
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
+
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    # Allow float32 for fallback.
+    self.assertEqual(input_details[0]['dtype'], dtypes.float32)
+    self.assertEqual(input_details[1]['dtype'],
+                     inference_input_output_type.as_numpy_dtype)
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 2)
+    # Allow float32 for fallback.
+    self.assertEqual(output_details[0]['dtype'], dtypes.float32)
+    self.assertEqual(output_details[1]['dtype'],
+                     inference_input_output_type.as_numpy_dtype)
+
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
@@ -788,8 +970,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir_converter
-    quantized_converter.inference_input_type = lite_constants.INT8
-    quantized_converter.inference_output_type = lite_constants.INT8
+    quantized_converter.inference_input_type = dtypes.int8
+    quantized_converter.inference_output_type = dtypes.int8
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite_model = quantized_converter.convert()
@@ -832,7 +1014,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     # Restricting to int8 type only
-    quantized_converter.target_spec.supported_types = [lite.constants.INT8]
+    quantized_converter.target_spec.supported_types = [dtypes.int8]
     # A representative dataset is required for full fixed point quantization.
     with self.assertRaises(ValueError) as error:
       quantized_converter.convert()
@@ -857,7 +1039,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess,
                                                   [in_tensor_1, in_tensor_2],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {
         'inputA': (0., 1.),
         'inputB': (0., 1.)
@@ -898,7 +1080,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     converter.default_ranges_stats = (0, 6)  # min, max
     tflite_model = converter.convert()
@@ -954,16 +1136,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     interpreter.allocate_tensors()
     self.assertEqual(interpreter.get_tensor_details()[idx]['name'], node_name)
     self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                     lite.constants.FLOAT)
+                     dtypes.float32)
     # Convert model to quantized version
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    quantized_converter.target_spec.supported_types = [dtypes.float16]
     if include_int8:
-      quantized_converter.target_spec.supported_types.append(
-          lite.constants.INT8)
+      quantized_converter.target_spec.supported_types.append(dtypes.int8)
     if use_rep_data:
       quantized_converter.representative_dataset = calibration_gen
 
@@ -984,11 +1165,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       if is_float16_quantized:
         # Verify that bias constant is float16 type.
         self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                         lite.constants.FLOAT16)
+                         dtypes.float16)
       elif is_post_training_quantized:
         # Verify that bias constants is int32 type.
         self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                         lite.constants.INT32)
+                         dtypes.int32)
       else:
         raise ValueError('Invalid test options.')
 
@@ -1005,7 +1186,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    quantized_converter.target_spec.supported_types = [dtypes.float16]
     # Specify only int8 builtin ops
     quantized_converter.target_spec.supported_ops = [
         lite.OpsSet.TFLITE_BUILTINS_INT8
@@ -1017,8 +1198,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         str(error.exception))
 
   @parameterized.named_parameters(
-      ('InferenceType_INT8', lite_constants.INT8),
-      ('InferenceType_UINT8', lite_constants.QUANTIZED_UINT8))
+      ('InferenceType_INT8', dtypes.int8),
+      ('InferenceType_UINT8', dtypes.uint8))
   def testInvalidQuantizeQATModelRequiresInputStats(self, quantized_type):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
@@ -1039,7 +1220,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         'flag is set to tf.uint8 or tf.int8.', str(error.exception))
 
     with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_type = lite_constants.FLOAT
+      quantized_converter.inference_type = dtypes.float32
       quantized_converter.inference_input_type = quantized_type
       quantized_converter.convert()
     self.assertEqual(
@@ -1070,7 +1251,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess,
                                                   [in_tensor_1, in_tensor_2],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {'inputA': (0., 1.)}  # mean, std_dev
     with self.assertRaises(ValueError) as error:
       converter.convert()
@@ -1091,9 +1272,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
 
     # extra flags to trigger training time quantization conversion
-    converter.inference_type = lite_constants.INT8
-    converter.inference_input_type = lite_constants.FLOAT
-    converter.inference_output_type = lite_constants.FLOAT
+    converter.inference_type = dtypes.int8
+    converter.inference_input_type = dtypes.float32
+    converter.inference_output_type = dtypes.float32
     input_arrays = converter.get_input_arrays()
     converter.quantized_input_stats = {
         input_arrays[0]: (0., 1.)
@@ -1255,7 +1436,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
@@ -1431,9 +1612,10 @@ class FromFrozenGraphFile(LiteTest):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testFloatWithShapesArray(self):
+    """Test a shape overriding case."""
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+          shape=[None, 16, 16, 3], dtype=dtypes.float32)
       _ = in_tensor + in_tensor
       sess = session.Session()
 
@@ -1445,7 +1627,7 @@ class FromFrozenGraphFile(LiteTest):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_frozen_graph(
         graph_def_file, ['Placeholder'], ['add'],
-        input_shapes={'Placeholder': [1, 16, 16, 3]})
+        input_shapes={'Placeholder': [2, 16, 16, 3]})
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
 
@@ -1455,7 +1637,56 @@ class FromFrozenGraphFile(LiteTest):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
+    self.assertAllEqual([2, 16, 16, 3], input_details[0]['shape'])
+
+  def testInvalidShapesArray(self):
+    """Test an invalid shape overriding case, which has a wrong input name."""
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    # Convert model and ensure model is not None.
+    with self.assertRaises(ValueError):
+      lite.TFLiteConverter.from_frozen_graph(
+          graph_def_file, ['Placeholder'], ['add'],
+          input_shapes={'wrong_input': [2, 16, 16, 3]})
+
+  def testPartialShapesArray(self):
+    """Test a shape overriding case, with the only one input among two."""
+    with ops.Graph().as_default():
+      a = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='a')
+      b = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32, name='b')
+      _ = math_ops.add(a, b, name='add')
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_frozen_graph(
+        graph_def_file, ['a', 'b'], ['add'], input_shapes={'a': [2, 16, 16, 3]})
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    self.assertAllEqual([2, 16, 16, 3], input_details[0]['shape'])
+    self.assertAllEqual([1, 16, 16, 3], input_details[1]['shape'])
 
   def testFreezeGraph(self):
     with ops.Graph().as_default():
@@ -1782,7 +2013,54 @@ class FromSavedModelTest(TestModels):
     self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
-  def testSubsetInputArrays(self):
+  def testShapeOverriding(self):
+    """Test a SavedModel with the input_shapes arugment."""
+    saved_model_dir = self._createSavedModel(shape=[None, 16, 16, 3])
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_saved_model(
+        saved_model_dir,
+        input_shapes={
+            'inputA': [2, 16, 16, 3],
+            'inputB': [2, 16, 16, 3]
+        })
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    self.assertStartsWith(input_details[0]['name'], 'inputA')
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertAllEqual([2, 16, 16, 3], input_details[0]['shape'])
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    self.assertStartsWith(input_details[1]['name'], 'inputB')
+    self.assertEqual(np.float32, input_details[1]['dtype'])
+    self.assertAllEqual([2, 16, 16, 3], input_details[1]['shape'])
+    self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertStartsWith(output_details[0]['name'], 'add')
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertAllEqual([2, 16, 16, 3], output_details[0]['shape'])
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testWrongInputShapes(self):
+    """Test a SavedModel with a wrong name in the input_shapes argument."""
+    saved_model_dir = self._createSavedModel(shape=[1, 16, 16, 3])
+
+    # Check case where input shape is given.
+    with self.assertRaises(ValueError):
+      lite.TFLiteConverter.from_saved_model(
+          saved_model_dir,
+          input_arrays=['inputA'],
+          input_shapes={'wrong_input': [1, 16, 16, 3]})
+
+  def testSubsetInputShaapes(self):
     """Test a SavedModel with a subset of the input array names of the model."""
     saved_model_dir = self._createSavedModel(shape=[1, 16, 16, 3])
 
@@ -2071,8 +2349,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
 
-  def testFunctionalModelMultipleInputs(self):
-    """Test a Functional tf.keras model with multiple inputs and outputs."""
+  def _getFunctionalModelMultipleInputs(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
     dense = keras.layers.Dense(4, name='dense')
@@ -2100,6 +2377,10 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     finally:
       os.close(fd)
 
+  def testFunctionalModelMultipleInputs(self):
+    """Test a Functional tf.keras model with multiple inputs and outputs."""
+    self._getFunctionalModelMultipleInputs()
+
     # Convert to TFLite model.
     converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
@@ -2131,6 +2412,90 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     self.assertAllEqual([1, 4], output_details[1]['shape'])
     self.assertEqual((0., 0.), output_details[1]['quantization'])
 
+  def testShapeOverriding(self):
+    """Test a Functional tf.keras model with input shape overriding."""
+    self._getFunctionalModelMultipleInputs()
+
+    # Convert to TFLite model.
+    converter = lite.TFLiteConverter.from_keras_model_file(
+        self._keras_file, input_shapes={
+            'input_a': {2, 3},
+            'input_b': {2, 3}
+        })
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    self.assertEndsWith(input_details[0]['name'], 'input_a')
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertAllEqual([2, 3], input_details[0]['shape'])
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    self.assertEndsWith(input_details[1]['name'], 'input_b')
+    self.assertEqual(np.float32, input_details[1]['dtype'])
+    self.assertAllEqual([2, 3], input_details[1]['shape'])
+    self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 2)
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertAllEqual([2, 4], output_details[0]['shape'])
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+    self.assertEqual(np.float32, output_details[1]['dtype'])
+    self.assertAllEqual([2, 4], output_details[1]['shape'])
+    self.assertEqual((0., 0.), output_details[1]['quantization'])
+
+  def testPartialShapeOverriding(self):
+    """Test a Functional tf.keras model with parital input shape overriding."""
+    self._getFunctionalModelMultipleInputs()
+
+    # Convert to TFLite model.
+    converter = lite.TFLiteConverter.from_keras_model_file(
+        self._keras_file, input_shapes={'input_a': {2, 3}})
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    self.assertEndsWith(input_details[0]['name'], 'input_a')
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertAllEqual([2, 3], input_details[0]['shape'])
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    self.assertEndsWith(input_details[1]['name'], 'input_b')
+    self.assertEqual(np.float32, input_details[1]['dtype'])
+    self.assertAllEqual([1, 3], input_details[1]['shape'])
+    self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 2)
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertAllEqual([1, 4], output_details[0]['shape'])
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+    self.assertEqual(np.float32, output_details[1]['dtype'])
+    self.assertAllEqual([2, 4], output_details[1]['shape'])
+    self.assertEqual((0., 0.), output_details[1]['quantization'])
+
+  def testWrongShapeOverriding(self):
+    """Test a Functional tf.keras model with wrong input shape overriding."""
+    self._getFunctionalModelMultipleInputs()
+
+    # Convert to TFLite model.
+    with self.assertRaises(ValueError):
+      lite.TFLiteConverter.from_keras_model_file(
+          self._keras_file, input_shapes={'wrong_input': {2, 3}})
+
   def testFunctionalSequentialModel(self):
     """Test a Functional tf.keras model containing a Sequential model."""
     model = keras.models.Sequential()
@@ -2334,7 +2699,7 @@ class DefaultConverterAttrsTest(LiteTest):
     self.assertEqual(converter.output_format, lite_constants.TFLITE)
 
     # Assert the default inference type is float.
-    self.assertEqual(converter.inference_type, lite_constants.FLOAT)
+    self.assertEqual(converter.inference_type, dtypes.float32)
 
     # Assert the default inference type overrides are None.
     self.assertIsNone(converter.inference_input_type)
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 173cf97910d..1c1bd8fe136 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -32,10 +32,9 @@ from tensorflow.lite.python import lite_v2_test_util
 from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.layers import recurrent
-from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import save_options
@@ -74,9 +73,9 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertEqual(expected_value.numpy(), actual_value)
 
   @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8),
-      ('_INT16InputOutput', lite.constants.INT16))
+      ('_INT8InputOutput', dtypes.int8),
+      ('_UINT8InputOutput', dtypes.uint8),
+      ('_INT16InputOutput', dtypes.int16))
   @test_util.run_v2_only
   def testInvalidFloat(self, inference_input_output_type):
     root = self._getSimpleVariableModel()
@@ -194,9 +193,9 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8),
-      ('_INT16InputOutput', lite.constants.INT16))
+      ('_INT8InputOutput', dtypes.int8),
+      ('_UINT8InputOutput', dtypes.uint8),
+      ('_INT16InputOutput', dtypes.int16))
   @test_util.run_v2_only
   def testInvalidPostTrainingDynamicRangeQuantization(
       self, inference_input_output_type):
@@ -219,18 +218,18 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
         'must be tf.float32.', str(error.exception))
 
   @parameterized.named_parameters(
-      ('_Default', False, False, lite.constants.FLOAT),
-      ('_INT8InputOutput', False, False, lite.constants.INT8),
-      ('_UINT8InputOutput', False, False, lite.constants.QUANTIZED_UINT8),
-      ('_INT16Quantize', False, True, lite.constants.FLOAT),
-      ('_INT16Quantize_INT16InputOutput', False, True, lite.constants.INT16),
-      ('_IntOnly', True, False, lite.constants.FLOAT),
-      ('_IntOnly_INT8InputOutput', True, False, lite.constants.INT8),
+      ('_Default', False, False, dtypes.float32),
+      ('_INT8InputOutput', False, False, dtypes.int8),
+      ('_UINT8InputOutput', False, False, dtypes.uint8),
+      ('_INT16Quantize', False, True, dtypes.float32),
+      ('_INT16Quantize_INT16InputOutput', False, True, dtypes.int16),
+      ('_IntOnly', True, False, dtypes.float32),
+      ('_IntOnly_INT8InputOutput', True, False, dtypes.int8),
       ('_IntOnly_UINT8InputOutput', True, False,
-       lite.constants.QUANTIZED_UINT8),
-      ('_IntOnly_INT16Quantize', True, True, lite.constants.FLOAT),
+       dtypes.uint8),
+      ('_IntOnly_INT16Quantize', True, True, dtypes.float32),
       ('_IntOnly_INT16Quantize_INT16InputOutput', True, True,
-       lite.constants.INT16))
+       dtypes.int16))
   def testIntegerQuantization(self, is_int_only, is_int16_quantize,
                               inference_input_output_type):
     func, calibration_gen = self._getIntegerQuantizeModel()
@@ -281,7 +280,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
   @parameterized.named_parameters(
-      ('_INT16Quantize_INT8InputOutput', True, lite.constants.INT8))
+      ('_INT16Quantize_INT8InputOutput', True, dtypes.int8))
   def testInvalidIntegerQuantization(self, is_int16_quantize,
                                      inference_input_output_type):
     func, calibration_gen = self._getIntegerQuantizeModel()
@@ -297,8 +296,8 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
           lite.OpsSet.TFLITE_BUILTINS
       ]
     with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = lite.constants.INT8
-      quantized_converter.inference_output_type = lite.constants.INT8
+      quantized_converter.inference_input_type = dtypes.int8
+      quantized_converter.inference_output_type = dtypes.int8
       quantized_converter.convert()
     self.assertEqual(
         "The inference_input_type and inference_output_type "
@@ -377,9 +376,9 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
   @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+      ('_DefaultFLOAT32InputOutput', dtypes.float32),
+      ('_INT8InputOutput', dtypes.int8),
+      ('_UINT8InputOutput', dtypes.uint8))
   @test_util.run_v2_only
   def testTrainingTimeQuantization(self, inference_input_output_type):
     model = self._getTrainingTimeQuantizedModel()
@@ -487,35 +486,183 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  def _getIntegerQuantizationModelWithFlexOp(self):
+    np.random.seed(0)
+
+    root = tracking.AutoTrackable()
+
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=[3, 3, 3, 3, 3], dtype=tf.float32)
+    ])
+    def func(inp):
+      tanh = tf.math.tanh(inp)
+      # Flex delegate will merge the consecutive conv3d and erf ops into one
+      # Delegate node.
+      conv3d = tf.nn.conv3d(
+          tanh,
+          tf.ones([3, 3, 3, 3, 3]),
+          strides=[1, 1, 1, 1, 1],
+          padding='SAME')
+      erf = tf.math.erf(conv3d)
+      output = tf.math.tanh(erf)
+      return output
+
+    def calibration_gen():
+      for _ in range(5):
+        yield [
+            np.random.uniform(-1, 1, size=(3, 3, 3, 3, 3)).astype(np.float32)
+        ]
+
+    root.f = func
+    return (root.f.get_concrete_function(), calibration_gen)
+
+  @parameterized.named_parameters(
+      ('_Default', False, False, dtypes.float32),
+      ('_INT8InputOutput', False, False, dtypes.int8),
+      ('_UINT8InputOutput', False, False, dtypes.uint8),
+      ('_INT16Quantize', False, True, dtypes.float32),
+      ('_INT16Quantize_INT16InputOutput', False, True, dtypes.int16),
+      ('_IntOnly', True, False, dtypes.float32),
+      ('_IntOnly_INT8InputOutput', True, False, dtypes.int8),
+      ('_IntOnly_UINT8InputOutput', True, False, dtypes.uint8),
+      ('_IntOnly_INT16Quantize', True, True, dtypes.float32),
+      ('_IntOnly_INT16Quantize_INT16InputOutput', True, True, dtypes.int16))
   @test_util.run_v2_only
-  def testFlexOpWithInt8OpSet(self):
-    model = tf.keras.Sequential()
-    input_shape = (1, 4, 4, 4, 1)
-    model.add(
-        tf.keras.layers.Conv3D(
-            4,
-            kernel_size=(1, 1, 1),
-            activation='relu',
-            input_shape=input_shape[1:]))
-    model.add(tf.keras.layers.Flatten())
-    model.add(tf.keras.layers.Dense(2, activation='relu'))
+  def testIntegerQuantizationWithFlexOp(self, is_int_only, is_int16_quantize,
+                                        inference_input_output_type):
+    func, calibration_gen = self._getIntegerQuantizationModelWithFlexOp()
 
-    @tf.function(
-        input_signature=[tf.TensorSpec(shape=input_shape, dtype=tf.float32)])
-    def _call_fn(inputs):
-      return model(inputs, training=False)
+    quantized_converter = tf.lite.TFLiteConverter.from_concrete_functions(
+        [func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    if is_int_only:
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.SELECT_TF_OPS
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS_INT8, lite.OpsSet.SELECT_TF_OPS
+        ]
+    else:
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS,
+            lite.OpsSet.SELECT_TF_OPS
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS, lite.OpsSet.SELECT_TF_OPS
+        ]
 
-    concrete_func = _call_fn.get_concrete_function(
-        tf.TensorSpec(input_shape, dtype=tf.float32))
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
 
-    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.target_spec.supported_ops = [
-        tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
-        tf.lite.OpsSet.SELECT_TF_OPS,
-    ]
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
+
+  def _getIntegerQuantizationModelWithUnsupportedOps(self):
+    np.random.seed(0)
+
+    root = tracking.AutoTrackable()
+
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=[3], dtype=tf.float32),
+        tf.TensorSpec(shape=[3], dtype=tf.float32)
+    ])
+    def func(a, b):
+      # ceil kernel does not support int8 nor int16 types neither.
+      left = tf.math.ceil(a)
+      right = tf.nn.tanh(b)
+      add = tf.math.add(left, right)
+      # ceil kernel does not support int8 nor int16 types neither.
+      output = tf.math.ceil(add)
+      return (output, right)
+
+    def calibration_gen():
+      for _ in range(5):
+        yield [
+            np.random.uniform(-1, 1, size=(3)).astype(np.float32),
+            np.random.uniform(-1, 1, size=(3)).astype(np.float32)
+        ]
+
+    root.f = func
+    return (root.f.get_concrete_function(), calibration_gen)
+
+  @parameterized.named_parameters(
+      ('_INT8InputOutput', False, False, dtypes.int8),
+      ('_UINT8InputOutput', False, False, dtypes.uint8),
+      ('_INT16Quantize_INT16InputOutput', False, True, dtypes.int16),
+      ('_IntOnly_INT8InputOutput', True, False, dtypes.int8),
+      ('_IntOnly_UINT8InputOutput', True, False, dtypes.uint8),
+      ('_IntOnly_INT16Quantize_INT16InputOutput', True, True, dtypes.int16))
+  @test_util.run_v2_only
+  def testIntegerQuantizationWithUnsupportedOps(self, is_int_only,
+                                                is_int16_quantize,
+                                                inference_input_output_type):
+    func, calib_gen = self._getIntegerQuantizationModelWithUnsupportedOps()
+
+    quantized_converter = tf.lite.TFLiteConverter.from_concrete_functions(
+        [func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calib_gen
+    if is_int_only:
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS_INT8, lite.OpsSet.TFLITE_BUILTINS
+        ]
+    else:
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
+
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    # Allow float32 for fallback.
+    self.assertEqual(input_details[0]['dtype'], dtypes.float32)
+    self.assertEqual(input_details[1]['dtype'],
+                     inference_input_output_type.as_numpy_dtype)
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 2)
+    # Allow float32 for fallback.
+    self.assertEqual(output_details[0]['dtype'], dtypes.float32)
+    self.assertEqual(output_details[1]['dtype'],
+                     inference_input_output_type.as_numpy_dtype)
 
 
 class FromSavedModelTest(lite_v2_test_util.ModelTest):
@@ -739,6 +886,31 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
 
     self.assertTrue(tflite_model)
 
+  @test_util.run_v2_only
+  def testNonStatefulConvLSTM2D(self):
+    """Test saved model with non stateful ConvLSTM2D keras layer."""
+    # Create keras model
+    model = tf.keras.Sequential([
+        tf.keras.layers.ConvLSTM2D(
+            32, (3, 3),
+            padding='same',
+            return_sequences=True,
+            stateful=False,
+            batch_input_shape=(1, 1, 10, 10, 1))
+    ])
+    model.compile()
+
+    # Export the keras model to saved model.
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_lstm_2d')
+    model.save(saved_model_dir, save_format='tf', include_optimizer=False)
+
+    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
+    ]
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
@@ -973,9 +1145,9 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
         expected = expected.c
       self.assertAllClose(expected, actual)
 
-  @parameterized.named_parameters(('LSTM', recurrent_v2.LSTM),
-                                  ('SimpleRNN', recurrent.SimpleRNN),
-                                  ('GRU', recurrent_v2.GRU))
+  @parameterized.named_parameters(('LSTM', tf.keras.layers.LSTM),
+                                  ('SimpleRNN', tf.keras.layers.SimpleRNN),
+                                  ('GRU', tf.keras.layers.GRU))
   @test_util.run_v2_only
   def testKerasRNN(self, rnn_layer):
     # This relies on TFLiteConverter to rewrite unknown batch size to 1. The
@@ -997,9 +1169,9 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     expected_value = model.predict(input_data)
     self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
-  @parameterized.named_parameters(('LSTM', recurrent_v2.LSTM),
-                                  ('SimpleRNN', recurrent.SimpleRNN),
-                                  ('GRU', recurrent_v2.GRU))
+  @parameterized.named_parameters(('LSTM', tf.keras.layers.LSTM),
+                                  ('SimpleRNN', tf.keras.layers.SimpleRNN),
+                                  ('GRU', tf.keras.layers.GRU))
   @test_util.run_v2_only
   def testKerasRNNMultiBatches(self, rnn_layer):
     input_data = tf.constant(
@@ -1026,7 +1198,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     model.add(tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'))
     model.add(
         tf.keras.layers.Bidirectional(
-            recurrent_v2.LSTM(units=10, return_sequences=True),
+            tf.keras.layers.LSTM(units=10, return_sequences=True),
             input_shape=(10, 10)))
     model.add(tf.keras.layers.Flatten())
     model.add(tf.keras.layers.Dense(5))
@@ -1047,7 +1219,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'))
-    model.add(tf.keras.layers.Bidirectional(recurrent_v2.LSTM(units=10)))
+    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=10)))
     model.add(tf.keras.layers.Dense(5))
     model.add(tf.keras.layers.Activation('softmax'))
 
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 1a0d3db3b73..c1956cc5b2d 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -16,6 +16,7 @@ cc_library(
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
         "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
         "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//tensorflow/lite/tools/optimize:quantization_wrapper_utils",
         "//tensorflow/lite/tools/optimize:quantize_model",
         "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
         "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
@@ -50,6 +51,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":_pywrap_tensorflow_lite_calibration_wrapper",  # buildcleaner: keep
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
@@ -67,8 +69,8 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":calibrator",
-        "//tensorflow/lite/python:lite_constants",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//third_party/py/numpy",
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index de3de413c1d..53d9aada15a 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
+#include "tensorflow/lite/tools/optimize/quantization_wrapper_utils.h"
 #include "tensorflow/lite/tools/optimize/quantize_model.h"
 
 #define TFLITE_PY_CHECK(x)               \
@@ -94,6 +95,42 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
 
 }  // namespace
 
+PyObject* AddIntermediateTensors(PyObject* data) {
+  using tflite::interpreter_wrapper::PythonErrorReporter;
+  char* buf = nullptr;
+  Py_ssize_t length;
+  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
+  ::tflite::python::ImportNumpy();
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
+    return nullptr;
+  }
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
+                                               error_reporter.get());
+  if (!model) {
+    PyErr_Format(PyExc_ValueError, "Invalid model");
+    return nullptr;
+  }
+  flatbuffers::FlatBufferBuilder builder;
+  auto tflite_model = CreateMutableModel(*model->GetModel());
+  if (optimize::AddIntermediateTensorsToFusedOp(&builder, tflite_model.get()) !=
+      kTfLiteOk) {
+    error_reporter->exception();
+    return nullptr;
+  }
+
+  if (builder.GetSize()) {
+    return python_utils::ConvertToPyString(
+        reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
+        builder.GetSize());
+  } else {
+    // When AddIntermediateTensorsToFusedOp early returns, return the model as
+    // it is.
+    return python_utils::ConvertToPyString(buf, length);
+  }
+}
+
 CalibrationWrapper::CalibrationWrapper(
     std::unique_ptr<tflite::Interpreter> interpreter,
     std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
@@ -225,7 +262,8 @@ PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
   for (int j = 0; j < PyArray_NDIM(array); j++) {
     // Ensure the calibration data input shape is the same as the model input
     // shape unless the dimension is unknown.
-    if (tensor->dims_signature->size == tensor->dims->size &&
+    if (tensor->dims_signature != nullptr &&
+        tensor->dims_signature->size == tensor->dims->size &&
         tensor->dims_signature->data[j] == -1) {
       has_unknown_dims = true;
     } else if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index 94aa0ed6f7f..4c81499c10c 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -50,6 +50,8 @@ class CalibrationReader;
 
 namespace calibration_wrapper {
 
+PyObject* AddIntermediateTensors(PyObject* data);
+
 class CalibrationWrapper {
  public:
   // SWIG caller takes ownership of pointer.
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
index 3f366615edc..5296d2796ab 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 
 namespace py = pybind11;
+using tflite::calibration_wrapper::AddIntermediateTensors;
 using tflite::calibration_wrapper::CalibrationWrapper;
 
 PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
@@ -25,6 +26,9 @@ PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
     _pywrap_tensorflow_lite_calibration_wrapper
     -----
   )pbdoc";
+  m.def("AddIntermediateTensors", [](py::handle& data) {
+    return tensorflow::PyoOrThrow(AddIntermediateTensors(data.ptr()));
+  });
   py::class_<CalibrationWrapper>(m, "CalibrationWrapper")
       .def(py::init([](py::handle& data) {
         return ::CalibrationWrapper::CreateWrapperCPPFromBuffer(data.ptr());
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 2b08ec690ff..e1758e87eeb 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -18,8 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util.lazy_loader import LazyLoader
-from tensorflow.lite.python import lite_constants
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies. Must use double quotes to match code internal rewrite
@@ -30,6 +31,11 @@ _calibration_wrapper = LazyLoader(
     "_pywrap_tensorflow_lite_calibration_wrapper")
 
 
+def add_intermediate_tensors(model_content):
+  """Adds intermedaite tensors to fused op if needed."""
+  return _calibration_wrapper.AddIntermediateTensors(model_content)
+
+
 class Calibrator(object):
   """Calibrates a floating point model and then quantizes it.
 
@@ -60,7 +66,7 @@ class Calibrator(object):
                              input_type,
                              output_type,
                              allow_float,
-                             activations_type=lite_constants.INT8,
+                             activations_type=dtypes.int8,
                              resize_input=True):
     """Calibrates the model with specified generator and then quantizes it.
 
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 371b3514ca3..49fafa0ff0a 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -23,8 +23,8 @@ from absl.testing import parameterized
 import numpy as np
 from six.moves import range
 
-from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.optimize import calibrator as _calibrator
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -34,9 +34,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       # Activation type Int8
-      ('UseActivationTypeInt8', constants.INT8),
+      ('UseActivationTypeInt8', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16', constants.INT16))
+      ('UseActivationTypeInt16', dtypes.int16))
   def test_calibration_with_quantization(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
@@ -49,16 +49,17 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       constants.FLOAT,
-                                                       constants.FLOAT, False,
+                                                       dtypes.float32,
+                                                       dtypes.float32,
+                                                       False,
                                                        activations_type)
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
       # Activation type Int8
-      ('UseActivationTypeInt8', constants.INT8),
+      ('UseActivationTypeInt8', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16', constants.INT16))
+      ('UseActivationTypeInt16', dtypes.int16))
   def test_calibration_with_quantization_allow_float(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
@@ -71,8 +72,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       constants.FLOAT,
-                                                       constants.FLOAT, True,
+                                                       dtypes.float32,
+                                                       dtypes.float32,
+                                                       True,
                                                        activations_type)
     self.assertIsNotNone(quantized_model)
 
@@ -88,7 +90,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize_single(
-        input_gen, constants.FLOAT, constants.FLOAT, True, 'conv2d_8/BiasAdd')
+        input_gen, dtypes.float32, dtypes.float32, True, 'conv2d_8/BiasAdd')
     self.assertIsNotNone(quantized_model)
 
   def test_calibration_with_string_input(self):
@@ -103,14 +105,14 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.array(u'Test' + str(i))]
 
     quantized_model = quantizer.calibrate_and_quantize_single(
-        input_gen, constants.FLOAT, constants.FLOAT, True, 'Identity')
+        input_gen, dtypes.float32, dtypes.float32, True, 'Identity')
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
       # Activation type Int8
-      ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8),
+      ('UseActivationTypeInt8 - EnableMlirQuantizer', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16))
+      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', dtypes.int16))
   def test_calibration_with_quantization_multiple_inputs(
       self, activations_type):
     # Load multi add model from test data.
@@ -126,8 +128,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 8, 8, 3), dtype=np.float32) for _ in range(4)]
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       constants.FLOAT,
-                                                       constants.FLOAT, False,
+                                                       dtypes.float32,
+                                                       dtypes.float32,
+                                                       False,
                                                        activations_type)
     self.assertIsNotNone(quantized_model)
 
@@ -148,8 +151,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield i
 
     with self.assertRaises(RuntimeError):
-      quantizer.calibrate_and_quantize(empty_input_gen, constants.FLOAT,
-                                       constants.FLOAT, False)
+      quantizer.calibrate_and_quantize(empty_input_gen, dtypes.float32,
+                                       dtypes.float32, False)
 
   def test_invalid_shape_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
@@ -163,8 +166,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 2, 2, 3), dtype=np.float32)]
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
-      quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, constants.INT8,
+      quantizer.calibrate_and_quantize(input_gen, dtypes.float32,
+                                       dtypes.float32, False, dtypes.int8,
                                        False)
 
   def test_invalid_type_calibrator_gen(self):
@@ -179,8 +182,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.int32)]
 
     with self.assertRaises(ValueError):
-      quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, constants.INT8)
+      quantizer.calibrate_and_quantize(input_gen, dtypes.float32,
+                                       dtypes.float32, False, dtypes.int8)
 
   def test_calibration(self):
     model_path = resource_loader.get_path_to_datafile(
@@ -196,5 +199,13 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate(input_gen)
     self.assertIsNotNone(quantized_model)
 
+  def test_add_intermediate_tensors(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    model = open(model_path, 'rb').read()
+    added_model = _calibrator.add_intermediate_tensors(model)
+    self.assertIsNotNone(added_model)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/lite/python/schema_util.py b/tensorflow/lite/python/schema_util.py
new file mode 100644
index 00000000000..ea4092810f8
--- /dev/null
+++ b/tensorflow/lite/python/schema_util.py
@@ -0,0 +1,50 @@
+# Lint as: python2, python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Schema utilities to get builtin code from operator code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util import all_util
+
+
+def get_builtin_code_from_operator_code(opcode):
+  """Return the builtin code of the given operator code.
+
+  The following method is introduced to resolve op builtin code shortage
+  problem. The new builtin opreator will be assigned to the extended builtin
+  code field in the flatbuffer schema. Those methods helps to hide builtin code
+  details.
+
+  Args:
+    opcode: Operator code.
+
+  Returns:
+    The builtin code of the given operator code.
+  """
+  # Access BuiltinCode() method first if available.
+  if hasattr(opcode, 'BuiltinCode') and callable(opcode.BuiltinCode):
+    return max(opcode.BuiltinCode(), opcode.DeprecatedBuiltinCode())
+
+  return max(opcode.builtinCode, opcode.deprecatedBuiltinCode)
+
+
+_allowed_symbols = [
+    'get_builtin_code_from_operator_code',
+]
+
+all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index d98e401c76a..83f2d14666b 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/lite:build_def.bzl", "tf_to_tflite")
+load("//tensorflow/lite:build_def.bzl", "DEPRECATED_tf_to_tflite")
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
@@ -8,7 +8,7 @@ package(
 
 exports_files(glob(["*.pb"]))
 
-tf_to_tflite(
+DEPRECATED_tf_to_tflite(
     name = "permute_float",
     src = "permute.pbtxt",
     out = "permute_float.tflite",
@@ -18,7 +18,7 @@ tf_to_tflite(
     ],
 )
 
-tf_to_tflite(
+DEPRECATED_tf_to_tflite(
     name = "permute_uint8",
     src = "permute.pbtxt",
     out = "permute_uint8.tflite",
@@ -33,7 +33,7 @@ tf_to_tflite(
     ],
 )
 
-tf_to_tflite(
+DEPRECATED_tf_to_tflite(
     name = "gather_string",
     src = "gather.pbtxt",
     out = "gather_string.tflite",
@@ -43,11 +43,22 @@ tf_to_tflite(
     ],
 )
 
+DEPRECATED_tf_to_tflite(
+    name = "gather_string_0d",
+    src = "gather_0d.pbtxt",
+    out = "gather_string_0d.tflite",
+    options = [
+        "--input_arrays=input,indices",
+        "--output_arrays=output",
+    ],
+)
+
 filegroup(
     name = "interpreter_test_data",
     srcs = [
         "pc_conv.bin",
         ":gather_string",
+        ":gather_string_0d",
         ":permute_float",
         ":permute_uint8",
     ],
diff --git a/tensorflow/lite/python/testdata/gather_0d.pbtxt b/tensorflow/lite/python/testdata/gather_0d.pbtxt
new file mode 100644
index 00000000000..b065cb22a4e
--- /dev/null
+++ b/tensorflow/lite/python/testdata/gather_0d.pbtxt
@@ -0,0 +1,108 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+node {
+  name: "input_const"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "abcd"
+      }
+    }
+  }
+}
+node {
+  name: "indices"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "axis"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "GatherV2"
+  input: "input_const"
+  input: "indices"
+  input: "axis"
+  device: "/device:CPU:0"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index e8d5733de76..85479bcd6cd 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -28,13 +28,13 @@ import six
 from six.moves import zip
 
 from tensorflow.lite.python import lite
-from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python.convert import register_custom_opdefs
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.lite.toco.logging import gen_html
-from tensorflow.python import keras
 from tensorflow.python import tf2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import app
+from tensorflow.python.util import keras_deps
 
 
 def _parse_array(values, type_fn=str):
@@ -63,13 +63,14 @@ def _parse_inference_type(value, flag):
     ValueError: Unsupported value.
   """
   if value == "FLOAT":
-    return lite_constants.FLOAT
-  if value == "QUANTIZED_UINT8":
-    return lite_constants.QUANTIZED_UINT8
+    return dtypes.float32
   if value == "INT8":
-    return lite_constants.INT8
-  raise ValueError("Unsupported value for --{0}. Only FLOAT and "
-                   "QUANTIZED_UINT8 are supported.".format(flag))
+    return dtypes.int8
+  if value == "UINT8" or value == "QUANTIZED_UINT8":
+    return dtypes.uint8
+  raise ValueError(
+      "Unsupported value for `{}` flag. Expected FLOAT, INT8 or UINT8, instead "
+      "got {}.".format(flag, value))
 
 
 def _get_tflite_converter(flags):
@@ -151,10 +152,10 @@ def _convert_tf1_model(flags):
 
     # In quantized inference, mean_value has to be integer so that the real
     # value 0.0 is exactly representable.
-    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
-      mean_values = _parse_array(flags.mean_values, type_fn=int)
-    else:
+    if converter.inference_type == dtypes.float32:
       mean_values = _parse_array(flags.mean_values, type_fn=float)
+    else:
+      mean_values = _parse_array(flags.mean_values, type_fn=int)
     quant_stats = list(zip(mean_values, std_dev_values))
     if ((not flags.input_arrays and len(input_arrays) > 1) or
         (len(input_arrays) != len(quant_stats))):
@@ -193,13 +194,13 @@ def _convert_tf1_model(flags):
 
   if flags.post_training_quantize:
     converter.optimizations = [lite.Optimize.DEFAULT]
-    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type != dtypes.float32:
       print("--post_training_quantize quantizes a graph of inference_type "
-            "FLOAT. Overriding inference type QUANTIZED_UINT8 to FLOAT.")
-      converter.inference_type = lite_constants.FLOAT
+            "FLOAT. Overriding inference_type to FLOAT.")
+      converter.inference_type = dtypes.float32
 
   if flags.quantize_to_float16:
-    converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    converter.target_spec.supported_types = [dtypes.float16]
     if not flags.post_training_quantize:
       print("--quantize_to_float16 will only take effect with the "
             "--post_training_quantize flag enabled.")
@@ -233,7 +234,7 @@ def _convert_tf2_model(flags):
   if flags.saved_model_dir:
     converter = lite.TFLiteConverterV2.from_saved_model(flags.saved_model_dir)
   elif flags.keras_model_file:
-    model = keras.models.load_model(flags.keras_model_file)
+    model = keras_deps.get_load_model_function()(flags.keras_model_file)
     converter = lite.TFLiteConverterV2.from_keras_model(model)
 
   if flags.experimental_new_converter is not None:
@@ -358,14 +359,15 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
-      help="Target data type of real-number arrays in the output file.")
+      default="FLOAT",
+      help=("Target data type of real-number arrays in the output file. "
+            "Must be either FLOAT, INT8 or UINT8."))
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help=("Target data type of real-number input arrays. Allows for a "
-            "different type for input arrays in the case of quantization."))
+            "different type for input arrays in the case of quantization. "
+            "Must be either FLOAT, INT8 or UINT8."))
 
   # Input and output arrays flags.
   parser.add_argument(
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 68d35bedd0e..9cc2c28a890 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -21,11 +21,11 @@ from __future__ import print_function
 import os
 
 import numpy as np
+from tensorflow import keras
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.lite.python import tflite_convert
 from tensorflow.lite.python.convert import register_custom_opdefs
-from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import def_function
@@ -168,6 +168,37 @@ class TfLiteConvertV1Test(TestModels):
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
+  def testQATFrozenGraphDefUInt8(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = self._getFilepath('model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    # Define converter flags
+    flags_str = ('--std_dev_values=128,128 --mean_values=128,128 '
+                 '--graph_def_file={0} --input_arrays={1} '
+                 '--output_arrays={2}'.format(
+                     graph_def_file, 'inputA,inputB', 'output'))
+
+    # Set inference_type UINT8 and (default) inference_input_type UINT8
+    flags_str_1 = flags_str + ' --inference_type=UINT8'
+    self._run(flags_str_1, should_succeed=True)
+
+    # Set inference_type UINT8 and inference_input_type FLOAT
+    flags_str_2 = flags_str_1 + ' --inference_input_type=FLOAT'
+    self._run(flags_str_2, should_succeed=True)
+
+    os.remove(graph_def_file)
+
   def testSavedModel(self):
     saved_model_dir = self._getFilepath('model')
     with ops.Graph().as_default():
diff --git a/tensorflow/lite/python/tflite_keras_util.py b/tensorflow/lite/python/tflite_keras_util.py
new file mode 100644
index 00000000000..e8c3e45d1a1
--- /dev/null
+++ b/tensorflow/lite/python/tflite_keras_util.py
@@ -0,0 +1,193 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Keras functions required by TensorFlow Lite.
+
+The functions defined in this library have been copied over from Keras in order
+to remove the dependency from TensorFlow Lite to Keras. The functions which
+could not be copied over are accessed using the dependecy inversion principle.
+(for details, refer to tensorflow/python/util/keras_deps.py).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.util import keras_deps
+from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
+
+
+def _enforce_names_consistency(specs):
+  """Enforces that either all specs have names or none do."""
+
+  def _has_name(spec):
+    return hasattr(spec, 'name') and spec.name is not None
+
+  def _clear_name(spec):
+    spec = copy.deepcopy(spec)
+    if hasattr(spec, 'name'):
+      spec._name = None  # pylint:disable=protected-access
+    return spec
+
+  flat_specs = nest.flatten(specs)
+  name_inconsistency = (
+      any(_has_name(s) for s in flat_specs) and
+      not all(_has_name(s) for s in flat_specs))
+
+  if name_inconsistency:
+    specs = nest.map_structure(_clear_name, specs)
+  return specs
+
+
+def model_input_signature(model, keep_original_batch_size=False):
+  """Inspect model to get its input signature.
+
+  The model's input signature is a list with a single (possibly-nested) object.
+  This is due to the Keras-enforced restriction that tensor inputs must be
+  passed in as the first argument.
+
+  For example, a model with input {'feature1': <Tensor>, 'feature2': <Tensor>}
+  will have input signature: [{'feature1': TensorSpec, 'feature2': TensorSpec}]
+
+  Args:
+    model: Keras Model object.
+    keep_original_batch_size: A boolean indicating whether we want to keep using
+      the original batch size or set it to None. Default is `False`, which means
+      that the batch dim of the returned input signature will always be set to
+      `None`.
+
+  Returns:
+    A list containing either a single TensorSpec or an object with nested
+    TensorSpecs. This list does not contain the `training` argument.
+  """
+  input_specs = model._get_save_spec(dynamic_batch=not keep_original_batch_size)  # pylint: disable=protected-access
+  if input_specs is None:
+    return None
+  input_specs = _enforce_names_consistency(input_specs)
+  # Return a list with a single element as the model's input signature.
+  if isinstance(input_specs,
+                collections_abc.Sequence) and len(input_specs) == 1:
+    # Note that the isinstance check filters out single-element dictionaries,
+    # which should also be wrapped as a single-element list.
+    return input_specs
+  else:
+    return [input_specs]
+
+
+def raise_model_input_error(model):
+  raise ValueError(
+      'Model {} cannot be saved because the input shapes have not been '
+      'set. Usually, input shapes are automatically determined from calling'
+      ' `.fit()` or `.predict()`. To manually set the shapes, call '
+      '`model.build(input_shape)`.'.format(model))
+
+
+def _create_pseudo_names(tensors, prefix):
+  """Creates pseudo {input | output} names for subclassed Models.
+
+  Warning: this function should only be used to define default
+  names for `Metics` and `SavedModel`. No other use cases should
+  rely on a `Model`'s input or output names.
+
+  Example with dict:
+
+  `{'a': [x1, x2], 'b': x3}` becomes:
+  `['a_1', 'a_2', 'b']`
+
+  Example with list:
+
+  `[x, y]` becomes:
+  `['output_1', 'output_2']`
+
+  Arguments:
+    tensors: `Model`'s outputs or inputs.
+    prefix: 'output_' for outputs, 'input_' for inputs.
+
+  Returns:
+    Flattened list of pseudo names.
+  """
+
+  def one_index(ele):
+    # Start with "output_1" instead of "output_0".
+    if isinstance(ele, int):
+      return ele + 1
+    return ele
+
+  flat_paths = list(nest.yield_flat_paths(tensors))
+  flat_paths = nest.map_structure(one_index, flat_paths)
+  names = []
+  for path in flat_paths:
+    if not path:
+      name = prefix + '1'  # Single output.
+    else:
+      name = '_'.join(str(p) for p in path)
+      if isinstance(path[0], int):
+        name = prefix + name
+    names.append(name)
+  return names
+
+
+def create_pseudo_output_names(outputs):
+  """Create pseudo output names for a subclassed Model."""
+  return _create_pseudo_names(outputs, prefix='output_')
+
+
+def trace_model_call(model, input_signature=None):
+  """Trace the model call to create a tf.function for exporting a Keras model.
+
+  Args:
+    model: A Keras model.
+    input_signature: optional, a list of tf.TensorSpec objects specifying the
+      inputs to the model.
+
+  Returns:
+    A tf.function wrapping the model's call function with input signatures set.
+
+  Raises:
+    ValueError: if input signature cannot be inferred from the model.
+  """
+  if input_signature is None:
+    if isinstance(model.call, def_function.Function):
+      input_signature = model.call.input_signature
+
+  if input_signature is None:
+    input_signature = model_input_signature(model)
+
+  if input_signature is None:
+    raise_model_input_error(model)
+
+  @def_function.function(input_signature=input_signature, autograph=False)
+  def _wrapped_model(*args):
+    """A concrete tf.function that wraps the model's call function."""
+    # When given a single input, Keras models will call the model on the tensor
+    # rather than a list consisting of the single tensor.
+    inputs = args[0] if len(input_signature) == 1 else list(args)
+
+    with keras_deps.get_call_context_function()().enter(
+        model, inputs=inputs, build_graph=False, training=False, saving=True):
+      outputs = model(inputs, training=False)
+
+    # Outputs always has to be a flat dict.
+    output_names = model.output_names  # Functional Model.
+    if output_names is None:  # Subclassed Model.
+      output_names = create_pseudo_output_names(outputs)
+    outputs = nest.flatten(outputs)
+    return {name: output for name, output in zip(output_names, outputs)}
+
+  return _wrapped_model
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index bb66d78a89c..ae8d6b83965 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -32,6 +32,8 @@ from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
 from tensorflow.lite.python import schema_py_generated as schema_fb
+from tensorflow.lite.python import schema_util
+from tensorflow.lite.python import tflite_keras_util as _tflite_keras_util
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
 from tensorflow.lite.toco import types_pb2 as _types_pb2
@@ -43,6 +45,10 @@ from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training.saver import export_meta_graph as _export_meta_graph
 
+# Keras functions used by TFLite
+model_input_signature = _tflite_keras_util.model_input_signature
+trace_model_call = _tflite_keras_util.trace_model_call
+
 # Map of tf.dtypes to TFLite types_flag_pb2.
 _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.float32: _types_pb2.FLOAT,
@@ -76,7 +82,10 @@ _MAP_TFLITE_ENUM_TO_TF_TYPES = {
 
 _TFLITE_FILE_IDENTIFIER = b"TFL3"
 
-_TFLITE_MODEL_INPUT_OUTPUT_TYPES = (dtypes.float32, dtypes.int8, dtypes.uint8)
+_MAP_QUANT_TO_IO_TYPES = {
+    dtypes.int8: {dtypes.int8, dtypes.uint8},
+    dtypes.int16: {dtypes.int16},
+}
 
 
 def convert_dtype_to_tflite_type(tf_dtype):
@@ -373,7 +382,7 @@ def build_debug_info_func(original_graph):
                 (func, sub_func.graph.get_operation_by_name(name)))
           else:
             sys.stderr.write(
-                "Use '@tf.function' or '@defun' to decorate the function.")
+                "Use '@tf.function' or '@defun' to decorate the function.\n")
             continue
       except KeyError:
         # New node created by graph optimizer. No stack trace from source code.
@@ -631,13 +640,6 @@ def _modify_model_input_type(model, inference_input_type=dtypes.float32):
   if inference_input_type == dtypes.float32:
     return
 
-  if inference_input_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
-    raise ValueError(
-        "Unsupported `inference_output_type` value. Expected to be in {}, "
-        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
-                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
-                                 _get_tf_type_name(inference_input_type)))
-
   subgraph = model.subgraphs[0]
   tensors = subgraph.tensors
   operators = subgraph.operators
@@ -645,34 +647,52 @@ def _modify_model_input_type(model, inference_input_type=dtypes.float32):
   # Find all quantize operators
   quant_opcode_idxs = []
   for idx, opcode in enumerate(model.operatorCodes):
-    if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
+    builtin_code = schema_util.get_builtin_code_from_operator_code(opcode)
+    if builtin_code == schema_fb.BuiltinOperator.QUANTIZE:
       quant_opcode_idxs.append(idx)
   if not quant_opcode_idxs:
     raise ValueError("Model input is not quantized.")
 
-  # Ensure that the model input is quantized
+  # Validate that the model input is quantized
   input_quant_ops = []
   for op in operators:
-    # Check if the operator quantizes an input
+    # Find operators that quantize model input
     if op.opcodeIndex in quant_opcode_idxs and op.inputs[0] in subgraph.inputs:
-      # If found, validate the operator input/output tensor types
-      float_tensor, int_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
-      if float_tensor.type != schema_fb.TensorType.FLOAT32:
+      float_tensor, quant_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
+      # If found, validate that the operator's input type is float
+      float_type = _convert_tflite_enum_type_to_tf_type(float_tensor.type)
+      if float_type != dtypes.float32:
         raise ValueError(
-            "Model input type must be tf.float32. Expected type for tensor "
-            "with name '{}' is tf.float32, instead type is {}".format(
-                float_tensor.name, _get_tf_type_name(
-                    _convert_tflite_enum_type_to_tf_type(float_tensor.type))))
-      if int_tensor.type != schema_fb.TensorType.INT8:
+            "Initial model input type must be tf.float32. Expected type for "
+            "tensor with name '{}' is tf.float32, instead type is {}".format(
+                float_tensor.name, _get_tf_type_name(float_type)))
+      # If found, validate that the operator output is quantized and compatible
+      # with the final model input type
+      quant_type = _convert_tflite_enum_type_to_tf_type(quant_tensor.type)
+      if quant_type not in _MAP_QUANT_TO_IO_TYPES:
         raise ValueError(
-            "Model input is not quantized. Expected type for tensor "
-            "with name '{}' is tf.int8, instead type is {}".format(
-                int_tensor.name, _get_tf_type_name(
-                    _convert_tflite_enum_type_to_tf_type(int_tensor.type))))
+            "Initial model input is not quantized. Expected type for "
+            "tensor with name '{}' should be in {}, instead type is {}".format(
+                quant_tensor.name,
+                tuple(_get_tf_type_name(t) for t in
+                      _MAP_QUANT_TO_IO_TYPES.keys()),
+                _get_tf_type_name(quant_type)))
+      else:
+        inference_io_types = _MAP_QUANT_TO_IO_TYPES[quant_type]
+        if inference_input_type not in inference_io_types:
+          raise ValueError(
+              "Unsupported `inference_input_type` value. Expected to be in "
+              "{}, instead got {}.".format(
+                  tuple(_get_tf_type_name(t) for t in inference_io_types),
+                  _get_tf_type_name(inference_input_type)))
       input_quant_ops.append(op)
 
   if len(subgraph.inputs) != len(input_quant_ops):
-    raise ValueError("Model input is not quantized.")
+    logging.warning(
+        "For model inputs containing unsupported operations which cannot be "
+        "quantized, the `inference_input_type` attribute will default to the "
+        "original type."
+        )
 
   # Modify model input type
   if inference_input_type == dtypes.uint8:
@@ -684,7 +704,7 @@ def _modify_model_input_type(model, inference_input_type=dtypes.float32):
       uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
       tensors[op.inputs[0]].quantization = uint8_quantization
       tensors[op.inputs[0]].type = schema_fb.TensorType.UINT8
-  elif inference_input_type == dtypes.int8:
+  elif inference_input_type in _MAP_QUANT_TO_IO_TYPES:
     # Remove the inputs and the quant operator
     remove_tensors_idxs = set()
     for op in input_quant_ops:
@@ -695,10 +715,8 @@ def _modify_model_input_type(model, inference_input_type=dtypes.float32):
     _remove_tensors_from_model(model, remove_tensors_idxs)
   else:
     raise ValueError(
-        "Unsupported `inference_input_type` value. Expected to be in {}, "
-        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
-                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
-                                 _get_tf_type_name(inference_input_type)))
+        "Unsupported `inference_input_type` value {}.".format(
+            _get_tf_type_name(inference_input_type)))
 
 
 def _modify_model_output_type(model, inference_output_type=dtypes.float32):
@@ -707,13 +725,6 @@ def _modify_model_output_type(model, inference_output_type=dtypes.float32):
   if inference_output_type == dtypes.float32:
     return
 
-  if inference_output_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
-    raise ValueError(
-        "Unsupported `inference_output_type` value. Expected to be in {}, "
-        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
-                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
-                                 _get_tf_type_name(inference_output_type)))
-
   subgraph = model.subgraphs[0]
   tensors = subgraph.tensors
   operators = subgraph.operators
@@ -721,48 +732,68 @@ def _modify_model_output_type(model, inference_output_type=dtypes.float32):
   # Find all dequantize operators
   dequant_opcode_idxs = []
   for idx, opcode in enumerate(model.operatorCodes):
-    if opcode.builtinCode == schema_fb.BuiltinOperator.DEQUANTIZE:
+    builtin_code = schema_util.get_builtin_code_from_operator_code(opcode)
+    if builtin_code == schema_fb.BuiltinOperator.DEQUANTIZE:
       dequant_opcode_idxs.append(idx)
   if not dequant_opcode_idxs:
     raise ValueError("Model output is not dequantized.")
 
-  # Ensure that the model output is dequantized
+  # Validate that the model output is dequantized
   output_dequant_ops = []
   for op in operators:
-    # Check if the operator dequantizes an output
+    # Find operators that dequantize model output
     if op.opcodeIndex in dequant_opcode_idxs and \
         op.outputs[0] in subgraph.outputs:
-      # If found, validate the operator input/output tensor types
-      int_tensor, float_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
-      if float_tensor.type != schema_fb.TensorType.FLOAT32:
+      # If found, validate that the operator's output type is float
+      quant_tensor, float_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
+      float_type = _convert_tflite_enum_type_to_tf_type(float_tensor.type)
+      if float_type != dtypes.float32:
         raise ValueError(
-            "Model output type must be tf.float32. Expected type for tensor "
-            "with name '{}' is tf.float32, instead type is {}".format(
-                float_tensor.name, _get_tf_type_name(
-                    _convert_tflite_enum_type_to_tf_type(float_tensor.type))))
-      if int_tensor.type != schema_fb.TensorType.INT8:
+            "Initial model output type must be tf.float32. Expected type for "
+            "tensor with name '{}' is tf.float32, instead type is {}".format(
+                float_tensor.name, _get_tf_type_name(float_type)))
+      # If found, validate that the operator input is quantized and compatible
+      # with the final model output type
+      quant_type = _convert_tflite_enum_type_to_tf_type(quant_tensor.type)
+      if quant_type not in _MAP_QUANT_TO_IO_TYPES:
         raise ValueError(
-            "Model output is not dequantized. Expected type for tensor "
-            "with name '{}' is tf.int8, instead type is {}".format(
-                int_tensor.name, _get_tf_type_name(
-                    _convert_tflite_enum_type_to_tf_type(int_tensor.type))))
+            "Initial model output is not dequantized. Expected type for "
+            "tensor with name '{}' should be in {}, instead type is {}".format(
+                quant_tensor.name,
+                tuple(_get_tf_type_name(t) for t in
+                      _MAP_QUANT_TO_IO_TYPES.keys()),
+                _get_tf_type_name(quant_type)))
+      else:
+        inference_io_types = _MAP_QUANT_TO_IO_TYPES[quant_type]
+        if inference_output_type not in inference_io_types:
+          raise ValueError(
+              "Unsupported `inference_output_type` value. Expected to be in "
+              "{}, instead got {}.".format(
+                  tuple(_get_tf_type_name(t) for t in inference_io_types),
+                  _get_tf_type_name(inference_output_type)))
       output_dequant_ops.append(op)
 
   if len(subgraph.outputs) != len(output_dequant_ops):
-    raise ValueError("Model output is not dequantized.")
+    logging.warning(
+        "For model outputs containing unsupported operations which cannot be "
+        "quantized, the `inference_output_type` attribute will default to the "
+        "original type."
+        )
 
   # Modify model output type
   if inference_output_type == dtypes.uint8:
     # Find a quantize operator
     quant_opcode_idx = -1
     for idx, opcode in enumerate(model.operatorCodes):
-      if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
+      builtin_code = schema_util.get_builtin_code_from_operator_code(opcode)
+      if builtin_code == schema_fb.BuiltinOperator.QUANTIZE:
         quant_opcode_idx = idx
         break
     # Create a quantize operator, if none exist
     if quant_opcode_idx == -1:
       quant_op = schema_fb.OperatorCodeT()
       quant_op.builtinCode = schema_fb.BuiltinOperator.QUANTIZE
+      quant_op.deprecatedBuiltinCode = schema_fb.BuiltinOperator.QUANTIZE
       model.operatorCodes.append(quant_op)
       quant_opcode_idx = len(model.operatorCodes) - 1
     # Change dequant op (int8 to float) to quant op (int8 to uint8)
@@ -774,7 +805,7 @@ def _modify_model_output_type(model, inference_output_type=dtypes.float32):
       uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
       tensors[op.outputs[0]].quantization = uint8_quantization
       tensors[op.outputs[0]].type = schema_fb.TensorType.UINT8
-  elif inference_output_type == dtypes.int8:
+  elif inference_output_type in _MAP_QUANT_TO_IO_TYPES:
     # Remove the outputs and the dequant operator
     remove_tensors_idxs = set()
     for op in output_dequant_ops:
@@ -785,10 +816,8 @@ def _modify_model_output_type(model, inference_output_type=dtypes.float32):
     _remove_tensors_from_model(model, remove_tensors_idxs)
   else:
     raise ValueError(
-        "Unsupported `inference_output_type` value. Expected to be in {}, "
-        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
-                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
-                                 _get_tf_type_name(inference_output_type)))
+        "Unsupported `inference_output_type` value {}.".format(
+            _get_tf_type_name(inference_output_type)))
 
 
 def modify_model_io_type(
@@ -800,11 +829,12 @@ def modify_model_io_type(
     model: A tflite model.
     inference_input_type: tf.DType representing modified input type.
       (default tf.float32. If model input is int8 quantized, it must be in
-      {tf.float32, tf.int8, tf.uint8}, else it must be tf.float32)
+      {tf.float32, tf.int8,tf.uint8}, else if model input is int16 quantized,
+      it must be in {tf.float32, tf.int16}, else it must be tf.float32)
     inference_output_type: tf.DType representing modified output type.
       (default tf.float32. If model output is int8 dequantized, it must be in
-      {tf.float32, tf.int8, tf.uint8}, else it must be tf.float32)
-
+      {tf.float32, tf.int8,tf.uint8}, else if model output is int16 dequantized,
+      it must be in {tf.float32, tf.int16}, else it must be tf.float32)
   Returns:
     A tflite model with modified input/output type.
 
@@ -830,4 +860,3 @@ def modify_model_io_type(
   _modify_model_output_type(model_object, inference_output_type)
 
   return _convert_model_from_object_to_bytearray(model_object)
-
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 1950331ce02..e98b50de0de 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -24,7 +24,6 @@ import numpy as np
 from six.moves import range
 import tensorflow as tf
 
-from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
 from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.client import session
@@ -231,7 +230,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([None, 3, 5], tensor.shape)
 
 
-def _generate_integer_tflite_model():
+def _generate_integer_tflite_model(quantization_type=dtypes.int8):
   """Define an integer post-training quantized tflite model."""
   # Load MNIST dataset
   n = 10  # Number of samples
@@ -277,7 +276,13 @@ def _generate_integer_tflite_model():
               np.float32)
       ]
   converter.representative_dataset = representative_dataset_gen
-  converter.target_spec.supported_ops = {tf.lite.OpsSet.TFLITE_BUILTINS_INT8}
+  if quantization_type == dtypes.int8:
+    converter.target_spec.supported_ops = {tf.lite.OpsSet.TFLITE_BUILTINS_INT8}
+  else:
+    converter.target_spec.supported_ops = {
+        tf.lite.OpsSet
+        .EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+    }
   tflite_model = converter.convert()
 
   return tflite_model
@@ -286,22 +291,24 @@ def _generate_integer_tflite_model():
 def _test_param_modify_integer_model_io_type():
   """Function to generate parameterized inputs for testing."""
   params = []
-  str_template = "_{}{}{}"
+  str_template = "_{}{}{}{}"
   map_model_type = {
       "PostTraining": True,
       # "DuringTraining": False,
   }
-  map_types = {
-      "": lite_constants.FLOAT,
-      "INT8": lite_constants.INT8,
-      "UINT8": lite_constants.QUANTIZED_UINT8
+  map_quantize_type_to_io_types = {
+      tf.int8: {tf.float32, tf.int8, tf.uint8},
+      tf.int16: {tf.float32, tf.int16}
   }
   for k1, v1 in map_model_type.items():
-    for k2, v2 in map_types.items():
-      istr = "_Input{}".format(k2) if k2 else ""
-      for k3, v3 in map_types.items():
-        ostr = "_Output{}".format(k3) if k3 else "" if istr else "_NoUpdate"
-        params.append((str_template.format(k1, istr, ostr), v1, v2, v3))
+    for qtype, v2 in map_quantize_type_to_io_types.items():
+      qstr = "_IntegerQuantize{}".format(qtype.name.capitalize())
+      for itype in v2:
+        istr = "_Input{}".format(itype.name.capitalize())
+        for otype in v2:
+          ostr = "_Output{}".format(otype.name.capitalize())
+          params.append((str_template.format(k1, qstr, istr, ostr),
+                         v1, qtype, itype, otype))
   return params
 
 
@@ -312,10 +319,12 @@ class UtilModifyIntegerQuantizedModelIOTypeTest(
   @classmethod
   def setUpClass(cls):
     super(UtilModifyIntegerQuantizedModelIOTypeTest, cls).setUpClass()
-    cls.post_train_integer_model = _generate_integer_tflite_model()
+    cls.post_train_int8_model = _generate_integer_tflite_model()
+    cls.post_train_int16_model = _generate_integer_tflite_model(
+        quantization_type=dtypes.int16)
 
   @parameterized.named_parameters(_test_param_modify_integer_model_io_type())
-  def test(self, is_post_train, in_tftype, out_tftype):
+  def test(self, is_post_train, quantization_type, in_tftype, out_tftype):
     """Modify the float input/output type of an integer quantized model."""
 
     def _run_tflite_inference(model, in_tftype, out_tftype):
@@ -354,7 +363,12 @@ class UtilModifyIntegerQuantizedModelIOTypeTest(
 
       return output_data
 
-    model = self.__class__.post_train_integer_model if is_post_train else None
+    if is_post_train and quantization_type == tf.int8:
+      model = self.__class__.post_train_int8_model
+    elif is_post_train and quantization_type == tf.int16:
+      model = self.__class__.post_train_int16_model
+    else:
+      model = None
     # Run model inference with float input output type
     output_data = _run_tflite_inference(model, tf.float32, tf.float32)
     # Run model inference with modified integer input output type
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 13a996cf56e..14f299a7d79 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -136,6 +136,19 @@ cc_library(
     hdrs = ["schema_utils.h"],
     compatible_with = get_compatible_with_portable(),
     visibility = [":utils_friends"],
+    deps = [
+        ":schema_fbs",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
+cc_library(
+    name = "schema_conversion_utils",
+    srcs = ["schema_conversion_utils.cc"],
+    hdrs = ["schema_conversion_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [":utils_friends"],
     deps = [
         ":schema_fbs",
         "//tensorflow/lite/kernels/internal:compatibility",
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 0df9c3d8441..ef1592193f7 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -351,7 +351,9 @@ enum BuiltinOperator : int32 {
   DENSIFY = 124,
   SEGMENT_SUM = 125,
   BATCH_MATMUL = 126,
-  PLACEHOLDER_FOR_GREATER_OP_CODES = 127
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  CUMSUM = 128,
+  CALL_ONCE = 129
 }
 
 
@@ -457,7 +459,9 @@ union BuiltinOptions {
   SelectV2Options,
   DensifyOptions,
   SegmentSumOptions,
-  BatchMatMulOptions
+  BatchMatMulOptions,
+  CumsumOptions,
+  CallOnceOptions
 }
 
 enum Padding : byte { SAME, VALID }
@@ -953,6 +957,10 @@ table IfOptions {
   else_subgraph_index:int;
 }
 
+table CallOnceOptions {
+  init_subgraph_index:int;
+}
+
 table WhileOptions {
   cond_subgraph_index:int;
   body_subgraph_index:int;
@@ -981,6 +989,11 @@ table BatchMatMulOptions {
   adj_y:bool;
 }
 
+table CumsumOptions {
+  exclusive:bool;
+  reverse:bool;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
@@ -1077,6 +1090,32 @@ table Metadata {
   buffer:uint;
 }
 
+// Map from an alias name of tensor to tensor index in the graph.
+// This is used in Signature def.
+table TensorMap {
+  // Represents the alias to use for this tensor.
+  name:string;
+
+  // The actual tensor index in the primary graph, that 'name' corresponds to.
+  tensor_index:uint;
+}
+
+// This corresponds to SignatureDef in Tensorflow SavedModel.
+// The SignatureDef will be part of the SavedModel provided for conversion.
+table SignatureDef {
+  // Named inputs for this signature.
+  inputs:[TensorMap];
+
+  // Named outputs for this signature.
+  outputs:[TensorMap];
+
+  // Exported method name for this signature.
+  method_name:string;
+
+  // Key value which was in the Tensorflow SavedModel SignatureDef map.
+  key:string;
+}
+
 table Model {
   // Version of the schema.
   version:uint;
@@ -1105,6 +1144,9 @@ table Model {
 
   // Metadata about the model.
   metadata:[Metadata];
+
+  // Optional SignatureDefs for the model.
+  signature_defs:[SignatureDef];
 }
 
 root_type Model;
diff --git a/tensorflow/lite/schema/schema_conversion_utils.cc b/tensorflow/lite/schema/schema_conversion_utils.cc
new file mode 100644
index 00000000000..640965c68f7
--- /dev/null
+++ b/tensorflow/lite/schema/schema_conversion_utils.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(
+    const BuiltinOperator builtin_code) {
+  return (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES)
+             ? static_cast<int8_t>(builtin_code)
+             : static_cast<int8_t>(
+                   BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
+}
+
+// The following methods are the following `OperatorCode` table object creation
+// methods for backward compatibility.  These are manually copied from the
+// flatbuffer generated code from schema v3. They serve as overloads for the
+// v3a's CreateOperatorCode functions in schema_generated.h and enable code that
+// still assumes flatbuffer schema v3 to be unchanged with the inclusion of the
+// schema_utils header.
+// TODO(b/162392898): remove once all callers are updated to use schema v3a
+// functions.
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, BuiltinOperator builtin_code,
+    flatbuffers::Offset<flatbuffers::String> custom_code, int32_t version) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_version(version);
+
+  int8_t deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
+  if (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES) {
+    deprecated_builtin_code = static_cast<int8_t>(builtin_code);
+  }
+  builder_.add_deprecated_builtin_code(deprecated_builtin_code);
+  builder_.add_custom_code(custom_code);
+  builder_.add_builtin_code(builtin_code);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, BuiltinOperator builtin_code,
+    const char *custom_code, int32_t version) {
+  auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
+  int8_t deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
+  if (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES) {
+    deprecated_builtin_code = static_cast<int8_t>(builtin_code);
+  }
+  return CreateOperatorCode(_fbb, deprecated_builtin_code, custom_code__,
+                            version, builtin_code);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/schema/schema_conversion_utils.h b/tensorflow/lite/schema/schema_conversion_utils.h
new file mode 100644
index 00000000000..8a0b11c433b
--- /dev/null
+++ b/tensorflow/lite/schema/schema_conversion_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
+#define TENSORFLOW_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(
+    const BuiltinOperator builtin_code);
+
+// The following methods are for backward compatibility for the early version
+// three, which does not have an extended builtin code.
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    int32_t version = 1);
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    const char *custom_code = nullptr, int32_t version = 1);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index bbc000cc5dc..dd9b655c6e6 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -325,6 +325,9 @@ struct MatrixSetDiagOptionsT;
 struct IfOptions;
 struct IfOptionsT;
 
+struct CallOnceOptions;
+struct CallOnceOptionsT;
+
 struct WhileOptions;
 struct WhileOptionsT;
 
@@ -349,6 +352,9 @@ struct SegmentSumOptionsT;
 struct BatchMatMulOptions;
 struct BatchMatMulOptionsT;
 
+struct CumsumOptions;
+struct CumsumOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -364,6 +370,12 @@ struct BufferT;
 struct Metadata;
 struct MetadataT;
 
+struct TensorMap;
+struct TensorMapT;
+
+struct SignatureDef;
+struct SignatureDefT;
+
 struct Model;
 struct ModelT;
 
@@ -782,11 +794,13 @@ enum BuiltinOperator {
   BuiltinOperator_SEGMENT_SUM = 125,
   BuiltinOperator_BATCH_MATMUL = 126,
   BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  BuiltinOperator_CUMSUM = 128,
+  BuiltinOperator_CALL_ONCE = 129,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES
+  BuiltinOperator_MAX = BuiltinOperator_CALL_ONCE
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[128] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[130] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -915,13 +929,15 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[128] {
     BuiltinOperator_DENSIFY,
     BuiltinOperator_SEGMENT_SUM,
     BuiltinOperator_BATCH_MATMUL,
-    BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES
+    BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES,
+    BuiltinOperator_CUMSUM,
+    BuiltinOperator_CALL_ONCE
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[129] = {
+  static const char * const names[131] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1050,13 +1066,15 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "SEGMENT_SUM",
     "BATCH_MATMUL",
     "PLACEHOLDER_FOR_GREATER_OP_CODES",
+    "CUMSUM",
+    "CALL_ONCE",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_CALL_ONCE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -1164,11 +1182,13 @@ enum BuiltinOptions {
   BuiltinOptions_DensifyOptions = 99,
   BuiltinOptions_SegmentSumOptions = 100,
   BuiltinOptions_BatchMatMulOptions = 101,
+  BuiltinOptions_CumsumOptions = 102,
+  BuiltinOptions_CallOnceOptions = 103,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_BatchMatMulOptions
+  BuiltinOptions_MAX = BuiltinOptions_CallOnceOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[102] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[104] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -1271,13 +1291,15 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[102] {
     BuiltinOptions_SelectV2Options,
     BuiltinOptions_DensifyOptions,
     BuiltinOptions_SegmentSumOptions,
-    BuiltinOptions_BatchMatMulOptions
+    BuiltinOptions_BatchMatMulOptions,
+    BuiltinOptions_CumsumOptions,
+    BuiltinOptions_CallOnceOptions
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOptions() {
-  static const char * const names[103] = {
+  static const char * const names[105] = {
     "NONE",
     "Conv2DOptions",
     "DepthwiseConv2DOptions",
@@ -1380,13 +1402,15 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "DensifyOptions",
     "SegmentSumOptions",
     "BatchMatMulOptions",
+    "CumsumOptions",
+    "CallOnceOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_BatchMatMulOptions)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_CallOnceOptions)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions()[index];
 }
@@ -1799,6 +1823,14 @@ template<> struct BuiltinOptionsTraits<tflite::BatchMatMulOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_BatchMatMulOptions;
 };
 
+template<> struct BuiltinOptionsTraits<tflite::CumsumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CumsumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CallOnceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOnceOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -2639,6 +2671,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_BatchMatMulOptions ?
       reinterpret_cast<const tflite::BatchMatMulOptionsT *>(value) : nullptr;
   }
+  tflite::CumsumOptionsT *AsCumsumOptions() {
+    return type == BuiltinOptions_CumsumOptions ?
+      reinterpret_cast<tflite::CumsumOptionsT *>(value) : nullptr;
+  }
+  const tflite::CumsumOptionsT *AsCumsumOptions() const {
+    return type == BuiltinOptions_CumsumOptions ?
+      reinterpret_cast<const tflite::CumsumOptionsT *>(value) : nullptr;
+  }
+  tflite::CallOnceOptionsT *AsCallOnceOptions() {
+    return type == BuiltinOptions_CallOnceOptions ?
+      reinterpret_cast<tflite::CallOnceOptionsT *>(value) : nullptr;
+  }
+  const tflite::CallOnceOptionsT *AsCallOnceOptions() const {
+    return type == BuiltinOptions_CallOnceOptions ?
+      reinterpret_cast<const tflite::CallOnceOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -8965,6 +9013,60 @@ inline flatbuffers::Offset<IfOptions> CreateIfOptions(
 
 flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct CallOnceOptionsT : public flatbuffers::NativeTable {
+  typedef CallOnceOptions TableType;
+  int32_t init_subgraph_index;
+  CallOnceOptionsT()
+      : init_subgraph_index(0) {
+  }
+};
+
+struct CallOnceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CallOnceOptionsT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INIT_SUBGRAPH_INDEX = 4
+  };
+  int32_t init_subgraph_index() const {
+    return GetField<int32_t>(VT_INIT_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INIT_SUBGRAPH_INDEX) &&
+           verifier.EndTable();
+  }
+  CallOnceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CallOnceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CallOnceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CallOnceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_init_subgraph_index(int32_t init_subgraph_index) {
+    fbb_.AddElement<int32_t>(CallOnceOptions::VT_INIT_SUBGRAPH_INDEX, init_subgraph_index, 0);
+  }
+  explicit CallOnceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CallOnceOptionsBuilder &operator=(const CallOnceOptionsBuilder &);
+  flatbuffers::Offset<CallOnceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CallOnceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t init_subgraph_index = 0) {
+  CallOnceOptionsBuilder builder_(_fbb);
+  builder_.add_init_subgraph_index(init_subgraph_index);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct WhileOptionsT : public flatbuffers::NativeTable {
   typedef WhileOptions TableType;
   int32_t cond_subgraph_index;
@@ -9337,6 +9439,72 @@ inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
 
 flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct CumsumOptionsT : public flatbuffers::NativeTable {
+  typedef CumsumOptions TableType;
+  bool exclusive;
+  bool reverse;
+  CumsumOptionsT()
+      : exclusive(false),
+        reverse(false) {
+  }
+};
+
+struct CumsumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CumsumOptionsT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EXCLUSIVE = 4,
+    VT_REVERSE = 6
+  };
+  bool exclusive() const {
+    return GetField<uint8_t>(VT_EXCLUSIVE, 0) != 0;
+  }
+  bool reverse() const {
+    return GetField<uint8_t>(VT_REVERSE, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_EXCLUSIVE) &&
+           VerifyField<uint8_t>(verifier, VT_REVERSE) &&
+           verifier.EndTable();
+  }
+  CumsumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CumsumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CumsumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CumsumOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_exclusive(bool exclusive) {
+    fbb_.AddElement<uint8_t>(CumsumOptions::VT_EXCLUSIVE, static_cast<uint8_t>(exclusive), 0);
+  }
+  void add_reverse(bool reverse) {
+    fbb_.AddElement<uint8_t>(CumsumOptions::VT_REVERSE, static_cast<uint8_t>(reverse), 0);
+  }
+  explicit CumsumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CumsumOptionsBuilder &operator=(const CumsumOptionsBuilder &);
+  flatbuffers::Offset<CumsumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CumsumOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool exclusive = false,
+    bool reverse = false) {
+  CumsumOptionsBuilder builder_(_fbb);
+  builder_.add_reverse(reverse);
+  builder_.add_exclusive(exclusive);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   int8_t deprecated_builtin_code;
@@ -9790,6 +9958,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::BatchMatMulOptions *builtin_options_as_BatchMatMulOptions() const {
     return builtin_options_type() == tflite::BuiltinOptions_BatchMatMulOptions ? static_cast<const tflite::BatchMatMulOptions *>(builtin_options()) : nullptr;
   }
+  const tflite::CumsumOptions *builtin_options_as_CumsumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CumsumOptions ? static_cast<const tflite::CumsumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CallOnceOptions *builtin_options_as_CallOnceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CallOnceOptions ? static_cast<const tflite::CallOnceOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -10230,6 +10404,14 @@ template<> inline const tflite::BatchMatMulOptions *Operator::builtin_options_as
   return builtin_options_as_BatchMatMulOptions();
 }
 
+template<> inline const tflite::CumsumOptions *Operator::builtin_options_as<tflite::CumsumOptions>() const {
+  return builtin_options_as_CumsumOptions();
+}
+
+template<> inline const tflite::CallOnceOptions *Operator::builtin_options_as<tflite::CallOnceOptions>() const {
+  return builtin_options_as_CallOnceOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -10593,6 +10775,193 @@ inline flatbuffers::Offset<Metadata> CreateMetadataDirect(
 
 flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct TensorMapT : public flatbuffers::NativeTable {
+  typedef TensorMap TableType;
+  std::string name;
+  uint32_t tensor_index;
+  TensorMapT()
+      : tensor_index(0) {
+  }
+};
+
+struct TensorMap FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TensorMapT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TENSOR_INDEX = 6
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t tensor_index() const {
+    return GetField<uint32_t>(VT_TENSOR_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_TENSOR_INDEX) &&
+           verifier.EndTable();
+  }
+  TensorMapT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorMapT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TensorMap> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorMapBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(TensorMap::VT_NAME, name);
+  }
+  void add_tensor_index(uint32_t tensor_index) {
+    fbb_.AddElement<uint32_t>(TensorMap::VT_TENSOR_INDEX, tensor_index, 0);
+  }
+  explicit TensorMapBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TensorMapBuilder &operator=(const TensorMapBuilder &);
+  flatbuffers::Offset<TensorMap> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TensorMap>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TensorMap> CreateTensorMap(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    uint32_t tensor_index = 0) {
+  TensorMapBuilder builder_(_fbb);
+  builder_.add_tensor_index(tensor_index);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t tensor_index = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateTensorMap(
+      _fbb,
+      name__,
+      tensor_index);
+}
+
+flatbuffers::Offset<TensorMap> CreateTensorMap(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SignatureDefT : public flatbuffers::NativeTable {
+  typedef SignatureDef TableType;
+  std::vector<std::unique_ptr<tflite::TensorMapT>> inputs;
+  std::vector<std::unique_ptr<tflite::TensorMapT>> outputs;
+  std::string method_name;
+  std::string key;
+  SignatureDefT() {
+  }
+};
+
+struct SignatureDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SignatureDefT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INPUTS = 4,
+    VT_OUTPUTS = 6,
+    VT_METHOD_NAME = 8,
+    VT_KEY = 10
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *>(VT_OUTPUTS);
+  }
+  const flatbuffers::String *method_name() const {
+    return GetPointer<const flatbuffers::String *>(VT_METHOD_NAME);
+  }
+  const flatbuffers::String *key() const {
+    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           verifier.VerifyVectorOfTables(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           verifier.VerifyVectorOfTables(outputs()) &&
+           VerifyOffset(verifier, VT_METHOD_NAME) &&
+           verifier.VerifyString(method_name()) &&
+           VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           verifier.EndTable();
+  }
+  SignatureDefT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SignatureDefT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SignatureDef> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SignatureDefBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> inputs) {
+    fbb_.AddOffset(SignatureDef::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> outputs) {
+    fbb_.AddOffset(SignatureDef::VT_OUTPUTS, outputs);
+  }
+  void add_method_name(flatbuffers::Offset<flatbuffers::String> method_name) {
+    fbb_.AddOffset(SignatureDef::VT_METHOD_NAME, method_name);
+  }
+  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+    fbb_.AddOffset(SignatureDef::VT_KEY, key);
+  }
+  explicit SignatureDefBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SignatureDefBuilder &operator=(const SignatureDefBuilder &);
+  flatbuffers::Offset<SignatureDef> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SignatureDef>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> outputs = 0,
+    flatbuffers::Offset<flatbuffers::String> method_name = 0,
+    flatbuffers::Offset<flatbuffers::String> key = 0) {
+  SignatureDefBuilder builder_(_fbb);
+  builder_.add_key(key);
+  builder_.add_method_name(method_name);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<flatbuffers::Offset<tflite::TensorMap>> *inputs = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::TensorMap>> *outputs = nullptr,
+    const char *method_name = nullptr,
+    const char *key = nullptr) {
+  auto inputs__ = inputs ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>>(*outputs) : 0;
+  auto method_name__ = method_name ? _fbb.CreateString(method_name) : 0;
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  return tflite::CreateSignatureDef(
+      _fbb,
+      inputs__,
+      outputs__,
+      method_name__,
+      key__);
+}
+
+flatbuffers::Offset<SignatureDef> CreateSignatureDef(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ModelT : public flatbuffers::NativeTable {
   typedef Model TableType;
   uint32_t version;
@@ -10602,6 +10971,7 @@ struct ModelT : public flatbuffers::NativeTable {
   std::vector<std::unique_ptr<tflite::BufferT>> buffers;
   std::vector<int32_t> metadata_buffer;
   std::vector<std::unique_ptr<tflite::MetadataT>> metadata;
+  std::vector<std::unique_ptr<tflite::SignatureDefT>> signature_defs;
   ModelT()
       : version(0) {
   }
@@ -10616,7 +10986,8 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_DESCRIPTION = 10,
     VT_BUFFERS = 12,
     VT_METADATA_BUFFER = 14,
-    VT_METADATA = 16
+    VT_METADATA = 16,
+    VT_SIGNATURE_DEFS = 18
   };
   uint32_t version() const {
     return GetField<uint32_t>(VT_VERSION, 0);
@@ -10639,6 +11010,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *metadata() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_VERSION) &&
@@ -10658,6 +11032,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_METADATA) &&
            verifier.VerifyVector(metadata()) &&
            verifier.VerifyVectorOfTables(metadata()) &&
+           VerifyOffset(verifier, VT_SIGNATURE_DEFS) &&
+           verifier.VerifyVector(signature_defs()) &&
+           verifier.VerifyVectorOfTables(signature_defs()) &&
            verifier.EndTable();
   }
   ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -10689,6 +11066,9 @@ struct ModelBuilder {
   void add_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata) {
     fbb_.AddOffset(Model::VT_METADATA, metadata);
   }
+  void add_signature_defs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
+    fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
+  }
   explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -10709,8 +11089,10 @@ inline flatbuffers::Offset<Model> CreateModel(
     flatbuffers::Offset<flatbuffers::String> description = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
   ModelBuilder builder_(_fbb);
+  builder_.add_signature_defs(signature_defs);
   builder_.add_metadata(metadata);
   builder_.add_metadata_buffer(metadata_buffer);
   builder_.add_buffers(buffers);
@@ -10729,13 +11111,15 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
     const char *description = nullptr,
     const std::vector<flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
     const std::vector<int32_t> *metadata_buffer = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr) {
+    const std::vector<flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
   auto operator_codes__ = operator_codes ? _fbb.CreateVector<flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
   auto subgraphs__ = subgraphs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
   auto description__ = description ? _fbb.CreateString(description) : 0;
   auto buffers__ = buffers ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
   auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
   auto metadata__ = metadata ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
+  auto signature_defs__ = signature_defs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
   return tflite::CreateModel(
       _fbb,
       version,
@@ -10744,7 +11128,8 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
       description__,
       buffers__,
       metadata_buffer__,
-      metadata__);
+      metadata__,
+      signature_defs__);
 }
 
 flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -10758,7 +11143,7 @@ inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolv
 inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom[_i] = _e->Get(_i); } } }
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom.begin()); } }
 }
 
 inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -10882,7 +11267,7 @@ inline Uint8VectorT *Uint8Vector::UnPack(const flatbuffers::resolver_function_t
 inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->values.begin()); } }
 }
 
 inline flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13507,6 +13892,32 @@ inline flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBui
       _else_subgraph_index);
 }
 
+inline CallOnceOptionsT *CallOnceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CallOnceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CallOnceOptions::UnPackTo(CallOnceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = init_subgraph_index(); _o->init_subgraph_index = _e; }
+}
+
+inline flatbuffers::Offset<CallOnceOptions> CallOnceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCallOnceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CallOnceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _init_subgraph_index = _o->init_subgraph_index;
+  return tflite::CreateCallOnceOptions(
+      _fbb,
+      _init_subgraph_index);
+}
+
 inline WhileOptionsT *WhileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new WhileOptionsT();
   UnPackTo(_o, _resolver);
@@ -13703,6 +14114,35 @@ inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuff
       _adj_y);
 }
 
+inline CumsumOptionsT *CumsumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CumsumOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CumsumOptions::UnPackTo(CumsumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = exclusive(); _o->exclusive = _e; }
+  { auto _e = reverse(); _o->reverse = _e; }
+}
+
+inline flatbuffers::Offset<CumsumOptions> CumsumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCumsumOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CumsumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _exclusive = _o->exclusive;
+  auto _reverse = _o->reverse;
+  return tflite::CreateCumsumOptions(
+      _fbb,
+      _exclusive,
+      _reverse);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -13752,7 +14192,7 @@ inline void Operator::UnPackTo(OperatorT *_o, const flatbuffers::resolver_functi
   { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }
   { auto _e = builtin_options_type(); _o->builtin_options.type = _e; }
   { auto _e = builtin_options(); if (_e) _o->builtin_options.value = tflite::BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); }
-  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom_options[_i] = _e->Get(_i); } } }
+  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom_options.begin()); } }
   { auto _e = custom_options_format(); _o->custom_options_format = _e; }
   { auto _e = mutating_variable_inputs(); if (_e) { _o->mutating_variable_inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mutating_variable_inputs[_i] = _e->Get(_i) != 0; } } }
   { auto _e = intermediates(); if (_e) { _o->intermediates.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->intermediates[_i] = _e->Get(_i); } } }
@@ -13835,7 +14275,7 @@ inline BufferT *Buffer::UnPack(const flatbuffers::resolver_function_t *_resolver
 inline void Buffer::UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->data[_i] = _e->Get(_i); } } }
+  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->data.begin()); } }
 }
 
 inline flatbuffers::Offset<Buffer> Buffer::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13882,6 +14322,70 @@ inline flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuild
       _buffer);
 }
 
+inline TensorMapT *TensorMap::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TensorMapT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void TensorMap::UnPackTo(TensorMapT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = tensor_index(); _o->tensor_index = _e; }
+}
+
+inline flatbuffers::Offset<TensorMap> TensorMap::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensorMap(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TensorMap> CreateTensorMap(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorMapT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _tensor_index = _o->tensor_index;
+  return tflite::CreateTensorMap(
+      _fbb,
+      _name,
+      _tensor_index);
+}
+
+inline SignatureDefT *SignatureDef::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SignatureDefT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SignatureDef::UnPackTo(SignatureDefT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = method_name(); if (_e) _o->method_name = _e->str(); }
+  { auto _e = key(); if (_e) _o->key = _e->str(); }
+}
+
+inline flatbuffers::Offset<SignatureDef> SignatureDef::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSignatureDef(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SignatureDefT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>> (_o->inputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->inputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>> (_o->outputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->outputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _method_name = _o->method_name.empty() ? 0 : _fbb.CreateString(_o->method_name);
+  auto _key = _o->key.empty() ? 0 : _fbb.CreateString(_o->key);
+  return tflite::CreateSignatureDef(
+      _fbb,
+      _inputs,
+      _outputs,
+      _method_name,
+      _key);
+}
+
 inline ModelT *Model::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ModelT();
   UnPackTo(_o, _resolver);
@@ -13898,6 +14402,7 @@ inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *
   { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); } } }
   { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } }
   { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); } } }
 }
 
 inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13915,6 +14420,7 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
   auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
   auto _metadata = _o->metadata.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateModel(
       _fbb,
       _version,
@@ -13923,7 +14429,8 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
       _description,
       _buffers,
       _metadata_buffer,
-      _metadata);
+      _metadata,
+      _signature_defs);
 }
 
 inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
@@ -14515,6 +15022,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return true;
   }
 }
@@ -14937,6 +15452,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -15347,6 +15870,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const tflite::BatchMatMulOptionsT *>(value);
       return CreateBatchMatMulOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptionsT *>(value);
+      return CreateCumsumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptionsT *>(value);
+      return CreateCallOnceOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -15757,6 +16288,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new tflite::BatchMatMulOptionsT(*reinterpret_cast<tflite::BatchMatMulOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_CumsumOptions: {
+      value = new tflite::CumsumOptionsT(*reinterpret_cast<tflite::CumsumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      value = new tflite::CallOnceOptionsT(*reinterpret_cast<tflite::CallOnceOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -16269,6 +16808,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<tflite::CumsumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<tflite::CallOnceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/schema/schema_utils.cc b/tensorflow/lite/schema/schema_utils.cc
index ad110ebda4f..fc19290b862 100644
--- a/tensorflow/lite/schema/schema_utils.cc
+++ b/tensorflow/lite/schema/schema_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/schema/schema_utils.h"
 
+#include <algorithm>
+
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -40,69 +42,21 @@ namespace tflite {
 // code. In the case, the maximum value of the two fields will be the value of
 // the `builtin_code` as the right value.
 
-BuiltinOperator GetBuiltinCode(const OperatorCode *op_code) {
+BuiltinOperator GetBuiltinCode(const OperatorCode* op_code) {
   // Caller should guarantee that the given argument value is not a nullptr.
   TFLITE_DCHECK(op_code != nullptr);
 
-  return (op_code->builtin_code() ? op_code->builtin_code()
-                                  : static_cast<BuiltinOperator>(
-                                        op_code->deprecated_builtin_code()));
+  return std::max(
+      op_code->builtin_code(),
+      static_cast<BuiltinOperator>(op_code->deprecated_builtin_code()));
 }
 
-BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code) {
+BuiltinOperator GetBuiltinCode(const OperatorCodeT* op_code) {
   // Caller should guarantee that the given argument value is not a nullptr.
   TFLITE_DCHECK(op_code != nullptr);
 
-  return (op_code->builtin_code
-              ? op_code->builtin_code
-              : static_cast<BuiltinOperator>(op_code->deprecated_builtin_code));
-}
-
-int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(
-    const BuiltinOperator builtin_code) {
-  return (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES)
-             ? static_cast<int8_t>(builtin_code)
-             : static_cast<int8_t>(
-                   BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
-}
-
-// The following methods are the following `OperatorCode` table object creation
-// methods for backward compatibility.  These are manually copied from the
-// flatbuffer generated code from schema v3. They serve as overloads for the
-// v3a's CreateOperatorCode functions in schema_generated.h and enable code that
-// still assumes flatbuffer schema v3 to be unchanged with the inclusion of the
-// schema_utils header.
-// TODO(b/162392898): remove once all callers are updated to use schema v3a
-// functions.
-
-flatbuffers::Offset<OperatorCode> CreateOperatorCode(
-    flatbuffers::FlatBufferBuilder &_fbb, BuiltinOperator builtin_code,
-    flatbuffers::Offset<flatbuffers::String> custom_code, int32_t version) {
-  OperatorCodeBuilder builder_(_fbb);
-  builder_.add_version(version);
-
-  int8_t deprecated_builtin_code =
-      static_cast<int8_t>(BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
-  if (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES) {
-    deprecated_builtin_code = static_cast<int8_t>(builtin_code);
-  }
-  builder_.add_deprecated_builtin_code(deprecated_builtin_code);
-  builder_.add_custom_code(custom_code);
-  builder_.add_builtin_code(builtin_code);
-  return builder_.Finish();
-}
-
-flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb, BuiltinOperator builtin_code,
-    const char *custom_code, int32_t version) {
-  auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
-  int8_t deprecated_builtin_code =
-      static_cast<int8_t>(BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
-  if (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES) {
-    deprecated_builtin_code = static_cast<int8_t>(builtin_code);
-  }
-  return CreateOperatorCode(_fbb, deprecated_builtin_code, custom_code__,
-                            version, builtin_code);
+  return std::max(op_code->builtin_code, static_cast<BuiltinOperator>(
+                                             op_code->deprecated_builtin_code));
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/schema/schema_utils.h b/tensorflow/lite/schema/schema_utils.h
index 315a8d0daf4..453276b97f0 100644
--- a/tensorflow/lite/schema/schema_utils.h
+++ b/tensorflow/lite/schema/schema_utils.h
@@ -28,22 +28,6 @@ BuiltinOperator GetBuiltinCode(const OperatorCode *op_code);
 
 BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code);
 
-int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(
-    const BuiltinOperator builtin_code);
-
-// The following methods are for backward compatibility for the early version
-// three, which does not have an extended builtin code.
-flatbuffers::Offset<OperatorCode> CreateOperatorCode(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    BuiltinOperator builtin_code = BuiltinOperator_ADD,
-    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
-    int32_t version = 1);
-
-flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    BuiltinOperator builtin_code = BuiltinOperator_ADD,
-    const char *custom_code = nullptr, int32_t version = 1);
-
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index 68143c976f4..2b80b2ef9c1 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -77,3 +77,16 @@ def tflite_schema_utils_friends():
     # Its usage should be rare, and is often abused by tools that are doing
     # Flatbuffer creation/manipulation in unofficially supported ways."
     return ["//..."]
+
+def flex_portable_tensorflow_deps():
+    """Returns dependencies for building portable tensorflow in Flex delegate."""
+
+    return [
+        "//third_party/fft2d:fft2d_headers",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/strings:str_format",
+        "@gemmlowp",
+        "@icu//:common",
+        "//third_party/icu/data:conversion_data",
+    ]
diff --git a/tensorflow/lite/stderr_reporter_test.cc b/tensorflow/lite/stderr_reporter_test.cc
new file mode 100644
index 00000000000..264b7f7b313
--- /dev/null
+++ b/tensorflow/lite/stderr_reporter_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/stderr_reporter.h"
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+
+namespace {
+
+void CheckWritesToStderr(ErrorReporter *error_reporter) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  testing::internal::CaptureStderr();
+#endif
+
+  // Run the code under test.
+  TF_LITE_REPORT_ERROR(error_reporter, "Test: %d", 42);
+
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_EQ("ERROR: Test: 42\n", testing::internal::GetCapturedStderr());
+#endif
+}
+
+TEST(StderrReporterTest, DefaultErrorReporter_WritesToStderr) {
+  CheckWritesToStderr(DefaultErrorReporter());
+}
+
+TEST(StderrReporterTest, StderrReporter_WritesToStderr) {
+  StderrReporter stderr_reporter;
+  CheckWritesToStderr(&stderr_reporter);
+}
+
+}  // namespace
+
+}  // namespace tflite
diff --git a/tensorflow/lite/testdata/custom_lstm.bin b/tensorflow/lite/testdata/custom_lstm.bin
new file mode 100644
index 00000000000..f791f0fd19f
Binary files /dev/null and b/tensorflow/lite/testdata/custom_lstm.bin differ
diff --git a/tensorflow/lite/testdata/unidirectional_sequence_lstm.bin b/tensorflow/lite/testdata/unidirectional_sequence_lstm.bin
new file mode 100644
index 00000000000..42c96d14faa
Binary files /dev/null and b/tensorflow/lite/testdata/unidirectional_sequence_lstm.bin differ
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 8d03911eb87..9ba0cffe5f5 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -291,10 +291,17 @@ std::vector<string> UnarchiveAndFindTestNames(const string& zip_file,
 class OpsTest : public ::testing::TestWithParam<string> {};
 
 TEST_P(OpsTest, RunZipTests) {
-  string test_path = GetParam();
+  string test_path_and_label = GetParam();
+  string test_path = test_path_and_label;
+  string label = test_path_and_label;
+  size_t end_pos = test_path_and_label.find(" ");
+  if (end_pos != string::npos) {
+    test_path = test_path_and_label.substr(0, end_pos);
+    label = test_path_and_label.substr(end_pos + 1);
+  }
   string tflite_test_case = test_path + "_tests.txt";
   string tflite_dir = test_path.substr(0, test_path.find_last_of("/"));
-  string test_name = test_path.substr(test_path.find_last_of('/'));
+  string test_name = label.substr(label.find_last_of('/'));
 
   std::ifstream tflite_stream(tflite_test_case);
   ASSERT_TRUE(tflite_stream.is_open()) << tflite_test_case;
@@ -305,7 +312,7 @@ TEST_P(OpsTest, RunZipTests) {
 
   auto quantized_tests_error = GetQuantizeTestsError();
   bool fully_quantize = false;
-  if (test_path.find("fully_quantize=True") != std::string::npos) {
+  if (label.find("fully_quantize=True") != std::string::npos) {
     for (const auto& p : quantized_tests_error) {
       if (RE2::PartialMatch(test_name, p.first)) {
         test_driver.SetQuantizationErrorMultiplier(p.second);
diff --git a/tensorflow/lite/testing/model_coverage/BUILD b/tensorflow/lite/testing/model_coverage/BUILD
index 7c5c221650a..3aa9f25f17f 100644
--- a/tensorflow/lite/testing/model_coverage/BUILD
+++ b/tensorflow/lite/testing/model_coverage/BUILD
@@ -10,6 +10,7 @@ py_library(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
+        "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python:platform",
     ],
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index d9cd6883a8d..0235de9c1a6 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -22,20 +22,19 @@ import os
 
 import numpy as np
 from six import PY2
+from tensorflow import keras
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.python import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python import lite as _lite
-from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python import util as _util
-from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
-from tensorflow.python.keras.preprocessing import image
 from tensorflow.python.lib.io import file_io as _file_io
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.saved_model import load as _load
@@ -71,8 +70,9 @@ def get_image(size):
   """
   img_filename = _resource_loader.get_path_to_datafile(
       "testdata/grace_hopper.jpg")
-  img = image.load_img(img_filename, target_size=(size, size))
-  img_array = image.img_to_array(img)
+  img = keras.preprocessing.image.load_img(
+      img_filename, target_size=(size, size))
+  img_array = keras.preprocessing.image.img_to_array(img)
   img_array = np.expand_dims(img_array, axis=0)
   return img_array
 
@@ -97,7 +97,7 @@ def _convert(converter, **kwargs):
   if "post_training_quantize" in kwargs:
     converter.optimizations = [_lite.Optimize.DEFAULT]
   if kwargs.get("quantize_to_float16", False):
-    converter.target_spec.supported_types = [constants.FLOAT16]
+    converter.target_spec.supported_types = [dtypes.float16]
   if kwargs.get("post_training_quantize_16x8", False):
     input_size = kwargs.get("model_input_size")
 
@@ -128,11 +128,11 @@ def _convert(converter, **kwargs):
 def _check_model_quantized_to_16x8(tflite_model):
   """Checks that the activations are quantized into int16.
 
-    Args:
-      tflite_model: Serialized TensorFlow Lite model.
+  Args:
+    tflite_model: Serialized TensorFlow Lite model.
 
-    Raises:
-      ValueError: Activations with int16 type are not found.
+  Raises:
+    ValueError: Activations with int16 type are not found.
   """
   interpreter = _get_tflite_interpreter(tflite_model)
   interpreter.allocate_tensors()
@@ -342,7 +342,7 @@ def evaluate_keras_model(filename):
   Returns:
     Lambda function ([np.ndarray data] : [np.ndarray result]).
   """
-  keras_model = _keras.models.load_model(filename)
+  keras_model = keras.models.load_model(filename)
   return lambda input_data: [keras_model.predict(input_data)]
 
 
@@ -740,7 +740,7 @@ def test_keras_model_v2(filename,
       (half-inclusive). (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
-  keras_model = _keras.models.load_model(filename)
+  keras_model = keras.models.load_model(filename)
   if input_shapes:
     for tensor, shape in zip(keras_model.inputs, input_shapes):
       tensor.set_shape(shape)
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 2733363fc3a..2032f52c4d1 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -22,16 +22,15 @@ import os
 import tempfile
 
 import numpy as np
+from tensorflow import keras
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.testing.model_coverage import model_coverage_lib as model_coverage
-from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -196,8 +195,8 @@ class EvaluateKerasModel(test.TestCase):
     """Returns single input Sequential tf.keras model."""
     keras.backend.clear_session()
 
-    xs = [-1, 0, 1, 2, 3, 4]
-    ys = [-3, -1, 1, 3, 5, 7]
+    xs = np.array([-1, 0, 1, 2, 3, 4])
+    ys = np.array([-3, -1, 1, 3, 5, 7])
 
     model = keras.Sequential([keras.layers.Dense(units=1, input_shape=[1])])
     model.compile(optimizer='sgd', loss='mean_squared_error')
@@ -207,26 +206,23 @@ class EvaluateKerasModel(test.TestCase):
   def _saveKerasModel(self, model):
     try:
       fd, keras_file = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, keras_file)
+      model.save(keras_file)
     finally:
       os.close(fd)
     return keras_file
 
-  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
-  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
-  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
diff --git a/tensorflow/lite/testing/op_tests/conv_activation.py b/tensorflow/lite/testing/op_tests/conv_activation.py
index 1ee1210ec9e..419ccc686b5 100644
--- a/tensorflow/lite/testing/op_tests/conv_activation.py
+++ b/tensorflow/lite/testing/op_tests/conv_activation.py
@@ -40,6 +40,7 @@ def make_conv_activation_tests(activation_op):
             "constant_filter": [True, False],
             "channel_multiplier": [1, 2],
             "fully_quantize": [False],
+            "quant_16x8": [False],
             "dynamic_range_quantize": [False],
         },
         # TODO(b/134702301): The fully_quantize param is just ignored by the
@@ -47,14 +48,15 @@ def make_conv_activation_tests(activation_op):
         # these tests or handle it properly in the mlir_convert() function.
         {
             "input_shape": [[1, 3, 4, 3], [4, 6, 6, 1]],
-            "filter_shape": [[1, 1], [2, 3], [3, 3]],
+            "filter_shape": [[1, 1], [2, 3]],
             "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-            "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
+            "dilations": [[1, 1, 1, 1], [1, 3, 2, 1]],
             "padding": ["SAME", "VALID"],
             "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
             "constant_filter": [True],
             "channel_multiplier": [1, 2],
             "fully_quantize": [True],
+            "quant_16x8": [False, True],
             "dynamic_range_quantize": [False],
         },
         {
@@ -67,6 +69,7 @@ def make_conv_activation_tests(activation_op):
             "constant_filter": [True],
             "channel_multiplier": [1, 2],
             "fully_quantize": [False],
+            "quant_16x8": [False],
             "dynamic_range_quantize": [True],
         },
     ]
@@ -123,7 +126,7 @@ def make_conv_activation_tests(activation_op):
         test_parameters,
         build_graph,
         build_inputs,
-        expected_tf_failures=60)
+        expected_tf_failures=48)
 
   return f
 
diff --git a/tensorflow/lite/testing/op_tests/squeeze.py b/tensorflow/lite/testing/op_tests/squeeze.py
index 481dfd7612c..00726869892 100644
--- a/tensorflow/lite/testing/op_tests/squeeze.py
+++ b/tensorflow/lite/testing/op_tests/squeeze.py
@@ -65,6 +65,11 @@ def make_squeeze_tests(options):
       "input_shape": [[1, 1, 5, 10], [1, 5, 1, 10], [5, 1, 10]],
       "axis": [[0], [1], [3, 0], [-2, 0, 3, 2]],
       "fully_quantize": [True],
+  }, {
+      "dtype": [tf.string],
+      "input_shape": [[1, 1, 5, 10], [1, 5, 1, 10]],
+      "axis": [[0], []],
+      "fully_quantize": [False],
   }]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/toco_convert.py b/tensorflow/lite/testing/toco_convert.py
index 0803f4de600..48c19c49686 100644
--- a/tensorflow/lite/testing/toco_convert.py
+++ b/tensorflow/lite/testing/toco_convert.py
@@ -114,6 +114,7 @@ def toco_convert(options, graph_def, input_tensors, output_tensors, **kwargs):
       converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
           graphdef_file.name, input_arrays, output_tensors, input_shapes)
 
+      converter.experimental_new_converter = options.use_experimental_converter
       converter.optimizations = [tf.lite.Optimize.DEFAULT]
 
       if fully_quantize:
diff --git a/tensorflow/lite/testing/zip_test_utils.py b/tensorflow/lite/testing/zip_test_utils.py
index 0340886d37d..1b4460461b6 100644
--- a/tensorflow/lite/testing/zip_test_utils.py
+++ b/tensorflow/lite/testing/zip_test_utils.py
@@ -342,6 +342,7 @@ def make_zip_of_tests(options,
   if options.multi_gen_state:
     label_base_path = options.multi_gen_state.label_base_path
 
+  i = 1
   for parameters in test_parameters:
     keys = parameters.keys()
     for curr in itertools.product(*parameters.values()):
@@ -349,6 +350,12 @@ def make_zip_of_tests(options,
           "%s=%r" % z for z in sorted(zip(keys, curr))).replace(" ", ""))
       if label[0] == "/":
         label = label[1:]
+
+      zip_path_label = label
+      if len(os.path.basename(zip_path_label)) > 245:
+        zip_path_label = label_base_path.replace(".zip", "_") + str(i)
+
+      i += 1
       if label in processed_labels:
         # Do not populate data for the same label more than once. It will cause
         # errors when unzipping.
@@ -397,13 +404,14 @@ def make_zip_of_tests(options,
 
         return input_values, output_values
 
-      def build_example(label, param_dict_real):
+      def build_example(label, param_dict_real, zip_path_label):
         """Build the model with parameter values set in param_dict_real.
 
         Args:
-          label: Label of the model (i.e. the filename in the zip).
+          label: Label of the model
           param_dict_real: Parameter dictionary (arguments to the factories
             make_graph and make_test_inputs)
+          zip_path_label: Filename in the zip
 
         Returns:
           (tflite_model_binary, report) where tflite_model_binary is the
@@ -466,7 +474,7 @@ def make_zip_of_tests(options,
         report["toco_log"] = toco_log
 
         if options.save_graphdefs:
-          archive.writestr(label + ".pbtxt",
+          archive.writestr(zip_path_label + ".pbtxt",
                            text_format.MessageToString(graph_def),
                            zipfile.ZIP_DEFLATED)
 
@@ -475,25 +483,29 @@ def make_zip_of_tests(options,
             # Set proper min max values according to input dtype.
             baseline_inputs, baseline_outputs = generate_inputs_outputs(
                 tflite_model_binary, min_value=0, max_value=255)
-          archive.writestr(label + ".bin", tflite_model_binary,
+          archive.writestr(zip_path_label + ".bin", tflite_model_binary,
                            zipfile.ZIP_DEFLATED)
           example = {"inputs": baseline_inputs, "outputs": baseline_outputs}
 
           example_fp = StringIO()
           write_examples(example_fp, [example])
-          archive.writestr(label + ".inputs", example_fp.getvalue(),
+          archive.writestr(zip_path_label + ".inputs", example_fp.getvalue(),
                            zipfile.ZIP_DEFLATED)
 
           example_fp2 = StringIO()
-          write_test_cases(example_fp2, label + ".bin", [example])
-          archive.writestr(label + "_tests.txt", example_fp2.getvalue(),
-                           zipfile.ZIP_DEFLATED)
+          write_test_cases(example_fp2, zip_path_label + ".bin", [example])
+          archive.writestr(zip_path_label + "_tests.txt",
+                           example_fp2.getvalue(), zipfile.ZIP_DEFLATED)
 
-          zip_manifest.append(label + "\n")
+          zip_manifest_label = zip_path_label + " " + label
+          if zip_path_label == label:
+            zip_manifest_label = zip_path_label
+
+          zip_manifest.append(zip_manifest_label + "\n")
 
         return tflite_model_binary, report
 
-      _, report = build_example(label, param_dict)
+      _, report = build_example(label, param_dict, zip_path_label)
 
       if report["toco"] == report_lib.FAILED:
         ignore_error = False
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index cd130d978f4..8a803ebc5c9 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/status",
         "@com_google_protobuf//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 91bb77ff391..fe2a1fd35a6 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -94,8 +94,8 @@ cc_library(
         ":operator",
         ":types",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/toco:model",
         "//tensorflow/lite/toco:tooling_util",
         "//tensorflow/lite/tools/optimize:quantize_weights",
@@ -174,8 +174,8 @@ tf_cc_test(
         ":import",
         "//tensorflow/core:ops",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 3ef1c67c721..0f5e7986ce6 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/context.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/toco/tflite/op_version.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index b5a55cfa090..b7bbcf49563 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -662,10 +662,10 @@ TEST_F(VersionedOpExportTest, Export) {
   // different versions.
   EXPECT_EQ(2, operator_codes->size());
   EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
-            (*operator_codes)[0]->builtin_code());
+            GetBuiltinCode((*operator_codes)[0]));
   EXPECT_EQ(1, (*operator_codes)[0]->version());
   EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
-            (*operator_codes)[1]->builtin_code());
+            GetBuiltinCode((*operator_codes)[1]));
   EXPECT_EQ(2, (*operator_codes)[1]->version());
 
   // Verify that the 2 operators points to the correct indices of the operation
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index fe7dd31a40a..fc2362f4068 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index 8352e0fd9f2..65dc3a64ed1 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/toco/toco_port.h"
+
 #include <cstring>
 
-#include "tensorflow/lite/toco/toco_port.h"
-#include "tensorflow/lite/toco/toco_types.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/toco/toco_types.h"
 
 #if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
 namespace std {
@@ -69,7 +71,7 @@ void CheckInitGoogleIsDone(const char* message) {
 namespace file {
 
 // Conversion to our wrapper Status.
-tensorflow::Status ToStatus(const ::util::Status& uts) {
+tensorflow::Status ToStatus(const absl::Status& uts) {
   if (!uts.ok()) {
     return tensorflow::Status(
         tensorflow::errors::Code(::util::RetrieveErrorCode(uts)),
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 22fa1ff1cea..aa852093794 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -220,8 +220,8 @@ cc_test(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -319,6 +319,7 @@ cc_library(
     hdrs = ["list_flex_ops.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_utils",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 8917c254825..e04e1a12cd4 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -46,8 +46,17 @@ extern "C" {
 
 typedef enum TfLiteStatus {
   kTfLiteOk = 0,
+
+  // Generally referring to an error in the runtime (i.e. interpreter)
   kTfLiteError = 1,
+
+  // Generally referring to an error from a TfLiteDelegate itself.
   kTfLiteDelegateError = 2,
+
+  // Generally referring to an error in applying a delegate due to
+  // incompatibility between runtime and delegate, e.g., this error is returned
+  // when trying to apply a TfLite delegate onto a model graph that's already
+  // immutable.
   kTfLiteApplicationError = 3
 } TfLiteStatus;
 
diff --git a/tensorflow/lite/tools/cmake/README.md b/tensorflow/lite/tools/cmake/README.md
index 7624b6623c2..c48685a8c1e 100644
--- a/tensorflow/lite/tools/cmake/README.md
+++ b/tensorflow/lite/tools/cmake/README.md
@@ -4,24 +4,25 @@ This page describes how to build the TensorFlow Lite static library with CMake
 tool.
 
 The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
-and TensorFlow devel docker image
+, TensorFlow devel docker image and Windows 10.
 [tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 **Note:** This is an experimental that is subject to change.
 
-**Note:** The following are not currently supported: Android, iOS, Tests and
-Host Tools (i.e benchmark / analysis tools etc.)
+**Note:** The following are not currently supported: iOS, Tests and
+Host Tools (i.e analysis tools etc.)
 
 #### Step 1. Install CMake tool
 
-It requires CMake 3.16 or higher. On Ubunutu, you can simply run the following
+It requires CMake 3.16 or higher. On Ubuntu, you can simply run the following
 command.
 
 ```sh
 sudo apt-get install cmake
 ```
 
-Or you can follow [the offcial cmake installation guide](https://cmake.org/install/)
+Or you can follow
+[the official cmake installation guide](https://cmake.org/install/)
 
 #### Step 2. Clone TensorFlow repository
 
@@ -40,11 +41,45 @@ cd tflite_build
 cmake ../tensorflow_src/tensorflow/lite
 ```
 
+If you want to configure Android build with GPU delegate support,
+
+```sh
+mkdir tflite_build
+cd tflite_build
+cmake -DCMAKE_TOOLCHAIN_FILE=<NDK path>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a -DTFLITE_ENABLE_GPU=ON ../tensorflow_src/tensorflow/lite
+```
+
+
 #### Step 4. Build TensorFlow Lite
 
+In the tflite_build directory,
+
 ```sh
 cmake --build . -j
 ```
 
+Or
+
+```sh
+make -j
+```
+
+
 **Note:** This should compile a static library `libtensorflow-lite.a` in the
 current directory.
+
+
+#### Step 5. Build TensorFlow Lite Benchmark Tool
+
+In the tflite_build directory,
+
+```sh
+cmake --build . -j -t benchmark_model
+```
+
+Or
+
+```sh
+make benchmark_model -j
+```
diff --git a/tensorflow/lite/tools/cmake/modules/Findopencl_headers.cmake b/tensorflow/lite/tools/cmake/modules/Findopencl_headers.cmake
new file mode 100644
index 00000000000..6384aa0d1da
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findopencl_headers.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(opencl_headers)
diff --git a/tensorflow/lite/tools/cmake/modules/Findvulkan_headers.cmake b/tensorflow/lite/tools/cmake/modules/Findvulkan_headers.cmake
new file mode 100644
index 00000000000..8dbb0e6d6ae
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findvulkan_headers.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(vulkan_headers)
diff --git a/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake b/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
new file mode 100644
index 00000000000..c54e526f5ed
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET opencl_headers OR opencl_headers_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  opencl_headers
+  GIT_REPOSITORY https://github.com/KhronosGroup/OpenCL-Headers
+  # GIT_TAG must keep in sync with tensorflow/third_party/opencl_headers/workspace.bzl
+  GIT_TAG 0d5f18c6e7196863bc1557a693f1509adfcee056
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/opencl_headers"
+)
+
+OverridableFetchContent_GetProperties(opencl_headers)
+if(NOT opencl_headers)
+  OverridableFetchContent_Populate(opencl_headers)
+endif()
+
+include_directories(
+  AFTER
+   "${opencl_headers_SOURCE_DIR}/"
+)
diff --git a/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake b/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
new file mode 100644
index 00000000000..4b8fc34104b
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET vulkan_headers OR vulkan_headers_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  vulkan_headers
+  GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers
+  # GIT_TAG must keep in sync with tensorflow/third_party/vulkan_headers/workspace.bzl
+  GIT_TAG 0e57fc1cfa56a203efe43e4dfb9b3c9e9b105593
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/vulkan_headers"
+)
+
+OverridableFetchContent_GetProperties(vulkan_headers)
+if(NOT vulkan_headers)
+  OverridableFetchContent_Populate(vulkan_headers)
+endif()
+
+include_directories(
+  AFTER
+   "${vulkan_headers_SOURCE_DIR}/include"
+)
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index e5f205c0b0b..4c72e87020a 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -22,7 +22,7 @@ include(FetchContent)
 OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/xnnpack
-  GIT_TAG 0af63ab36b899559bd1a92bbc327f8137e53c15c
+  GIT_TAG 30d4b250aef4ee74271c2254943f062a8356a23e
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index 4bca044cb50..894d12923b4 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -33,7 +33,7 @@ class NnapiDelegateProvider : public DelegateProvider {
     default_params_.AddParam("nnapi_accelerator_name",
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("disable_nnapi_cpu",
-                             ToolParam::Create<bool>(false));
+                             ToolParam::Create<bool>(true));
     default_params_.AddParam("nnapi_allow_fp16",
                              ToolParam::Create<bool>(false));
   }
@@ -104,8 +104,8 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
         params.Get<std::string>("nnapi_accelerator_name");
     if (!accelerator_name.empty()) {
       options.accelerator_name = accelerator_name.c_str();
-    } else if (params.Get<bool>("disable_nnapi_cpu")) {
-      options.disallow_nnapi_cpu = true;
+    } else {
+      options.disallow_nnapi_cpu = params.Get<bool>("disable_nnapi_cpu");
     }
 
     if (params.Get<bool>("nnapi_allow_fp16")) {
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 17cb0f4ef9f..4363aff01a9 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -51,7 +51,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:preprocessing_steps_cc_proto",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_jpeg_internal",
+            "//tensorflow/core:portable_jpeg_internal",
         ],
         "//conditions:default": [
             "//tensorflow/core:jpeg_internal",
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md b/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
index 64606ee19df..8e22328ff01 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
@@ -11,7 +11,7 @@ latency & output-value deviation) in two settings:
 
 To do so, the tool generates random gaussian data and passes it through two
 TFLite Interpreters - one running single-threaded CPU kernels and the other
-parametrized by the user's arguments.
+parameterized by the user's arguments.
 
 It measures the latency of both, as well as the absolute difference between the
 output tensors from each Interpreter, on a per-element basis.
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 29eaf1c7a3b..8a9031fb8f2 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -121,11 +121,11 @@ def strip_strings(model):
 
   """
 
-  model.description = ''
+  model.description = None
   for subgraph in model.subgraphs:
-    subgraph.name = ''
+    subgraph.name = None
     for tensor in subgraph.tensors:
-      tensor.name = ''
+      tensor.name = None
 
 
 def randomize_weights(model, random_seed=0):
diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py
index 0b7aa282ab1..129e027ba21 100644
--- a/tensorflow/lite/tools/flatbuffer_utils_test.py
+++ b/tensorflow/lite/tools/flatbuffer_utils_test.py
@@ -87,13 +87,13 @@ class StripStringsTest(test_util.TensorFlowTestCase):
     # 3. VALIDATE
     # Validate that the initial and final models are the same except strings
     # Validate the description
-    self.assertNotEqual('', initial_model.description)
-    self.assertEqual('', final_model.description)
+    self.assertIsNotNone(initial_model.description)
+    self.assertIsNone(final_model.description)
     # Validate the main subgraph's name, inputs, outputs, operators and tensors
     initial_subgraph = initial_model.subgraphs[0]
     final_subgraph = final_model.subgraphs[0]
-    self.assertNotEqual('', initial_model.subgraphs[0].name)
-    self.assertEqual('', final_model.subgraphs[0].name)
+    self.assertIsNotNone(initial_model.subgraphs[0].name)
+    self.assertIsNone(final_model.subgraphs[0].name)
     for i in range(len(initial_subgraph.inputs)):
       self.assertEqual(initial_subgraph.inputs[i], final_subgraph.inputs[i])
     for i in range(len(initial_subgraph.outputs)):
@@ -104,8 +104,8 @@ class StripStringsTest(test_util.TensorFlowTestCase):
     initial_tensors = initial_subgraph.tensors
     final_tensors = final_subgraph.tensors
     for i in range(len(initial_tensors)):
-      self.assertNotEqual('', initial_tensors[i].name)
-      self.assertEqual('', final_tensors[i].name)
+      self.assertIsNotNone(initial_tensors[i].name)
+      self.assertIsNone(final_tensors[i].name)
       self.assertEqual(initial_tensors[i].type, final_tensors[i].type)
       self.assertEqual(initial_tensors[i].buffer, final_tensors[i].buffer)
       for j in range(len(initial_tensors[i].shape)):
diff --git a/tensorflow/lite/tools/list_flex_ops_no_kernel.cc b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
index 68d40be1c9c..e90e3d75f22 100644
--- a/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
+++ b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "json/json.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/list_flex_ops.h"
 
 namespace tflite {
@@ -40,7 +41,7 @@ void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops) {
     for (int i = 0; i < operators->size(); ++i) {
       const tflite::Operator* op = operators->Get(i);
       const tflite::OperatorCode* opcode = opcodes->Get(op->opcode_index());
-      if (opcode->builtin_code() != tflite::BuiltinOperator_CUSTOM ||
+      if (tflite::GetBuiltinCode(opcode) != tflite::BuiltinOperator_CUSTOM ||
           !tflite::IsFlexOp(opcode->custom_code()->c_str())) {
         continue;
       }
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 9beb7a239e8..c77c08eeb6b 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -88,6 +88,33 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "quantization_wrapper_utils_custom_test",
+    srcs = [
+        "quantization_wrapper_utils.cc",
+        "quantization_wrapper_utils.h",
+        "quantization_wrapper_utils_custom_test.cc",
+    ],
+    defines = [
+        "TFLITE_CUSTOM_LSTM",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":operator_property",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "quantization_wrapper",
     srcs = ["quantization_wrapper.cc"],
@@ -134,6 +161,7 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
@@ -180,7 +208,7 @@ tf_cc_test(
         "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
     ],
     data = [
-        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        ":testdata/single_conv_weights_min_0_max_plus_10.bin",
     ],
     tags = [
         "tflite_not_portable_android",
@@ -195,6 +223,7 @@ tf_cc_test(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
+        "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -312,6 +341,8 @@ tf_cc_test(
         "//tensorflow/lite/tools/optimize:testdata/svdf_calibrated.bin",
         "//tensorflow/lite/tools/optimize:testdata/svdf_quantized.bin",
         "//tensorflow/lite/tools/optimize:testdata/transpose.bin",
+        "//tensorflow/lite/tools/optimize:testdata/unidirectional_sequence_lstm_calibrated.bin",
+        "//tensorflow/lite/tools/optimize:testdata/unidirectional_sequence_lstm_quantized.bin",
         "//tensorflow/lite/tools/optimize:testdata/unpack.bin",
     ],
     tags = [
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index 53bd1bb4faf..bf4a9b86233 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -29,6 +29,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "custom_logging_op",
+    srcs = ["custom_logging_ops/lstm.cc"],
+    hdrs = ["custom_logging_ops/lstm.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":calibration_logger",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:lstm_shared",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:reference",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+    ],
+)
+
 cc_library(
     name = "calibrator_lib",
     srcs = ["calibrator.cc"],
@@ -39,6 +59,7 @@ cc_library(
         ":calibration_common",
         ":calibration_logger",
         ":calibration_reader",
+        ":custom_logging_op",
         ":logging_op",
         ":logging_op_resolver",
         "//tensorflow/lite:framework",
@@ -63,8 +84,10 @@ tf_cc_test(
         "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
     ],
     data = [
+        "//tensorflow/lite:testdata/custom_lstm.bin",
         "//tensorflow/lite:testdata/lstm.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/unidirectional_sequence_lstm.bin",
     ],
     tags = [
         "tflite_not_portable_android",
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 6249649f4b7..bdf27d9a980 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -125,15 +125,14 @@ void CalculateLstmOutputCalibration(
     const float* output_gate, TfLiteFusedActivation activation,
     const float* projection_weights, const float* projection_bias,
     const float proj_clip, float* output_state, float* scratch, Logger* logger,
-    const std::vector<int>& intermediate_tensor_indexes,
-    ErrorReporter* error_reporter) {
+    int intermediate_tensor_index, ErrorReporter* error_reporter) {
   tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
                                         activation, scratch);
   tensor_utils::VectorVectorCwiseProduct(output_gate, scratch, n_batch * n_cell,
                                          scratch);
 
-  logger->LogTensorValue(intermediate_tensor_indexes[4], scratch,
-                         n_cell * n_batch, error_reporter);
+  logger->LogTensorValue(intermediate_tensor_index, scratch, n_cell * n_batch,
+                         error_reporter);
 
   const bool use_projection = (projection_weights != nullptr);
   const bool use_projection_bias = (projection_bias != nullptr);
@@ -252,7 +251,7 @@ inline void LstmStepCalibration(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
       params->activation, projection_weights_ptr, projection_bias_ptr,
       params->proj_clip, output_state_ptr, scratch2, logger,
-      intermediate_tensor_indexes, error_reporter);
+      intermediate_tensor_indexes[4], error_reporter);
   // Copy output state to the output. Note that the output's rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
@@ -462,10 +461,9 @@ struct OpData {
 // Resize the output, state tensors based on the sizes of the input tensors.
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
-TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
+TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node,
+                       LSTMType lstm_type, Logger* logger,
                        ErrorReporter* error_reporter) {
-  const auto* params = static_cast<TfLiteLSTMParams*>(node->builtin_data);
-
   const TfLiteTensor* input;
   TF_LITE_ENSURE_OK(
       context, GetInputSafe(context, node,
@@ -573,10 +571,37 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
                              ops::builtin::lstm::full::kOutputTensor, &output));
 
   std::vector<int> intermediate_tensor_indexes(node->intermediates->size);
+  // LSTM expect 5 intermediate tensors.
+  TF_LITE_ENSURE_EQ(context, node->intermediates->size, 5);
   for (int i = 0; i < node->intermediates->size; ++i) {
     intermediate_tensor_indexes[i] = node->intermediates->data[i];
   }
 
+  TfLiteLSTMParams lstm_params;
+  bool time_major = true;
+  switch (lstm_type) {
+    case LSTMType::kLSTM: {
+      lstm_params = *(static_cast<TfLiteLSTMParams*>(node->builtin_data));
+      time_major = true;
+      break;
+    }
+    case LSTMType::kUnidirectionalSequenceLSTM: {
+      const auto* params = static_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
+      // Copy out the LSTM specific params so they can be passed in the
+      // function.
+      lstm_params.activation = params->activation;
+      lstm_params.cell_clip = params->cell_clip;
+      lstm_params.proj_clip = params->proj_clip;
+      lstm_params.asymmetric_quantize_inputs =
+          params->asymmetric_quantize_inputs;
+      time_major = params->time_major;
+      break;
+    }
+    default:
+      return kTfLiteError;
+  }
+
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
       return EvalCalibration(
@@ -593,9 +618,9 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_gate_bias, output_gate_bias,
-          projection_weights, projection_bias, params,
+          projection_weights, projection_bias, &lstm_params,
           /*forward_sequence=*/true,
-          /*time_major=*/true,
+          /*time_major=*/time_major,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
           logger, intermediate_tensor_indexes, error_reporter);
     }
@@ -612,7 +637,14 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
 TfLiteStatus lstm_logging_kernel(TfLiteContext* context, TfLiteNode* node,
                                  Logger* logger,
                                  ErrorReporter* error_reporter) {
-  return lstm_eval(context, node, logger, error_reporter);
+  return lstm_eval(context, node, LSTMType::kLSTM, logger, error_reporter);
+}
+
+TfLiteStatus unidirectional_sequence_lstm_logging_kernel(
+    TfLiteContext* context, TfLiteNode* node, Logger* logger,
+    ErrorReporter* error_reporter) {
+  return lstm_eval(context, node, LSTMType::kUnidirectionalSequenceLSTM, logger,
+                   error_reporter);
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
index f3306bc0564..0a9e7095507 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
@@ -23,9 +23,18 @@ namespace optimize {
 namespace calibration {
 namespace builtin {
 
+enum class LSTMType {
+  kLSTM,
+  kUnidirectionalSequenceLSTM,
+};
+
 TfLiteStatus lstm_logging_kernel(TfLiteContext* context, TfLiteNode* node,
                                  Logger* logger, ErrorReporter* error_reporter);
 
+TfLiteStatus unidirectional_sequence_lstm_logging_kernel(
+    TfLiteContext* context, TfLiteNode* node, Logger* logger,
+    ErrorReporter* error_reporter);
+
 }  // namespace builtin
 }  // namespace calibration
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index be8fad8a221..15932664e5a 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h"
 #include "tensorflow/lite/tools/optimize/calibration/logging_op.h"
 #include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
 
@@ -174,13 +175,21 @@ GlobalCalibratorRegistry* GetCalibratorRegistry() {
 // TODO(jianlijianli): extend this to support multiple recipe for the same
 // model.
 logging_kernel_func_ptr GetLoggingEvalFunc(TfLiteContext* context,
-                                           TfLiteNode* node) {
-  const int lstm_number_input = 24;
-  if (node->inputs->size == lstm_number_input) {
-    // LSTM Op.
-    return tflite::optimize::calibration::builtin::lstm_logging_kernel;
+                                           TfLiteNode* node,
+                                           int builtin_op_code) {
+  switch (builtin_op_code) {
+    case BuiltinOperator_LSTM: {
+      if (node->intermediates->size == 12) {
+        return tflite::optimize::calibration::custom::lstm_logging_kernel;
+      }
+      return tflite::optimize::calibration::builtin::lstm_logging_kernel;
+    }
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+      return tflite::optimize::calibration::builtin::
+          unidirectional_sequence_lstm_logging_kernel;
+    default:
+      return nullptr;
   }
-  return nullptr;
 }
 
 // A wrapper implementation for |TfLiteRegistration.invoke| that logs inputs,
@@ -203,13 +212,14 @@ TfLiteStatus LoggingEval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_STATUS(logger->LogTensorValue(
         i, tensor.data.f, tensor.bytes / sizeof(float), error_reporter));
   }
-  auto kernel_invoke_intermediate = GetLoggingEvalFunc(context, node);
-  TfLiteStatus status;
+  auto builtin_op_code = calibrator->GetOpInfo(node).builtin_op_code;
+  auto kernel_invoke_intermediate =
+      GetLoggingEvalFunc(context, node, builtin_op_code);
   if (kernel_invoke_intermediate == nullptr) {
-    status = kernel_invoke(context, node);
+    TF_LITE_ENSURE_STATUS(kernel_invoke(context, node));
   } else {
-    status = kernel_invoke_intermediate(context, node, calibrator->GetLogger(),
-                                        error_reporter);
+    TF_LITE_ENSURE_STATUS(kernel_invoke_intermediate(
+        context, node, calibrator->GetLogger(), error_reporter));
   }
 
   // TODO(shashishekhar): An intermediate tensor in graph will get logged twice
@@ -231,7 +241,7 @@ TfLiteStatus LoggingEval(TfLiteContext* context, TfLiteNode* node) {
         i, tensor.data.f, tensor.bytes / sizeof(float), error_reporter));
   }
 
-  return status;
+  return kTfLiteOk;
 }
 
 // Returns the loggable tensors. Not all inputs and outputs need to be logged.
@@ -272,8 +282,11 @@ TfLiteStatus GetNodeOpInfoMapAndContext(
 
   // Since we only consider the primary subgraph while populating
   // node_to_opinfo, do the same here.
-  TF_LITE_ENSURE_EQ(*context, interpreter->execution_plan().size(),
-                    node_to_opinfo.size());
+  // Because Flex delegate can merge multiple op nodes into one Delegate node if
+  // they are located in a row, the size of the execution plan can be lesser
+  // than the size of the graph's op nodes.
+  TF_LITE_ENSURE(*context,
+                 interpreter->execution_plan().size() <= node_to_opinfo.size());
   for (const auto& entry : node_to_opinfo) {
     auto op_info = entry.second;
     const auto* node_and_reg = interpreter->node_and_registration(entry.first);
@@ -393,8 +406,8 @@ TfLiteStatus BuildLoggingInterpreter(
   // (TfLiteContext, TfLiteNode) -> OperatorInfo
   std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map;
   TfLiteContext* context = nullptr;
-  GetNodeOpInfoMapAndContext(node_to_opinfo, interpreter->get(),
-                             &node_ptr_opinfo_map, &context);
+  TF_LITE_ENSURE_STATUS(GetNodeOpInfoMapAndContext(
+      node_to_opinfo, interpreter->get(), &node_ptr_opinfo_map, &context));
 
   Calibrator* calibrator = nullptr;
   // Register a calibrator object for the context. This can be accessed
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index f0cd27ef620..93efce3bcd9 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -283,7 +283,7 @@ TEST(CalibratorTest, LSTM) {
   auto status = BuildLoggingInterpreter(*flatbuffer_model,
                                         ops::builtin::BuiltinOpResolver{},
                                         &interpreter, &reader);
-  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(status, kTfLiteOk);
 
   auto readonly_model = flatbuffer_model->GetModel();
   tflite::ModelT model;
@@ -294,24 +294,17 @@ TEST(CalibratorTest, LSTM) {
   status = interpreter->AllocateTensors();
 
   EXPECT_EQ(kTfLiteOk, status);
-  const std::vector<float> lstm_input = {
-      0.3, 0.2, 0.9, 0.8, 0.1,  //
-      0.1, 0.5, 0.2, 0.4, 0.2,  //
-      0.6, 0.9, 0.2, 0.5, 0.7,  //
-  };
+  const std::vector<float> lstm_input = {0.3, 0.2};
   int input_tensor_idx = interpreter->inputs()[0];
   TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
   for (size_t j = 0; j < lstm_input.size(); j++) {
     tensor->data.f[j] = lstm_input[j];
   }
 
-  // Invoke with update == true.
-  status = interpreter->Invoke();
-  ASSERT_EQ(kTfLiteOk, status);
+  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
 
   absl::flat_hash_map<int, CalibrationReader::CalibrationStats> stats;
-  status = reader->GetTensorStatsAsMap(&stats);
-  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(reader->GetTensorStatsAsMap(&stats), kTfLiteOk);
 
   // Check the results.
   const float eps = 1e-6f;
@@ -344,6 +337,140 @@ TEST(CalibratorTest, LSTM) {
   }
 }
 
+TEST(CalibratorTest, UnidirectionalSequenceLSTM) {
+  auto flatbuffer_model = ReadModel("unidirectional_sequence_lstm.bin");
+  ASSERT_TRUE(flatbuffer_model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(*flatbuffer_model,
+                                        ops::builtin::BuiltinOpResolver{},
+                                        &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  auto readonly_model = flatbuffer_model->GetModel();
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  EXPECT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+  const std::vector<float> lstm_input = {0.3, 0.2, 0.9, 0.8};
+  int input_tensor_idx = interpreter->inputs()[0];
+  TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+  for (size_t j = 0; j < lstm_input.size(); j++) {
+    tensor->data.f[j] = lstm_input[j];
+  }
+
+  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
+
+  absl::flat_hash_map<int, CalibrationReader::CalibrationStats> stats;
+  EXPECT_EQ(reader->GetTensorStatsAsMap(&stats), kTfLiteOk);
+
+  // Check the results.
+  const float eps = 1e-6f;
+  const std::unordered_map<int, CalibrationReader::CalibrationStats>
+      expected_calibration_result = {
+          // Input.
+          {0, {0.200000, 0.900000}},
+          // State.
+          {18, {0.000000, 0.520999}},
+          // State.
+          {19, {0.000000, 0.711364}},
+          // Output.
+          {24, {0.247992, 0.520999}},
+          // Intemediate_0.
+          {25, {0.080045, 0.824241}},
+          // Intemediate_1.
+          {26, {0.080045, 0.824241}},
+          // Intemediate_2.
+          {27, {0.080045, 0.824241}},
+          // Intemediate_3.
+          {28, {0.080045, 0.824241}},
+          // Intemediate_4.
+          {29, {0.000000, 0.413618}},
+      };
+  EXPECT_EQ(expected_calibration_result.size(), stats.size());
+  for (const auto& e : stats) {
+    auto expected_result = expected_calibration_result.at(e.first);
+    EXPECT_NEAR(e.second.min, expected_result.min, eps);
+    EXPECT_NEAR(e.second.max, expected_result.max, eps);
+  }
+}
+
+TEST(CalibratorTest, CustomLSTM) {
+  auto flatbuffer_model = ReadModel("custom_lstm.bin");
+  ASSERT_TRUE(flatbuffer_model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(*flatbuffer_model,
+                                        ops::builtin::BuiltinOpResolver{},
+                                        &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  auto readonly_model = flatbuffer_model->GetModel();
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  EXPECT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+  const std::vector<float> lstm_input = {0.3, 0.2, 0.9, 0.8};
+  int input_tensor_idx = interpreter->inputs()[0];
+  TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+  for (size_t j = 0; j < lstm_input.size(); j++) {
+    tensor->data.f[j] = lstm_input[j];
+  }
+
+  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
+
+  absl::flat_hash_map<int, CalibrationReader::CalibrationStats> stats;
+  EXPECT_EQ(reader->GetTensorStatsAsMap(&stats), kTfLiteOk);
+
+  // Check the results.
+  const float eps = 1e-6f;
+  const std::unordered_map<int, CalibrationReader::CalibrationStats>
+      expected_calibration_result = {
+          // input.
+          {0, {0.200000, 0.300000}},
+          // state.
+          {18, {0.000000, 0.468415}},
+          // state.
+          {19, {0.000000, 0.424349}},
+          // output.
+          {24, {0.265968, 0.468415}},
+          // intermediate 0.
+          {25, {0.080045, 0.170588}},
+          // intermediate 1.
+          {26, {0.080045, 0.170588}},
+          // intermediate 2.
+          {27, {0.000000, 0.000000}},
+          // intermediate 3.
+          {28, {0.080045, 0.170588}},
+          // intermediate 4.
+          {29, {0.080045, 0.170588}},
+          // intermediate 5.
+          {30, {0.000000, 0.000000}},
+          // intermediate 6.
+          {31, {0.080045, 0.170588}},
+          // intermediate 7.
+          {32, {0.080045, 0.170588}},
+          // intermediate 8.
+          {33, {0.000000, 0.000000}},
+          // intermediate 9.
+          {34, {0.080045, 0.170588}},
+          // intermediate 10.
+          {35, {0.080045, 0.170588}},
+          // intermediate 11.
+          {36, {0.000000, 0.000000}},
+      };
+  EXPECT_EQ(expected_calibration_result.size(), stats.size());
+  for (const auto& e : stats) {
+    auto expected_result = expected_calibration_result.at(e.first);
+    EXPECT_NEAR(e.second.min, expected_result.min, eps);
+    EXPECT_NEAR(e.second.max, expected_result.max, eps);
+  }
+}
+
 }  // namespace
 }  // namespace calibration
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc
new file mode 100644
index 00000000000..394212908c1
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc
@@ -0,0 +1,649 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/lstm_shared.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace custom {
+
+namespace {
+
+inline void LstmStepWithAuxInput(
+    const float* input_ptr, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr, const float* aux_input_ptr,
+    const float* aux_input_to_input_weights_ptr,
+    const float* aux_input_to_forget_weights_ptr,
+    const float* aux_input_to_cell_weights_ptr,
+    const float* aux_input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr,
+    const float* input_layer_norm_coefficients_ptr,
+    const float* forget_layer_norm_coefficients_ptr,
+    const float* cell_layer_norm_coefficients_ptr,
+    const float* output_layer_norm_coefficients_ptr,
+    const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
+    const float* cell_bias_ptr, const float* output_gate_bias_ptr,
+    const float* projection_weights_ptr, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr, Logger* logger,
+    const std::vector<int>& intemediate_tensor_indexes,
+    ErrorReporter* error_reporter) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    if (!use_cifg) {
+      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
+    }
+    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
+  } else {
+    if (!use_cifg) {
+      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
+                                            n_batch, input_gate_scratch);
+    }
+    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                          forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                          output_gate_scratch);
+  }
+
+  // For each batch and cell: compute input_weight * input.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_input_weights_ptr, n_cell, n_input, input_ptr, n_batch,
+        input_gate_scratch);
+  }
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_forget_weights_ptr, n_cell, n_input, input_ptr, n_batch,
+      forget_gate_scratch);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(input_to_cell_weights_ptr,
+                                                    n_cell, n_input, input_ptr,
+                                                    n_batch, cell_scratch);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
+      output_gate_scratch);
+
+  {
+    // calibration.
+    if (!use_cifg) {
+      logger->LogTensorValue(intemediate_tensor_indexes[1], input_gate_scratch,
+                             n_cell * n_batch, error_reporter);
+    }
+    logger->LogTensorValue(intemediate_tensor_indexes[4], forget_gate_scratch,
+                           n_cell * n_batch, error_reporter);
+    logger->LogTensorValue(intemediate_tensor_indexes[7], cell_scratch,
+                           n_cell * n_batch, error_reporter);
+    logger->LogTensorValue(intemediate_tensor_indexes[10], output_gate_scratch,
+                           n_cell * n_batch, error_reporter);
+  }
+
+  // If auxiliary input is available then compute aux_input_weight * aux_input
+  if (aux_input_ptr != nullptr) {
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_to_input_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
+          n_batch, input_gate_scratch);
+    }
+
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_forget_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
+        n_batch, forget_gate_scratch);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
+        n_batch, cell_scratch);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
+        n_batch, output_gate_scratch);
+  }
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, input_gate_scratch);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, forget_gate_scratch);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, cell_scratch);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, output_gate_scratch);
+  {
+    // calibrition.
+    if (!use_cifg) {
+      std::vector<float> temp_input(n_batch * n_cell);
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
+          n_batch, temp_input.data());
+      logger->LogTensorValue(intemediate_tensor_indexes[2], temp_input.data(),
+                             n_cell * n_batch, error_reporter);
+    }
+    std::vector<float> temp_forget(n_batch * n_cell);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, temp_forget.data());
+    logger->LogTensorValue(intemediate_tensor_indexes[5], temp_forget.data(),
+                           n_cell * n_batch, error_reporter);
+
+    std::vector<float> temp_cell(n_batch * n_cell);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, temp_cell.data());
+
+    logger->LogTensorValue(intemediate_tensor_indexes[8], temp_cell.data(),
+                           n_cell * n_batch, error_reporter);
+
+    std::vector<float> temp_output(n_batch * n_cell);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, temp_output.data());
+    logger->LogTensorValue(intemediate_tensor_indexes[11], temp_output.data(),
+                           n_cell * n_batch, error_reporter);
+  }
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    if (use_layer_norm) {
+      logger->LogTensorValue(intemediate_tensor_indexes[0], input_gate_scratch,
+                             n_cell * n_batch, error_reporter);
+      tensor_utils::MeanStddevNormalization(
+          input_gate_scratch, input_gate_scratch, n_cell, n_batch);
+      tensor_utils::VectorBatchVectorCwiseProduct(
+          input_layer_norm_coefficients_ptr, n_cell, input_gate_scratch,
+          n_batch, input_gate_scratch);
+      tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
+                                         input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  if (use_layer_norm) {
+    logger->LogTensorValue(intemediate_tensor_indexes[3], forget_gate_scratch,
+                           n_cell * n_batch, error_reporter);
+    tensor_utils::MeanStddevNormalization(forget_gate_scratch,
+                                          forget_gate_scratch, n_cell, n_batch);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        forget_layer_norm_coefficients_ptr, n_cell, forget_gate_scratch,
+        n_batch, forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
+                                       forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  if (use_layer_norm) {
+    logger->LogTensorValue(intemediate_tensor_indexes[6], cell_scratch,
+                           n_cell * n_batch, error_reporter);
+    tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
+                                          n_batch);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        cell_layer_norm_coefficients_ptr, n_cell, cell_scratch, n_batch,
+        cell_scratch);
+    tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
+                                       cell_scratch);
+  }
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  if (use_layer_norm) {
+    logger->LogTensorValue(intemediate_tensor_indexes[9], output_gate_scratch,
+                           n_cell * n_batch, error_reporter);
+    tensor_utils::MeanStddevNormalization(output_gate_scratch,
+                                          output_gate_scratch, n_cell, n_batch);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        output_layer_norm_coefficients_ptr, n_cell, output_gate_scratch,
+        n_batch, output_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
+                                       output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+
+  // For each batch: update the projection and output_state. Note that since
+  // the output batch rows may not be contiguous (output_batch_leading_dim !=
+  // n_output), we unroll batched operations.
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      for (int k = 0; k < n_batch; k++) {
+        std::copy_n(projection_bias_ptr, n_output,
+                    output_ptr + k * output_batch_leading_dim);
+      }
+    } else {
+      for (int k = 0; k < n_batch; k++) {
+        std::fill_n(output_ptr + k * output_batch_leading_dim, n_output, 0.0f);
+      }
+    }
+    for (int k = 0; k < n_batch; k++) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights_ptr, n_output, n_cell,
+          output_gate_scratch + k * n_cell,
+          /*n_batch=*/1, output_ptr + k * output_batch_leading_dim);
+      if (params->proj_clip > 0.0) {
+        tensor_utils::CwiseClipping(output_ptr + k * output_batch_leading_dim,
+                                    n_output, params->proj_clip);
+      }
+    }
+  } else {
+    for (int k = 0; k < n_batch; k++) {
+      std::copy_n(output_gate_scratch + k * n_output, n_output,
+                  output_ptr + k * output_batch_leading_dim);
+    }
+  }
+  for (int k = 0; k < n_batch; k++) {
+    std::copy_n(output_ptr + k * output_batch_leading_dim, n_output,
+                output_state_ptr + k * n_output);
+  }
+}
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    int output_offset, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output, Logger* logger,
+    const std::vector<int>& intemediate_tensor_indexes,
+    ErrorReporter* error_reporter) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+  int max_time, n_batch;
+  if (input->dims->size == 3) {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  } else {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  }
+  const int n_input = input->dims->data[input->dims->size - 1];
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  float* scratch_buffer_ptr = GetTensorData<float>(scratch_buffer);
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer_ptr;
+    forget_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer_ptr;
+    cell_scratch = scratch_buffer_ptr + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer_ptr + 3 * n_cell * n_batch;
+  }
+
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  if (time_major) {
+    // Loop through the sequence.
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      // If this is the forward_sequence, step forward, otherwise step
+      // backwards.
+      const int t_rel = forward_sequence ? t : max_time - t - 1;
+      const float* input_ptr = GetTensorData<float>(input) + t_rel * input_step;
+      const float* aux_input_ptr = nullptr;
+      if (aux_input) {
+        aux_input_ptr = GetTensorData<float>(aux_input) + t_rel * input_step;
+      }
+      float* output_ptr_time =
+          GetTensorData<float>(output) + t_rel * output_step + output_offset;
+
+      LstmStepWithAuxInput(
+          input_ptr, GetTensorData<float>(input_to_input_weights),
+          GetTensorData<float>(input_to_forget_weights),
+          GetTensorData<float>(input_to_cell_weights),
+          GetTensorData<float>(input_to_output_weights), aux_input_ptr,
+          GetTensorData<float>(aux_input_to_input_weights),
+          GetTensorData<float>(aux_input_to_forget_weights),
+          GetTensorData<float>(aux_input_to_cell_weights),
+          GetTensorData<float>(aux_input_to_output_weights),
+          GetTensorData<float>(recurrent_to_input_weights),
+          GetTensorData<float>(recurrent_to_forget_weights),
+          GetTensorData<float>(recurrent_to_cell_weights),
+          GetTensorData<float>(recurrent_to_output_weights),
+          GetTensorData<float>(cell_to_input_weights),
+          GetTensorData<float>(cell_to_forget_weights),
+          GetTensorData<float>(cell_to_output_weights),
+          GetTensorData<float>(input_layer_norm_coefficients),
+          GetTensorData<float>(forget_layer_norm_coefficients),
+          GetTensorData<float>(cell_layer_norm_coefficients),
+          GetTensorData<float>(output_layer_norm_coefficients),
+          GetTensorData<float>(input_gate_bias),
+          GetTensorData<float>(forget_gate_bias),
+          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(output_gate_bias),
+          GetTensorData<float>(projection_weights),
+          GetTensorData<float>(projection_bias), params, n_batch, n_cell,
+          n_input, aux_input_size, n_output, output_batch_leading_dim,
+          GetTensorData<float>(activation_state),
+          GetTensorData<float>(cell_state), input_gate_scratch,
+          forget_gate_scratch, cell_scratch, output_gate_scratch,
+          output_ptr_time, logger, intemediate_tensor_indexes, error_reporter);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const int time_offset = b * max_time + t_rel;
+        const float* input_ptr =
+            GetTensorData<float>(input) + time_offset * input_step;
+        const float* aux_input_ptr = nullptr;
+        if (aux_input) {
+          aux_input_ptr =
+              GetTensorData<float>(aux_input) + time_offset * input_step;
+        }
+        float* output_ptr = GetTensorData<float>(output) +
+                            time_offset * output_step + output_offset;
+
+        // Offset the {activation,cell}_state pointers to the right batch.
+        float* activation_state_ptr = GetTensorData<float>(activation_state) +
+                                      b * output_batch_leading_dim;
+        float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
+        // Offset the scratch pointers to the right batch.
+        float* input_gate_scratch_ptr =
+            input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
+        float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
+        float* cell_scratch_ptr = cell_scratch + b * n_cell;
+        float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
+
+        LstmStepWithAuxInput(
+            input_ptr, GetTensorData<float>(input_to_input_weights),
+            GetTensorData<float>(input_to_forget_weights),
+            GetTensorData<float>(input_to_cell_weights),
+            GetTensorData<float>(input_to_output_weights), aux_input_ptr,
+            GetTensorData<float>(aux_input_to_input_weights),
+            GetTensorData<float>(aux_input_to_forget_weights),
+            GetTensorData<float>(aux_input_to_cell_weights),
+            GetTensorData<float>(aux_input_to_output_weights),
+            GetTensorData<float>(recurrent_to_input_weights),
+            GetTensorData<float>(recurrent_to_forget_weights),
+            GetTensorData<float>(recurrent_to_cell_weights),
+            GetTensorData<float>(recurrent_to_output_weights),
+            GetTensorData<float>(cell_to_input_weights),
+            GetTensorData<float>(cell_to_forget_weights),
+            GetTensorData<float>(cell_to_output_weights),
+            GetTensorData<float>(input_layer_norm_coefficients),
+            GetTensorData<float>(forget_layer_norm_coefficients),
+            GetTensorData<float>(cell_layer_norm_coefficients),
+            GetTensorData<float>(output_layer_norm_coefficients),
+            GetTensorData<float>(input_gate_bias),
+            GetTensorData<float>(forget_gate_bias),
+            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(output_gate_bias),
+            GetTensorData<float>(projection_weights),
+            GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
+            n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
+            activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+            forget_gate_scratch_ptr, cell_scratch_ptr, output_gate_scratch_ptr,
+            output_ptr, logger, intemediate_tensor_indexes, error_reporter);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+struct OpData {
+  // Which kernel type to use. Full kernel (24 inputs) or basic kernel (5
+  // inputs).
+  // Please note the 20-input full kernel is deprecated and only kept
+  // here for backward compatibility.
+  TfLiteLSTMKernelType kernel_type;
+
+  // If the lstm is layer norm.
+  bool use_layer_norm;
+
+  // These fields are only used by full kernel.
+  int scratch_tensor_index;
+};
+
+// Resize the output, state tensors based on the sizes of the input tensors.
+// Allocate a temporary scratch tensor. Also check that the sizes of the input
+// tensors match each other.
+TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
+                       ErrorReporter* error_reporter) {
+  const auto* params = static_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  const TfLiteTensor* input =
+      GetInput(context, node, ops::builtin::lstm::full::kInputTensor);
+
+  const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights = GetInput(
+      context, node, ops::builtin::lstm::full::kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights = GetInput(
+      context, node, ops::builtin::lstm::full::kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights = GetInput(
+      context, node, ops::builtin::lstm::full::kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights = GetInput(
+      context, node, ops::builtin::lstm::full::kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights = GetInput(
+      context, node, ops::builtin::lstm::full::kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights = GetInput(
+      context, node, ops::builtin::lstm::full::kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node,
+      ops::builtin::lstm::full::kInputLayerNormCoefficientsTensor);
+  const TfLiteTensor* forget_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node,
+      ops::builtin::lstm::full::kForgetLayerNormCoefficientsTensor);
+  const TfLiteTensor* cell_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node,
+      ops::builtin::lstm::full::kCellLayerNormCoefficientsTensor);
+  const TfLiteTensor* output_layer_norm_coefficients = GetOptionalInputTensor(
+      context, node,
+      ops::builtin::lstm::full::kOutputLayerNormCoefficientsTensor);
+
+  const TfLiteTensor* input_gate_bias = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, ops::builtin::lstm::full::kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias =
+      GetInput(context, node, ops::builtin::lstm::full::kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, ops::builtin::lstm::full::kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias = GetOptionalInputTensor(
+      context, node, ops::builtin::lstm::full::kProjectionBiasTensor);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+
+  TfLiteTensor* activation_state = GetVariableInput(
+      context, node, ops::builtin::lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(
+      context, node, ops::builtin::lstm::full::kCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
+
+  TfLiteTensor* output =
+      GetOutput(context, node, ops::builtin::lstm::full::kOutputTensor);
+
+  std::vector<int> intemediate_tensor_indexes(node->intermediates->size);
+  for (int i = 0; i < node->intermediates->size; ++i) {
+    intemediate_tensor_indexes[i] = node->intermediates->data[i];
+  }
+
+  switch (input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      return EvalFloat(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input_layer_norm_coefficients, forget_layer_norm_coefficients,
+          cell_layer_norm_coefficients, output_layer_norm_coefficients,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*time_major=*/true,
+          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          output, logger, intemediate_tensor_indexes, error_reporter);
+    }
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    default:
+      printf("Error. Only float model can be calibrated\n");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus lstm_logging_kernel(TfLiteContext* context, TfLiteNode* node,
+                                 Logger* logger,
+                                 ErrorReporter* error_reporter) {
+  return lstm_eval(context, node, logger, error_reporter);
+}
+
+}  // namespace custom
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
new file mode 100644
index 00000000000..0dcdab7d605
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace custom {
+
+TfLiteStatus lstm_logging_kernel(TfLiteContext* context, TfLiteNode* node,
+                                 Logger* logger, ErrorReporter* error_reporter);
+
+}  // namespace custom
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
index 92601b2a459..557e5fcff83 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
@@ -71,7 +71,7 @@ LoggingOpResolver::LoggingOpResolver(
                       absl::StrJoin(unresolved_builtin_ops, ", "), "]");
     if (!unresolved_custom_ops.empty()) {
       absl::StrAppend(&error_message, "\nThere are unresolved custom ops: [",
-                      absl::StrJoin(unresolved_builtin_ops, ", "), "]");
+                      absl::StrJoin(unresolved_custom_ops, ", "), "]");
     }
     TF_LITE_REPORT_ERROR(error_reporter, error_message.c_str());
   }
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index ff9a476187a..62a3f85e586 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/operator_property.h"
@@ -42,6 +43,8 @@ int32_t GetOrInsertOpCodeIndex(ModelT* model, const BuiltinOperator& op_code,
   model->operator_codes.push_back(absl::make_unique<OperatorCodeT>());
   int op_code_idx = model->operator_codes.size() - 1;
   model->operator_codes[op_code_idx]->builtin_code = op_code;
+  model->operator_codes[op_code_idx]->deprecated_builtin_code =
+      ConvertBuiltinCodeToDeprecatedBuiltinCode(op_code);
   // Version 2 and onwards supports INT8 inputs.
   model->operator_codes[op_code_idx]->version = version;
 
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
index 26e069032ad..08a31cd0051 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -46,12 +46,18 @@ std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput(
 
   // Op code
   quant_op_code->builtin_code = BuiltinOperator_QUANTIZE;
+  quant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_QUANTIZE);
   quant_op_code->version = 2;
 
   fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_FULLY_CONNECTED);
   fc_op_code->version = 2;
 
   dequant_op_code->builtin_code = BuiltinOperator_DEQUANTIZE;
+  dequant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_DEQUANTIZE);
   dequant_op_code->version = 2;
 
   // Op.
@@ -137,12 +143,18 @@ std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput(
 
   // Op code
   quant_op_code->builtin_code = BuiltinOperator_QUANTIZE;
+  quant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_QUANTIZE);
   quant_op_code->version = 2;
 
   fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_FULLY_CONNECTED);
   fc_op_code->version = 2;
 
   dequant_op_code->builtin_code = BuiltinOperator_DEQUANTIZE;
+  dequant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_DEQUANTIZE);
   dequant_op_code->version = 2;
 
   // Op.
@@ -258,6 +270,8 @@ std::unique_ptr<ModelT> CreateFloatModel() {
 
   // Op code
   fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_FULLY_CONNECTED);
   fc_op_code->version = 2;
 
   // Op.
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 1efb0e3d48d..0bee87be992 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -44,7 +44,8 @@ const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index,
       model->subgraphs.at(subgraph_index)->operators[op_index].get();
   op_variant.op_code =
       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
-  if (op_variant.op_code == BuiltinOperator_LSTM) {
+  if (op_variant.op_code == BuiltinOperator_LSTM ||
+      op_variant.op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM) {
     if (op->inputs.size() == 5) {
       // The 5 input ("basic") LSTM is not supported in this tooling (yet).
       op_variant.is_quantizable = false;
@@ -99,6 +100,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantize_input_as_activations = true;
       break;
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND:
@@ -229,7 +231,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.version = 2;
       break;
     }
-    case BuiltinOperator_LSTM: {
+    case BuiltinOperator_LSTM:
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
       if (!op_variant.is_quantizable) {
         // Early exist for 5 input LSTM.
         // It is not supported in this tooling yet.
diff --git a/tensorflow/lite/tools/optimize/python/BUILD b/tensorflow/lite/tools/optimize/python/BUILD
index 050c0008924..34f57cbecaf 100644
--- a/tensorflow/lite/tools/optimize/python/BUILD
+++ b/tensorflow/lite/tools/optimize/python/BUILD
@@ -52,7 +52,7 @@ py_library(
     name = "modify_model_interface_constants",
     srcs = ["modify_model_interface_constants.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/lite/python:lite_constants"],
+    deps = ["//tensorflow/python:dtypes"],
 )
 
 pybind_extension(
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
index cbe1aa92022..f7c7cc60d5c 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
@@ -19,12 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.lite.python import lite_constants
+from tensorflow.python.framework import dtypes
 
 STR_TO_TFLITE_TYPES = {
-    'INT8': lite_constants.INT8,
-    'INT16': lite_constants.INT16,
-    'UINT8': lite_constants.QUANTIZED_UINT8
+    'INT8': dtypes.int8,
+    'UINT8': dtypes.uint8,
+    'INT16': dtypes.int16,
 }
 TFLITE_TO_STR_TYPES = {v: k for k, v in STR_TO_TFLITE_TYPES.items()}
 
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 81110071dc9..d2e04338375 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -502,9 +502,14 @@ TfLiteStatus QuantizeTensorFloat16(ModelT* model, TensorT* tensor) {
   // Transform float data to float16.
   std::vector<Eigen::half> quantized_buffer;
   quantized_buffer.resize(num_elements);
-  std::transform(
-      float_vector.begin(), float_vector.end(), quantized_buffer.begin(),
-      [](float a) { return Eigen::half_impl::float_to_half_rtne(a); });
+  constexpr float kMaxFloat16Value = 65504.f;
+  constexpr float kMinFloat16Value = -65504.f;
+  std::transform(float_vector.begin(), float_vector.end(),
+                 quantized_buffer.begin(), [=](float a) {
+                   float clamped = std::min(std::max(a, kMinFloat16Value),
+                                            kMaxFloat16Value);
+                   return Eigen::half_impl::float_to_half_rtne(clamped);
+                 });
 
   char* half_buffer = reinterpret_cast<char*>(quantized_buffer.data());
   model->buffers[tensor->buffer]->data.assign(
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index 4ce0d01fd12..76648eaafac 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -575,6 +575,42 @@ TEST_F(QuantizationUtilsTest, SymmetricQuantizeTensor) {
   EXPECT_EQ(quant_buffer_size * 4, float_buffer_size);
 }
 
+TEST_F(QuantizationUtilsTest, QuantizeFloat16Clamp) {
+  // Create data.
+  auto model = absl::make_unique<ModelT>();
+  auto subgraph = absl::make_unique<tflite::SubGraphT>();
+  auto tensor = absl::make_unique<TensorT>();
+  auto buffer = absl::make_unique<tflite::BufferT>();
+  constexpr int kNumElements = 6;
+  const std::vector<float> weights = {2.0, 1.0, 65504., 65505, -65504., -99999};
+  auto weights_reinterpreted_data =
+      reinterpret_cast<const unsigned char*>(weights.data());
+  buffer->data.assign(weights_reinterpreted_data,
+                      weights_reinterpreted_data + weights.size() * 4);
+  tensor->buffer = 0;
+  tensor->shape = {1, kNumElements};
+
+  // Wire the model.
+  model->subgraphs.push_back(std::move(subgraph));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor));
+  model->buffers.push_back(std::move(buffer));
+
+  // Call and verify.
+  EXPECT_EQ(
+      QuantizeTensorFloat16(model.get(), model->subgraphs[0]->tensors[0].get()),
+      kTfLiteOk);
+  auto weightsf16 = reinterpret_cast<Eigen::half*>(
+      model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data.data());
+  std::vector<float> wf32(kNumElements);
+  std::transform(weightsf16, weightsf16 + 6, wf32.begin(), [](Eigen::half a) {
+    return Eigen::half_impl::half_to_float(a);
+  });
+
+  EXPECT_THAT(wf32,
+              ElementsAreArray({2.0, 1.0, 65504., 65504., -65504., -65504.}));
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_FLOAT16);
+}
+
 TEST_F(QuantizationUtilsTest, QuantizeFloat16) {
   // Conv model has weights between 0 and 10.
   // Quantize the weights tensor.
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc
index 753cf99375a..1e9fe666e89 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc
@@ -24,6 +24,12 @@ namespace tflite {
 namespace optimize {
 namespace {
 
+#ifdef TFLITE_CUSTOM_LSTM
+constexpr bool kUseCustomLSTM = true;
+#else
+constexpr bool kUseCustomLSTM = false;
+#endif
+
 void MakeTensor(const string& name, std::unique_ptr<TensorT>* tensor) {
   TensorT* tensor_raw = new TensorT;
   tensor_raw->name = name;
@@ -68,6 +74,10 @@ TfLiteStatus LoadModel(const string& path, ModelT* model) {
 
 TfLiteStatus AddIntermediateTensorsToFusedOp(
     flatbuffers::FlatBufferBuilder* builder, ModelT* model) {
+  // Return early when the model has no operator.
+  if (model->subgraphs.size() == 1 && model->subgraphs[0]->operators.empty()) {
+    return kTfLiteOk;
+  }
   // Return early if the model already has intermediate tensors.
   if (IntermediateTensorExists(model)) {
     return kTfLiteOk;
@@ -86,7 +96,10 @@ TfLiteStatus AddIntermediateTensorsToFusedOp(
       }
       // Add tensors.
       const int next_tensor_index = subgraph->tensors.size();
-      const int num_intermediates = property.intermediates.size();
+      int num_intermediates = property.intermediates.size();
+      if (kUseCustomLSTM) {
+        num_intermediates = 12;
+      }
       for (int i = 0; i < num_intermediates; ++i) {
         std::unique_ptr<TensorT> intermediate_tensor;
         auto name = CreateTensorName(op_idx, i);
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc
new file mode 100644
index 00000000000..85d9ee71e68
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
+#include "tensorflow/lite/tools/optimize/quantization_wrapper_utils.h"
+
+namespace tflite {
+namespace optimize {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(LstmPreprocess, Add2Tensors) {
+  // Create a model with 1 lstm layer.
+  auto model = absl::make_unique<ModelT>();
+  auto subgraph = absl::make_unique<tflite::SubGraphT>();
+  auto buffer = absl::make_unique<tflite::BufferT>();
+  auto lstm_op_code = absl::make_unique<OperatorCodeT>();
+  auto lstm_op = absl::make_unique<OperatorT>();
+
+  lstm_op_code->builtin_code = BuiltinOperator_LSTM;
+  lstm_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_LSTM);
+  lstm_op_code->version = 2;
+  lstm_op->opcode_index = 0;
+  lstm_op->inputs = {0, 1,  2,  3,  4,  5,  6,  7,  8,  -1, -1, -1,
+                     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+  lstm_op->outputs = {24};
+
+  model->subgraphs.push_back(std::move(subgraph));
+  for (int i = 0; i < lstm_op->inputs.size(); ++i) {
+    const int index = lstm_op->inputs[i];
+    if (index == -1) {
+      continue;
+    }
+    auto tensor = absl::make_unique<TensorT>();
+    tensor->name = "lstm_tensor" + std::to_string(index);
+    tensor->shape = {2, 3, 4};
+    tensor->type = TensorType_FLOAT32;
+    model->subgraphs[0]->tensors.push_back(std::move(tensor));
+  }
+  model->subgraphs[0]->operators.push_back(std::move(lstm_op));
+  model->operator_codes.push_back(std::move(lstm_op_code));
+  model->buffers.push_back(std::move(buffer));
+
+  // Add 2 tensors.
+  flatbuffers::FlatBufferBuilder builder;
+  tflite::optimize::AddIntermediateTensorsToFusedOp(&builder, model.get());
+
+  // Verify results.
+  EXPECT_EQ(model->operator_codes.size(), 1);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 33);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[0].get()),
+            BuiltinOperator_LSTM);
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "lstm_tensor0");
+  EXPECT_EQ(model->subgraphs[0]->tensors[21]->name, "intermediate_0_0");
+  EXPECT_EQ(model->subgraphs[0]->tensors[22]->name, "intermediate_0_1");
+  EXPECT_EQ(model->subgraphs[0]->tensors[23]->name, "intermediate_0_2");
+  EXPECT_EQ(model->subgraphs[0]->tensors[24]->name, "intermediate_0_3");
+  EXPECT_EQ(model->subgraphs[0]->tensors[25]->name, "intermediate_0_4");
+  EXPECT_EQ(model->subgraphs[0]->tensors[26]->name, "intermediate_0_5");
+  EXPECT_EQ(model->subgraphs[0]->tensors[27]->name, "intermediate_0_6");
+  EXPECT_EQ(model->subgraphs[0]->tensors[28]->name, "intermediate_0_7");
+  EXPECT_EQ(model->subgraphs[0]->tensors[29]->name, "intermediate_0_8");
+  EXPECT_EQ(model->subgraphs[0]->tensors[30]->name, "intermediate_0_9");
+  EXPECT_EQ(model->subgraphs[0]->tensors[31]->name, "intermediate_0_10");
+  EXPECT_EQ(model->subgraphs[0]->tensors[32]->name, "intermediate_0_11");
+  EXPECT_THAT(
+      model->subgraphs[0]->operators[0]->inputs,
+      ElementsAreArray({0, 1,  2,  3,  4,  5,  6,  7,  8,  -1, -1, -1,
+                        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}));
+  EXPECT_THAT(model->subgraphs[0]->operators[0]->outputs,
+              ElementsAreArray({24}));
+  EXPECT_THAT(
+      model->subgraphs[0]->operators[0]->intermediates,
+      ElementsAreArray({21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}));
+
+  // Call AddIntermediateTensorsToFusedOp again and expect no change in model.
+  tflite::optimize::AddIntermediateTensorsToFusedOp(&builder, model.get());
+
+  // Verify results.
+  EXPECT_EQ(model->operator_codes.size(), 1);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 33);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[0].get()),
+            BuiltinOperator_LSTM);
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "lstm_tensor0");
+  EXPECT_EQ(model->subgraphs[0]->tensors[21]->name, "intermediate_0_0");
+  EXPECT_EQ(model->subgraphs[0]->tensors[22]->name, "intermediate_0_1");
+  EXPECT_EQ(model->subgraphs[0]->tensors[23]->name, "intermediate_0_2");
+  EXPECT_EQ(model->subgraphs[0]->tensors[24]->name, "intermediate_0_3");
+  EXPECT_EQ(model->subgraphs[0]->tensors[25]->name, "intermediate_0_4");
+  EXPECT_EQ(model->subgraphs[0]->tensors[26]->name, "intermediate_0_5");
+  EXPECT_EQ(model->subgraphs[0]->tensors[27]->name, "intermediate_0_6");
+  EXPECT_EQ(model->subgraphs[0]->tensors[28]->name, "intermediate_0_7");
+  EXPECT_EQ(model->subgraphs[0]->tensors[29]->name, "intermediate_0_8");
+  EXPECT_EQ(model->subgraphs[0]->tensors[30]->name, "intermediate_0_9");
+  EXPECT_EQ(model->subgraphs[0]->tensors[31]->name, "intermediate_0_10");
+  EXPECT_EQ(model->subgraphs[0]->tensors[32]->name, "intermediate_0_11");
+  EXPECT_THAT(
+      model->subgraphs[0]->operators[0]->inputs,
+      ElementsAreArray({0, 1,  2,  3,  4,  5,  6,  7,  8,  -1, -1, -1,
+                        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}));
+  EXPECT_THAT(model->subgraphs[0]->operators[0]->outputs,
+              ElementsAreArray({24}));
+  EXPECT_THAT(
+      model->subgraphs[0]->operators[0]->intermediates,
+      ElementsAreArray({21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}));
+}
+
+}  // namespace
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) { return RUN_ALL_TESTS(); }
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
index e3970f5f57b..38fef0660d5 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
@@ -36,6 +36,8 @@ TEST(LstmPreprocess, Add2Tensors) {
   auto lstm_op = absl::make_unique<OperatorT>();
 
   lstm_op_code->builtin_code = BuiltinOperator_LSTM;
+  lstm_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_LSTM);
   lstm_op_code->version = 2;
   lstm_op->opcode_index = 0;
   lstm_op->inputs = {0, 1,  2,  3,  4,  5,  6,  7,  8,  -1, -1, -1,
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index e7f1c7a8bdf..5db624258f6 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -825,16 +825,20 @@ TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
           if (input.second.number_of_bits == 8 &&
               input.second.symmetric == false) {
             TensorT* tensor = subgraph->tensors[index_global].get();
+            if (tensor->quantization == nullptr) {
+              continue;
+            }
             if (utils::HasMinMax(tensor)) {
               utils::QuantizeActivation(tensor, activations_type,
                                         error_reporter);
             } else {
-              TF_LITE_REPORT_ERROR(
-                  error_reporter,
-                  "Unable to find min/max value for output %d in %s in "
-                  "subgraph %d, node: %d",
-                  tensor, EnumNameBuiltinOperator(op_code), subgraph_idx,
-                  op_idx);
+              TF_LITE_REPORT_ERROR(error_reporter,
+                                   "Unable to find min/max value for "
+                                   "intermediate tensor %d in %s in "
+                                   "subgraph %d, node: %d",
+                                   index_local,
+                                   EnumNameBuiltinOperator(op_code),
+                                   subgraph_idx, op_idx);
               return kTfLiteError;
             }
           } else if (input.second.number_of_bits == 16 &&
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 32a23033019..9afd163efd2 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -81,6 +81,44 @@ class QuantizeModelTest : public testing::Test {
   internal::FailOnErrorReporter error_reporter_;
 };
 
+void ExpectSameModels(const ModelT& model, const ModelT& expected_model) {
+  ASSERT_EQ(model.subgraphs.size(), expected_model.subgraphs.size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model.subgraphs.size();
+       subgraph_idx++) {
+    const auto graph = model.subgraphs[subgraph_idx].get();
+    const auto expected_graph = expected_model.subgraphs[subgraph_idx].get();
+    ASSERT_EQ(graph->tensors.size(), expected_graph->tensors.size());
+    for (size_t i = 0; i < graph->tensors.size(); i++) {
+      const auto tensor = graph->tensors[i].get();
+      const auto expected_tensor = expected_graph->tensors[i].get();
+      EXPECT_EQ(tensor->buffer, expected_tensor->buffer);
+      EXPECT_EQ(tensor->is_variable, expected_tensor->is_variable);
+      EXPECT_EQ(tensor->shape, expected_tensor->shape);
+      EXPECT_EQ(tensor->name, expected_tensor->name);
+      EXPECT_EQ(tensor->type, expected_tensor->type);
+      const auto quantization_params = tensor->quantization.get();
+      const auto expected_quantization_params =
+          expected_tensor->quantization.get();
+      if (quantization_params != nullptr ||
+          expected_quantization_params != nullptr) {
+        EXPECT_NE(quantization_params, nullptr);
+        EXPECT_NE(expected_quantization_params, nullptr);
+        EXPECT_EQ(quantization_params->scale,
+                  expected_quantization_params->scale);
+        EXPECT_EQ(quantization_params->zero_point,
+                  expected_quantization_params->zero_point);
+      }
+    }
+  }
+  ASSERT_EQ(model.buffers.size(), expected_model.buffers.size());
+  for (size_t buffer_idx = 0; buffer_idx < model.buffers.size(); ++buffer_idx) {
+    const auto buffer = model.buffers[buffer_idx].get()->data;
+    const auto expected_buffer = expected_model.buffers[buffer_idx].get()->data;
+    EXPECT_EQ(buffer, expected_buffer);
+  }
+  // TODO(jianlijianli): Compare operators as well.
+}
+
 class QuantizeConvModelTest : public QuantizeModelTest,
                               public testing::WithParamInterface<TensorType> {
  protected:
@@ -1121,42 +1159,7 @@ TEST_F(QuantizeLSTMTest, VerifyLSTM) {
   ModelT expected_model;
   expected_read_only_model->UnPackTo(&expected_model);
 
-  // Comparison.
-  ASSERT_EQ(model_.subgraphs.size(), expected_model.subgraphs.size());
-  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
-       subgraph_idx++) {
-    const auto graph = model_.subgraphs[subgraph_idx].get();
-    const auto expected_graph = expected_model.subgraphs[subgraph_idx].get();
-    ASSERT_EQ(graph->tensors.size(), expected_graph->tensors.size());
-    for (size_t i = 0; i < graph->tensors.size(); i++) {
-      const auto tensor = graph->tensors[i].get();
-      const auto expected_tensor = expected_graph->tensors[i].get();
-      EXPECT_EQ(tensor->buffer, expected_tensor->buffer);
-      EXPECT_EQ(tensor->is_variable, expected_tensor->is_variable);
-      EXPECT_EQ(tensor->shape, expected_tensor->shape);
-      EXPECT_EQ(tensor->name, expected_tensor->name);
-      EXPECT_EQ(tensor->type, expected_tensor->type);
-      const auto quantization_params = tensor->quantization.get();
-      const auto expected_quantization_params =
-          expected_tensor->quantization.get();
-      if (quantization_params != nullptr ||
-          expected_quantization_params != nullptr) {
-        EXPECT_NE(quantization_params, nullptr);
-        EXPECT_NE(expected_quantization_params, nullptr);
-        EXPECT_EQ(quantization_params->scale,
-                  expected_quantization_params->scale);
-        EXPECT_EQ(quantization_params->zero_point,
-                  expected_quantization_params->zero_point);
-      }
-    }
-  }
-  ASSERT_EQ(model_.buffers.size(), expected_model.buffers.size());
-  for (size_t buffer_idx = 0; buffer_idx < model_.buffers.size();
-       ++buffer_idx) {
-    const auto buffer = model_.buffers[buffer_idx].get()->data;
-    const auto expected_buffer = expected_model.buffers[buffer_idx].get()->data;
-    EXPECT_EQ(buffer, expected_buffer);
-  }
+  ExpectSameModels(model_, expected_model);
 }
 
 class QuantizeLSTM2Test : public QuantizeModelTest {
@@ -1181,42 +1184,34 @@ TEST_F(QuantizeLSTM2Test, VerifyLSTM) {
   ModelT expected_model;
   expected_read_only_model->UnPackTo(&expected_model);
 
-  // Comparison.
-  ASSERT_EQ(model_.subgraphs.size(), expected_model.subgraphs.size());
-  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
-       subgraph_idx++) {
-    const auto graph = model_.subgraphs[subgraph_idx].get();
-    const auto expected_graph = expected_model.subgraphs[subgraph_idx].get();
-    ASSERT_EQ(graph->tensors.size(), expected_graph->tensors.size());
-    for (size_t i = 0; i < graph->tensors.size(); i++) {
-      const auto tensor = graph->tensors[i].get();
-      const auto expected_tensor = expected_graph->tensors[i].get();
-      EXPECT_EQ(tensor->buffer, expected_tensor->buffer);
-      EXPECT_EQ(tensor->is_variable, expected_tensor->is_variable);
-      EXPECT_EQ(tensor->shape, expected_tensor->shape);
-      EXPECT_EQ(tensor->name, expected_tensor->name);
-      EXPECT_EQ(tensor->type, expected_tensor->type);
-      const auto quantization_params = tensor->quantization.get();
-      const auto expected_quantization_params =
-          expected_tensor->quantization.get();
-      if (quantization_params != nullptr ||
-          expected_quantization_params != nullptr) {
-        EXPECT_NE(quantization_params, nullptr);
-        EXPECT_NE(expected_quantization_params, nullptr);
-        EXPECT_EQ(quantization_params->scale,
-                  expected_quantization_params->scale);
-        EXPECT_EQ(quantization_params->zero_point,
-                  expected_quantization_params->zero_point);
-      }
-    }
-  }
-  ASSERT_EQ(model_.buffers.size(), expected_model.buffers.size());
-  for (size_t buffer_idx = 0; buffer_idx < model_.buffers.size();
-       ++buffer_idx) {
-    const auto buffer = model_.buffers[buffer_idx].get()->data;
-    const auto expected_buffer = expected_model.buffers[buffer_idx].get()->data;
-    EXPECT_EQ(buffer, expected_buffer);
+  ExpectSameModels(model_, expected_model);
+}
+
+class QuantizeUnidirectionalSequenceLSTMTest : public QuantizeModelTest {
+ protected:
+  QuantizeUnidirectionalSequenceLSTMTest() {
+    input_model_ = ReadModel(internal::kUnidirectionalSequenceLstmCalibrated);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
   }
+};
+
+TEST_F(QuantizeUnidirectionalSequenceLSTMTest,
+       VerifyUnidirectionalSequenceLSTM) {
+  // Quantize model.
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32, false,
+      TensorType_INT8, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  // Read expected model.
+  auto expected_fb_model =
+      ReadModel(internal::kUnidirectionalSequenceLstmQuantized);
+  auto expected_read_only_model = expected_fb_model->GetModel();
+  ModelT expected_model;
+  expected_read_only_model->UnPackTo(&expected_model);
+
+  ExpectSameModels(model_, expected_model);
 }
 
 class QuantizeSVDFTest : public QuantizeModelTest {
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index b22902a3e4b..5565fc4d657 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -57,6 +57,11 @@ const char* kModelPack = "pack.bin";
 const char* kLstmCalibrated = "lstm_calibrated.bin";
 const char* kLstmQuantized = "lstm_quantized.bin";
 
+const char* kUnidirectionalSequenceLstmCalibrated =
+    "unidirectional_sequence_lstm_calibrated.bin";
+const char* kUnidirectionalSequenceLstmQuantized =
+    "unidirectional_sequence_lstm_quantized.bin";
+
 const char* kModelWithMinimumOp = "minimum.bin";
 const char* kModelWithMaximumOp = "maximum.bin";
 const char* kLstmCalibrated2 = "lstm_calibrated2.bin";
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index 99e8f0aedd3..4341a67d1ae 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -92,17 +92,20 @@ extern const char* kModelPack;
 extern const char* kLstmCalibrated;
 extern const char* kLstmQuantized;
 
+// Test model with LSTM op that has peephole, without layer norm, without
+// projection, without cifg.
+extern const char* kLstmCalibrated2;
+extern const char* kLstmQuantized2;
+
+extern const char* kUnidirectionalSequenceLstmCalibrated;
+extern const char* kUnidirectionalSequenceLstmQuantized;
+
 // Test model with a minimum op.
 extern const char* kModelWithMinimumOp;
 
 // Test model with a maximum op.
 extern const char* kModelWithMaximumOp;
 
-// Test model with LSTM op that has peephole, without layer norm, without
-// projection, without cifg.
-extern const char* kLstmCalibrated2;
-extern const char* kLstmQuantized2;
-
 // Test model with a transpose op.
 extern const char* kModelWithTranspose;
 
diff --git a/tensorflow/lite/tools/optimize/testdata/unidirectional_sequence_lstm_calibrated.bin b/tensorflow/lite/tools/optimize/testdata/unidirectional_sequence_lstm_calibrated.bin
new file mode 100644
index 00000000000..5712f85329d
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/unidirectional_sequence_lstm_calibrated.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/unidirectional_sequence_lstm_quantized.bin b/tensorflow/lite/tools/optimize/testdata/unidirectional_sequence_lstm_quantized.bin
new file mode 100644
index 00000000000..3b547b4f4ec
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/unidirectional_sequence_lstm_quantized.bin differ
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index a4cc4e0c9a0..cc81fe49cbc 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -126,6 +126,25 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   return true;
 }
 
+bool CheckArraySegments(const DimensionMetadata* dim_metadata) {
+  if (dim_metadata->array_segments() == nullptr) {
+    return false;
+  }
+  switch (dim_metadata->array_segments_type()) {
+    case SparseIndexVector_Int32Vector:
+      return (dim_metadata->array_segments_as_Int32Vector()->values() !=
+              nullptr);
+    case SparseIndexVector_Uint16Vector:
+      return (dim_metadata->array_segments_as_Uint16Vector()->values() !=
+              nullptr);
+    case SparseIndexVector_Uint8Vector:
+      return (dim_metadata->array_segments_as_Uint8Vector()->values() !=
+              nullptr);
+    default:
+      return false;
+  }
+}
+
 int GetSizeOfSegments(const DimensionMetadata* dim_metadata) {
   switch (dim_metadata->array_segments_type()) {
     case SparseIndexVector_Int32Vector:
@@ -155,6 +174,25 @@ int GetValueOfSegmentsAt(const DimensionMetadata* dim_metadata, const int i) {
   }
 }
 
+bool CheckArrayIndices(const DimensionMetadata* dim_metadata) {
+  if (dim_metadata->array_indices() == nullptr) {
+    return false;
+  }
+  switch (dim_metadata->array_indices_type()) {
+    case SparseIndexVector_Int32Vector:
+      return (dim_metadata->array_indices_as_Int32Vector()->values() !=
+              nullptr);
+    case SparseIndexVector_Uint16Vector:
+      return (dim_metadata->array_indices_as_Uint16Vector()->values() !=
+              nullptr);
+    case SparseIndexVector_Uint8Vector:
+      return (dim_metadata->array_indices_as_Uint8Vector()->values() !=
+              nullptr);
+    default:
+      return false;
+  }
+}
+
 int GetSizeOfIndices(const DimensionMetadata* dim_metadata) {
   switch (dim_metadata->array_indices_type()) {
     case SparseIndexVector_Int32Vector:
@@ -205,9 +243,8 @@ absl::optional<uint64_t> VerifyAndCountElements(
       // Each index in a dense dimension is stored implicitly.
       num_elements *= dim_metadata->dense_size();
     } else {
-      const auto* array_segments = dim_metadata->array_segments();
-      const auto* array_indices = dim_metadata->array_indices();
-      if (array_segments == nullptr || array_indices == nullptr) {
+      if (!CheckArraySegments(dim_metadata) ||
+          !CheckArrayIndices(dim_metadata)) {
         return absl::nullopt;
       }
 
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index f37eaaa99be..c4eac26049e 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 4f89a6531f8..06ac1968f52 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -26,6 +26,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 88ff581d8ea..ac38bbdea0b 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace {
@@ -448,7 +449,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_STRING) {
         return 2;
       }
+      return 1;
 
+    case BuiltinOperator_SQUEEZE:
+      if (op_sig.input_types.at(0) == TensorType_STRING) {
+        return 2;
+      }
       return 1;
 
     case BuiltinOperator_SPACE_TO_BATCH_ND:
@@ -545,6 +551,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_CONCATENATION:
+    case BuiltinOperator_BATCH_MATMUL:
     case BuiltinOperator_SOFTMAX:
     case BuiltinOperator_MEAN:
     case BuiltinOperator_PAD:
@@ -587,7 +594,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
     case BuiltinOperator_SELECT:
-    case BuiltinOperator_BATCH_MATMUL:
       if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
@@ -623,9 +629,10 @@ TensorType GetTensorType(int32_t idx, const SubGraph* subgraph) {
 // options to decide op version.
 OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
                            const SubGraph* subgraph) {
-  OpSignature op_sig = {op_code->builtin_code()};
+  auto builtin_code = GetBuiltinCode(op_code);
+  OpSignature op_sig = {builtin_code};
 
-  switch (op_code->builtin_code()) {
+  switch (builtin_code) {
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
       auto conv_option = op->builtin_options_as_DepthwiseConv2DOptions();
       if (conv_option) {
@@ -799,14 +806,15 @@ void UpdateOpVersion(uint8_t* model_buffer_pointer) {
       OperatorCode* op_code =
           model->mutable_operator_codes()->GetMutableObject(op->opcode_index());
 
-      if (op_code->builtin_code() != BuiltinOperator_CUSTOM) {
+      auto builtin_code = GetBuiltinCode(op_code);
+      if (builtin_code != BuiltinOperator_CUSTOM) {
         OpSignature op_sig = GetOpSignature(op_code, op, subgraph);
         // Update builtin operator version.
         int32_t op_ver = GetBuiltinOperatorVersion(op_sig);
         if (!op_code->mutate_version(op_ver)) {
           LOG(ERROR) << "Can't set operator "
-                     << EnumNameBuiltinOperator(op_code->builtin_code())
-                     << " to version " << op_ver;
+                     << EnumNameBuiltinOperator(builtin_code) << " to version "
+                     << op_ver;
         }
       }
     }
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 5a7464e1956..c48755830ea 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/schema/mutable/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace {
@@ -59,11 +60,12 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_AVERAGE_POOL_2D, 3}, "2.3.0"},
               {{BuiltinOperator_BATCH_MATMUL, 1}, "2.3.0"},
               {{BuiltinOperator_BATCH_MATMUL, 2}, "2.3.0"},
+              {{BuiltinOperator_BATCH_MATMUL, 3}, "2.4.0"},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 4}, "2.3.0"},
-              {{BuiltinOperator_CONV_2D, 5}, kPendingReleaseVersion},
+              {{BuiltinOperator_CONV_2D, 5}, "2.4.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
@@ -72,8 +74,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, "2.3.0"},
               {{BuiltinOperator_ADD, 1}, "1.5.0"},
               {{BuiltinOperator_ADD, 2}, "1.14.0"},
-              {{BuiltinOperator_ADD, 3}, kPendingReleaseVersion},
-              {{BuiltinOperator_ADD, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_ADD, 3}, "2.4.0"},
+              {{BuiltinOperator_ADD, 4}, "2.4.0"},
               {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 1}, "1.6.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 2}, "1.14.0"},
@@ -81,8 +83,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SUB, 1}, "1.6.0"},
               {{BuiltinOperator_SUB, 2}, "1.14.0"},
               {{BuiltinOperator_SUB, 3}, "2.3.0"},
-              {{BuiltinOperator_SUB, 4}, kPendingReleaseVersion},
-              {{BuiltinOperator_SUB, 5}, kPendingReleaseVersion},
+              {{BuiltinOperator_SUB, 4}, "2.4.0"},
+              {{BuiltinOperator_SUB, 5}, "2.4.0"},
               {{BuiltinOperator_DENSIFY, 1}, "2.2.0"},
               {{BuiltinOperator_DIV, 1}, "1.6.0"},
               {{BuiltinOperator_DIV, 2}, "2.3.0"},
@@ -112,7 +114,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
-              {{BuiltinOperator_GATHER, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_GATHER, 4}, "2.4.0"},
               {{BuiltinOperator_GATHER_ND, 1}, "1.14.0"},
               {{BuiltinOperator_GATHER_ND, 2}, "2.3.0"},
               {{BuiltinOperator_HASHTABLE_LOOKUP, 1}, "1.5.0"},
@@ -143,12 +145,12 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_NON_MAX_SUPPRESSION_V5, 1}, "2.1.0"},
               {{BuiltinOperator_PAD, 1}, "1.5.0"},
               {{BuiltinOperator_PAD, 2}, "1.14.0"},
-              {{BuiltinOperator_PAD, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_PAD, 3}, "2.4.0"},
               {{BuiltinOperator_TILE, 1}, "1.10.1"},
               {{BuiltinOperator_TILE, 2}, "2.2.0"},
               {{BuiltinOperator_PADV2, 1}, "1.9.0"},
               {{BuiltinOperator_PADV2, 2}, "1.14.0"},
-              {{BuiltinOperator_PADV2, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_PADV2, 3}, "2.4.0"},
               {{BuiltinOperator_RESHAPE, 1}, "1.5.0"},
               {{BuiltinOperator_SOFTMAX, 1}, "1.5.0"},
               {{BuiltinOperator_SOFTMAX, 2}, "1.14.0"},
@@ -159,7 +161,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_TRANSPOSE, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 3}, "1.15.0"},
               {{BuiltinOperator_TRANSPOSE, 4}, "2.3.0"},
-              {{BuiltinOperator_TRANSPOSE, 5}, kPendingReleaseVersion},
+              {{BuiltinOperator_TRANSPOSE, 5}, "2.4.0"},
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
@@ -175,7 +177,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 3}, "2.3.0"},
               {{BuiltinOperator_MEAN, 1}, "1.6.0"},
               {{BuiltinOperator_MEAN, 2}, "1.14.0"},
-              {{BuiltinOperator_MEAN, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_MEAN, 3}, "2.4.0"},
               {{BuiltinOperator_SUM, 1}, "1.10.0"},
               {{BuiltinOperator_SUM, 2}, "1.15.0"},
               {{BuiltinOperator_REDUCE_MAX, 1}, "1.11.0"},
@@ -193,13 +195,13 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1}, "1.13.1"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 2}, "1.14.0"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 3}, "2.3.0"},
-              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 4},
-               kPendingReleaseVersion},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 4}, "2.4.0"},
               {{BuiltinOperator_RNN, 1}, "1.5.0"},
               {{BuiltinOperator_RNN, 2}, "1.14.0"},
               {{BuiltinOperator_RNN, 3}, "2.3.0"},
               {{BuiltinOperator_SKIP_GRAM, 1}, "1.5.0"},
               {{BuiltinOperator_SQUEEZE, 1}, "1.6.0"},
+              {{BuiltinOperator_SQUEEZE, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_SPLIT, 1}, "1.5.0"},
               {{BuiltinOperator_SPLIT, 2}, "1.14.0"},
               {{BuiltinOperator_SPLIT, 3}, "1.14.0"},
@@ -230,7 +232,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SLICE, 1}, "1.14.0"},
               {{BuiltinOperator_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_SLICE, 3}, "1.14.0"},
-              {{BuiltinOperator_SLICE, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_SLICE, 4}, "2.4.0"},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
               {{BuiltinOperator_TANH, 3}, "2.3.0"},
@@ -310,7 +312,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SQUARE, 1}, "1.12.0"},
               {{BuiltinOperator_ZEROS_LIKE, 1}, "1.12.0"},
               {{BuiltinOperator_ABS, 1}, "1.13.0"},
-              {{BuiltinOperator_ABS, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_ABS, 2}, "2.4.0"},
               {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
               {{BuiltinOperator_FILL, 1}, "1.13.0"},
               {{BuiltinOperator_FILL, 2}, "2.3.0"},
@@ -318,6 +320,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_REVERSE_V2, 2}, "2.2.0"},
               {{BuiltinOperator_RANK, 1}, "1.14.0"},
               {{BuiltinOperator_WHILE, 1}, "1.15.0"},
+              {{BuiltinOperator_CUMSUM, 1}, "2.4.0"},
+              {{BuiltinOperator_CALL_ONCE, 1}, kPendingReleaseVersion},
           });
 
   std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
@@ -339,7 +343,7 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
       const OperatorCode* op_code =
           model->operator_codes()->Get(op->opcode_index());
       std::string runtime_version = FindMinimumRuntimeVersionForOp(
-          op_code->builtin_code(), op_code->version());
+          GetBuiltinCode(op_code), op_code->version());
       if (runtime_version.empty() ||
           runtime_version == kPendingReleaseVersion) {
         // In case we didn't find the current op in the map, or the operator
diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
index 3d22d1bb05b..a9d337bee16 100644
--- a/tensorflow/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -452,6 +452,10 @@ def CreateHtmlFile(tflite_input, html_output):
                               ("custom_code", None),
                               ("version", None)]
 
+  # Update builtin code fields.
+  for idx, d in enumerate(data["operator_codes"]):
+    d["builtin_code"] = max(d["builtin_code"], d["deprecated_builtin_code"])
+
   for subgraph_idx, g in enumerate(data["subgraphs"]):
     # Subgraph local specs on what to display
     html += "<div class='subgraph'>"
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 5d7658389be..010c9357aa3 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -9,6 +9,7 @@ tensorflow/lite/micro/build_def.bzl
 tensorflow/python/autograph/core/config.py
 tensorflow/python/eager/benchmarks_test_base.py
 tensorflow/python/framework/tfrt_utils.py
+tensorflow/python/keras/benchmarks/layer_benchmarks/run_xprof.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
 tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh
@@ -117,12 +118,13 @@ tensorflow/third_party/llvm/BUILD
 tensorflow/third_party/llvm/expand_cmake_vars.py
 tensorflow/third_party/llvm/llvm.autogenerated.BUILD
 tensorflow/third_party/llvm/llvm.bzl
+tensorflow/third_party/llvm_openmp/BUILD
+tensorflow/third_party/llvm_openmp/openmp.bzl
 tensorflow/third_party/lmdb.BUILD
 tensorflow/third_party/mkl/BUILD
 tensorflow/third_party/mkl/LICENSE
 tensorflow/third_party/mkl/MKL_LICENSE
 tensorflow/third_party/mkl/build_defs.bzl
-tensorflow/third_party/mkl/mkl.BUILD
 tensorflow/third_party/mkl_dnn/LICENSE
 tensorflow/third_party/mkl_dnn/build_defs.bzl
 tensorflow/third_party/mkl_dnn/mkldnn.BUILD
@@ -136,14 +138,7 @@ tensorflow/third_party/nccl/archive.patch
 tensorflow/third_party/nccl/build_defs.bzl.tpl
 tensorflow/third_party/nccl/nccl_configure.bzl
 tensorflow/third_party/nccl/system.BUILD.tpl
-tensorflow/third_party/ngraph/BUILD
-tensorflow/third_party/ngraph/LICENSE
-tensorflow/third_party/ngraph/NGRAPH_LICENSE
-tensorflow/third_party/ngraph/build_defs.bzl
-tensorflow/third_party/ngraph/ngraph.BUILD
-tensorflow/third_party/ngraph/ngraph_tf.BUILD
-tensorflow/third_party/ngraph/nlohmann_json.BUILD
-tensorflow/third_party/ngraph/tbb.BUILD
+tensorflow/third_party/nlohmann_json.BUILD
 tensorflow/third_party/opt_einsum.BUILD
 tensorflow/third_party/pcre.BUILD
 tensorflow/third_party/png.BUILD
@@ -204,6 +199,7 @@ tensorflow/third_party/tensorrt/build_defs.bzl.tpl
 tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl
 tensorflow/third_party/tensorrt/tensorrt_configure.bzl
 tensorflow/third_party/termcolor.BUILD
+tensorflow/third_party/tf_toolchains.BUILD
 tensorflow/third_party/tflite_mobilenet.BUILD
 tensorflow/third_party/tflite_mobilenet_float.BUILD
 tensorflow/third_party/tflite_mobilenet_quant.BUILD
@@ -272,115 +268,16 @@ tensorflow/tools/build_info/BUILD
 tensorflow/tools/ci_build/horovod/gpu/nightly.sh
 tensorflow/tools/ci_build/release/common.sh
 tensorflow/tools/ci_build/release/common_win.bat
-tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
-tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/custom_op/nightly.sh
 tensorflow/tools/ci_build/release/ubuntu_16/custom_op/release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
-tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
-tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
-tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat
-tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
 tensorflow/tools/ci_build/release/windows/cpu_py36_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/cpu_py36_full/release_v1.bat
-tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
 tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_v1.bat
-tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
 tensorflow/tools/ci_build/release/windows/cpu_py38_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/nightly.bat
-tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/release.bat
-tensorflow/tools/ci_build/release/windows/gpu_pip_on_cpu/build.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat
-tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
 tensorflow/tools/ci_build/release/windows/gpu_py36_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/gpu_py36_full/release_v1.bat
-tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
 tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_v1.bat
-tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
 tensorflow/tools/ci_build/release/windows/gpu_py38_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
 tensorflow/tools/ci_build/remote/BUILD
 tensorflow/tools/def_file_filter/BUILD
 tensorflow/tools/def_file_filter/BUILD.tpl
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index bb80b737876..e790cd703ec 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -43,10 +43,6 @@ load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_protos_grappler")  # @unused
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static", "tf_additional_plugin_deps", "tf_additional_profiler_deps", "tf_additional_xla_deps_py")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
-load(
-    "//third_party/ngraph:build_defs.bzl",
-    "if_ngraph",
-)
 
 # TODO(mdan): Break into per-directory files.
 
@@ -358,7 +354,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":platform",
         ":platform_test",
@@ -377,7 +372,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":platform",
         ":platform_test",
@@ -389,7 +383,6 @@ tf_py_test(
     size = "small",
     srcs = ["platform/flags_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -405,7 +398,6 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -710,7 +702,6 @@ tf_python_pybind_extension(
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
@@ -1067,7 +1058,7 @@ cc_library(
         "-parse_headers",
     ],
     visibility = tf_external_workspace_visible(visibility + [
-        "//learning/deepmind/courier:__subpackages__",
+        "//tensorflow:ndarray_tensor_allow_list",
     ]),
     deps = [
         ":numpy_lib",
@@ -1192,7 +1183,6 @@ tf_py_test(
     name = "decorator_utils_test",
     srcs = ["util/decorator_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1204,7 +1194,6 @@ tf_py_test(
     name = "deprecation_test",
     srcs = ["util/deprecation_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1216,7 +1205,6 @@ tf_py_test(
     name = "dispatch_test",
     srcs = ["util/dispatch_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1228,7 +1216,6 @@ tf_py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -1383,6 +1370,7 @@ py_library(
         ":_pywrap_kernel_registry",
         ":_pywrap_py_exception_registry",
         ":_pywrap_py_func",  # TODO(b/142001480): remove once the bug is fixed.
+        ":_pywrap_python_api_dispatcher",
         ":_pywrap_python_op_gen",
         ":_pywrap_quantize_training",
         ":_pywrap_stacktrace_handler",
@@ -1566,7 +1554,6 @@ tf_py_test(
     srcs = ["framework/function_def_to_graph_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -1695,7 +1682,6 @@ tf_py_test(
     srcs = ["framework/py_context_manager_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
-    tfrt_enabled = True,
     deps = [
         ":_py_context_manager",
     ],
@@ -1750,7 +1736,118 @@ tf_py_test(
     srcs = ["framework/op_def_util_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
-    tfrt_enabled = True,
+)
+
+cc_library(
+    name = "python_api_dispatcher",
+    srcs = ["framework/python_api_dispatcher.cc"],
+    hdrs = ["framework/python_api_dispatcher.h"],
+    deps = [
+        ":cpp_python_util",
+        ":safe_pyobject_ptr",
+        "//tensorflow/core/platform:logging",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Note: this target is only used by python_api_dispatcher_test.
+tf_python_pybind_extension(
+    name = "_pywrap_python_api_dispatcher",
+    # testonly = True,
+    srcs = ["framework/python_api_dispatcher_wrapper.cc"],
+    hdrs = ["framework/python_api_dispatcher.h"],
+    module_name = "_pywrap_python_api_dispatcher",
+    deps = [
+        ":safe_pyobject_ptr_required_hdrs",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "python_api_dispatcher_test",
+    srcs = ["framework/python_api_dispatcher_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    deps = [
+        ":_pywrap_python_api_dispatcher",
+        ":client_testlib",
+    ],
+)
+
+cc_library(
+    name = "python_tensor_converter",
+    srcs = ["framework/python_tensor_converter.cc"],
+    hdrs = ["framework/python_tensor_converter.h"],
+    deps = [
+        ":cpp_python_util",
+        ":safe_pyobject_ptr",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/python/eager:pywrap_tfe_lib",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Note: this target is only used by python_tensor_converter_test.
+tf_python_pybind_extension(
+    name = "_pywrap_python_tensor_converter",
+    srcs = ["framework/python_tensor_converter_wrapper.cc"],
+    hdrs = [
+        "framework/python_tensor_converter.h",
+        "lib/core/numpy.h",
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
+        "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
+        "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
+        "//tensorflow/python/eager:pywrap_required_hdrs",
+    ],
+    module_name = "_pywrap_python_tensor_converter",
+    deps = [
+        ":safe_pyobject_ptr_required_hdrs",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@pybind11",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//third_party/py/numpy:headers",
+        "//tensorflow/c:pywrap_required_hdrs",
+        "@com_google_absl//absl/types:span",
+    ] + if_static(
+        extra_deps = [
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
+        ],
+        otherwise = [
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
+        ],
+    ),
+)
+
+tf_py_test(
+    name = "python_tensor_converter_test",
+    srcs = ["framework/python_tensor_converter_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    deps = [
+        ":_pywrap_python_tensor_converter",
+        ":client_testlib",
+    ],
 )
 
 py_library(
@@ -1964,7 +2061,6 @@ tf_py_test(
     size = "small",
     srcs = ["framework/smart_cond_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -2037,7 +2133,6 @@ tf_py_test(
     srcs = ["framework/composite_tensor_utils_test.py"],
     main = "framework/composite_tensor_utils_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":composite_tensor",
@@ -2300,7 +2395,6 @@ tf_py_test(
     srcs = ["framework/constant_op_test.py"],
     main = "framework/constant_op_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":constant_op",
     ],
@@ -2312,7 +2406,6 @@ tf_py_test(
     srcs = ["framework/registry_test.py"],
     main = "framework/registry_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2326,7 +2419,6 @@ tf_py_test(
     srcs = ["framework/errors_test.py"],
     main = "framework/errors_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -2340,7 +2432,6 @@ tf_py_test(
     srcs = ["framework/error_interpolation_test.py"],
     main = "framework/error_interpolation_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -2355,7 +2446,6 @@ tf_py_test(
     srcs = ["framework/subscribe_test.py"],
     main = "framework/subscribe_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2398,7 +2488,6 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -2411,7 +2500,6 @@ tf_py_test(
     srcs = ["framework/proto_test.py"],
     main = "framework/proto_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2500,7 +2588,6 @@ tf_py_test(
     srcs = ["framework/versions_test.py"],
     main = "framework/versions_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2549,7 +2636,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2573,7 +2659,6 @@ tf_py_test(
     srcs = ["framework/traceable_stack_test.py"],
     main = "framework/traceable_stack_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         ":platform_test",
@@ -2628,7 +2713,6 @@ tf_py_test(
     srcs = ["framework/common_shapes_test.py"],
     main = "framework/common_shapes_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2645,7 +2729,6 @@ tf_py_test(
     main = "framework/ops_test.py",
     python_version = "PY3",
     tags = ["no_pip"],  # test_ops_2 is not available in pip.
-    tfrt_enabled = True,
     deps = [
         ":cond_v2",
         ":control_flow_ops",
@@ -2675,7 +2758,6 @@ tf_py_test(
     srcs = ["framework/ops_enable_eager_test.py"],
     main = "framework/ops_enable_eager_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework",
         ":platform_test",
@@ -2689,7 +2771,6 @@ tf_py_test(
     srcs = ["framework/tensor_shape_test.py"],
     main = "framework/tensor_shape_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2705,7 +2786,6 @@ tf_py_test(
     srcs = ["framework/type_spec_test.py"],
     main = "framework/type_spec_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2721,7 +2801,6 @@ tf_py_test(
     srcs = ["framework/tensor_spec_test.py"],
     main = "framework/tensor_spec_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2754,7 +2833,6 @@ tf_py_test(
     srcs = ["framework/device_spec_test.py"],
     main = "framework/device_spec_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2769,7 +2847,6 @@ tf_py_test(
     srcs = ["framework/device_test.py"],
     main = "framework/device_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2784,7 +2861,6 @@ tf_py_test(
     srcs = ["framework/random_seed_test.py"],
     main = "framework/random_seed_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework",
@@ -2797,7 +2873,6 @@ tf_py_test(
     srcs = ["framework/tensor_shape_div_test.py"],
     main = "framework/tensor_shape_div_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2814,7 +2889,6 @@ tf_py_test(
     main = "framework/tensor_util_test.py",
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2869,7 +2943,6 @@ tf_py_test(
         "nomsan",  # TODO(b/149948895): Re-enable.
         "notsan",  # TODO(b/149948895): Re-enable.
     ],
-    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         # TODO(kkb): Find more appropriate place to add `memory_checker` as deps
@@ -2895,7 +2968,6 @@ tf_py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2911,7 +2983,6 @@ tf_py_test(
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2925,7 +2996,6 @@ tf_py_test(
     srcs = ["framework/kernels_test.py"],
     main = "framework/kernels_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         ":kernels",
@@ -3289,6 +3359,16 @@ tf_gen_op_wrapper_private_py(
     visibility = ["//tensorflow/python/ops/ragged:__pkg__"],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "risc_ops_gen",
+    visibility = [
+        "//tensorflow/python/ops/risc:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:risc_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(name = "rnn_ops_gen")
 
 tf_gen_op_wrapper_private_py(
@@ -3467,7 +3547,6 @@ tf_py_test(
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":clip_ops",
@@ -3493,7 +3572,6 @@ tf_py_test(
     size = "medium",
     srcs = ["ops/clustering_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":clustering_ops",
@@ -3538,7 +3616,6 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = True,
     deps = [
         ":client_testlib",
@@ -3662,7 +3739,6 @@ tf_py_test(
     size = "small",
     srcs = ["ops/control_flow_v2_toggles_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util_v2",
@@ -3676,7 +3752,6 @@ tf_py_test(
     size = "small",
     srcs = ["ops/control_flow_v2_enable_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util",
@@ -3698,7 +3773,6 @@ tf_py_test(
         "no_oss",
         "no_pip",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util",
@@ -3781,7 +3855,6 @@ tf_py_test(
     size = "small",
     srcs = ["ops/bincount_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":bincount_ops",
         ":platform_test",
@@ -4316,6 +4389,7 @@ py_library(
         ":array_ops",
         ":handle_data_util",
         ":list_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -4659,7 +4733,6 @@ tf_py_test(
     name = "sort_ops_test",
     srcs = ["ops/sort_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4759,7 +4832,6 @@ cuda_py_test(
     name = "rnn_grad_test",
     srcs = ["ops/rnn_grad_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5042,7 +5114,6 @@ cuda_py_test(
     srcs = ["ops/bitwise_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":bitwise_ops",
         ":constant_op",
@@ -5057,6 +5128,9 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_test.py"],
     python_version = "PY3",
     shard_count = 2,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":array_ops",
         ":cond_v2",
@@ -5100,7 +5174,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/gradient_checker_v2_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5155,7 +5228,6 @@ cuda_py_test(
     size = "small",
     srcs = ["ops/histogram_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5173,7 +5245,6 @@ cuda_py_test(
     srcs = ["ops/image_grad_deterministic_test.py"],
     python_version = "PY3",
     shard_count = 5,
-    tfrt_enabled = True,
     deps = [
         ":image_grad_test_base",
     ],
@@ -5185,7 +5256,6 @@ cuda_py_test(
     srcs = ["ops/image_grad_test.py"],
     python_version = "PY3",
     shard_count = 5,
-    tfrt_enabled = True,
     deps = [
         ":image_grad_test_base",
     ],
@@ -5210,6 +5280,9 @@ cuda_py_test(
     data = ["//tensorflow/core:image_testdata"],
     python_version = "PY3",
     shard_count = 16,
+    tags = [
+        "no_cuda_asan",  # TODO(b/171511582): re-enable.
+    ],
     deps = [
         ":array_ops",
         ":client",
@@ -5235,7 +5308,6 @@ cuda_py_test(
     size = "small",
     srcs = ["ops/init_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_ops",
@@ -5251,7 +5323,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/init_ops_v2_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5269,7 +5340,9 @@ cuda_py_test(
     srcs = ["ops/math_grad_test.py"],
     python_version = "PY3",
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5309,7 +5382,6 @@ cuda_py_test(
     tags = [
         "no_windows_gpu",
     ],
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5326,7 +5398,6 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5363,6 +5434,9 @@ cuda_py_test(
     srcs = ["ops/nn_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5416,6 +5490,7 @@ py_test(
         ":array_ops",
         ":client",
         ":client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -5430,7 +5505,6 @@ cuda_py_test(
         "no_oss",  # TODO(b/149565560)
         "no_windows_gpu",
     ],
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5468,7 +5542,6 @@ tf_py_test(
     size = "small",
     srcs = ["ops/variable_spec_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5515,7 +5588,6 @@ tf_py_test(
     name = "tf_export_test",
     srcs = ["util/tf_export_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -5576,7 +5648,6 @@ tf_py_test(
     name = "tf_stack_test",
     srcs = ["util/tf_stack_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":tf_export",
@@ -5639,7 +5710,6 @@ tf_py_test(
         "no_pip",  # b/168621686
         "no_windows",  # b/169275019
     ],
-    tfrt_enabled = True,
     deps = [
         ":_function_parameter_canonicalizer_binding_for_test",
         ":client_testlib",
@@ -5673,6 +5743,7 @@ py_library(
         "//tensorflow:__pkg__",
         "//third_party/py/tensorflow_core:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
+        "//third_party/py/tfx:__subpackages__",
     ],
     deps = [
         ":_pywrap_tensor_float_32_execution",
@@ -5688,6 +5759,7 @@ py_library(
         "//third_party/py/numpy",
         "@six_archive//:six",
         "@wrapt",
+        "//tensorflow/tools/docs:doc_controls",
         "//tensorflow/tools/compatibility:all_renames_v2",
     ],
 )
@@ -5697,7 +5769,6 @@ tf_py_test(
     size = "small",
     srcs = ["util/object_identity_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
 )
 
 # Placeholder for intenal nest_test comments.
@@ -5707,7 +5778,6 @@ tf_py_test(
     srcs = ["util/nest_test.py"],
     main = "util/nest_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [":util_nest_test_main_lib"],
 )
 
@@ -5733,7 +5803,6 @@ tf_py_test(
     srcs = ["util/serialization_test.py"],
     main = "util/serialization_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5744,7 +5813,6 @@ tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5756,7 +5824,6 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_contextlib_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5768,7 +5835,6 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_decorator_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5792,7 +5858,6 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_should_use_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":tf_should_use",
@@ -5804,7 +5869,6 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_inspect_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5829,7 +5893,6 @@ tf_py_test(
     srcs = ["util/lock_util_test.py"],
     main = "util/lock_util_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5842,7 +5905,6 @@ tf_py_test(
     size = "small",
     srcs = ["util/module_wrapper_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5886,7 +5948,6 @@ tf_py_test(
     main = "util/protobuf/compare_test.py",
     python_version = "PY3",
     tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
-    tfrt_enabled = True,
     deps = [
         ":compare_test_proto_py",
         ":platform_test",
@@ -5901,7 +5962,6 @@ tf_py_test(
     srcs = ["util/example_parser_configuration_test.py"],
     main = "util/example_parser_configuration_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -5917,7 +5977,6 @@ tf_py_test(
     size = "small",
     srcs = ["client/events_writer_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":errors",
         ":framework_test_lib",
@@ -5976,8 +6035,8 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:replay_log_proto_cc",
         "//tensorflow/core/protobuf:master_proto_cc",
+        "//tensorflow/core/protobuf:replay_log_proto_cc",
     ],
 )
 
@@ -6048,12 +6107,17 @@ pywrap_tensorflow_macro(
         ":pybind11_lib",
         ":pybind11_status",
         ":pybind11_proto",
+        ":python_api_dispatcher",
         ":python_op_gen",
+        ":python_tensor_converter",
         ":safe_pyobject_ptr",
         ":tf_session_helper",
         "//third_party/python_runtime:headers",
         "//tensorflow/c:c_api",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:ops",
         "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/experimental/stream_executor:stream_executor",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
@@ -6095,9 +6159,7 @@ pywrap_tensorflow_macro(
     ] + (tf_additional_lib_deps() +
          tf_monitoring_python_deps() +
          tf_additional_plugin_deps() +
-         tf_additional_profiler_deps()) + if_ngraph([
-        "@ngraph_tf//:ngraph_tf",
-    ]) + if_xla_available([
+         tf_additional_profiler_deps()) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
     ]) + if_static(extra_deps = ["//tensorflow/core/platform:tensor_float_32_utils"]),
 )
@@ -6117,6 +6179,8 @@ filegroup(
         ":numpy_lib",  # checkpoint_reader
         ":py_exception_registry",  # py_exception_registry
         ":py_func_lib",  # py_func
+        ":python_api_dispatcher",  # python_api_dispatcher
+        ":python_tensor_converter",  # python_tensor_converter
         ":python_op_gen",  # python_op_gen
         ":safe_ptr",  # checkpoint_reader
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
@@ -6125,8 +6189,6 @@ filegroup(
         "//tensorflow/compiler/jit:get_compiler_ir",  #tfe
         "//tensorflow/compiler/jit:flags",  #tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
-        "//tensorflow/core/common_runtime:device",  # device_lib, tfe, tf_session
-        "//tensorflow/core/common_runtime:device_factory",  # device_lib, tfe, tf_session
         "//tensorflow/core/common_runtime:graph_constructor",  # tf_session
         "//tensorflow/core/common_runtime:quantize_training",  # quantize_training
         "//tensorflow/core/common_runtime:session_options",  # device_lib, tfe, tf_session
@@ -6435,7 +6497,6 @@ tf_py_test(
         "no_pip_gpu",
         "notsan",  # data race due to b/62910646
     ],
-    tfrt_enabled = True,
     deps = [
         ":client",
         ":framework",
@@ -6455,7 +6516,6 @@ tf_py_test(
         "no_gpu",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6480,7 +6540,6 @@ cuda_py_test(
         "gpu_cupti",
         "no_gpu",  # b/154742661
     ],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
     deps = [
         ":client",
@@ -6500,7 +6559,6 @@ cuda_py_test(
         "no_gpu",  # b/127386241
         "no_windows_gpu",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6515,7 +6573,6 @@ tf_py_test(
     size = "small",
     srcs = ["framework/c_api_util_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":c_api_util",
         ":framework_test_lib",
@@ -6528,7 +6585,6 @@ tf_py_test(
     size = "small",
     srcs = ["framework/graph_util_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6563,7 +6619,6 @@ tf_py_test(
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":lib",
@@ -6580,7 +6635,6 @@ tf_py_test(
         "no_rocm",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -6593,7 +6647,6 @@ tf_py_test(
     size = "small",
     srcs = ["lib/io/tf_record_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -6798,7 +6851,6 @@ cuda_py_test(
     main = "ops/accumulate_n_benchmark.py",
     python_version = "PY3",
     shard_count = 6,
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6818,7 +6870,6 @@ cuda_py_test(
     srcs = ["ops/batch_norm_benchmark.py"],
     main = "ops/batch_norm_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6840,7 +6891,6 @@ cuda_py_test(
     srcs = ["ops/collective_ops_benchmark.py"],
     main = "ops/collective_ops_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6858,7 +6908,6 @@ cuda_py_test(
     srcs = ["ops/concat_benchmark.py"],
     main = "ops/concat_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6877,7 +6926,6 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_benchmark.py"],
     main = "ops/control_flow_ops_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -6893,7 +6941,6 @@ cuda_py_test(
     srcs = ["ops/conv2d_benchmark.py"],
     main = "ops/conv2d_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6914,7 +6961,6 @@ cuda_py_test(
     srcs = ["ops/split_benchmark.py"],
     main = "ops/split_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6935,7 +6981,6 @@ cuda_py_test(
     srcs = ["ops/transpose_benchmark.py"],
     main = "ops/transpose_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6956,7 +7001,6 @@ cuda_py_test(
     srcs = ["ops/matmul_benchmark.py"],
     main = "ops/matmul_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [":matmul_benchmark_main_lib"],
 )
 
@@ -6986,7 +7030,6 @@ cuda_py_test(
     grpc_enabled = True,
     main = "client/session_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7005,7 +7048,6 @@ cuda_py_test(
     srcs = ["framework/graph_building_benchmark.py"],
     main = "framework/graph_building_benchmark.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7021,7 +7063,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_grad_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7076,7 +7117,6 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7097,7 +7137,6 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7216,7 +7255,6 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7238,7 +7276,6 @@ tf_py_test(
     tags = [
         "grappler",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7374,7 +7411,6 @@ tf_py_test(
         "no_pip",
         "no_windows",  # TODO(b/151942037)
     ],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7409,7 +7445,6 @@ tf_py_test(
         "grappler",
         "no_pip",
     ],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7430,7 +7465,6 @@ cuda_py_test(
     ],
     python_version = "PY3",
     tags = ["grappler"],
-    tfrt_enabled = True,
     # This test analyzes the graph, but XLA changes the names of nodes.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -7613,6 +7647,9 @@ tf_python_pybind_extension(
         "//tensorflow/python/eager:pywrap_required_hdrs",
     ],
     module_name = "_pywrap_tfe",
+    # Only include TensorFlow header-only targets here.
+    # If a cc_library needs to depend on TensorFlow .cc files through srcs or
+    # deps, then you can use cc_header_only_library to keep only headers.
     deps = [
         ":safe_pyobject_ptr",
         ":pybind11_lib",
@@ -7623,14 +7660,14 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
-        "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/compiler/jit:flags_headers_only",
         "//tensorflow/compiler/jit:get_compiler_ir_hdrs_only",
-        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal_hdrs_only",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
@@ -7727,7 +7764,6 @@ cuda_py_test(
     name = "raw_ops_test",
     srcs = ["ops/raw_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
     ],
diff --git a/tensorflow/python/autograph/BUILD b/tensorflow/python/autograph/BUILD
index 874b99464d8..f3b4dfcb558 100644
--- a/tensorflow/python/autograph/BUILD
+++ b/tensorflow/python/autograph/BUILD
@@ -21,7 +21,7 @@ py_strict_library(
     srcs = [
         "__init__.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:util",
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index fd8ec1dbaa3..e3153d3f93c 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -35,7 +35,7 @@ py_library(
         "slices.py",
         "variables.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:util",
@@ -52,7 +52,7 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -64,7 +64,7 @@ py_test(
     name = "break_statements_test",
     srcs = ["break_statements_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -88,7 +88,7 @@ py_test(
     name = "conditional_expressions_test",
     srcs = ["conditional_expressions_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -100,7 +100,7 @@ py_test(
     name = "continue_statements_test",
     srcs = ["continue_statements_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -126,7 +126,7 @@ py_test(
     name = "directives_test",
     srcs = ["directives_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -151,7 +151,7 @@ py_test(
     name = "list_comprehensions_test",
     srcs = ["list_comprehensions_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -163,7 +163,7 @@ py_test(
     name = "lists_test",
     srcs = ["lists_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -175,7 +175,7 @@ py_test(
     name = "logical_expressions_test",
     srcs = ["logical_expressions_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = ["notsan"],  # b/163218460
     deps = [
         ":converters",
@@ -188,7 +188,7 @@ py_test(
     name = "return_statements_test",
     srcs = ["return_statements_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -201,7 +201,7 @@ py_test(
     name = "slices_test",
     srcs = ["slices_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -214,7 +214,7 @@ py_test(
     name = "variables_test",
     srcs = ["variables_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 4eace00fcaf..7b30b5723be 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -33,10 +33,6 @@ from tensorflow.python.autograph.pyct.static_analysis import annos
 from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
-from tensorflow.python.autograph.utils import compat_util
-
-
-# TODO(mdan): Refactor functions to make them smaller.
 
 
 class _Function(object):
@@ -419,6 +415,3 @@ def transform(node, ctx):
 
   node = ControlFlowTransformer(ctx).visit(node)
   return node
-
-
-compat_util.deprecated_py2_support(__name__)
diff --git a/tensorflow/python/autograph/converters/directives.py b/tensorflow/python/autograph/converters/directives.py
index 23eb4a61bee..952a4ded852 100644
--- a/tensorflow/python/autograph/converters/directives.py
+++ b/tensorflow/python/autograph/converters/directives.py
@@ -24,7 +24,7 @@ is, they do not change at runtime. So if you do something like this:
   tf.autograph.set_loop_options = <new function>
 
 Then the directive will may no longer be recognized. Furthermore, if the
-converted function is cached, such an action action may be irreversible.
+converted function is cached, such an action may be irreversible.
 """
 
 from __future__ import absolute_import
@@ -71,7 +71,7 @@ def _map_args(call_node, function):
   # Keyword arguments not specified in kwds will be mapped to their defaults,
   # which are Python values. Since we don't currently have a way to transform
   # those into AST references, we simply remove them. By convention, directives
-  # use UNSPECIFIED as default value for for optional arguments. No other
+  # use UNSPECIFIED as default value for optional arguments. No other
   # defaults should be present.
   unexpected_defaults = []
   for k in call_args:
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index e4062e42db7..9f7a4072e1d 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -41,7 +41,7 @@ class _RewriteBlock(object):
 
 
 class ConditionalReturnRewriter(converter.Base):
-  """Rewrites a a pattern where it's unobvious that all paths return a value.
+  """Rewrites a pattern where it's unobvious that all paths return a value.
 
   This rewrite allows avoiding intermediate None return values.
 
@@ -355,7 +355,7 @@ class ReturnStatementsTransformer(converter.Base):
         if block.return_used:
 
           if self.allow_missing_return:
-            # The function whould have a single `with` node that wraps the
+            # The function would have a single `with` node that wraps the
             # entire body. If the function had a docstring, the body has two
             # nodes, with the `with` as the second node.
             wrapper_node = node.body[-1]
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 4a5c50dac55..77d34240d24 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -26,7 +26,7 @@ py_library(
         "function_wrappers.py",
         "unsupported_features_checker.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:framework_ops",
@@ -43,7 +43,7 @@ py_library(
     srcs = [
         "converter_testing.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":core",
@@ -61,7 +61,7 @@ py_test(
     name = "converter_test",
     srcs = ["converter_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":core",
         ":test_lib",
@@ -73,7 +73,7 @@ py_test(
     name = "function_wrappers_test",
     srcs = ["function_wrappers_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":core",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index fc6908784f9..fefd4a2a18d 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -143,7 +143,7 @@ class ConversionOptions(object):
       classes that the converted function may use.
     user_requested: bool, whether the conversion was explicitly requested by
       the user, as opposed to being performed as a result of other logic. This
-      value always auto-resets resets to False in child conversions.
+      value always auto-resets to False in child conversions.
     optional_features: Union[Feature, Set[Feature]], controls the use of
       optional features in the conversion process. See Feature for available
       options.
diff --git a/tensorflow/python/autograph/g3doc/reference/control_flow.md b/tensorflow/python/autograph/g3doc/reference/control_flow.md
index cf580af7330..b38aec9863c 100644
--- a/tensorflow/python/autograph/g3doc/reference/control_flow.md
+++ b/tensorflow/python/autograph/g3doc/reference/control_flow.md
@@ -264,9 +264,10 @@ for i in tf.stack(l):
 ```
 
 <!-- TODO(mdan): List this under limitations -->
-Caution: A loop in which the type of the condition condition changes across
-iterations, in a way that would influence the way the loop is executed, is not
-allowed in AutoGraph.
+
+Caution: A loop in which the type of the condition changes across iterations, in
+a way that would influence the way the loop is executed, is not allowed in
+AutoGraph.
 
 For example, the loop below will generate an error. After the first iteration,
 `i` becomes a tf.Tensor, because
diff --git a/tensorflow/python/autograph/g3doc/reference/functions.md b/tensorflow/python/autograph/g3doc/reference/functions.md
index 48bf052f298..6ded93a26ce 100644
--- a/tensorflow/python/autograph/g3doc/reference/functions.md
+++ b/tensorflow/python/autograph/g3doc/reference/functions.md
@@ -45,7 +45,7 @@ are handled correctly.
 The following types of functions are not converted:
 
 *   functions already converted
-*   functions defined in in a allowlisted module (see autograph/core/config.py)
+*   functions defined in a allowlisted module (see autograph/core/config.py)
 *   non-Python functions (such as native bindings)
 *   `print`, `pdb.set_trace`, `ipdb.set_trace`
 *   most built-in functions (exceptions are listed in
diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md
index 5459d67b883..09961f29fcd 100644
--- a/tensorflow/python/autograph/g3doc/reference/limitations.md
+++ b/tensorflow/python/autograph/g3doc/reference/limitations.md
@@ -376,8 +376,7 @@ l()  # Prints 0!
 ```
 
 Note that none of these restrictions only apply to TensorFlow loops; Python
-loops correctly correctly handle closures in all cases.
-
+loops correctly handle closures in all cases.
 
 ### Python collections in TensorFlow control flow
 
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 4c5475bbb74..f7afd32f293 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -22,7 +22,7 @@ py_library(
         "api.py",
         "conversion.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:platform",
@@ -54,7 +54,6 @@ tf_py_test(
 tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index d73b35283f1..25856834814 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -57,7 +57,7 @@ def _is_known_loaded_type(f, module_name, entity_name):
     return True
   # Note: inspect is required here, to avoid unpacking tf.function decorators.
   if inspect.ismethod(f):
-    # The the unbound method if of this type. Example:
+    # The unbound method if of this type. Example:
     #
     # class ClassType:
     #   @function
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index ceccc6f0c93..e0db4f0d5e2 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -22,7 +22,7 @@ py_library(
         "directives.py",
         "special_functions.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/operators",
@@ -33,7 +33,7 @@ py_test(
     name = "special_functions_test",
     srcs = ["special_functions_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":lang",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 13b3b7a1764..ab9babf9149 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -22,7 +22,6 @@ py_library(
         "__init__.py",
         "conditional_expressions.py",
         "control_flow.py",
-        "control_flow_deprecated_py2.py",
         "data_structures.py",
         "exceptions.py",
         "logical.py",
@@ -30,7 +29,7 @@ py_library(
         "slices.py",
         "variables.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -54,7 +53,7 @@ py_test(
     name = "data_structures_test",
     srcs = ["data_structures_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -66,9 +65,6 @@ py_test(
     srcs = ["conditional_expressions_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
-    tags = [
-        "no_oss_py2",
-    ],
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -82,7 +78,6 @@ py_test(
     srcs_version = "PY3",
     tags = [
         "no_gpu",  # b/127001953
-        "no_oss_py2",
     ],
     deps = [
         ":operators",
@@ -96,7 +91,7 @@ py_test(
     name = "exceptions_test",
     srcs = ["exceptions_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -107,7 +102,7 @@ py_test(
     name = "logical_test",
     srcs = ["logical_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -134,7 +129,7 @@ py_test(
     name = "slices_test",
     srcs = ["slices_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -145,7 +140,7 @@ py_test(
     name = "variables_test",
     srcs = ["variables_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index aaa4808cb0a..46ed5c655fd 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -67,7 +67,6 @@ import numpy as np
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
-from tensorflow.python.autograph.utils import compat_util
 from tensorflow.python.autograph.utils import misc
 from tensorflow.python.autograph.utils import tensors
 from tensorflow.python.data.experimental.ops import scan_ops
@@ -91,7 +90,7 @@ from tensorflow.python.util import nest
 
 PYTHON_MAX_ITERATIONS = 100000000  # Fails in about one minute for empty loops.
 WARN_INEFFICIENT_UNROLL = True
-INEFFICIENT_UNROLL_MIN_ITERATIONS = 3000
+INEFFICIENT_UNROLL_MIN_ITERATIONS = 50000
 INEFFICIENT_UNROLL_MIN_OPS = 1
 
 
@@ -490,24 +489,26 @@ def _known_len_tf_for_stmt(
   ta = tensor_array_ops.TensorArray(iter_.dtype, size=n)
   iter_ = ta.unstack(iter_)
 
-  iterate_index = compat_util.BasicRef(0)
+  iterate_index = 0
 
   def aug_get_state():
-    return (iterate_index.value,) + get_state()
+    return (iterate_index,) + get_state()
 
   def aug_set_state(aug_loop_vars):
-    # TOOD(mdan): Use starred assignment once we can switch to Py3-only syntax.
-    iterate_index.value, loop_vars = aug_loop_vars[0], aug_loop_vars[1:]
+    nonlocal iterate_index
+    # TODO(b/171479293): Drop the lint override.
+    iterate_index, *loop_vars = aug_loop_vars  # pylint:disable=unused-variable
     # The iteration index is not "output" by the for loop. If the iterate
     # is used outside the loop, it will appear in the loop vars separately.
     set_state(loop_vars)
 
   def aug_body():
-    body(iter_.read(iterate_index.value))
-    iterate_index.value += 1
+    nonlocal iterate_index
+    body(iter_.read(iterate_index))
+    iterate_index += 1
 
   def aug_test():
-    main_test = iterate_index.value < n
+    main_test = iterate_index < n
     if extra_test is not None:
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
@@ -536,24 +537,26 @@ def _tf_ragged_for_stmt(
   else:
     n = iter_.row_lengths()[0]
 
-  iterate_index = compat_util.BasicRef(0)
+  iterate_index = 0
 
   def aug_get_state():
-    return (iterate_index.value,) + get_state()
+    return (iterate_index,) + get_state()
 
   def aug_set_state(aug_loop_vars):
-    # TOOD(mdan): Use starred assignment once we can switch to Py3-only syntax.
-    iterate_index.value, loop_vars = aug_loop_vars[0], aug_loop_vars[1:]
+    nonlocal iterate_index
+    # TODO(b/171479293): Drop the lint override.
+    iterate_index, *loop_vars = aug_loop_vars  # pylint:disable=unused-variable
     # The iteration index is not "output" by the for loop. If the iterate
     # is used outside the loop, it will appear in the loop vars separately.
     set_state(loop_vars)
 
   def aug_body():
-    body(iter_[iterate_index.value])
-    iterate_index.value += 1
+    nonlocal iterate_index
+    body(iter_[iterate_index])
+    iterate_index += 1
 
   def aug_test():
-    main_test = iterate_index.value < n
+    main_test = iterate_index < n
     if extra_test is not None:
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
@@ -574,7 +577,7 @@ def _tf_range_for_stmt(
   """Overload of for_stmt that iterates over a TF range (and elides it)."""
   start, limit, delta = iter_.op.inputs
 
-  iterate = compat_util.BasicRef(start)
+  iterate = start
 
   def _value_or(name, var, default):
     if (name == opts['iterate_names'] and isinstance(var, variables.Undefined)):
@@ -584,33 +587,35 @@ def _tf_range_for_stmt(
   def aug_get_state():
     state_vars = get_state()
     state_vars = tuple(
-        _value_or(name, var, iterate.value)
+        _value_or(name, var, iterate)
         for name, var in zip(symbol_names, state_vars))
-    return (iterate.value,) + state_vars
+    return (iterate,) + state_vars
 
   def aug_set_state(aug_loop_vars):
-    # TOOD(mdan): Use starred assignment once we can switch to Py3-only syntax.
-    iterate.value, loop_vars = aug_loop_vars[0], aug_loop_vars[1:]
+    nonlocal iterate
+    # TODO(b/171479293): Drop the lint override.
+    iterate, *loop_vars = aug_loop_vars  # pylint:disable=unused-variable
     # The iteration index is not "output" by the for loop. If the iterate
     # is used outside the loop, it will appear in the loop vars separately.
     set_state(loop_vars)
 
   def aug_body():
-    body(iterate.value)
-    iterate.value += delta
+    nonlocal iterate
+    body(iterate)
+    iterate += delta
 
   def aug_test():
     # TODO(b/159713842): Remove once constant folding works.
     const_delta = tensor_util.constant_value(delta)
     if const_delta is not None:
       if const_delta >= 0:
-        main_test = iterate.value < limit
+        main_test = iterate < limit
       else:
-        main_test = iterate.value > limit
+        main_test = iterate > limit
     else:
       main_test = math_ops.logical_or(
-          math_ops.logical_and(delta >= 0, iterate.value < limit),
-          math_ops.logical_and(delta < 0, iterate.value > limit))
+          math_ops.logical_and(delta >= 0, iterate < limit),
+          math_ops.logical_and(delta < 0, iterate > limit))
 
     if extra_test is not None:
       main_test = control_flow_ops.cond(main_test, extra_test, lambda: False)
@@ -633,14 +638,15 @@ def _tf_iterator_for_stmt(
     iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   """Overload of for_stmt that iterates over TF Iterators. See for_loop."""
   symbol_names = ('<internal has_next>',) + symbol_names
-  has_next = compat_util.BasicRef(True)
+  has_next = True
 
   def aug_get_state():
-    return (has_next.value,) + get_state()
+    return (has_next,) + get_state()
 
   def aug_set_state(aug_loop_vars):
-    # TOOD(mdan): Use starred assignment once we can switch to Py3-only syntax.
-    has_next.value, loop_vars = aug_loop_vars[0], aug_loop_vars[1:]
+    nonlocal has_next
+    # TODO(b/171479293): Drop the lint override.
+    has_next, *loop_vars = aug_loop_vars  # pylint:disable=unused-variable
     set_state(loop_vars)
 
   init_vars = aug_get_state()
@@ -648,8 +654,9 @@ def _tf_iterator_for_stmt(
 
   def aug_body():
     """Main body passed to _tf_while_stmt."""
+    nonlocal has_next
     opt_iterate = iter_.get_next_as_optional()
-    has_next.value = opt_iterate.has_value()
+    has_next = opt_iterate.has_value()
     loop_vars = aug_get_state()  # updated by set_state() in _tf_while_loop.
 
     def main_path():
@@ -669,13 +676,13 @@ def _tf_iterator_for_stmt(
     # Calling set_state so that get_state() _tf_while_loop sees the conditional
     # tensors.
     aug_set_state(
-        control_flow_ops.cond(has_next.value, main_path, noop_path))
+        control_flow_ops.cond(has_next, main_path, noop_path))
 
   def aug_test():
     # This value takes a complicated path to get here:
     #   prev_iteration_body -> get_state -> tf.while_loop (as loop var)
-    #   -> current_iteration_body -> set_state -> has_next.value
-    main_test = has_next.value
+    #   -> current_iteration_body -> set_state -> has_next
+    main_test = has_next
     if extra_test is not None:
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
@@ -1216,6 +1223,3 @@ def _tf_if_stmt(
 def _py_if_stmt(cond, body, orelse):
   """Overload of if_stmt that executes a Python if statement."""
   return body() if cond else orelse()
-
-
-compat_util.deprecated_py2_support(__name__)
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index b35e51f3e7d..69ae22ad42f 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -429,8 +429,7 @@ def map_(fn, *iterables):
 
 
 def _tf_dataset_map(fn, *iterables):
-  zipped_dataset = dataset_ops.DatasetV2.zip(iterables)
-  return zipped_dataset.map(fn, num_parallel_calls=dataset_ops.AUTOTUNE)
+  return dataset_ops.DatasetV2.zip(iterables).map(fn)
 
 
 def _py_map(fn, *iterables):
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 8ff18fcf2f4..09f98682efa 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -29,7 +29,6 @@ py_library(
         "gast_util.py",
         "inspect_utils.py",
         "loader.py",
-        "loader_deprecated_py2.py",
         "naming.py",
         "origin_info.py",
         "parser.py",
@@ -39,7 +38,7 @@ py_library(
         "transformer.py",
         "transpiler.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python/autograph/pyct/common_transformers",
@@ -54,7 +53,7 @@ py_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -65,7 +64,7 @@ py_test(
     name = "ast_util_test",
     srcs = ["ast_util_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = [
         "no_oss_py2",
     ],
@@ -80,7 +79,7 @@ py_test(
     name = "cache_test",
     srcs = ["cache_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = [
         "no_oss_py2",
     ],
@@ -95,7 +94,7 @@ py_test(
     name = "cfg_test",
     srcs = ["cfg_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = [
         "no_oss_py2",
     ],
@@ -110,7 +109,7 @@ py_test(
     name = "loader_test",
     srcs = ["loader_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -122,7 +121,7 @@ py_test(
     name = "error_utils_test",
     srcs = ["error_utils_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -134,7 +133,7 @@ py_test(
     name = "inspect_utils_test",
     srcs = ["inspect_utils_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -153,7 +152,7 @@ py_test(
     name = "naming_test",
     srcs = ["naming_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -164,7 +163,7 @@ py_test(
     name = "origin_info_test",
     srcs = ["origin_info_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -176,7 +175,7 @@ py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -187,7 +186,7 @@ py_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -198,7 +197,7 @@ py_test(
     name = "qual_names_test",
     srcs = ["qual_names_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -209,7 +208,7 @@ py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -222,7 +221,7 @@ py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -234,7 +233,7 @@ py_test(
     name = "transpiler_test",
     srcs = ["transpiler_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index fa9f99b5a69..0f3ac0b718f 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -268,7 +268,7 @@ class GraphBuilder(object):
   nodes and their subsequent statements.
 
   Important concepts:
-   * nodes - nodes refer refer to CFG nodes; AST nodes are qualified explicitly
+   * nodes - nodes refer to CFG nodes; AST nodes are qualified explicitly
    * leaf set - since the graph is constructed gradually, a leaf set maintains
      the CFG nodes that will precede the node that the builder expects to
      receive next; when an ordinary node is added, it is connected to the
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 61856a590ae..ecb50b2cece 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -21,7 +21,7 @@ py_library(
     srcs = [
         "anf.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "@gast_archive//:gast",
@@ -33,7 +33,7 @@ py_test(
     name = "anf_test",
     srcs = ["anf_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = ["no_oss"],
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/loader.py b/tensorflow/python/autograph/pyct/loader.py
index 6eb925bca45..4fc01e35942 100644
--- a/tensorflow/python/autograph/pyct/loader.py
+++ b/tensorflow/python/autograph/pyct/loader.py
@@ -31,7 +31,6 @@ import tempfile
 
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.utils import compat_util
 
 
 def _remove_file(file_name):
@@ -103,6 +102,3 @@ def load_ast(nodes,
 
   # TODO(mdan): Return a structured object.
   return module, source, source_map
-
-
-compat_util.deprecated_py2_support(__name__)
diff --git a/tensorflow/python/autograph/pyct/loader_deprecated_py2.py b/tensorflow/python/autograph/pyct/loader_deprecated_py2.py
deleted file mode 100644
index fd962916cac..00000000000
--- a/tensorflow/python/autograph/pyct/loader_deprecated_py2.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Converting AST to code and Python entities.
-
-Python 2 compatibility version. Not maintained.
-
-Adapted from Tangent.
-"""
-
-# TODO(mdan): Consolidate with parser and rename to parsing.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# TODO(mdan): Use six for compatibility here.
-import atexit
-import imp
-import os
-import tempfile
-
-import six
-
-from tensorflow.python.autograph.pyct import origin_info
-from tensorflow.python.autograph.pyct import parser
-
-
-def load_source(source, delete_on_exit):
-  """Loads the given source code as a Python module."""
-  if six.PY2:
-    source = source.encode('utf-8')
-    f = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False)
-  else:
-    f = tempfile.NamedTemporaryFile(  # pylint:disable=unexpected-keyword-arg
-        mode='w', suffix='.py', delete=False, encoding='utf-8')
-
-  with f:
-    module_name = os.path.basename(f.name[:-3])
-    f.write(source)
-
-  if delete_on_exit:
-    atexit.register(lambda: os.remove(f.name))
-  return imp.load_source(module_name, f.name), f.name
-
-
-def load_ast(nodes,
-             indentation='  ',
-             include_source_map=False,
-             delete_on_exit=True):
-  """Loads the given AST as a Python module.
-
-  Compiling the AST code this way ensures that the source code is readable by
-  e.g. `pdb` or `inspect`.
-
-  Args:
-    nodes: Union[ast.AST, Iterable[ast.AST]], the code to compile, as an AST
-      object.
-    indentation: Text, the string to use for indentation.
-    include_source_map: bool, whether return a source map.
-    delete_on_exit: bool, whether to delete the temporary file used for
-      compilation on exit.
-
-  Returns:
-    Tuple[module, Text, Dict[LineLocation, OriginInfo]], containing:
-    the module containing the unparsed nodes, the source code corresponding to
-    nodes, and the source map. Is include_source_map is False, the source map
-    will be None.
-  """
-  if not isinstance(nodes, (list, tuple)):
-    nodes = (nodes,)
-
-  source = parser.unparse(nodes, indentation=indentation)
-  module, _ = load_source(source, delete_on_exit)
-
-  if include_source_map:
-    source_map = origin_info.create_source_map(nodes, source, module.__file__)
-  else:
-    source_map = None
-
-  # TODO(mdan): Return a structured object.
-  return module, source, source_map
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 51523cbc642..529b01cd02f 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -339,7 +339,7 @@ def parse(src, preamble_len=0, single_node=True):
     nodes = nodes[preamble_len:]
   if single_node:
     if len(nodes) != 1:
-      raise ValueError('expected exactly one node node, found {}'.format(nodes))
+      raise ValueError('expected exactly one node, found {}'.format(nodes))
     return nodes[0]
   return nodes
 
@@ -370,7 +370,7 @@ def unparse(node, indentation=None, include_encoding_marker=True):
     node: The code to compile, as an AST object.
     indentation: Unused, deprecated. The returning code will always be indented
       at 4 spaces.
-    include_encoding_marker: Bool, thether to include a comment on the first
+    include_encoding_marker: Bool, whether to include a comment on the first
       line to explicitly specify UTF-8 encoding.
 
   Returns:
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 1eaf3b3c177..0f05cb58d9e 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -26,7 +26,7 @@ py_library(
         "reaching_fndefs.py",
         "type_inference.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:util",
@@ -40,7 +40,7 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
@@ -53,7 +53,7 @@ py_library(
     name = "activity_test_lib",
     testonly = True,
     srcs = ["activity_test.py"],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
@@ -89,38 +89,8 @@ py_test(
     testonly = True,
     srcs = ["liveness_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":static_analysis",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
-    ],
-)
-
-py_library(
-    name = "liveness_test_lib",
-    srcs = ["liveness_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":static_analysis",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
-        "@gast_archive//:gast",
-    ],
-)
-
-py_test(
-    name = "liveness_py3_test",
-    srcs = ["liveness_py3_test.py"],
-    python_version = "PY3",
     srcs_version = "PY3",
-    tags = [
-        "no_oss_py2",
-        "no_pip",
-        "nopip",
-    ],
     deps = [
-        ":liveness_test_lib",
         ":static_analysis",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct",
@@ -131,38 +101,8 @@ py_test(
     name = "reaching_definitions_test",
     srcs = ["reaching_definitions_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":static_analysis",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
-    ],
-)
-
-py_library(
-    name = "reaching_definitions_test_lib",
-    srcs = ["reaching_definitions_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":static_analysis",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
-        "@gast_archive//:gast",
-    ],
-)
-
-py_test(
-    name = "reaching_definitions_py3_test",
-    srcs = ["reaching_definitions_py3_test.py"],
-    python_version = "PY3",
     srcs_version = "PY3",
-    tags = [
-        "no_oss_py2",
-        "no_pip",
-        "nopip",
-    ],
     deps = [
-        ":reaching_definitions_test_lib",
         ":static_analysis",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct",
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_py3_test.py
index 035d3f7458b..6e5e69602fd 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_py3_test.py
@@ -30,25 +30,6 @@ NodeAnno = annos.NodeAnno
 class LivenessAnalyzerTest(liveness_test.LivenessAnalyzerTestBase):
   """Tests which can only run in Python 3."""
 
-  def test_nonlocal_symbol(self):
-
-    nonlocal_a = 3
-    nonlocal_b = 13
-
-    def test_fn(c):
-      nonlocal nonlocal_a
-      nonlocal nonlocal_b
-      if nonlocal_a:
-        nonlocal_b = c
-      else:
-        nonlocal_b = c
-      return nonlocal_b
-
-    node = self._parse_and_analyze(test_fn)
-    fn_body = node.body
-    self.assertHasLiveOut(fn_body[2], ('nonlocal_b',))
-    self.assertHasLiveIn(fn_body[2], ('nonlocal_a', 'c'))
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index ecb466532e2..1aba627cd3b 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -552,6 +552,25 @@ class LivenessAnalyzerTest(LivenessAnalyzerTestBase):
     self.assertHasLiveOut(fn_body[2], ('global_b',))
     self.assertHasLiveIn(fn_body[2], ('global_a', 'c'))
 
+  def test_nonlocal_symbol(self):
+
+    nonlocal_a = 3
+    nonlocal_b = 13
+
+    def test_fn(c):
+      nonlocal nonlocal_a
+      nonlocal nonlocal_b
+      if nonlocal_a:
+        nonlocal_b = c
+      else:
+        nonlocal_b = c
+      return nonlocal_b
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+    self.assertHasLiveOut(fn_body[2], ('nonlocal_b',))
+    self.assertHasLiveIn(fn_body[2], ('nonlocal_a', 'c'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
index ba27280f729..c2ca557878a 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
@@ -78,7 +78,7 @@ class ReachingDefinitionsAnalyzerTest(
 
     self.assertSameDef(local_body[1].test, local_body[2].value.elts[0])
 
-    # Note: the function name is is visible inside the function body. But it's
+    # Note: the function name is visible inside the function body. But it's
     # a closure variable, not a local.
     #
     # Example:
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index ac91b662a47..59feadb45d9 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -465,6 +465,71 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     self.assertHasDefinedIn(fn_body[2], ('global_a', 'global_b'))
 
+  def test_nonlocal(self):
+
+    a = 3
+    b = 13
+
+    def test_fn():
+      nonlocal a
+      nonlocal b
+      if a:
+        b = []
+      return a, b
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[2].test, 1)
+    self.assertHasDefs(fn_body[2].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[3].value.elts[0], 1)
+    self.assertHasDefs(fn_body[3].value.elts[1], 2)
+
+    self.assertSameDef(fn_body[2].test, fn_body[3].value.elts[0])
+
+    self.assertHasDefinedIn(fn_body[2], ('a', 'b'))
+
+  def test_nonlocal_in_nested_function(self):
+
+    a = 3
+    b = 13
+
+    def test_fn():
+      a = 3
+      b = 13
+
+      def local_fn():
+        nonlocal a, b
+        if a:
+          b = []
+        return a, b
+
+      return local_fn()
+
+    node = self._parse_and_analyze(test_fn)
+    local_body = node.body[2].body
+
+    self.assertHasDefs(local_body[1].test, 1)
+    self.assertHasDefs(local_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(local_body[2].value.elts[0], 1)
+    self.assertHasDefs(local_body[2].value.elts[1], 2)
+
+    self.assertSameDef(local_body[1].test, local_body[2].value.elts[0])
+
+    # Note: the function name is is visible inside the function body. But it's
+    # a closure variable, not a local.
+    #
+    # Example:
+    #
+    #   >>> def f():
+    #   ...  print(f)
+    #   >>> g = f
+    #   >>> f = 'something else'
+    #   >>> g()
+    #   something else
+    #
+    self.assertHasDefinedIn(local_body[1], ('a', 'b'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index 59b15ceaf05..811ee6faae1 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -22,7 +22,7 @@ py_library(
         "basic_definitions.py",
         "decorators.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
 )
 
@@ -31,7 +31,7 @@ py_library(
     srcs = [
         "codegen.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python/autograph/pyct",
@@ -45,7 +45,7 @@ py_test(
     size = "large",
     srcs = ["codegen_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = [
         "manual",
         "no_windows",
diff --git a/tensorflow/python/autograph/pyct/transpiler.py b/tensorflow/python/autograph/pyct/transpiler.py
index d93da4b03d1..f7672a8b6f3 100644
--- a/tensorflow/python/autograph/pyct/transpiler.py
+++ b/tensorflow/python/autograph/pyct/transpiler.py
@@ -80,9 +80,9 @@ def _wrap_into_factory(nodes, entity_name, inner_factory_name,
         return inner_factory
 
   The lexical scoping is created using dummy symbol declarations which create
-  local fariables in the body of the outer factory, so that the Python parser
+  local variables in the body of the outer factory, so that the Python parser
   correctly marks them as free non-global variables upon load (that is, it
-  creates cell slots for each symbol. Thes symbols are initialized with None,
+  creates cell slots for each symbol. These symbols are initialized with None,
   but their values are not expected to be used; instead, the caller is expected
   to replace them with the cells of the source entity. For more details, see:
   https://docs.python.org/3/reference/executionmodel.html#binding-of-names
@@ -277,7 +277,7 @@ class GenericTranspiler(object):
       user_context: An opaque object (may be None) that is forwarded to
         transform_ast, through the ctx.user_context argument.
     Returns:
-      Tre result of calling transform_function.
+      The result of calling transform_function.
 
     Raises:
       NotImplementedError: if the type of obj is not handled.
@@ -288,7 +288,7 @@ class GenericTranspiler(object):
     raise NotImplementedError('Non-function: {}'.format(type(obj)))
 
   def _erase_arg_defaults(self, node):
-    """Erase argde fault expressions, which would otherwise be unbound."""
+    """Erase arg default expressions, which would otherwise be unbound."""
     args = node.args
     for i in range(len(args.defaults)):
       args.defaults[i] = parser.parse_expression('None')
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index ba44b2b435c..0b5faaddbc3 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -21,7 +21,6 @@ py_library(
     srcs = [
         "__init__.py",
         "ag_logging.py",
-        "compat_util.py",
         "context_managers.py",
         "misc.py",
         "py_func.py",
@@ -29,7 +28,7 @@ py_library(
         "tensors.py",
         "testing.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:dtypes",
@@ -47,7 +46,7 @@ py_test(
     name = "context_managers_test",
     srcs = ["context_managers_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -58,7 +57,7 @@ py_test(
     name = "misc_test",
     srcs = ["misc_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -69,7 +68,7 @@ py_test(
     name = "py_func_test",
     srcs = ["py_func_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     tags = ["no_windows"],
     deps = [
         ":utils",
@@ -81,7 +80,7 @@ py_test(
     name = "tensor_list_test",
     srcs = ["tensor_list_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -93,7 +92,7 @@ py_test(
     name = "tensors_test",
     srcs = ["tensors_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 3bb87cdd4d6..6fc32ac8f90 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -235,18 +235,13 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
     }
   }
 
-  // Allocate a RunMetadata protobuf object to receive the metadata,
-  // if the caller is expecting any.
-  std::unique_ptr<RunMetadata> run_metadata_proto;
-  if (run_metadata != nullptr) {
-    run_metadata_proto.reset(new RunMetadata);
-  }
+  RunMetadata run_metadata_proto;
 
   // Run the callable.
   std::vector<Tensor> output_tensors;
   Py_BEGIN_ALLOW_THREADS;
   s = session->RunCallable(handle, input_tensors, &output_tensors,
-                           run_metadata_proto.get());
+                           &run_metadata_proto);
   Py_END_ALLOW_THREADS;
 
   if (!s.ok()) {
@@ -256,7 +251,7 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
 
   // If requested, serialize the RunMetadata to pass it back to the caller.
   if (run_metadata != nullptr) {
-    s = MessageToBuffer(*run_metadata_proto, run_metadata);
+    s = MessageToBuffer(run_metadata_proto, run_metadata);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index ac656d322c4..d8399a41f1c 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -1155,6 +1155,13 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
     return "TensorHandle";
   });
 
+  m.def("TF_RegisterFilesystemPlugin", [](const char* plugin_filename) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TF_RegisterFilesystemPlugin(plugin_filename, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+  });
+
   py::enum_<TF_DataType>(m, "TF_DataType")
       .value("TF_FLOAT", TF_FLOAT)
       .value("TF_DOUBLE", TF_DOUBLE)
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index ddc8c484390..f7c282750be 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -32,7 +32,6 @@ tf_py_test(
     size = "small",
     srcs = ["compat_test.py"],
     tags = ["nofwdcompat"],
-    tfrt_enabled = True,
     deps = [
         ":compat",
         "//tensorflow/python:client_testlib",
@@ -43,7 +42,6 @@ tf_py_test(
     name = "disable_v2_behavior_test",
     size = "small",
     srcs = ["disable_v2_behavior_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":v2_compat",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e4c324b0313..fd51af3736b 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 10, 12)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 11, 3)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index fcbb5c0bd87..6b3f32cadc4 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -101,7 +101,6 @@ cuda_py_test(
         "no_windows",
         "nomac",
     ],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     deps = [
         ":trt_convert_py",
@@ -124,6 +123,7 @@ cuda_py_test(
 cuda_py_tests(
     name = "tf_trt_integration_test",
     srcs = [
+        "test/annotate_max_batch_sizes_test.py",
         "test/base_test.py",
         "test/batch_matmul_test.py",
         "test/biasadd_matmul_test.py",
@@ -182,7 +182,6 @@ cuda_py_test(
         "no_windows",
         "nomac",
     ],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base",
diff --git a/tensorflow/python/compiler/tensorrt/test/annotate_max_batch_sizes_test.py b/tensorflow/python/compiler/tensorrt/test/annotate_max_batch_sizes_test.py
new file mode 100644
index 00000000000..7eadb001708
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/annotate_max_batch_sizes_test.py
@@ -0,0 +1,147 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Testing the impact of graph node _tftrt_op_max_batch_size annotation on TRTEngineOp attributes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class MaxBatchSizesTestBase(trt_test.TfTrtIntegrationTestBase):
+
+  @classmethod
+  def setUpClass(cls):
+    if cls is MaxBatchSizesTestBase:
+      raise unittest.SkipTest(
+          'MaxBatchSizesTestBase defines base class for other tests.')
+    super(MaxBatchSizesTestBase, cls).setUpClass()
+
+  @property
+  def tensor_shapes(self):
+    return [[1, 512, 1, 1], [64, 2, 2, 2], [32, 4, 2, 2], [16, 8, 2, 2]]
+
+  @property
+  def max_batch_sizes(self):
+    return [shape[0] for shape in self.tensor_shapes]
+
+  def GetParams(self):
+    """Gets the build parameters for the test."""
+    return self.BuildParams(
+        self.GraphFn,
+        dtype=dtypes.float32,
+        input_shapes=[self.tensor_shapes[0]],
+        output_shapes=[self.tensor_shapes[-1]])
+
+  def ShouldRunTest(self, run_params):
+    # The maximum batch size for dynamic engines will be the actual batch size
+    # detected at runtime. Therefore, we don't run the test with dynamic
+    # engines.
+    return (not run_params.dynamic_engine, 'test static engine only.')
+
+  def GetConversionParams(self, run_params):
+    """Returns a ConversionParams for test."""
+    conversion_params = super(MaxBatchSizesTestBase,
+                              self).GetConversionParams(run_params)
+    conversion_params._replace(
+        max_batch_size=min(self.max_batch_sizes), maximum_cached_engines=1)
+    rewrite_config_with_trt = self.GetTrtRewriterConfig(
+        run_params=run_params,
+        conversion_params=conversion_params,
+        use_implicit_batch=True,
+        disable_non_trt_optimizers=True)
+    return conversion_params._replace(
+        rewriter_config_template=rewrite_config_with_trt)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Checks that the expected engine is built.
+
+    Args:
+      run_params: the run parameters.
+
+    Returns:
+      the expected engines to build.
+
+    There shall be engines generated for each maximum batch size.
+    """
+    return [
+        'TRTEngineOp_{}'.format(seq_id)
+        for seq_id in range(len(self.max_batch_sizes))
+    ]
+
+  def ExpectedMaxBatchSizes(self, run_params):
+    """Checks that the expected maximum batch sizes for the generated engines.
+
+    Args:
+      run_params: the run parameters.
+
+    Returns:
+      the expected maximum batch sizes for the generated engines.
+
+    There shall be engines generated for each maximum batch size.
+    """
+    return self.max_batch_sizes
+
+
+class AnnotateMaxBatchSizesTest(MaxBatchSizesTestBase):
+
+  def GraphFn(self, inp):
+    """Builds a tf.Graph for the test."""
+    tensor = inp * 2.0
+    tensor = array_ops.reshape(tensor, [-1] + self.tensor_shapes[1][1:])
+    with ops.get_default_graph()._attr_scope({
+        '_tftrt_op_max_batch_size':
+            attr_value_pb2.AttrValue(i=self.max_batch_sizes[1])
+    }):
+      tensor = tensor + 3.0
+    tensor = array_ops.reshape(tensor, [-1] + self.tensor_shapes[2][1:])
+    with ops.get_default_graph()._attr_scope({
+        '_tftrt_op_max_batch_size':
+            attr_value_pb2.AttrValue(i=self.max_batch_sizes[2])
+    }):
+      tensor = tensor * 4.0
+    tensor = array_ops.reshape(tensor, [-1] + self.tensor_shapes[3][1:])
+    with ops.get_default_graph()._attr_scope({
+        '_tftrt_op_max_batch_size':
+            attr_value_pb2.AttrValue(i=self.max_batch_sizes[3])
+    }):
+      tensor += tensor + 5.0
+    return array_ops.identity(tensor, name='output_0')
+
+
+class StaticBatchSizeTest(MaxBatchSizesTestBase):
+
+  def GraphFn(self, inp):
+    """Builds a tf.Graph for the test."""
+    tensor = inp * 2.0
+    tensor = array_ops.reshape(tensor, self.tensor_shapes[1])
+    tensor = tensor + 3.0
+    tensor = array_ops.reshape(tensor, self.tensor_shapes[2])
+    tensor = tensor * 4.0
+    tensor = array_ops.reshape(tensor, self.tensor_shapes[3])
+    tensor += tensor + 5.0
+    return array_ops.identity(tensor, name='output_0')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
index e956ce58814..c32ea99629b 100644
--- a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
+
 import numpy as np
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
@@ -29,7 +31,30 @@ from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
 
-class BatchMatMulTwoTensorTest(trt_test.TfTrtIntegrationTestBase):
+class BatchMatMultTestBase(trt_test.TfTrtIntegrationTestBase):
+  """Base class for BatchMatMult tests."""
+
+  # Shape inference of BatchMatMultV2 doesn't work. Use static batch size.
+  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):
+    return self.BuildParamsWithMask(
+        graph_fn=graph_fn,
+        dtype=dtype,
+        input_shapes=input_shapes,
+        output_shapes=output_shapes,
+        input_mask=[[True] * len(s) for s in input_shapes],
+        output_mask=[[True] * len(s) for s in output_shapes],
+        extra_inputs=[],
+        extra_outputs=[])
+
+  @classmethod
+  def setUpClass(cls):
+    if cls is BatchMatMultTestBase:
+      raise unittest.SkipTest(
+          "BatchMatMultTestBase defines base class for other test.")
+    super(BatchMatMultTestBase, cls).setUpClass()
+
+
+class BatchMatMulTwoTensorTest(BatchMatMultTestBase):
   """Testing conversion of BatchMatMul where both inputs are tensors."""
 
   def GraphFn(self, inp, inp1):
@@ -47,7 +72,7 @@ class BatchMatMulTwoTensorTest(trt_test.TfTrtIntegrationTestBase):
     return {"TRTEngineOp_0": ["matmul", "relu"]}
 
 
-class BatchMatMulWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+class BatchMatMulWeightBroadcastTest(BatchMatMultTestBase):
   """Testing BatchMatMulV2: one operand is weight and both have same rank."""
 
   def GraphFn(self, inp):
@@ -66,7 +91,7 @@ class BatchMatMulWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     return {"TRTEngineOp_0": ["matmul", "kernel"]}
 
 
-class BatchMatMulWeightBroadcastDims2Test(trt_test.TfTrtIntegrationTestBase):
+class BatchMatMulWeightBroadcastDims2Test(BatchMatMultTestBase):
   """Testing BatchMatMulV2: weight operand must be broadcasted."""
 
   def GraphFn(self, inp):
diff --git a/tensorflow/python/compiler/tensorrt/test/rank_two_test.py b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
index b23e052a316..8884cd702d2 100644
--- a/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
@@ -67,9 +67,14 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
             "abs0_2", "expand0_0", "expand0_1", "axis"
         ],
         "TRTEngineOp_1": [
-            "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
-            "abs1_1", "abs1_2", "reciprocal0", "reciprocal1"
+            "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3", "abs1_1",
+            "abs1_2", "reciprocal1"
         ],
+        # The two ops can't be in the same cluster as the ops in TRTEngineOp_0
+        # due to trt_incompatible_op. They can't be in the same cluster as the
+        # ops in TRTEngineOP_1 because their batch size belongs to a different
+        # equivalent class.
+        "TRTEngineOp_2": ["add", "reciprocal0"]
     }
 
 
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 092c38f2011..2265a19cf62 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+import collections
 import errno
 import gc
 import itertools
@@ -44,6 +44,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import trace
 from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import loader
@@ -56,7 +57,7 @@ from tensorflow.python.tools import saved_model_utils
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
 
-TfTrtIntegrationTestParams = namedtuple(
+TfTrtIntegrationTestParams = collections.namedtuple(
     "TfTrtIntegrationTestParams",
     [
         # A function that creates the TF graph for testing.
@@ -73,7 +74,7 @@ TfTrtIntegrationTestParams = namedtuple(
         "expected_output_dims"
     ])
 
-RunParams = namedtuple(
+RunParams = collections.namedtuple(
     "RunParams",
     [
         # Whether to run the conversion online with RewriterConfig, or offline
@@ -304,9 +305,13 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         run_params.precision_mode)), "test either calibration or non-INT8"
 
   def ExpectedEnginesToBuild(self, run_params):
-    """Return the expected engines to build, implemented by subclass."""
+    """Returns the expected engines to build, implemented by subclass."""
     raise NotImplementedError()
 
+  def ExpectedMaxBatchSizes(self, run_params):
+    """Returns the expected maximum batch sizes of the build engines."""
+    return None
+
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
     return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-02
@@ -536,18 +541,39 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     logging.info("Writing graph to %s/%s", temp_dir, graph_name)
     graph_io.write_graph(gdef, temp_dir, graph_name)
 
-  # Remove the graph sequence number prefix from the name only if the name has
-  # a prefix TRTEngineOp_n_. When expecting_prefix is true, assert such a
-  # prefix exists.
-  def _RemoveGraphSequenceNumberImpl(self, name, expecting_prefix):
-    match = re.search(r"TRTEngineOp_\d+_", name)
-    has_prefix = match and name.startswith(match.group(0))
-    assert (not expecting_prefix) or has_prefix
-    if has_prefix:
-      parts = name.split("_", maxsplit=2)
-      assert len(parts) == 3
-      return parts[0] + "_" + parts[2]
-    return name
+  # Removes the prefix(s) of function name(s).
+  # The input value can be a string or a sequence of string.
+  def _Canonicalize(self, value):
+    if isinstance(value, str):
+      return self._ToString(value.split("/")[-1])
+    elif isinstance(value, collections.abc.Iterable):
+      return set(self._Canonicalize(nm) for nm in value)
+    else:
+      raise TypeError(
+          "'_Canonicalize' can only be used on strings or sequence of strings!")
+
+  # Removes the graph sequence number prefix from the name(s) only if the
+  # name(s) has a prefix TRTEngineOp_n_. When expecting_prefix is true, asserts
+  # such a prefix exists.
+  # The input value can be a string or a sequence of string.
+  def _RemoveGraphSequenceNumberImpl(self, value, expecting_prefix):
+    if isinstance(value, str):
+      match = re.search(r"TRTEngineOp_\d+_", value)
+      has_prefix = match and value.startswith(match.group(0))
+      assert (not expecting_prefix) or has_prefix
+      if has_prefix:
+        parts = value.split("_", maxsplit=2)
+        assert len(parts) == 3
+        return parts[0] + "_" + parts[2]
+      return value
+    elif isinstance(value, collections.abc.Iterable):
+      return set(
+          self._RemoveGraphSequenceNumberImpl(nm, expecting_prefix)
+          for nm in value)
+    else:
+      raise TypeError(
+          "'_RemoveGraphSequenceNumberImpl' can only be used on strings "
+          "or sequence of strings!")
 
   def _RemoveGraphSequenceNumber(self, name):
     return self._RemoveGraphSequenceNumberImpl(name, True)
@@ -643,6 +669,124 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         msg="\nexpected:\n%s\nvs actual:\n%s" %
         (sorted(expected_input_map.items()), sorted(actual_input_map.items())))
 
+  def _VerifyMaxBatchSizeAnnotations(
+      self,
+      expected_engines,
+      original_gdef,
+      converted_gdef,
+      default_max_batch_size,
+      expected_max_batch_sizes=None,
+  ):
+    """Verifies the max batch size annotations in the original and converted GraphDef.
+
+    Args:
+      expected_engines: A sequence of engines names.
+      original_gdef: GraphDef. The graph def before TensorRT conversion.
+      converted_gdef: GraphDef. The graph def after TensorRT conversion.
+      default_max_batch_size: The default maximum batch size to use if no node
+        inside a segment is annoted with a customized max batch size.
+      expected_max_batch_sizes: Optional. A sequence of max batch sizes for all
+        the engines. `None` if does not check enforce max batch sizes.
+    """
+    if isinstance(expected_max_batch_sizes, collections.abc.Collection):
+      self.assertEqual(len(expected_max_batch_sizes), len(expected_engines))
+    else:
+      self.assertIsNone(
+          expected_max_batch_sizes,
+          "'expected_max_batch_sizes' shall only be a sequence "
+          "of integers or `None`.")
+
+    def _ChainAllNodes(graph_def):
+      return itertools.chain(
+          graph_def.node,
+          itertools.chain(
+              *[func.node_def for func in graph_def.library.function]))
+
+    old_name_to_node_map = {
+        self._ToString(node.name): node
+        for node in _ChainAllNodes(original_gdef)
+    }
+    new_name_to_func_map = {
+        self._ToString(func.signature.name): func
+        for func in converted_gdef.library.function
+    }
+
+    def _DetectStaticBatchSize(node_def):
+      """Returns the static batch size of an operation or None.
+
+      It is incorrect to use the output shapes to find the batch size of an
+      operation, as the segmenter actually uses the input shapes. However, it is
+      a simplication and works for most of the cases for the test purposes.
+
+      Args:
+        node_def: `tf.NodeDef`. The target node for analysis.
+
+      Returns:
+        If all the outputs of the node have the same static batch size, returns
+        the int value for the batch size. Otherwise returns None.
+      """
+      shapes = node_def.attr["_output_shapes"].list.shape
+      batch_size = set(
+          list(s.dim)[0].size if len(s.dim) >= 2 else None for s in shapes)
+      if len(batch_size) == 1 and list(batch_size)[0] >= 1:
+        return list(batch_size)[0]
+      return None
+
+    name_to_engines_map = {}
+    actual_max_batch_sizes = []
+    for node in _ChainAllNodes(converted_gdef):
+      if node.op == "TRTEngineOp":
+        engine = node
+        engine_name = self._RemoveGraphSequenceNumber(
+            self._Canonicalize(self._ToString(engine.name)))
+        self.assertIn(engine_name, expected_engines)
+        name_to_engines_map[engine_name] = engine
+        # The input nodes shall not have the conflicting annotation (no
+        # annotation or the same annotation) with the maximum batch size
+        # annotation. If the engine has maximum batch size annotation as the
+        # non-default maximum batch size, then at least one input node shall
+        # have the same annotation to be the source.
+        self.assertIn("max_batch_size", node.attr)
+        engine_max_batch_size = node.attr["max_batch_size"].i
+        self.assertIsInstance(engine_max_batch_size, int)
+        actual_max_batch_sizes.append(engine_max_batch_size)
+        seg_func = node.attr["segment_func"].func
+        self.assertIsNotNone(seg_func)
+        self.assertIn(seg_func.name, new_name_to_func_map)
+        seg_func_def = new_name_to_func_map[seg_func.name]
+        logging.info("Segment function name: %s. Including %d nodes.",
+                     seg_func.name, len(seg_func_def.node_def))
+        node_max_batch_size_all_none = True
+        # Use the native segment to search for replaced nodes
+        for alternative_node in seg_func_def.node_def:
+          node_name = self._Canonicalize(self._ToString(alternative_node.name))
+          if node_name not in old_name_to_node_map:
+            continue
+          original_node = old_name_to_node_map[node_name]
+          node_max_batch_size = None
+          if "_tftrt_op_max_batch_size" in original_node.attr:
+            node_max_batch_size = original_node.attr[
+                "_tftrt_op_max_batch_size"].i
+          elif (original_node.op != "Const" and
+                alternative_node.op != "Const" and
+                "_output_shapes" in original_node.attr):
+            node_max_batch_size = _DetectStaticBatchSize(original_node)
+          logging.info(
+              "'{%s}(%s)'s max batch size annotation is %s. "
+              "'{%s}'s max batch size is %s.", node_name, original_node.op,
+              str(node_max_batch_size), engine_name, str(engine_max_batch_size))
+          node_max_batch_size_all_none &= node_max_batch_size is None
+          self.assertTrue(engine_max_batch_size == node_max_batch_size or
+                          node_max_batch_size is None)
+        logging.info("'{%s}'s max batch size is %d.", engine_name,
+                     engine_max_batch_size)
+        self.assertTrue(engine_max_batch_size == default_max_batch_size or
+                        not node_max_batch_size_all_none)
+
+    self.assertCountEqual(expected_engines, tuple(name_to_engines_map.keys()))
+    if expected_max_batch_sizes is not None:
+      self.assertCountEqual(expected_max_batch_sizes, actual_max_batch_sizes)
+
   def _GetGraphDef(self, run_params, gdef_or_saved_model_dir):
     if isinstance(gdef_or_saved_model_dir, str):
       if run_params.is_v2:
@@ -702,7 +846,14 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       self.assertEqual(num_engines, len(expected_engines))
       if isinstance(expected_engines, dict):
         self._VerifyConnections(expected_engines, original_gdef, gdef_to_verify)
-      # TODO(aaroey): consider verifying the corresponding TF function.
+      self._VerifyMaxBatchSizeAnnotations(
+          expected_engines=expected_engines,
+          original_gdef=original_gdef,
+          converted_gdef=gdef_to_verify,
+          expected_max_batch_sizes=self.ExpectedMaxBatchSizes(run_params),
+          default_max_batch_size=self.GetConversionParams(
+              run_params).max_batch_size,
+      )
 
   def _VerifyGraphDefV2(self, run_params, original_gdef, gdef_to_verify,
                         graph_state):
@@ -720,15 +871,10 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
           all_op_names.append(node.name)
           if node.op == "TRTEngineOp":
             trt_op_names.append(node.name)
-    # Remove the function name prefix.
-    def _Canonicalize(names):
-      return set(self._ToString(name.split("/")[-1]) for name in names)
-    # Remove the graph sequence number prefix from all the names.
-    def _RemoveGraphSequenceNumber(names):
-      return set(self._RemoveGraphSequenceNumber(name) for name in names)
 
-    all_op_names = _Canonicalize(all_op_names)
-    trt_op_names = _RemoveGraphSequenceNumber(_Canonicalize(trt_op_names))
+    all_op_names = self._Canonicalize(all_op_names)
+    trt_op_names = self._RemoveGraphSequenceNumber(
+        self._Canonicalize(trt_op_names))
 
     if isinstance(expected_engines, dict):
       # For simplicity we don't verify the connections inside the engine in
@@ -740,6 +886,14 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       expected_engines = set(expected_engines.keys())
 
     self.assertEqual(set(expected_engines), trt_op_names)
+    self._VerifyMaxBatchSizeAnnotations(
+        expected_engines=expected_engines,
+        original_gdef=original_gdef,
+        converted_gdef=gdef_to_verify,
+        expected_max_batch_sizes=self.ExpectedMaxBatchSizes(run_params),
+        default_max_batch_size=self.GetConversionParams(
+            run_params).max_batch_size,
+    )
 
   def _VerifyGraphDef(self, run_params, original_gdef_or_saved_model_dir,
                       gdef_or_saved_model_dir_to_verify, graph_state):
@@ -821,71 +975,72 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     return self._MakeSavedModelV1(run_params)
 
   def RunTest(self, run_params):
-    should_run, reason_for_skipping = self.ShouldRunTest(run_params)
-    if not should_run:
-      return self.skipTest(reason_for_skipping)
+    with trace.Trace(run_params.test_name):
+      should_run, reason_for_skipping = self.ShouldRunTest(run_params)
+      if not should_run:
+        return self.skipTest(reason_for_skipping)
 
-    saved_model_dir = self._MakeSavedModel(run_params)
+      saved_model_dir = self._MakeSavedModel(run_params)
 
-    np.random.seed(12345)  # Fix the seed so the test is deterministic.
-    inputs_data = []
-    input_specs = self._GetParamsCached().input_specs
-    for dim_list in self._GetParamsCached().input_dims:
-      assert len(input_specs) == len(dim_list)
-      current_input_data = []
-      for spec, np_shape in zip(input_specs, dim_list):
-        np_dtype = spec.dtype.as_numpy_dtype()
-        # Multiply the input by some constant to avoid all zeros input for
-        # integer types.
-        scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0
-        # TODO(laigd): add debug options. E.g. we can set the input data to be
-        # continuous natural numbers:
-        # seq = np.arange(np.prod(np_shape))
-        # seq.resize(np_shape)
-        # current_inputs_data.append(scale * seq.astype(np_dtype))
-        data = (scale * np.random.random_sample(np_shape)).astype(np_dtype)
-        if run_params.is_v2:
-          with ops.device("/GPU:0"):
-            data = ops.convert_to_tensor(data)
-        current_input_data.append(data)
-      inputs_data.append(current_input_data)
+      np.random.seed(12345)  # Fix the seed so the test is deterministic.
+      inputs_data = []
+      input_specs = self._GetParamsCached().input_specs
+      for dim_list in self._GetParamsCached().input_dims:
+        assert len(input_specs) == len(dim_list)
+        current_input_data = []
+        for spec, np_shape in zip(input_specs, dim_list):
+          np_dtype = spec.dtype.as_numpy_dtype()
+          # Multiply the input by some constant to avoid all zeros input for
+          # integer types.
+          scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0
+          # TODO(laigd): add debug options. E.g. we can set the input data to be
+          # continuous natural numbers:
+          # seq = np.arange(np.prod(np_shape))
+          # seq.resize(np_shape)
+          # current_inputs_data.append(scale * seq.astype(np_dtype))
+          data = (scale * np.random.random_sample(np_shape)).astype(np_dtype)
+          if run_params.is_v2:
+            with ops.device("/GPU:0"):
+              data = ops.convert_to_tensor(data)
+          current_input_data.append(data)
+        inputs_data.append(current_input_data)
 
-    # Verify the original graph.
-    self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir,
-                         GraphState.ORIGINAL)
+      # Verify the original graph.
+      self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir,
+                           GraphState.ORIGINAL)
 
-    # Run the original graph without TensorRT to get the reference result.
-    logging.info("Running original graph w/o TensorRT\n")
-    ref_result = self._RunGraph(
-        run_params,
-        saved_model_dir,
-        inputs_data,
-        GraphState.ORIGINAL,
-        num_runs=1)
+      # Run the original graph without TensorRT to get the reference result.
+      logging.info("Running original graph w/o TensorRT\n")
+      ref_result = self._RunGraph(
+          run_params,
+          saved_model_dir,
+          inputs_data,
+          GraphState.ORIGINAL,
+          num_runs=1)
 
-    # Run calibration if necessary.
-    if IsQuantizationWithCalibration(run_params):
-      infer_saved_model_dir = self._GetCalibratedInferGraph(
-          run_params, saved_model_dir, inputs_data)
-      self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
-                           GraphState.INFERENCE)
-    elif not run_params.convert_online:
-      infer_saved_model_dir = self._GetInferGraph(run_params, saved_model_dir)
-      self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
-                           GraphState.INFERENCE)
-    else:
-      infer_saved_model_dir = saved_model_dir
+      # Run calibration if necessary.
+      if IsQuantizationWithCalibration(run_params):
+        infer_saved_model_dir = self._GetCalibratedInferGraph(
+            run_params, saved_model_dir, inputs_data)
+        self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
+                             GraphState.INFERENCE)
+      elif not run_params.convert_online:
+        infer_saved_model_dir = self._GetInferGraph(run_params, saved_model_dir)
+        self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
+                             GraphState.INFERENCE)
+      else:
+        infer_saved_model_dir = saved_model_dir
 
-    # Run the inference graph, either using the converted graph or the original
-    # graph with convert_online == True.
-    logging.info("Running final inference graph\n")
-    result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data,
-                            GraphState.INFERENCE)
-    self.assertAllClose(
-        ref_result,
-        result,
-        atol=self.ExpectedAbsoluteTolerance(run_params),
-        rtol=self.ExpectedRelativeTolerance(run_params))
+      # Run the inference graph, either using the converted graph or the
+      # original graph with convert_online == True.
+      logging.info("Running final inference graph\n")
+      result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data,
+                              GraphState.INFERENCE)
+      self.assertAllClose(
+          ref_result,
+          result,
+          atol=self.ExpectedAbsoluteTolerance(run_params),
+          rtol=self.ExpectedRelativeTolerance(run_params))
 
   def testIdempotence(self):
     # Test that applying tensorrt optimizer or offline conversion tools multiple
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index ee56be3f11a..5c7f261fa98 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -125,7 +125,8 @@ class ExplicitBatchTest(TrtModeTestBase):
 
   def GetConversionParams(self, run_params):
     """Return a TrtConversionParams for test that enables explicit batch."""
-    return super(ExplicitBatchTest, self).GetConversionParams(run_params, False)
+    return super(ExplicitBatchTest, self).GetConversionParams(
+        run_params, implicit_batch=False)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Check that the expected engine is built.
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 29cb6d14856..8ea5c96f4cc 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -50,6 +50,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
@@ -426,6 +427,8 @@ class TrtGraphConverter(object):
   ```
   """
 
+  @deprecation.deprecated_args(None, "Remove the use of this argument",
+                               "session_config")
   def __init__(self,
                input_saved_model_dir=None,
                input_saved_model_tags=None,
@@ -994,6 +997,9 @@ class TrtGraphConverterV2(object):
     assert context.executing_eagerly()
     if conversion_params is None:
       conversion_params = TrtConversionParams()
+    elif conversion_params.rewriter_config_template is not None:
+      tf_logging.warn("the rewrite_config_template field will be deprecated.")
+
     _check_trt_version_compatibility()
     _check_conversion_params(conversion_params, is_v2=True)
 
@@ -1052,6 +1058,10 @@ class TrtGraphConverterV2(object):
         [tensor.name for tensor in func.outputs])
     rebuilt_func.graph.structured_outputs = nest.pack_sequence_as(
         func.graph.structured_outputs, rebuilt_func.graph.structured_outputs)
+    # Copy structured input signature from original function (used during
+    # serialization)
+    rebuilt_func.graph.structured_input_signature = (
+        func.structured_input_signature)
     return rebuilt_func
 
   # TODO(laigd): provide a utility function to optimize a ConcreteFunction and
@@ -1104,6 +1114,10 @@ class TrtGraphConverterV2(object):
     self._converted_func.graph.structured_outputs = nest.pack_sequence_as(
         func.graph.structured_outputs,
         self._converted_func.graph.structured_outputs)
+    # Copy structured input signature from original function (used during
+    # serialization)
+    self._converted_func.graph.structured_input_signature = (
+        func.structured_input_signature)
 
     if self._need_calibration:
       for inp in calibration_input_fn():
@@ -1259,6 +1273,8 @@ class TrtGraphConverterV2(object):
       reset_converted_func.graph.structured_outputs = nest.pack_sequence_as(
           self._converted_func.graph.structured_outputs,
           reset_converted_func.graph.structured_outputs)
+      reset_converted_func.graph.strucutred_input_signature = (
+          self._converted_func.structured_input_signature)
       self._converted_func = reset_converted_func
 
     signatures[self._input_saved_model_signature_key] = self._converted_func
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index 5d92e43c61a..79c18571f9a 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -24,6 +24,9 @@ cuda_py_test(
     name = "jit_test",
     size = "small",
     srcs = ["jit_test.py"],
+    tags = [
+        "no_windows",  # TODO(b/171385770)
+    ],
     xla_enabled = True,
     deps = [
         ":compiler_py",
@@ -99,7 +102,6 @@ cuda_py_test(
         "no_mac",
         "no_windows",
     ],
-    tfrt_enabled = True,
     xla_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 94c189c43f1..3f0faf5364a 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -10,7 +10,6 @@ exports_files(["LICENSE"])
 tf_py_test(
     name = "meta_benchmark",
     srcs = ["meta_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:session",
@@ -35,7 +34,6 @@ py_library(
 tf_py_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python:sparse_tensor",
@@ -47,7 +45,6 @@ tf_py_test(
 tf_py_test(
     name = "filter_benchmark",
     srcs = ["filter_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -57,7 +54,6 @@ tf_py_test(
 tf_py_test(
     name = "from_tensor_slices_benchmark",
     srcs = ["from_tensor_slices_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/experimental/ops:get_single_element",
@@ -69,7 +65,6 @@ tf_py_test(
 tf_py_test(
     name = "list_files_benchmark",
     srcs = ["list_files_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python:client_testlib",
@@ -84,7 +79,6 @@ tf_py_test(
 tf_py_test(
     name = "map_benchmark",
     srcs = ["map_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -94,7 +88,6 @@ tf_py_test(
 tf_py_test(
     name = "prefetch_benchmark",
     srcs = ["prefetch_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -104,7 +97,6 @@ tf_py_test(
 tf_py_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 3e9f202d849..5712f20117e 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -28,6 +28,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@CsvDataset
 @@DatasetStructure
 @@DistributeOptions
+@@ExternalStatePolicy
 @@MapVectorizationOptions
 @@OptimizationOptions
 @@Optional
@@ -106,6 +107,7 @@ from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNO
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.experimental.ops.distribute_options import DistributeOptions
+from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index a3ceb9ed37f..e3ca2d52ab5 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -25,7 +25,6 @@ py_binary(
 tf_py_test(
     name = "autotune_benchmark",
     srcs = ["autotune_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -38,7 +37,6 @@ tf_py_test(
 tf_py_test(
     name = "choose_fastest_benchmark",
     srcs = ["choose_fastest_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -52,7 +50,6 @@ tf_py_test(
 tf_py_test(
     name = "choose_fastest_branch_benchmark",
     srcs = ["choose_fastest_branch_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -69,7 +66,6 @@ tf_py_test(
     name = "csv_dataset_benchmark",
     srcs = ["csv_dataset_benchmark.py"],
     tags = ["no_pip"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:parsing_ops",
@@ -85,7 +81,6 @@ tf_py_test(
 tf_py_test(
     name = "map_and_batch_benchmark",
     srcs = ["map_and_batch_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -104,7 +99,6 @@ tf_py_test(
 tf_py_test(
     name = "map_defun_benchmark",
     srcs = ["map_defun_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -120,7 +114,6 @@ tf_py_test(
 tf_py_test(
     name = "map_vectorization_benchmark",
     srcs = ["map_vectorization_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -140,7 +133,6 @@ tf_py_test(
     name = "matching_files_benchmark",
     size = "small",
     srcs = ["matching_files_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -156,7 +148,6 @@ tf_py_test(
 tf_py_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -170,7 +161,6 @@ tf_py_test(
 tf_py_test(
     name = "parallel_interleave_benchmark",
     srcs = ["parallel_interleave_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -186,7 +176,6 @@ tf_py_test(
     name = "rejection_resample_benchmark",
     srcs = ["rejection_resample_benchmark.py"],
     tags = ["no_pip"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:resampling",
@@ -199,7 +188,6 @@ tf_py_test(
 tf_py_test(
     name = "snapshot_dataset_benchmark",
     srcs = ["snapshot_dataset_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -218,7 +206,6 @@ tf_py_test(
 tf_py_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 248421b083f..b1510a56c7e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -379,9 +379,6 @@ tf_py_test(
     name = "map_and_batch_test",
     size = "medium",
     srcs = ["map_and_batch_test.py"],
-    tags = [
-        "nomsan",  # b/168906619
-    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -405,7 +402,6 @@ tf_py_test(
     size = "small",
     srcs = ["map_defun_op_test.py"],
     tags = ["no_pip"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -743,7 +739,6 @@ tf_py_test(
 tf_py_test(
     name = "sleep_test",
     srcs = ["sleep_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:util",
@@ -810,6 +805,7 @@ tf_py_test(
     size = "small",
     srcs = ["stats_dataset_ops_test.py"],
     tags = [
+        "no_oss",  # TODO(b/155795733): Note that this functionality is deprecated.
         "no_pip",
         "notap",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 0dd7ae1f083..4e5a8681320 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -101,7 +101,8 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
 
     for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
                                                    n_bucket_elements):
-      # Calculate the expected sum across all batches of a specific sequence length.
+      # Calculate the expected sum across all batches of a specific sequence
+      # length.
       expected_sums[length] = \
           (bucket_elements - bucket_elements % batch_size) * length
       # Calculate the expected occurrence of individual batch sizes.
@@ -116,8 +117,8 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
         # Produce 1 batch for each bucket
         elements = []
         for bucket_elements, length in zip(n_bucket_elements, lengths):
-          # Using only full sequences (opposed to the strategy employed in `testBucket`) makes
-          # checking the sum a lot easier.
+          # Using only full sequences (opposed to the strategy employed in
+          # `testBucket`) makes checking the sum a lot easier.
           record_len = length
           for _ in range(bucket_elements):
             elements.append([1] * record_len)
@@ -177,7 +178,8 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
         generated_sums[length] += batch_sum
 
       for l in lengths:
-        # Make sure the sum of the batch contents is correct for the individual sequence lengths.
+        # Make sure the sum of the batch contents is correct for the individual
+        # sequence lengths.
         self.assertEqual(
             generated_sums[l], expected_sums[l], "Tensor sums did not match! "
             "expected: {}, generated: {}".format(expected_sums, generated_sums))
@@ -261,6 +263,7 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
 
     _test_bucket_by_padding(param_no_padding)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testPadToBoundary(self):
 
     boundaries = [10, 20, 30]
@@ -308,6 +311,7 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
     self.assertEqual([boundary - 1 for boundary in sorted(boundaries)],
                      sorted(lengths_val))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testPadToBoundaryNoExtraneousPadding(self):
 
     boundaries = [3, 7, 11]
@@ -460,6 +464,25 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
     expected_batches = _compute_expected_batches(param_drop_remainder)
     self.assertEqual(batches, expected_batches)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testCardinality(self):
+
+    boundaries = [3, 7, 11]
+    batch_sizes = [2, 2, 2, 2]
+    lengths = range(1, 11)
+
+    def element_gen():
+      for length in lengths:
+        yield ([1] * length,)
+
+    element_len = lambda element: array_ops.shape(element)[0]
+    dataset = dataset_ops.Dataset.from_generator(
+        element_gen, (dtypes.int64,), ([None],)).repeat().apply(
+            grouping.bucket_by_sequence_length(
+                element_len, boundaries, batch_sizes,
+                pad_to_bucket_boundary=True))
+    self.assertEqual(self.evaluate(dataset.cardinality()), dataset_ops.INFINITE)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py
index 99d1a7b563e..33b7ca25985 100644
--- a/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py
@@ -219,6 +219,7 @@ class DataServiceOpsTest(data_service_test_base.TestBase,
 
   @combinations.generate(test_base.eager_only_combinations())
   def testChangeProcessingModeAfterRestart(self):
+    self.skipTest("b/170910141")
     cluster = self.create_cluster(num_workers=1)
     num_elements = 100
     range_dataset = dataset_ops.Dataset.range(num_elements)
diff --git a/tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py
index 65674818daf..97c906e4788 100644
--- a/tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py
@@ -214,7 +214,7 @@ class DataServiceOpsTest(data_service_test_base.TestBase,
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
     cluster = self.create_cluster(num_workers=1)
-    num_elements = 100
+    num_elements = 1000
 
     def make_ds():
       return dataset_ops.Dataset.range(num_elements).shuffle(num_elements)
@@ -430,6 +430,43 @@ class DataServiceOpsTest(data_service_test_base.TestBase,
     self.assertDatasetProduces(
         ds, num_repeats * list(range(num_elements)), assert_items_equal=True)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochForeverRepeat(self):
+    cluster = self.create_cluster(num_workers=2)
+    num_elements = 20
+    elements_to_read = 1000
+    ds = dataset_ops.Dataset.range(num_elements).repeat()
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    it = iter(ds)
+    results = {}
+    for _ in range(elements_to_read):
+      val = next(it).numpy()
+      if val not in results: results[val] = 0
+      results[val] += 1
+    for i in range(num_elements):
+      self.assertGreater(results[i], elements_to_read / num_elements / 2)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochForeverRepeatFewElements(self):
+    num_workers = 5
+    cluster = self.create_cluster(num_workers=num_workers)
+    # Less than the number of workers, so that some workers get zero elements on
+    # the first repetition.
+    num_elements = 1
+    ds = dataset_ops.Dataset.range(num_elements).repeat()
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    it = iter(ds)
+    for _ in range(100):
+      self.assertEqual(next(it).numpy(), 0)
+
+    # Stop all but one worker and check that we can still read.
+    for i in range(num_workers - 1):
+      cluster.workers[i]._stop()
+    for _ in range(100):
+      self.assertEqual(next(it).numpy(), 0)
+
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeDistributedEpochShuffleAndRepeat(self):
     cluster = self.create_cluster(num_workers=2)
diff --git a/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py b/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py
index 0bb1383a56b..fe805850ec3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py
@@ -99,7 +99,8 @@ class TestCluster(object):
         server_lib.WorkerServer(
             server_lib.WorkerConfig(
                 dispatcher_address=self.dispatcher_address(),
-                heartbeat_interval_ms=TEST_HEARTBEAT_INTERVAL_MS),
+                heartbeat_interval_ms=TEST_HEARTBEAT_INTERVAL_MS,
+                dispatcher_timeout_ms=1000),
             start=start))
 
   def start_dispatcher(self):
@@ -151,6 +152,11 @@ class TestCluster(object):
   def num_tasks_on_worker(self, worker_index=0):
     return self.workers[worker_index]._num_tasks()
 
+  def __del__(self):
+    # Destroy workers before the dispatcher for clean shutdown.
+    self.workers.clear()
+    del self.dispatcher
+
 
 class TestBase(test_base.DatasetTestBase):
   """Base class for tf.data service tests."""
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index a35327c7b70..11fbaa890f5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -341,6 +341,14 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testGroupByWindowCardinality(self):
+    dataset = dataset_ops.Dataset.range(1).repeat().apply(
+        grouping.group_by_window(
+            lambda x: x % 2,
+            lambda key, window: dataset_ops.Dataset.from_tensors(key), 4))
+    self.assertEqual(self.evaluate(dataset.cardinality()), dataset_ops.INFINITE)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 46ae19a05a9..e3d8c60d317 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -255,6 +255,7 @@ tf_py_test(
         "no_oss",
         "no_pip",
         "no_windows",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
index 14a0eafdd01..51bd5fbcbaf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
@@ -208,6 +208,26 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[[0]])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testOptimizationDoubleOptimizeDatasetNested(self):
+    def flat_map_fn(_):
+      dataset = dataset_ops.Dataset.from_tensors(0)
+      dataset = dataset.apply(testing.assert_next(["MapAndBatch"]))
+      dataset = dataset.skip(0)
+      # Should be fused by map and batch fusion
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(1)
+      return dataset
+
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.flat_map(flat_map_fn)
+    dataset = dataset_ops._OptimizeDataset(dataset, ["map_and_batch_fusion"],
+                                           [], [])
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"], [],
+                                           [])
+
+    self.assertDatasetProduces(dataset, expected_output=[[0]])
+
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
@@ -257,6 +277,33 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       del os.environ["TF_DATA_EXPERIMENT_OPT_IN"]
       del os.environ["TF_JOB_NAME"]
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(autotune=[True, False]),
+          combinations.combine(set_env=[True, False])))
+  def testOptimizationMapParallelization(self, autotune, set_env):
+    if set_env:
+      os.environ["TF_DATA_EXPERIMENT_OPT_IN"] = "map_parallelization"
+      os.environ["TF_JOB_NAME"] = "test_job"
+
+    dataset = dataset_ops.Dataset.range(5)
+    if autotune and set_env:
+      dataset = dataset.apply(testing.assert_next(["ParallelMap"]))
+    else:
+      dataset = dataset.apply(testing.assert_next(["Map"]))
+    dataset = dataset.map(lambda x: x + 1)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.autotune = autotune
+    dataset = dataset.with_options(options)
+
+    self.assertDatasetProduces(dataset, expected_output=list(range(1, 6)))
+
+    if set_env:
+      del os.environ["TF_DATA_EXPERIMENT_OPT_IN"]
+      del os.environ["TF_JOB_NAME"]
+
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
@@ -515,5 +562,6 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(cpu_budget, 1000)
     self.assertEqual(ram_budget, 999999999)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 8175480182f..941ce327555 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -287,6 +287,40 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     expected_output = [[0], [1], [2], [3], [], [4], [5], [6], [7], []]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testEmptyFirstSplits(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[0, 1], drop_remainder=drop_remainder)
+
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    # We have an extra element at the end because if the desired batch size is
+    # zero, then we never read any inputs from the input_dataset at all, so we
+    # will keep producting empty outputs until we reach a non zero desired batch
+    # size split.
+    expected_output = [[], [0], [], [1], [], [2], [], [3],
+                       [], [4], [], [5], [], [6], [], [7], []]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testEmptyLastSplits(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[1, 0], drop_remainder=drop_remainder)
+
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    expected_output = [[0], [], [1], [], [2], [], [3], [],
+                       [4], [], [5], [], [6], [], [7], []]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index da710783151..9aa9f3f5447 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -40,7 +40,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -59,7 +58,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -82,7 +80,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -103,7 +100,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -122,7 +118,6 @@ tf_py_test(
         "no_windows",
         "notsan",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -166,7 +161,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -184,7 +178,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -203,7 +196,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -222,7 +214,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -240,7 +231,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -259,7 +249,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -280,7 +269,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -298,7 +286,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -324,7 +311,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -343,7 +329,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -361,8 +346,8 @@ tf_py_test(
         "no_oss",
         "no_pip",
         "no_windows",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -382,7 +367,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -403,7 +387,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -422,7 +405,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -446,7 +428,6 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -465,7 +446,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -483,7 +463,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -501,7 +480,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -521,7 +499,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -542,7 +519,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -567,7 +543,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -584,7 +559,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -601,7 +575,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -625,7 +598,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -643,7 +615,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -662,7 +633,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -680,7 +650,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_combinations",
@@ -701,7 +670,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -719,7 +687,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -737,7 +704,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -757,7 +723,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -777,7 +742,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -798,7 +762,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -818,7 +781,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -837,7 +799,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -855,7 +816,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -874,7 +834,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -892,7 +851,6 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index aea4934260e..44fe30f6729 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -39,8 +39,7 @@ from tensorflow.python.util import nest
 
 
 def remove_variants(get_next_op):
-  # TODO(b/72408568): Remove this once session.run can get
-  # variant tensors.
+  # TODO(b/72408568): Remove this once session.run can get variant tensors.
   """Remove variants from a nest structure, so sess.run will execute."""
 
   def _remove_variant(x):
@@ -61,7 +60,7 @@ class DatasetSerializationTestBase(test.TestCase):
 
   # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
   # (deprecated) saveable `SparseTensorSliceDataset`, once the API
-  # `from_sparse_tensor_slices()`and related tests are deleted.
+  # `from_sparse_tensor_slices()` and related tests are deleted.
   def run_core_tests(self, ds_fn, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index c46aeb116c3..b1fa780f6b3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -325,10 +325,12 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2, dataset3, dataset4))
     dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
-    next1 = self.getNext(dataset)
-    for i in range(0, 1000):
-      self.assertEqual((i, i + 1000, i + 2000, i + 3000),
-                       self.evaluate(next1()))
+
+    expected = list(
+        zip(
+            range(0, 1000), range(1000, 2000), range(2000, 3000),
+            range(3000, 4000)))
+    self.assertDatasetProduces(dataset, expected)
     self.assertSnapshotDirectoryContains(
         self._snapshot_dir,
         num_fingerprints=1,
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index 568c01646de..a65a9d79340 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -330,6 +330,14 @@ def replicate(dataset, devices):
     return datasets
 
   with ops.colocate_with(dataset._variant_tensor):
+    # We apply options before replicating the dataset because options are
+    # currently not automatically preserved through dataset serialization and
+    # thus an explicit application of options here is needed to avoid losing
+    # `dataset` options.
+    #
+    # TODO(b/147325552): Propagating options to C++ upon their setting would
+    # allow us to preserve the options across both variant and GraphDef based
+    # serialization, avoiding the need to explicitly apply options here.
     dataset = dataset._apply_options()
     policy = dataset.options().experimental_external_state_policy
     if policy is None:
diff --git a/tensorflow/python/data/experimental/ops/distribute_options.py b/tensorflow/python/data/experimental/ops/distribute_options.py
index 40c9b2ec2aa..82c498ff993 100644
--- a/tensorflow/python/data/experimental/ops/distribute_options.py
+++ b/tensorflow/python/data/experimental/ops/distribute_options.py
@@ -27,8 +27,8 @@ from tensorflow.python.util.tf_export import tf_export
 class AutoShardPolicy(enum.IntEnum):
   """Represents the type of auto-sharding we enable.
 
-  Please see the DistributeOptions.auto_shard_policy documentation for more
-  information on each type of autosharding.
+  See the `tf.data.experimental.DistributeOptions.auto_shard_policy`
+  documentation for more information.
   """
   OFF = -1
   AUTO = 0
@@ -36,7 +36,13 @@ class AutoShardPolicy(enum.IntEnum):
   DATA = 2
 
 
+@tf_export("data.experimental.ExternalStatePolicy")
 class ExternalStatePolicy(enum.Enum):
+  """Represents how to handle external state during serialization.
+
+  See the `tf.data.Options.experimental_external_state_policy` documentation
+  for more information.
+  """
   WARN = 0
   IGNORE = 1
   FAIL = 2
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index b6096b4b0bb..020e5a91e29 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -36,7 +36,6 @@ py_library(
 tf_py_test(
     name = "server_lib_test",
     srcs = ["server_lib_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":server_lib",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
index 95179a4a7df..addd20fb73b 100644
--- a/tensorflow/python/data/experimental/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -217,7 +217,7 @@ class DispatchServer(object):
 class WorkerConfig(
     collections.namedtuple("WorkerConfig", [
         "dispatcher_address", "worker_address", "port", "protocol",
-        "heartbeat_interval_ms"
+        "heartbeat_interval_ms", "dispatcher_timeout_ms"
     ])):
   """Configuration class for tf.data service dispatchers.
 
@@ -235,6 +235,8 @@ class WorkerConfig(
       reasonable default. A higher value will reduce the load on the dispatcher,
       while a lower value will reduce the time it takes to reclaim resources
       from finished jobs.
+    dispatcher_timeout_ms: How long, in milliseconds, to retry requests to the
+      dispatcher before giving up and reporting an error. Defaults to 1 hour.
   """
 
   def __new__(cls,
@@ -242,15 +244,19 @@ class WorkerConfig(
               worker_address=None,
               port=0,
               protocol="grpc",
-              heartbeat_interval_ms=None):
+              heartbeat_interval_ms=None,
+              dispatcher_timeout_ms=None):
     if worker_address is None:
       worker_address = "localhost:%port%"
     if heartbeat_interval_ms is None:
       heartbeat_interval_ms = 30 * 1000  # 30 seconds
+    if dispatcher_timeout_ms is None:
+      dispatcher_timeout_ms = 60 * 60 * 1000  # 1 hour
 
     return super(WorkerConfig,
                  cls).__new__(cls, dispatcher_address, worker_address, port,
-                              protocol, heartbeat_interval_ms)
+                              protocol, heartbeat_interval_ms,
+                              dispatcher_timeout_ms)
 
 
 @tf_export("data.experimental.service.WorkerServer", v1=[])
@@ -299,7 +305,8 @@ class WorkerServer(object):
         worker_address=config.worker_address,
         port=config.port,
         protocol=config.protocol,
-        heartbeat_interval_ms=config.heartbeat_interval_ms)
+        heartbeat_interval_ms=config.heartbeat_interval_ms,
+        dispatcher_timeout_ms=config.dispatcher_timeout_ms)
     self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
         config_proto.SerializeToString())
     if start:
diff --git a/tensorflow/python/data/experimental/service/server_lib_test.py b/tensorflow/python/data/experimental/service/server_lib_test.py
index 16cea26614f..8248abdc2fd 100644
--- a/tensorflow/python/data/experimental/service/server_lib_test.py
+++ b/tensorflow/python/data/experimental/service/server_lib_test.py
@@ -172,7 +172,7 @@ class ServerLibTest(test.TestCase):
     # return UnavailableError with no trace events collected string.
     with self.assertRaises(errors.UnavailableError) as error:
       profiler_client.trace(worker._address, tempfile.mkdtemp(), duration_ms=10)
-    self.assertEqual("No trace event is collected", str(error.exception))
+    self.assertStartsWith(str(error.exception), "No trace event was collected")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 2705778fba6..c0b0b8e006b 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -114,7 +114,6 @@ tf_py_test(
     name = "dataset_spec_test",
     size = "small",
     srcs = ["dataset_spec_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":test_base",
         "//tensorflow/python:client_testlib",
@@ -226,7 +225,6 @@ tf_py_test(
     name = "from_sparse_tensor_slices_test",
     size = "small",
     srcs = ["from_sparse_tensor_slices_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
@@ -318,7 +316,6 @@ tf_py_test(
         "no_oss",  # Test flaky due to port collisions.
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
@@ -611,6 +608,9 @@ cuda_py_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.py"],
+    tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":test_base",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
index 9caf1177ae9..ff89ff80465 100644
--- a/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import gzip
 import os
+import pathlib
 import zlib
 
 from absl.testing import parameterized
@@ -190,6 +191,24 @@ class FixedLengthRecordDatasetTest(test_base.DatasetTestBase,
             r"which is not an exact multiple of the record length \(4 bytes\).")
         )
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testFixedLengthRecordDatasetPathlib(self):
+    test_filenames = self._createFiles()
+    test_filenames = [pathlib.Path(f) for f in test_filenames]
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes,
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10,
+        num_parallel_reads=4)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output,
+                               assert_items_equal=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/text_line_dataset_test.py b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
index 2a81dff0058..4e171d2f3ba 100644
--- a/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import gzip
 import os
+import pathlib
 import zlib
 
 from absl.testing import parameterized
@@ -168,6 +169,16 @@ class TextLineDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     open_files = psutil.Process().open_files()
     self.assertNotIn(filename, [open_file.path for open_file in open_files])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testTextLineDatasetPathlib(self):
+    files = self._createFiles(1, 5)
+    files = [pathlib.Path(f) for f in files]
+
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    ds = readers.TextLineDataset(files)
+    self.assertDatasetProduces(
+        ds, expected_output=expected_output, assert_items_equal=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
index 792c4926640..a16fa334155 100644
--- a/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import gzip
 import os
+import pathlib
 import zlib
 
 from absl.testing import parameterized
@@ -187,6 +188,15 @@ class TFRecordDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         dataset, expected_output=expected_output * 10, assert_items_equal=True)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDatasetPathlib(self):
+    files = [pathlib.Path(self.test_filenames[0])]
+
+    expected_output = [self._record(0, i) for i in range(self._num_records)]
+    ds = readers.TFRecordDataset(files)
+    self.assertDatasetProduces(
+        ds, expected_output=expected_output, assert_items_equal=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 6ece7cc2f01..d69bf6a2297 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -51,11 +51,14 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/compat",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:convert",
-        "//tensorflow/python/data/util:structure",
     ],
 )
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index a7d2ed4840f..0cb68f102d9 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -741,7 +741,7 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
 
     The elements generated by `generator` must be compatible with either the
     given `output_signature` argument or with the given `output_types` and
-    (optionally) `output_shapes` arguments, whichiver was specified.
+    (optionally) `output_shapes` arguments, whichever was specified.
 
     The recommended way to call `from_generator` is to use the
     `output_signature` argument. In this case the output will be assumed to
@@ -765,8 +765,8 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
     There is also a deprecated way to call `from_generator` by either with
     `output_types` argument alone or together with `output_shapes` argument.
     In this case the output of the function will be assumed to consist of
-    `tf.Tensor` objects with with the types defined by `output_types` and with
-    the shapes which are either unknown or defined by `output_shapes`.
+    `tf.Tensor` objects with the types defined by `output_types` and with the
+    shapes which are either unknown or defined by `output_shapes`.
 
     Note: The current implementation of `Dataset.from_generator()` uses
     `tf.numpy_function` and inherits the same constraints. In particular, it
@@ -833,14 +833,20 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       output_signature = nest.map_structure_up_to(output_types,
                                                   tensor_spec.TensorSpec,
                                                   output_shapes, output_types)
+    if all([
+        isinstance(x, tensor_spec.TensorSpec)
+        for x in nest.flatten(output_signature)
+    ]):
+      output_types = nest.pack_sequence_as(
+          output_signature, [x.dtype for x in nest.flatten(output_signature)])
+      output_shapes = nest.pack_sequence_as(
+          output_signature, [x.shape for x in nest.flatten(output_signature)])
 
     if args is None:
       args = ()
     else:
       args = tuple(ops.convert_n_to_tensor(args, name="args"))
 
-    flat_output_types = structure.get_flat_tensor_types(output_signature)
-
     generator_state = DatasetV2._GeneratorState(generator)
 
     def get_iterator_id_fn(unused_dummy):
@@ -872,38 +878,112 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       Returns:
         The next element to generate from the iterator.
       """
+      if output_types and output_shapes:
+        flattened_types = [
+            dtypes.as_dtype(dt) for dt in nest.flatten(output_types)
+        ]
+        flattened_shapes = nest.flatten(output_shapes)
 
-      def generator_py_func(iterator_id):
-        """A `py_func` that will be called to invoke the iterator."""
-        # `next()` raises `StopIteration` when there are no more
-        # elements remaining to be generated.
-        values = next(generator_state.get_iterator(iterator_id.numpy()))
+        def generator_py_func(iterator_id):
+          """A `py_func` that will be called to invoke the iterator."""
+          # `next()` raises `StopIteration` when there are no more
+          # elements remaining to be generated.
+          values = next(generator_state.get_iterator(iterator_id))
 
-        try:
-          values = structure.normalize_element(values, output_signature)
-        except (TypeError, ValueError):
-          six.reraise(
-              TypeError,
-              TypeError(
-                  "`generator` yielded an element that did not match the "
-                  "expected structure. The expected structure was %s, but the "
-                  "yielded element was %s." % (output_signature, values)),
-              sys.exc_info()[2])
+          # Use the same _convert function from the py_func() implementation to
+          # convert the returned values to arrays early, so that we can inspect
+          # their values.
+          try:
+            flattened_values = nest.flatten_up_to(output_types, values)
+          except (TypeError, ValueError):
+            six.reraise(
+                TypeError,
+                TypeError(
+                    "`generator` yielded an element that did not match the "
+                    "expected structure. The expected structure was %s, but "
+                    "the yielded element was %s." % (output_types, values)),
+                sys.exc_info()[2])
+          ret_arrays = []
+          for ret, dtype in zip(flattened_values, flattened_types):
+            try:
+              ret_arrays.append(
+                  script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
+                      ret,
+                      dtype=dtype.as_numpy_dtype))
+            except (TypeError, ValueError):
+              six.reraise(
+                  TypeError,
+                  TypeError(
+                      "`generator` yielded an element that could not be "
+                      "converted to the expected type. The expected type was "
+                      "%s, but the yielded element was %s." %
+                      (dtype.name, ret)),
+                  sys.exc_info()[2])
 
-        values_spec = structure.type_spec_from_value(values)
+          # Additional type and shape checking to ensure that the components of
+          # the generated element match the `output_types` and `output_shapes`
+          # arguments.
+          for (ret_array, expected_dtype,
+               expected_shape) in zip(ret_arrays, flattened_types,
+                                      flattened_shapes):
+            if ret_array.dtype != expected_dtype.as_numpy_dtype:
+              raise TypeError(
+                  "`generator` yielded an element of type %s where an element "
+                  "of type %s was expected." %
+                  (ret_array.dtype, expected_dtype.as_numpy_dtype))
+            if not expected_shape.is_compatible_with(ret_array.shape):
+              raise ValueError(
+                  "`generator` yielded an element of shape %s where an element "
+                  "of shape %s was expected." %
+                  (ret_array.shape, expected_shape))
 
-        if not structure.are_compatible(values_spec, output_signature):
-          raise TypeError(
-              "`generator` yielded an element of %s where an element "
-              "of %s was expected." % (values_spec, output_signature))
+          return ret_arrays
 
-        return structure.to_tensor_list(output_signature, values)
+        flat_values = script_ops.numpy_function(generator_py_func,
+                                                [iterator_id_t],
+                                                flattened_types)
 
-      return script_ops._eager_py_func(  # pylint: disable=protected-access
-          generator_py_func,
-          inp=[iterator_id_t],
-          Tout=flat_output_types,
-          use_tape_cache=False)
+        # The `py_func()` op drops the inferred shapes, so we add them back in
+        # here.
+        if output_shapes is not None:
+          for ret_t, shape in zip(flat_values, flattened_shapes):
+            ret_t.set_shape(shape)
+
+        return nest.pack_sequence_as(output_types, flat_values)
+      else:
+        flat_output_types = structure.get_flat_tensor_types(output_signature)
+
+        def generator_py_func(iterator_id):
+          """A `py_func` that will be called to invoke the iterator."""
+          # `next()` raises `StopIteration` when there are no more
+          # elements remaining to be generated.
+          values = next(generator_state.get_iterator(iterator_id.numpy()))
+
+          try:
+            values = structure.normalize_element(values, output_signature)
+          except (TypeError, ValueError):
+            six.reraise(
+                TypeError,
+                TypeError(
+                    "`generator` yielded an element that did not match the "
+                    "expected structure. The expected structure was %s, but "
+                    "the yielded element was %s." % (output_signature, values)),
+                sys.exc_info()[2])
+
+          values_spec = structure.type_spec_from_value(values)
+
+          if not structure.are_compatible(values_spec, output_signature):
+            raise TypeError(
+                "`generator` yielded an element of %s where an element "
+                "of %s was expected." % (values_spec, output_signature))
+
+          return structure.to_tensor_list(output_signature, values)
+
+        return script_ops._eager_py_func(  # pylint: disable=protected-access
+            generator_py_func,
+            inp=[iterator_id_t],
+            Tout=flat_output_types,
+            use_tape_cache=False)
 
     def finalize_fn(iterator_id_t):
       """Releases host-side state for the iterator with ID `iterator_id_t`."""
@@ -1575,7 +1655,7 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
     """
     if padded_shapes is None:
       padded_shapes = get_legacy_output_shapes(self)
-      # A `tf.TensorShape` only is only falsey if its *rank* is unknown:
+      # A `tf.TensorShape` is only false if its *rank* is unknown:
       # bool(tf.TensorShape(None)) is False
       if not all(nest.flatten(padded_shapes)):
         raise ValueError("You must set the `padded_shapes` argument to "
@@ -1990,7 +2070,7 @@ name=None))
         stride of the input elements in the sliding window. Must be positive.
         The default value of 1 means "retain every input element".
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last window should be dropped if its size is smaller than
+        whether the last windows should be dropped if their size is smaller than
         `size`.
 
     Returns:
@@ -2842,7 +2922,7 @@ def get_legacy_output_types(dataset_or_iterator):
     dataset_or_iterator: A `tf.data.Dataset` or `tf.data.Iterator`.
 
   Returns:
-    A nested structure of `tf.DType` objects objects matching the structure of
+    A nested structure of `tf.DType` objects matching the structure of
     dataset / iterator elements and specifying the shape of the individual
     components.
   """
@@ -2940,9 +3020,9 @@ class Options(options_lib.OptionsBase):
       docstring="This option can be used to override the default policy for "
       "how to handle external state when serializing a dataset or "
       "checkpointing its iterator. There are three settings available - "
-      "IGNORE: in which we completely ignore any state; WARN: We warn the "
-      "user that some state might be thrown away; FAIL: We fail if any state "
-      "is being captured.")
+      "IGNORE: External state is ignored without a warning; WARN: External "
+      "state is ignored and a warning is logged; FAIL: External state results "
+      "in an error.")
 
   def _graph_rewrites(self):
     """Produces lists of enabled, disabled, default static graph rewrites.
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index dbc580ce331..4db302be75b 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
@@ -27,11 +29,17 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
+def _normalise_fspath(path):
+  """Convert pathlib-like objects to str (__fspath__ compatibility, PEP 519)."""
+  return os.fspath(path) if isinstance(path, os.PathLike) else path
+
+
 def _create_or_validate_filenames_dataset(filenames):
   """Creates (or validates) a dataset of filenames.
 
@@ -52,6 +60,7 @@ def _create_or_validate_filenames_dataset(filenames):
           "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
           "elements.")
   else:
+    filenames = nest.map_structure(_normalise_fspath, filenames)
     filenames = ops.convert_to_tensor(filenames, dtype_hint=dtypes.string)
     if filenames.dtype != dtypes.string:
       raise TypeError(
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 61fbda603fe..8b005983719 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -537,6 +537,12 @@ class NoneTensorSpec(type_spec.BatchableTypeSpec):
   def _to_legacy_output_classes(self):
     return self
 
+  def most_specific_compatible_shape(self, other):
+    if type(self) is not type(other):
+      raise ValueError("No TypeSpec is compatible with both %s and %s" %
+                       (self, other))
+    return self
+
 
 type_spec.register_type_spec_from_value_converter(type(None),
                                                   NoneTensorSpec.from_value)
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 176d2546525..36fdb20aeae 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -797,7 +797,6 @@ cuda_py_test(
     size = "small",
     srcs = ["lib/debug_gradients_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1042,7 +1041,6 @@ cuda_py_test(
     size = "small",
     srcs = ["lib/debug_grappler_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Tests TF:Classic implementation.
     deps = [
         ":debug_data",
@@ -1061,7 +1059,6 @@ cuda_py_test(
     srcs = ["lib/session_debug_file_test.py"],
     python_version = "PY3",
     tags = ["notsan"],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1080,7 +1077,6 @@ cuda_py_test(
     size = "small",
     srcs = ["lib/debug_graph_reconstruction_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1101,7 +1097,6 @@ cuda_py_test(
     srcs = ["lib/session_debug_multi_gpu_test.py"],
     python_version = "PY3",
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1224,7 +1219,6 @@ cuda_py_test(
     srcs = ["cli/analyzer_cli_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":analyzer_cli",
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 3182d29d125..2d461c9e630 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -105,7 +105,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("tfdbg CLI is for tf.Session only")
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -319,7 +319,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("tfdbg CLI is for tf.Session only")
 class GetErrorIntroTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/common_test.py b/tensorflow/python/debug/lib/common_test.py
index f6413f6b7b3..8469700accf 100644
--- a/tensorflow/python/debug/lib/common_test.py
+++ b/tensorflow/python/debug/lib/common_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest
 
 class CommonTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Relies on tensor name, which is unavailable in TF2")
   def testOnFeedOneFetch(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -36,7 +36,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["b:0"], loaded[1])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Relies on tensor name, which is unavailable in TF2")
   def testGetRunKeyFlat(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -45,7 +45,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["a:0", "b:0"], loaded[1])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Relies on tensor name, which is unavailable in TF2")
   def testGetRunKeyNestedFetches(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 95da6cb9ff8..e7a4d2d5450 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index d0a4ecdbac4..366b25e89ac 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -106,7 +106,7 @@ class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Tensor.op is not available in TF 2.x")
   def testFileInPythonKernelsPathReturnsTrue(self):
     x = constant_op.constant(42.0, name="x")
     self.assertTrue(
@@ -320,7 +320,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       source_utils.load_source(source_path)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
 
   def createAndRunGraphWithWhileLoop(self):
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 71c56b33106..e9cb388c89f 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 266404a1038..7ca0cf192fa 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -141,7 +141,7 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
     return framework.OnRunEndResponse()
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def _no_rewrite_session_config(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index f8c6b83ba83..75792a00935 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -68,10 +68,12 @@ py_library(
         ":collective_util",
         ":cross_device_utils",
         ":device_util",
+        ":distribute_utils",
         ":ps_values",
         ":reduce_util",
         ":tpu_values",
         ":values",
+        ":values_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device_lib",
         "//tensorflow/python:framework_ops",
@@ -82,8 +84,10 @@ py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:tf_export",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
         "//tensorflow/tools/docs:doc_controls",
+        "@enum34_archive//:enum",
         "@six_archive//:six",
     ],
 )
@@ -96,13 +100,14 @@ py_library(
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nccl_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -148,8 +153,9 @@ py_library(
         ":multi_process_runner",
         ":multi_worker_test_base",
         ":one_device_strategy",
+        ":parameter_server_strategy_v2",
         ":sharded_variable",
-        "//tensorflow/python/distribute/client",
+        "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/distribute/experimental",
     ],
 )
@@ -206,7 +212,6 @@ py_test(
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -333,6 +338,7 @@ py_library(
     name = "mirrored_strategy",
     srcs = ["mirrored_strategy.py"],
     deps = [
+        ":collective_util",
         ":cross_device_ops",
         ":device_util",
         ":distribute_lib",
@@ -421,18 +427,27 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":collective_util",
         ":cross_device_ops",
         ":cross_device_utils",
+        ":device_util",
+        ":distribute_lib",
+        ":distribute_utils",
         ":input_lib",
         ":mirrored_strategy",
         ":multi_worker_util",
         ":numpy_dataset",
+        ":reduce_util",
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
     ],
@@ -457,6 +472,7 @@ cuda_py_test(
     ],
     python_version = "PY3",
     tags = [
+        "notap",  # TODO(b/171355671)
         "notsan",  # TODO(b/151841995)
     ],
     deps = [
@@ -665,12 +681,22 @@ py_library(
 py_library(
     name = "collective_util",
     srcs = ["collective_util.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
     ],
 )
 
+tf_py_test(
+    name = "collective_util_test",
+    srcs = ["collective_util_test.py"],
+    deps = [
+        ":collective_util",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 py_library(
     name = "shared_variable_creator",
     srcs = ["shared_variable_creator.py"],
@@ -845,10 +871,10 @@ py_library(
         ":one_device_strategy",
         ":test_util",
         ":tpu_strategy",
-        "//tensorflow/python:config",
         "//tensorflow/python:platform",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
@@ -908,6 +934,9 @@ tf_py_test(
     name = "multi_worker_test_base_test",
     srcs = ["multi_worker_test_base_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",  # TODO(b/170834611)
+    ],
     deps = [
         ":multi_worker_test_base",
     ],
@@ -935,7 +964,6 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
-    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -956,6 +984,7 @@ distribute_py_test(
     shard_count = 10,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -990,22 +1019,24 @@ distribute_py_test(
         "multi_and_single_gpu",
     ],
     deps = [
-        ":collective_all_reduce_strategy",
         ":combinations",
-        ":input_lib",
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
-        ":reduce_util",
+        ":distribute_lib",
         ":strategy_combinations",
+        ":test_util",
         ":tpu_strategy",
         ":values",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
@@ -1017,7 +1048,6 @@ cuda_py_test(
     name = "cross_device_utils_test",
     srcs = ["cross_device_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":combinations",
         ":cross_device_utils",
@@ -1039,11 +1069,13 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
     ],
     deps = [
         ":collective_util",
         ":combinations",
         ":cross_device_ops",
+        ":cross_device_utils",
         ":multi_process_runner",
         ":multi_worker_test_base",
         ":reduce_util",
@@ -1083,9 +1115,15 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:type_spec",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -1097,19 +1135,28 @@ tf_py_test(
     name = "sharded_variable_test",
     size = "small",
     srcs = ["sharded_variable_test.py"],
+    tags = [
+        # depend through //third_party/tensorflow/python:extra_py_tests_deps.
+        "ignore_for_dep=third_party.tensorflow.python.keras.engine.base_layer",
+    ],
     deps = [
         ":sharded_variable",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/module",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:signature_constants",
@@ -1153,18 +1200,20 @@ distribute_py_test(
     name = "values_test",
     size = "medium",
     srcs = ["values_test.py"],
-    disable_mlir_bridge = False,
     main = "values_test.py",
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
         "no_rocm",
         "notsan",  # b/168645872
     ],
     tpu_tags = [
-        "no_oss",  # b/150954621 Target too big to run serially reliably.
+        "no_oss",  # TODO(b/150954621) Target too big to run serially reliably.
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
     deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":distribute_lib",
         ":distribute_utils",
@@ -1176,7 +1225,6 @@ distribute_py_test(
         ":tpu_strategy",
         ":tpu_values",
         ":values",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -1186,7 +1234,6 @@ distribute_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:indexed_slices",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
@@ -1197,14 +1244,12 @@ distribute_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
-        "//tensorflow/python/saved_model/model_utils:mode_keys",
-        "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/types",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1244,6 +1289,7 @@ distribute_py_test(
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
         "no_rocm",
     ],
     tpu_tags = [
@@ -1301,7 +1347,6 @@ distribute_py_test(
 distribute_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
-    disable_mlir_bridge = False,
     main = "moving_averages_test.py",
     deps = [
         ":combinations",
@@ -1671,37 +1716,6 @@ py_test(
     ],
 )
 
-py_library(
-    name = "distributed_file_utils",
-    srcs = [
-        "distributed_file_utils.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":multi_worker_util",
-    ],
-)
-
-py_test(
-    name = "distributed_file_utils_test",
-    srcs = ["distributed_file_utils_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [
-        ":distributed_file_utils",
-        ":multi_worker_util",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/eager:test",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 distribute_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
@@ -1733,6 +1747,38 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "strategy_gather_test",
+    srcs = ["strategy_gather_test.py"],
+    disable_mlir_bridge = False,
+    python_version = "PY3",
+    shard_count = 4,
+    tags = [
+        "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
+        "notsan",  # TODO(b/160006974)
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":collective_all_reduce_strategy",
+        ":combinations",
+        ":multi_worker_test_base",
+        ":reduce_util",
+        ":strategy_combinations",
+        ":strategy_test_lib",
+        ":test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "tf_function_test",
     srcs = ["tf_function_test.py"],
@@ -1821,18 +1867,19 @@ tf_py_test(
     srcs = ["parameter_server_strategy_v2_test.py"],
     python_version = "PY3",
     tags = [
-        "notsan",  # b/168675975
+        "no_windows",  # TODO(171349346)
+        "notsan",  # TODO(b/168675975)
     ],
     deps = [
         ":multi_worker_test_base",
         ":parameter_server_strategy_v2",
         ":sharded_variable",
+        "//tensorflow:tensorflow_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:linalg_ops_impl",
-        "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py
index e61570dd6bd..caa03f84193 100644
--- a/tensorflow/python/distribute/central_storage_strategy.py
+++ b/tensorflow/python/distribute/central_storage_strategy.py
@@ -102,6 +102,13 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     Returns:
       A "distributed `Dataset`" that the caller can iterate over.
     """
+    if (options and options.experimental_replication_moden ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          'InputReplicationMode.PER_REPLICA '
+          'is only supported in '
+          '`experimental_distribute_datasets_from_function`.'
+      )
     return super(CentralStorageStrategy, self).experimental_distribute_dataset(
         dataset, options)
 
diff --git a/tensorflow/python/distribute/client/client_test.py b/tensorflow/python/distribute/client/client_test.py
deleted file mode 100644
index 981ad964b6d..00000000000
--- a/tensorflow/python/distribute/client/client_test.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for client.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import platform
-import sys
-import threading
-import time
-
-from tensorflow.python.distribute.client import client
-from tensorflow.python.eager import cancellation
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import coordinator
-from tensorflow.python.util import nest
-
-
-class CoordinatedClosureQueueTest(test.TestCase):
-
-  def testBasic(self):
-    queue = client._CoordinatedClosureQueue()
-    closure1 = self._create_closure(queue._cancellation_mgr)
-    queue.put(closure1)
-    self.assertIs(closure1, queue.get())
-    self.assertFalse(queue.done())
-    queue.put_back(closure1)
-    self.assertEqual(closure1, queue.get())
-    queue.mark_finished()
-    self.assertTrue(queue.done())
-    queue.wait()
-
-  def testProcessAtLeaseOnce(self):
-    closure_queue = client._CoordinatedClosureQueue()
-    labels = ['A', 'B', 'C', 'D', 'E']
-    processed_count = collections.defaultdict(int)
-
-    coord = coordinator.Coordinator(clean_stop_exception_types=[])
-
-    def process_queue():
-      with coord.stop_on_exception():
-        has_been_put_back = False
-        while True:
-          closure = closure_queue.get(timeout=30)
-          if closure is None:
-            break
-          if not has_been_put_back:
-            has_been_put_back = True
-            closure_queue.put_back(closure)
-            continue
-          closure._function()
-          closure_queue.mark_finished()
-
-    def get_func(label):
-
-      def func():
-        time.sleep(3)
-        processed_count[label] += 1
-
-      return func
-
-    cm = cancellation.CancellationManager()
-    for label in labels:
-      closure_queue.put(client.Closure(get_func(label), cm))
-    t1 = threading.Thread(target=process_queue, daemon=True)
-    t1.start()
-    t2 = threading.Thread(target=process_queue, daemon=True)
-    t2.start()
-
-    # Make sure multiple wait() calls are fine.
-    closure_queue.wait()
-    closure_queue.wait()
-    closure_queue.wait()
-    closure_queue.wait()
-
-    self.assertEqual(processed_count, collections.Counter(labels))
-
-    coord.join([t1, t2])
-
-  def testNotifyBeforeWait(self):
-    closure_queue = client._CoordinatedClosureQueue()
-
-    def func():
-      logging.info('func running')
-
-    coord = coordinator.Coordinator(clean_stop_exception_types=[])
-
-    def process_queue():
-      with coord.stop_on_exception():
-        closure_queue.get()
-        closure_queue.mark_finished()
-
-    closure_queue.put(client.Closure(func, closure_queue._cancellation_mgr))
-    t = threading.Thread(target=process_queue)
-    t.start()
-    coord.join([t])
-
-    # This test asserts that waiting at the time the function has been processed
-    # doesn't time out.
-    closure_queue.wait()
-
-  def _assert_one_unblock_the_other(self, first_fn, second_fn):
-    """Asserts `second_fn` wouldn't return before `first_fn` is finished."""
-    first_fn_done = threading.Event()
-    second_fn_done = threading.Event()
-    coord = coordinator.Coordinator(clean_stop_exception_types=[])
-
-    def wrapped_first_fn():
-      with coord.stop_on_exception():
-        self.assertFalse(second_fn_done.is_set())
-        first_fn()
-        first_fn_done.set()
-
-    self.assertFalse(first_fn_done.is_set())
-    t = threading.Thread(target=wrapped_first_fn)
-    t.start()
-
-    second_fn()
-    self.assertTrue(first_fn_done.is_set())
-    second_fn_done.set()
-
-    coord.join([t])
-
-  def testWaitRaiseErrorAfterMarkFailure(self):
-    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
-      # TODO(b/165013260): Fix this
-      self.skipTest('Test is currently broken on Windows with Python 3.8')
-
-    closure_queue = client._CoordinatedClosureQueue()
-    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
-    closure = closure_queue.get()
-
-    wait_finish_event = threading.Event()
-    coord = coordinator.Coordinator(clean_stop_exception_types=[])
-
-    # Using a thread to verify that closure_queue.wait() will not return until
-    # all inflight closures are finished.
-
-    def mark_finished_fn():
-      try:
-        raise ValueError('Some error.')
-      except ValueError as e:
-        closure_queue.mark_failed(e)
-
-    def wait_fn():
-      with self.assertRaises(ValueError):
-        closure_queue.wait()
-
-    self._assert_one_unblock_the_other(mark_finished_fn, wait_fn)
-
-    self.assertTrue(closure_queue.done())
-
-  def _create_closure(self, cancellation_mgr):
-
-    @def_function.function()
-    def some_function():
-      return 1.0
-
-    return client.Closure(some_function, cancellation_mgr)
-
-  def _put_two_closures_and_get_one(self):
-    closure_queue = client._CoordinatedClosureQueue()
-    closure1 = self._create_closure(closure_queue._cancellation_mgr)
-    closure_queue.put(closure1)
-
-    closure2 = self._create_closure(closure_queue._cancellation_mgr)
-    closure_queue.put(closure2)
-
-    closure_got = closure_queue.get()  # returns closure1
-    self.assertIs(closure_got, closure1)
-    self.assertIsNot(closure_got, closure2)
-    return closure_queue, closure1, closure2
-
-  def testPutRaiseError(self):
-    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
-      # TODO(b/165013260): Fix this
-      self.skipTest('Test is currently broken on Windows with Python 3.8')
-
-    closure_queue, _, closure2 = self._put_two_closures_and_get_one()
-
-    closure_queue.mark_failed(ValueError())
-
-    with self.assertRaises(ValueError):
-      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
-
-    self.assertTrue(closure_queue.done())
-
-    with self.assertRaisesRegex(
-        errors.CancelledError,
-        'The corresponding function is cancelled. Please reschedule the '
-        'function.'):
-      closure2._fetch_output_remote_values()
-
-    # The error is cleared.
-    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
-
-  def testWaitRaiseError(self):
-    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
-      # TODO(b/165013260): Fix this
-      self.skipTest('Test is currently broken on Windows with Python 3.8')
-
-    closure_queue, _, closure2 = self._put_two_closures_and_get_one()
-
-    closure_queue.mark_failed(ValueError())
-
-    with self.assertRaises(ValueError):
-      closure_queue.wait()
-    self.assertTrue(closure_queue.done())
-
-    with self.assertRaisesRegex(
-        errors.CancelledError,
-        'The corresponding function is cancelled. Please reschedule the '
-        'function.'):
-      closure2._fetch_output_remote_values()
-
-    # The error is cleared.
-    closure_queue.wait()
-
-  def testDoneRaiseError(self):
-    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
-      # TODO(b/165013260): Fix this
-      self.skipTest('Test is currently broken on Windows with Python 3.8')
-
-    closure_queue, _, _ = self._put_two_closures_and_get_one()
-
-    self.assertFalse(closure_queue.done())
-    closure_queue.mark_failed(ValueError())
-    with self.assertRaises(ValueError):
-      closure_queue.done()
-
-  def _set_error(self, closure_queue, closure, error):
-    try:
-      raise error
-    except Exception as e:  # pylint: disable=broad-except
-      nest.map_structure(lambda x: x._set_error(e),
-                         closure._output_remote_values)
-      closure_queue.mark_failed(e)
-
-  def _test_cancel_closure_when_error(self, call_wait):
-    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
-      # TODO(b/165013260): Fix this
-      self.skipTest('Test is currently broken on Windows with Python 3.8')
-
-    closure_queue, closure1, closure2 = self._put_two_closures_and_get_one()
-    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
-    closure_queue.get()
-    # At this moment, there are two inflight, one in queue.
-    self.assertEqual(closure_queue._inflight_closure_count, 2)
-
-    # Hold a copy of the queue's cancellation manager at this point
-    initial_cm = closure_queue._cancellation_mgr
-
-    # Simulating closure1 fails.
-    self._set_error(closure_queue, closure1, ValueError('Some error.'))
-
-    # At this moment, there are one inflight, one in queue.
-    self.assertEqual(closure_queue._queue.qsize(), 1)
-    self.assertEqual(closure_queue._inflight_closure_count, 1)
-
-    closure3 = self._create_closure(closure_queue._cancellation_mgr)
-
-    def fake_cancellation():
-      self._set_error(closure_queue, closure2,
-                      ValueError('Fake cancellation error.'))
-
-    def report_error():
-      # It should not report the fake cancellation error.
-      with self.assertRaisesRegex(ValueError, 'Some error.'):
-        # Verifying `wait()` or `put()` raises even if one closure is in
-        # flight.
-        if call_wait:
-          closure_queue.wait()
-        else:
-          closure_queue.put(closure3)
-
-    self._assert_one_unblock_the_other(fake_cancellation, report_error)
-
-    # The original cancellation manager of the queue has been cancelled.
-    self.assertTrue(initial_cm.is_cancelled)
-
-    # At this moment, there is zero inflight, nothing in queue.
-    self.assertTrue(closure_queue._queue.empty())
-    self.assertEqual(closure_queue._inflight_closure_count, 0)
-    self.assertIsNone(closure_queue._error)
-
-    # This asserts that closure1 has errored.
-    with self.assertRaisesRegex(ValueError, 'Some error.'):
-      closure1._fetch_output_remote_values()
-
-    # The following asserts that closure3 should have been cancelled.
-    if not call_wait:
-      with self.assertRaisesRegex(
-          errors.CancelledError,
-          'The corresponding function is cancelled. Please reschedule the '
-          'function.'):
-        closure3._fetch_output_remote_values()
-
-    # Closure2 was an inflight closure when it got cancelled.
-    self.assertEqual(closure2._output_remote_values._status,
-                     client._RemoteValueStatus.READY)
-    with self.assertRaisesRegex(ValueError, 'Fake cancellation error.'):
-      closure2._fetch_output_remote_values()
-
-    # This asserts that the queue has a clear state.
-    self.testBasic()
-
-  def testWaitRaiseErrorAfterCancelClosure(self):
-    self._test_cancel_closure_when_error(call_wait=True)
-
-  def testPutRaiseErrorAfterCancelClosure(self):
-    self._test_cancel_closure_when_error(call_wait=False)
-
-  def testStateIsRestoredAfterJoinIsCalled(self):
-    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
-      # TODO(b/165013260): Fix this
-      self.skipTest('Test is currently broken on Windows with Python 3.8')
-
-    closure_queue, _, _ = self._put_two_closures_and_get_one()
-    self.assertEqual(closure_queue._inflight_closure_count, 1)
-    closure_queue.mark_failed(ValueError('test error'))
-    with self.assertRaises(ValueError):
-      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
-
-    # Its error should have been cleared.
-    self.assertIsNone(closure_queue._error)
-    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
-    self.assertIsNone(closure_queue._error)
-
-  def testThreadSafey(self):
-    thread_count = 10
-    queue = client._CoordinatedClosureQueue()
-
-    # Each thread performs 20 queue actions: 10 are `put_back` and 10 are
-    # `mark_finished`.
-    action_count = 20
-
-    def func():
-      for i in range(action_count):
-        closure = queue.get()
-        if i % 2 == 0:
-          queue.put_back(closure)
-        else:
-          queue.mark_finished()
-
-    threads = [threading.Thread(target=func) for i in range(thread_count)]
-    for t in threads:
-      t.start()
-
-    for _ in range(thread_count * action_count // 2):
-      queue.put(self._create_closure(queue._cancellation_mgr))
-    queue.wait()
-    self.assertTrue(queue.done())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/distribute/client/parameter_server_client_test.py b/tensorflow/python/distribute/client/parameter_server_client_test.py
deleted file mode 100644
index 022539308d1..00000000000
--- a/tensorflow/python/distribute/client/parameter_server_client_test.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `Client` when used together with `ParameterServerStrategyV2."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import threading
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import client as client_lib
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.server_lib import ClusterSpec
-
-
-class ErrorReportingThread(threading.Thread):
-
-  error = None
-
-  def __init__(self, *args, **kwargs):
-    assert "target" in kwargs
-    target = kwargs["target"]
-
-    @functools.wraps(target)
-    def wrapped_target(*args, **kwargs):
-      try:
-        return target(*args, **kwargs)
-      except Exception as e:  # pylint: disable=broad-except
-        ErrorReportingThread.error = e
-
-    kwargs["target"] = wrapped_target
-    super(ErrorReportingThread, self).__init__(*args, **kwargs)
-
-
-class TestCaseWithErrorReportingThread(test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    cls._threading_thread = threading.Thread
-    threading.Thread = ErrorReportingThread
-    super(TestCaseWithErrorReportingThread, cls).setUpClass()
-
-  @classmethod
-  def tearDownClass(cls):
-    super(TestCaseWithErrorReportingThread, cls).tearDownClass()
-    threading.Thread = cls._threading_thread
-
-  def setUp(self):
-    ErrorReportingThread.error = None
-    super(TestCaseWithErrorReportingThread, self).setUp()
-
-  def tearDown(self):
-    super(TestCaseWithErrorReportingThread, self).tearDown()
-    if ErrorReportingThread.error:
-      raise ErrorReportingThread.error  # pylint: disable=raising-bad-type
-
-
-def make_client(num_workers, num_ps):
-  # TODO(rchao): Test the internal rpc_layer version.
-  cluster_def = multi_worker_test_base.create_in_process_cluster(
-      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
-  cluster_def["chief"] = [
-      "localhost:%d" % multi_worker_test_base.pick_unused_port()
-  ]
-  cluster_resolver = SimpleClusterResolver(
-      ClusterSpec(cluster_def), rpc_layer="grpc")
-  strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-      cluster_resolver)
-  return client_lib.Client(strategy)
-
-
-class ParameterServerClientTest(TestCaseWithErrorReportingThread):
-
-  @classmethod
-  def setUpClass(cls):
-    super(ParameterServerClientTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-    cls.strategy = cls.client.strategy
-
-  def testBasic(self):
-    self.strategy.extended._variable_count = 0
-    with self.strategy.scope():
-      v1 = variables.Variable(initial_value=0.0)
-      v2 = variables.Variable(initial_value=1.0)
-    self.assertEqual(self.strategy.extended._variable_count, 2)
-
-    @def_function.function
-    def worker_fn():
-      v1.assign_add(0.1)
-      v2.assign_sub(0.2)
-      return v1.read_value() / v2.read_value()
-
-    results = self.client.schedule(worker_fn)
-    logging.info("Results of experimental_run_v2: %f",
-                 self.client.fetch(results))
-
-    self.assertAlmostEqual(v1.read_value().numpy(), 0.1, delta=1e-6)
-    self.assertAlmostEqual(v2.read_value().numpy(), 0.8, delta=1e-6)
-
-  def testFnReturnNestedValues(self):
-    x = constant_op.constant(1)
-
-    @def_function.function
-    def f():
-      return x + 1, (x + 2, x + 3), [x + 4], {"v": x}
-
-    got = self.client.schedule(f)
-    want = 2, (3, 4), [5], {"v": 1}
-    self.assertEqual(self.client.fetch(got), want)
-
-  def testInputFunction(self):
-
-    def input_fn():
-      return dataset_ops.DatasetV2.range(1, 2)
-
-    with self.strategy.scope():
-      v = variables.Variable(initial_value=0, dtype=dtypes.int64)
-
-    @def_function.function
-    def worker_fn(iterator):
-      x = next(iterator)
-      v.assign_add(x)
-      return x
-
-    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
-    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
-    result = self.client.fetch(result)
-    self.assertEqual(result, (1,))
-    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
-    result = self.client.fetch(result)
-    self.assertEqual(result, (1,))
-
-    self.assertAlmostEqual(v.read_value().numpy(), 2, delta=1e-6)
-
-  def testAsyncScheduleAndJoin(self):
-
-    def input_fn():
-      return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
-
-    with self.strategy.scope():
-      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
-
-    # TODO(yuefengz): the following tf.function has a return value which is None
-    # in its structured_outputs.
-    @def_function.function
-    def worker_fn(iterator):
-      x = next(iterator)
-      v.assign_add(x)
-
-    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
-
-    iterator = iter(distributed_dataset)
-
-    # Verifying joining without any scheduling doesn't hang.
-    self.client.join()
-    self.assertEqual(v.read_value().numpy(), 0)
-
-    for _ in range(5):
-      self.client.schedule(worker_fn, args=(iterator,))
-    self.client.join()
-
-    # With 5 addition it should be 2*5 = 10.
-    self.assertEqual(v.read_value().numpy(), 10)
-
-    for _ in range(5):
-      self.client.schedule(worker_fn, args=(iterator,))
-
-    # Verifying multiple join is fine.
-    self.client.join()
-    self.client.join()
-    self.client.join()
-
-    self.assertTrue(self.client.done())
-
-    # Likewise, it's now 20.
-    self.assertEqual(v.read_value().numpy(), 20)
-
-  def testInputFunctionWithMap(self):
-    self._map_fn_tracing_count = 0
-
-    def input_fn():
-      def map_fn(x):
-        self._map_fn_tracing_count += 1
-        return x + 10
-      return dataset_ops.DatasetV2.range(0, 10).map(map_fn)
-
-    @def_function.function
-    def worker_fn(iterator):
-      return next(iterator)
-
-    distributed_dataset = (
-        self.client.create_per_worker_dataset(input_fn))
-    result = self.client.schedule(
-        worker_fn, args=(iter(distributed_dataset),))
-    self.assertEqual(result.fetch(), (10,))
-    self.assertEqual(self._map_fn_tracing_count, 1)
-
-  def testInputFunctionCreateVariables(self):
-
-    def input_fn():
-      v = variables.Variable(initial_value=0.0)
-      return v.read_value()
-
-    with self.assertRaises(ValueError):
-      self.client.create_per_worker_dataset(input_fn)
-
-  def testPerWorkerValue(self):
-    var_shape = tuple()
-    var_dtype = dtypes.float32
-    var_name = "var"
-
-    def create_var():
-      var = variables.Variable(
-          initial_value=0.0, dtype=var_dtype, name=var_name)
-      self.assertIn("worker", var.device)
-      return var
-
-    worker_local_var = self.client._create_per_worker_resources(create_var)
-
-    # The following is a workaround to allow `worker_local_var` to be passed in
-    # as args to the `client.schedule` method which requires tensor specs to
-    # trace tf.function but _create_worker_resources' return values don't have
-    # tensor specs. We can get rid of this workaround once
-    # _create_worker_resources is able to infer the tensor spec of the return
-    # value of the function passed in. See b/154675763.
-    for var in worker_local_var._values:
-      var._set_type_spec(tensor_spec.TensorSpec(var_shape, var_dtype, var_name))
-
-    def worker_fn(var):
-      var.assign_add(1.0)
-
-    for _ in range(10):
-      # Which slice of `worker_local_var` will be used will depend on which
-      # worker the `worker_fn` gets scheduled on.
-      self.client.schedule(worker_fn, args=(worker_local_var,))
-    self.client.join()
-
-    var_sum = sum(self.client.fetch(worker_local_var._values))
-    self.assertEqual(var_sum, 10.0)
-
-  def testDisallowRemoteValueAsInput(self):
-
-    @def_function.function
-    def func_0():
-      return 1.0
-
-    @def_function.function
-    def func_1(x):
-      return x + 1.0
-
-    remote_v = self.client.schedule(func_0)
-    with self.assertRaises(ValueError):
-      self.client.schedule(func_1, args=(remote_v,))
-
-
-class LimitedClosureQueueSizeBasicTest(ParameterServerClientTest):
-  """Test basic functionality works with explicit maximum closure queue size.
-
-  Execute the same set of test cases as in `ParameterServerClientTest`, with an
-  explicit size limit for the closure queue. Note that even when the queue size
-  is set to infinite, there is still a maximum practical size (depends on host
-  memory limit) that might cause the queue.put operations to be blocking when
-  scheduling a large number of closures on a big cluster. These tests make sure
-  that the client does not run into deadlocks in such scenario.
-  """
-
-  @classmethod
-  def setUpClass(cls):
-    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
-    client_lib._CLOSURE_QUEUE_MAX_SIZE = 2
-    cls.client = make_client(num_workers=3, num_ps=2)
-    cls.strategy = cls.client.strategy
-
-
-class ErrorReportingTest(TestCaseWithErrorReportingThread):
-
-  @classmethod
-  def setUpClass(cls):
-    super(ErrorReportingTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-    cls.strategy = cls.client.strategy
-
-    with cls.strategy.scope():
-      cls.iteration = variables.Variable(initial_value=0.0)
-
-  @def_function.function
-  def _normal_function(self):
-    x = random_ops.random_uniform((2, 10))
-    y = random_ops.random_uniform((10, 2))
-    self.iteration.assign_add(1.0)
-    return math_ops.reduce_mean(math_ops.matmul(x, y))
-
-  @def_function.function
-  def _error_function(self):
-    x = random_ops.random_uniform((2, 10))
-    y = random_ops.random_uniform((10, 2))
-    check_ops.assert_non_positive_v2(math_ops.reduce_sum(math_ops.matmul(x, y)))
-    self.iteration.assign_add(1.0)
-    return self.iteration
-
-  @def_function.function
-  def _long_function(self):
-    x = random_ops.random_uniform((1000, 1000))
-    for _ in math_ops.range(10000):
-      a = random_ops.random_uniform((1000, 1000))
-      b = random_ops.random_uniform((1000, 1000))
-      x += math_ops.matmul(a, b)
-    return x
-
-  def testJoinRaiseError(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-  def testScheduleRaiseError(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      while True:
-        self.client.schedule(self._normal_function)
-
-  def testScheduleRaiseErrorWithMultipleFailure(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      while True:
-        self.client.schedule(self._error_function)
-    self.client.join()
-
-  def testErrorWillbeCleared(self):
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-  def testRemoteValueReturnError(self):
-    result = self.client.schedule(self._error_function)
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      result.fetch()
-
-    # Clear the error.
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-  def testInputError(self):
-
-    worker_local_val = self.client._create_per_worker_resources(
-        self._error_function)
-
-    @def_function.function
-    def func(x):
-      return x + 1
-
-    result = self.client.schedule(func, args=(worker_local_val,))
-    with self.assertRaises(client_lib.InputError):
-      self.client.join()
-
-    with self.assertRaises(client_lib.InputError):
-      result.fetch()
-
-  def testCancellation(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    long_function = self.client.schedule(self._long_function)
-    self.client.schedule(self._error_function)
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-    with self.assertRaises(errors.CancelledError):
-      long_function.fetch()
-
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.join()
-
-
-class LimitedClosureQueueErrorTest(ErrorReportingTest):
-  """Test error reporting works with explicit maximum closure queue size.
-
-  Execute the same set of test cases as in ErrorReportingTest, with an explicit
-  size limit for the closure queue.
-  """
-
-  @classmethod
-  def setUpClass(cls):
-    super(LimitedClosureQueueErrorTest, cls).setUpClass()
-    client_lib._CLOSURE_QUEUE_MAX_SIZE = 2
-    cls.client = make_client(num_workers=3, num_ps=2)
-    cls.strategy = cls.client.strategy
-
-    with cls.client.strategy.scope():
-      cls.iteration = variables.Variable(initial_value=0.0)
-
-
-class StrategyRunTest(test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(StrategyRunTest, cls).setUpClass()
-    cls.client = make_client(num_workers=1, num_ps=1)
-    cls.strategy = cls.client.strategy
-
-  def testStrategyRun(self):
-    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
-    with self.strategy.scope():
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      v = variables.Variable(initial_value=1)
-
-      @def_function.function
-      def worker_fn(input_tensor):
-
-        def replica_fn(input_tensor):
-          # Within `replica_fn`, it has to be in a replica context.
-          self.assertFalse(
-              distribution_strategy_context.in_cross_replica_context())
-          return input_tensor + v
-
-        return self.strategy.run(replica_fn, args=(input_tensor,))
-
-      # Asserting scheduling in scope has the expected behavior.
-      result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
-      self.assertIsInstance(result, client_lib.RemoteValue)
-      self.assertEqual(result.fetch(), 4)
-
-    # Asserting scheduling out of scope has the expected behavior.
-    result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
-    self.assertEqual(result.fetch(), 4)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 6fda3f5311d..e925104ffa3 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -98,7 +98,6 @@ tf_py_test(
     name = "base_cluster_resolver_py_test",
     srcs = ["cluster_resolver_test.py"],
     main = "cluster_resolver_test.py",
-    tfrt_enabled = True,
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -114,7 +113,6 @@ tf_py_test(
     size = "small",
     srcs = ["gce_cluster_resolver_test.py"],
     main = "gce_cluster_resolver_test.py",
-    tfrt_enabled = True,
     deps = [
         ":gce_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -131,7 +129,6 @@ tf_py_test(
     srcs = ["tfconfig_cluster_resolver_test.py"],
     grpc_enabled = True,
     main = "tfconfig_cluster_resolver_test.py",
-    tfrt_enabled = True,
     deps = [
         ":tfconfig_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -148,7 +145,6 @@ tf_py_test(
     srcs = ["sagemaker_cluster_resolver_test.py"],
     grpc_enabled = True,
     main = "sagemaker_cluster_resolver_test.py",
-    tfrt_enabled = True,
     deps = [
         ":sagemaker_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -165,7 +161,6 @@ tf_py_test(
     srcs = ["slurm_cluster_resolver_test.py"],
     main = "slurm_cluster_resolver_test.py",
     tags = [],
-    tfrt_enabled = True,
     deps = [
         ":slurm_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -181,7 +176,6 @@ tf_py_test(
     size = "small",
     srcs = ["kubernetes_cluster_resolver_test.py"],
     main = "kubernetes_cluster_resolver_test.py",
-    tfrt_enabled = True,
     deps = [
         ":kubernetes_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
index 01b21f73dee..dff2b6937f7 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -32,7 +32,6 @@ tf_py_test(
     grpc_enabled = True,
     main = "tpu_cluster_resolver_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":tpu_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 500782d0e89..37a440bf46e 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -37,6 +37,7 @@ from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
@@ -46,10 +47,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distribute.experimental.MultiWorkerMirroredStrategy", v1=[])
+# pylint: disable=line-too-long
+@tf_export("distribute.MultiWorkerMirroredStrategy", v1=[])
 class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   """A distribution strategy for synchronous training on multiple workers.
 
@@ -63,7 +66,12 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   `cluster_resolver` correctly. For example, if you are using
   `tf.distribute.cluster_resolver.TFConfigClusterResolver`, each worker needs to
   have its corresponding `task_type` and `task_id` set in the `TF_CONFIG`
-  environment variable.
+  environment variable. An example TF_CONFIG on worker-0 of a two worker cluster
+  is:
+
+  ```
+  TF_CONFIG = '{"cluster": {"worker": ["localhost:12345", "localhost:23456"]}, "task": {"type": "worker", "index": 0} }'
+  ```
 
   Your program runs on each worker as-is. Note that collectives require each
   worker to participate. All `tf.distribute` and non `tf.distribute` API may use
@@ -76,8 +84,57 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   strategy uses. If it's zero, the strategy uses the CPU. All workers need to
   use the same number of devices, otherwise the behavior is undefined.
 
-  This strategy is not intended for TPU. Use
-  `tf.distribute.experimental.TPUStrategy` instead.
+  This strategy is not intended for TPU. Use `tf.distribute.TPUStrategy`
+  instead.
+
+  After setting up TF_CONFIG, using this strategy is similar to using
+  `tf.distribute.MirroredStrategy` and `tf.distribute.TPUStrategy`.
+
+  ```
+  strategy = tf.distribute.MultiWorkerMirroredStrategy()
+
+  with strategy.scope():
+    model = tf.keras.Sequential([
+      tf.keras.layers.Dense(2, input_shape=(5,)),
+    ])
+    optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
+
+  def dataset_fn(ctx):
+    x = np.random.random((2, 5)).astype(np.float32)
+    y = np.random.randint(2, size=(2, 1))
+    dataset = tf.data.Dataset.from_tensor_slices((x, y))
+    return dataset.repeat().batch(1, drop_remainder=True)
+  dist_dataset = strategy.distribute_datasets_from_function(dataset_fn)
+
+  model.compile()
+  model.fit(dist_dataset)
+  ```
+
+  You can also write your own training loop:
+
+  ```
+  @tf.function
+  def train_step(iterator):
+
+    def step_fn(inputs):
+      features, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = model(features, training=True)
+        loss = tf.keras.losses.sparse_categorical_crossentropy(
+            labels, logits)
+
+      grads = tape.gradient(loss, model.trainable_variables)
+      optimizer.apply_gradients(zip(grads, model.trainable_variables))
+
+    strategy.run(step_fn, args=(next(iterator),))
+
+  for _ in range(NUM_STEP):
+    train_step(iterator)
+  ```
+
+  See
+  [Multi-worker training with Keras](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras)
+  for a detailed tutorial.
 
   __Saving__
 
@@ -98,30 +155,37 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   Tensorflow API.
 
   """
+  # pylint: enable=line-too-long
+
   # TODO(anjalisridhar): Update our guides with examples showing how we can use
   # the cluster_resolver argument.
 
-  def __init__(
-      self,
-      communication=cross_device_ops_lib.CollectiveCommunication.AUTO,
-      cluster_resolver=None):
+  # The starting number for collective keys. This should only be set in tests.
+  _collective_key_base = 0
+
+  def __init__(self,
+               cluster_resolver=None,
+               communication_options=None):
     """Creates the strategy.
 
     Args:
-      communication: optional
-        `tf.distribute.experimental.CollectiveCommunication`. This is a hint on
-        the preferred collective communication implementation. Possible values
-        include `AUTO`, `RING`, and `NCCL`.
       cluster_resolver: optional
         `tf.distribute.cluster_resolver.ClusterResolver`. If `None`,
         `tf.distribute.cluster_resolver.TFConfigClusterResolver` is used.
+      communication_options: optional
+        `tf.distribute.experimental.CommunicationOptions`. This configures the
+        default options for cross device communications. It can be overridden by
+        options provided to the communication APIs like
+        `tf.distribute.ReplicaContext.all_reduce`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
     """
-    # TODO(b/150151677): consider move communication to CollectiveHints.
+    if communication_options is None:
+      communication_options = collective_util.Options()
     super(CollectiveAllReduceStrategy, self).__init__(
         CollectiveAllReduceExtended(
             self,
-            communication=communication,
-            cluster_resolver=cluster_resolver))
+            cluster_resolver=cluster_resolver,
+            communication_options=communication_options))
 
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
         "MultiWorkerMirroredStrategy")
@@ -132,12 +196,9 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
         "num_replicas_per_worker").set(self.extended._num_gpus_per_worker)
 
   @classmethod
-  def _from_local_devices(
-      cls,
-      devices,
-      communication=cross_device_ops_lib.CollectiveCommunication.AUTO):
+  def _from_local_devices(cls, devices, communication_options=None):
     """A convenience method to create an object with a list of devices."""
-    obj = cls(communication)
+    obj = cls(communication_options=communication_options)
     obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
     return obj
 
@@ -154,21 +215,77 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
     return self.extended._cluster_resolver  # pylint: disable=protected-access
 
 
+class _CollectiveAllReduceStrategyExperimentalMeta(type):
+
+  @classmethod
+  def __instancecheck__(cls, instance):
+    # This is to make isinstance(tf.distribute.MultiWorkerMirroredStrategy(),
+    # tf.distribute.experimental.MultiWorkerMirroredStrategy). Some libraries is
+    # performing such check.
+    return isinstance(instance, CollectiveAllReduceStrategy)
+
+
+@tf_export("distribute.experimental.MultiWorkerMirroredStrategy", v1=[])
+class _CollectiveAllReduceStrategyExperimental(
+    CollectiveAllReduceStrategy,
+    metaclass=_CollectiveAllReduceStrategyExperimentalMeta):
+
+  __doc__ = CollectiveAllReduceStrategy.__doc__
+
+  @deprecation.deprecated(
+      None, "use distribute.MultiWorkerMirroredStrategy instead")
+  def __init__(self,
+               communication=collective_util.CommunicationImplementation.AUTO,
+               cluster_resolver=None):
+    """Creates the strategy.
+
+    Args:
+      communication: optional
+        `tf.distribute.experimental.CommunicationImplementation`. This is a hint
+        on the preferred collective communication implementation. Possible
+        values include `AUTO`, `RING`, and `NCCL`.
+      cluster_resolver: optional
+        `tf.distribute.cluster_resolver.ClusterResolver`. If `None`,
+        `tf.distribute.cluster_resolver.TFConfigClusterResolver` is used.
+    """
+    communication_options = collective_util.Options(
+        implementation=communication)
+    super(_CollectiveAllReduceStrategyExperimental,
+          self).__init__(cluster_resolver, communication_options)
+
+  @classmethod
+  def _from_local_devices(
+      cls,
+      devices,
+      communication=collective_util.CommunicationImplementation.AUTO):
+    """A convenience method to create an object with a list of devices."""
+    obj = cls(communication)
+    obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
+    return obj
+
+
+_CollectiveAllReduceStrategyExperimental.__name__ = CollectiveAllReduceStrategy.__name__
+
+
 @tf_export(v1=["distribute.experimental.MultiWorkerMirroredStrategy"])  # pylint: disable=missing-docstring
 class CollectiveAllReduceStrategyV1(distribute_lib.StrategyV1):
 
   __doc__ = CollectiveAllReduceStrategy.__doc__
 
-  def __init__(
-      self,
-      communication=cross_device_ops_lib.CollectiveCommunication.AUTO,
-      cluster_resolver=None):
+  # The starting number for collective keys. This should only be set in tests.
+  _collective_key_base = 0
+
+  def __init__(self,
+               communication=collective_util.CommunicationImplementation.AUTO,
+               cluster_resolver=None):
     """Initializes the object."""
+    communication_options = collective_util.Options(
+        implementation=communication)
     super(CollectiveAllReduceStrategyV1, self).__init__(
         CollectiveAllReduceExtended(
             self,
-            communication=communication,
-            cluster_resolver=cluster_resolver))
+            cluster_resolver=cluster_resolver,
+            communication_options=communication_options))
     distribute_lib.distribution_strategy_gauge.get_cell("V1").set(
         "MultiWorkerMirroredStrategy")
     # pylint: disable=protected-access
@@ -192,17 +309,21 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   _check_health_initial_timeout = 0
   # Times to retry before considering the peer is down.
   _check_health_retry_limit = 3
+  # Timeout in seconds the each check health.
+  _check_health_timeout = 10
 
-  def __init__(self,
-               container_strategy,
-               communication,
-               cluster_resolver):
+  def __init__(self, container_strategy, cluster_resolver,
+               communication_options):
+    if not isinstance(communication_options, collective_util.Options):
+      raise ValueError("communication_options must be an instance of "
+                       "tf.distribute.experimental.CommunicationOptions")
     self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
+    if not isinstance(self._cluster_resolver, ClusterResolver):
+      raise ValueError("cluster_resolver must be an instance of "
+                       "tf.distribute.cluster_resolver.ClusterResolver")
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
-    assert isinstance(
-        communication,
-        cross_device_ops_lib.CollectiveCommunication)
-    self._communication = communication
+    self._communication_options = communication_options
+    self._collective_key_base = container_strategy._collective_key_base  # pylint: disable=protected-access
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
     self.experimental_enable_get_next_as_optional = True
@@ -247,19 +368,17 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._worker_device = device_util.canonicalize("/device:CPU:0")
     self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
 
-    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=1 + self._collective_key_base)
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=local_devices,
         group_size=len(local_devices),
-        collective_keys=self._collective_keys,
-        communication=self._communication)
+        collective_keys=self._collective_keys)
     # CrossDeviceOps for per host tensors.
     self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=[self._worker_device],
         group_size=self._num_workers,
-        collective_keys=self._collective_keys,
-        communication=cross_device_ops_lib.CollectiveCommunication.RING,
-    )
+        collective_keys=self._collective_keys)
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
 
@@ -279,8 +398,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._rpc_layer = cluster_resolver.rpc_layer
     self._warn_nccl_no_gpu()
 
-    logging.info("Single-worker MultiWorkerMirroredStrategy with local_devices "
-                 "= %r, communication = %s", local_devices, self._communication)
+    logging.info(
+        "Single-worker MultiWorkerMirroredStrategy with local_devices "
+        "= %r, communication = %s", local_devices,
+        self._communication_options.implementation)
 
   def _initialize_multi_worker(self, cluster_resolver):
     """Initializes the object for multi-worker training."""
@@ -362,19 +483,17 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       local_devices = (self._worker_device,)
 
-    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=1 + self._collective_key_base)
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=local_devices,
         group_size=len(local_devices) * self._num_workers,
-        collective_keys=self._collective_keys,
-        communication=self._communication)
+        collective_keys=self._collective_keys)
     # CrossDeviceOps for per host tensors.
     self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=[self._worker_device],
         group_size=self._num_workers,
-        collective_keys=self._collective_keys,
-        communication=cross_device_ops_lib.CollectiveCommunication.RING,
-    )
+        collective_keys=self._collective_keys)
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
 
@@ -393,9 +512,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     logging.info(
         "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
         "task_id = %r, num_workers = %r, local_devices = %r, "
-        "communication = %s", cluster_spec.as_dict(), task_type,
-        task_id, self._num_workers, local_devices,
-        self._communication)
+        "communication = %s", cluster_spec.as_dict(), task_type, task_id,
+        self._num_workers, local_devices,
+        self._communication_options.implementation)
 
   def __del__(self):
     self._stop_check_health_thread()
@@ -428,7 +547,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         group_key = self._collective_keys.get_group_key([device])
         group_size = self._num_workers
         collective_instance_key = (
-            self._collective_keys.get_variable_instance_key())
+            self._collective_keys.get_instance_key(group_key, device))
 
         with ops.device(device):
           initial_value = kwargs["initial_value"]
@@ -471,6 +590,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     return input_context
 
   def _experimental_distribute_dataset(self, dataset, options):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          "`experimental_distribute_datasets_from_function`."
+      )
     input_context = self._make_input_context()
     return input_lib.get_distributed_dataset(
         dataset,
@@ -480,6 +606,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         input_context=input_context)
 
   def _distribute_datasets_from_function(self, dataset_fn, options):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          " `experimental_distribute_datasets_from_function` "
+          "of tf.distribute.MirroredStrategy")
     input_context = self._make_input_context()
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn=dataset_fn,
@@ -566,8 +699,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
 
     if (not ops.executing_eagerly_outside_functions() and
-        self._communication ==
-        cross_device_ops_lib.CollectiveCommunication.NCCL):
+        self._communication_options.implementation ==
+        collective_util.CommunicationImplementation.NCCL):
       updated_config.experimental.collective_nccl = True
 
     if not self._cluster_spec:
@@ -605,15 +738,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       return self._host_cross_device_ops
 
-  def _gather_to_implementation(self, value, destinations, axis,
-                                experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
     return self._get_cross_device_ops(value)._gather(  # pylint: disable=protected-access
         value,
         destinations=destinations,
         axis=axis,
-        experimental_hints=experimental_hints)
+        options=options)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     if (isinstance(value, values.Mirrored) and
         reduce_op == reduce_util.ReduceOp.MEAN):
       return value
@@ -637,7 +769,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         reduce_op,
         value,
         destinations=destinations,
-        experimental_hints=experimental_hints)
+        options=self._communication_options.merge(options))
 
   def _check_health(self):
     while True:
@@ -650,12 +782,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
           while True:
             attempts += 1
             try:
-              context.context().check_collective_ops_peer_health(peer)
+              context.context().check_collective_ops_peer_health(
+                  peer, timeout_in_ms=self._check_health_timeout * 1000)
               # If check_collective_ops_peer_health doesn't raise an Exception,
               # the peer is healthy.
               break
-            except (errors.UnavailableError,
-                    errors.FailedPreconditionError) as e:
+            except (errors.UnavailableError, errors.FailedPreconditionError,
+                    errors.DeadlineExceededError) as e:
               # TODO(b/151232436): Always raise UnavailableError when a peer
               # fails. Now there could be many kinds of errors:
               # - Unavailable: when the peer is not reachable, e.g. it's down.
@@ -699,8 +832,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
           reduce_util.ReduceOp.SUM,
           dummy_value,
           dummy_value,
-          experimental_hints=collective_util.Hints(
-              timeout_seconds=self._check_health_initial_timeout))
+          options=collective_util.Options(
+              timeout_seconds=self._check_health_initial_timeout,
+              implementation=collective_util.CommunicationImplementation.RING))
       if context.is_async():
         context.async_wait()
     except errors.DeadlineExceededError:
@@ -726,8 +860,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       logging.info("check health thread stopped")
 
   def _warn_nccl_no_gpu(self):
-    if ((self._communication ==
-         cross_device_ops_lib.CollectiveCommunication.NCCL) and
+    if ((self._communication_options.implementation ==
+         collective_util.CommunicationImplementation.NCCL) and
         self._num_gpus_per_worker == 0):
       logging.warning("Enabled NCCL communication but no GPUs detected/"
                       "specified.")
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 4faad331a06..39d2b432a25 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context
@@ -59,8 +58,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
+CollectiveAllReduceStrategy = (
+    collective_all_reduce_strategy.CollectiveAllReduceStrategy)
 CollectiveAllReduceExtended = (
     collective_all_reduce_strategy.CollectiveAllReduceExtended)
+_CollectiveAllReduceStrategyExperimental = (
+    collective_all_reduce_strategy._CollectiveAllReduceStrategyExperimental)
 
 
 def create_test_objects(cluster_spec=None,
@@ -93,14 +96,10 @@ def create_test_objects(cluster_spec=None,
 class CollectiveAllReduceStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
-  collective_key_base = 0
-
   def setUp(self):
     # We use a different key_base for each test so that collective keys won't be
     # reused.
-    # TODO(yuefengz, ayushd): enable it to reuse collective keys in different
-    # tests.
-    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    CollectiveAllReduceStrategy._collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
   def _get_test_object(self, task_type, task_id, num_gpus=0):
@@ -109,18 +108,6 @@ class CollectiveAllReduceStrategyTestBase(
         task_type=task_type,
         task_id=task_id,
         num_gpus=num_gpus)
-
-    collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        op_instance_key_start=100 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base)
-    strategy.extended._collective_keys = collective_keys
-    strategy.extended._cross_device_ops._collective_keys = collective_keys
-    strategy.extended._host_cross_device_ops._collective_keys = collective_keys
-
     return strategy, target, session_config
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
@@ -625,5 +612,27 @@ class CollectiveAllReduceStrategyV2Test(test.TestCase, parameterized.TestCase):
         strategy.extended._num_workers, results[1].numpy())
 
 
+class ExperimentalCompatibilityTest(test.TestCase):
+
+  def testIsInstance(self):
+    # It's not uncommon for people to special case MultiWorkerMirroredStrategy,
+    # so we need to make sure isinstance check works for combinations between
+    # the experimental and non-experimental endpoints.
+    strategy = CollectiveAllReduceStrategy()
+    experimental_strategy = _CollectiveAllReduceStrategyExperimental()
+    self.assertIsInstance(strategy, CollectiveAllReduceStrategy)
+    self.assertIsInstance(strategy, _CollectiveAllReduceStrategyExperimental)
+    self.assertIsInstance(experimental_strategy, CollectiveAllReduceStrategy)
+    self.assertIsInstance(experimental_strategy,
+                          _CollectiveAllReduceStrategyExperimental)
+
+  def testName(self):
+    # Estimator checks the __name__ to special case MultiWorkerMirroredStrategy.
+    self.assertEqual(CollectiveAllReduceStrategy.__name__,
+                     'CollectiveAllReduceStrategy')
+    self.assertEqual(_CollectiveAllReduceStrategyExperimental.__name__,
+                     'CollectiveAllReduceStrategy')
+
+
 if __name__ == '__main__':
   test_util.main()
diff --git a/tensorflow/python/distribute/collective_util.py b/tensorflow/python/distribute/collective_util.py
index 0d9c404e520..0d4554480b5 100644
--- a/tensorflow/python/distribute/collective_util.py
+++ b/tensorflow/python/distribute/collective_util.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,9 +19,145 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+import enum
+
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+# TODO(b/170340570): print deprecation warning for CollectiveCommunication.
+@tf_export("distribute.experimental.CommunicationImplementation",
+           "distribute.experimental.CollectiveCommunication")
+class CommunicationImplementation(enum.Enum):
+  """Cross device communication implementation.
+
+  Warning: The alias `tf.distribute.experimental.CollectiveCommunication` is
+  deprecated and will be removed in a future version. Use
+  `tf.distribute.experimental.CommunicationImplementation` instead.
+
+  * `AUTO`: Automatically chosen by Tensorflow.
+  * `RING`: TensorFlow's ring algorithms for all-reduce and
+    all-gather.
+  * `NCCL`: NVIDIA®'s NCCL library. This is now only used for all-reduce on
+    GPUs; all-reduce on CPU, all-gather and broadcast fallbacks to RING.
+  """
+  AUTO = "AUTO"
+  RING = "RING"
+  NCCL = "NCCL"
+  # TODO(ayushd): add ncclAllGather implementation.
+
+
+CollectiveCommunication = CommunicationImplementation
+
+
+@tf_export("distribute.experimental.CommunicationOptions")
+class _OptionsExported(object):
+  """Options for cross device communications like All-reduce.
+
+  This can be passed to methods like
+  `tf.distribute.get_replica_context().all_reduce()` to optimize collective
+  operation performance. Note that these are only hints, which may or may not
+  change the actual behavior. Some options only apply to certain strategy and
+  are ignored by others.
+
+  One common optimization is to break gradients all-reduce into multiple packs
+  so that weight updates can overlap with gradient all-reduce.
+
+  Examples:
+
+  ```python
+  options = tf.distribute.experimental.CommunicationOptions(
+      bytes_per_pack=50 * 1024 * 1024,
+      timeout_seconds=120,
+      implementation=tf.distribute.experimental.CommunicationImplementation.NCCL
+  )
+  grads = tf.distribute.get_replica_context().all_reduce(
+      'sum', grads, options=options)
+  optimizer.apply_gradients(zip(grads, vars),
+      experimental_aggregate_gradients=False)
+  ```
+
+  """
+
+  def __new__(cls, *args, **kwargs):
+    return Options.__new__(Options, *args, **kwargs)
+
+  def __init__(self,
+               bytes_per_pack=0,
+               timeout_seconds=None,
+               implementation=CommunicationImplementation.AUTO):
+    """Creates a CollectiveHints.
+
+    Args:
+      bytes_per_pack: a non-negative integer. Breaks collective operations into
+        packs of certain size. If it's zero, the value is determined
+        automatically. This only applies to all-reduce with
+        `MultiWorkerMirroredStrategy` currently.
+      timeout_seconds: a float or None, timeout in seconds. If not None, the
+        collective raises `tf.errors.DeadlineExceededError` if it takes longer
+        than this timeout. Zero disables timeout. This can be useful when
+        debugging hanging issues.  This should only be used for debugging since
+        it creates a new thread for each collective, i.e. an overhead of
+        `timeout_seconds * num_collectives_per_second` more threads. This only
+        works for `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+      implementation: a
+        `tf.distribute.experimental.CommunicationImplementation`. This is a hint
+        on the preferred communication implementation. Possible values include
+        `AUTO`, `RING`, and `NCCL`. NCCL is generally more performant for GPU,
+        but doesn't work for CPU. This only works for
+        `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+
+    Raises:
+      ValueError: When arguments have invalid value.
+    """
+    pass
+
+
+class Options(object):
+  """Implementation of OptionsInterface."""
+
+  def __init__(self,
+               bytes_per_pack=0,
+               timeout_seconds=None,
+               implementation=CommunicationImplementation.AUTO):
+    if bytes_per_pack < 0:
+      raise ValueError("bytes_per_pack must be non-negative")
+    if isinstance(implementation, str):
+      implementation = CommunicationImplementation(implementation.upper())
+    if not isinstance(implementation, CommunicationImplementation):
+      raise ValueError("implementation should be a "
+                       "tf.distribute.experimental.CommunicationImplementation")
+    self.bytes_per_pack = bytes_per_pack
+    self.timeout_seconds = timeout_seconds
+    self.implementation = implementation
+
+  __init__.__doc__ = _OptionsExported.__init__.__doc__
+
+  def merge(self, options):
+    """Merges with another options and returns a new one.
+
+    Values specified in the `options` takes precedence if they're not the
+    default.
+
+    Args:
+      options: a `tf.distribute.experimental.CollectiveCommunication`.
+
+    Returns:
+      A new `tf.distribute.experimental.CollectiveCommunication`.
+    """
+    merged = copy.deepcopy(self)
+    if options is None:
+      return merged
+    if options.bytes_per_pack != 0:
+      merged.bytes_per_pack = options.bytes_per_pack
+    if options.timeout_seconds is not None:
+      merged.timeout_seconds = options.timeout_seconds
+    if options.implementation != CommunicationImplementation.AUTO:
+      merged.implementation = options.implementation
+    return merged
+
+
 @tf_export("distribute.experimental.CollectiveHints")
 class Hints(object):
   """Hints for collective operations like AllReduce.
@@ -61,6 +198,12 @@ class Hints(object):
 
   """
 
+  @deprecation.deprecated(
+      None, "use distribute.experimental.CommunicationOptions instead")
+  def __new__(cls, bytes_per_pack=0, timeout_seconds=None):
+    return Options(
+        bytes_per_pack=bytes_per_pack, timeout_seconds=timeout_seconds)
+
   def __init__(self, bytes_per_pack=0, timeout_seconds=None):
     """Creates a CollectiveHints.
 
@@ -80,7 +223,4 @@ class Hints(object):
     Raises:
       ValueError: When arguments have invalid value.
     """
-    if bytes_per_pack < 0:
-      raise ValueError("bytes_per_pack must be non-negative")
-    self.bytes_per_pack = bytes_per_pack
-    self.timeout_seconds = timeout_seconds
+    pass
diff --git a/tensorflow/python/distribute/collective_util_test.py b/tensorflow/python/distribute/collective_util_test.py
new file mode 100644
index 00000000000..e75d520979b
--- /dev/null
+++ b/tensorflow/python/distribute/collective_util_test.py
@@ -0,0 +1,41 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for utilities for collectives."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import collective_util
+from tensorflow.python.eager import test
+
+
+class OptionsTest(test.TestCase):
+
+  def testCreateOptionsViaExportedAPI(self):
+    options = collective_util._OptionsExported()
+    self.assertIsInstance(options, collective_util.Options)
+
+  def testCreateOptionsViaHints(self):
+    with self.assertLogs() as cm:
+      options = collective_util.Hints(50, 1)
+    self.assertTrue(any("is deprecated" in msg for msg in cm.output))
+    self.assertIsInstance(options, collective_util.Options)
+    self.assertEqual(options.bytes_per_pack, 50)
+    self.assertEqual(options.timeout_seconds, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index c9d3d7d9a9a..a2e3610b7c5 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -101,7 +101,7 @@ class ClusterParameters(combinations_lib.ParameterModifier):
     else:
       has_chief = kwargs.get("has_chief", False)
       num_workers = kwargs.get("num_workers", 1)
-      runner = None
+      runner = kwargs.get("runner", None)
 
     # Always set cluster parameters if they're requested. So that generate()
     # works when there's no startegy in the combinations.
@@ -257,7 +257,7 @@ class NamedDistribution(object):
                use_cloud_tpu=False,
                has_chief=False,
                num_workers=1,
-               use_pool_runner=False,
+               pool_runner_fn=None,
                no_xla=False):
     """Initialize NamedDistribution.
 
@@ -269,8 +269,8 @@ class NamedDistribution(object):
       use_cloud_tpu: Whether the strategy requires cloud TPU.
       has_chief: Whether the strategy requires a chief worker.
       num_workers: The number of workers that the strategy requires.
-      use_pool_runner: Whether to use a pool runner so that workers are re-used
-        each time.
+      pool_runner_fn: An optional callable that returns a MultiProcessPoolRunner
+        to run the test.
       no_xla: Whether to skip in XLA tests.
     """
     object.__init__(self)
@@ -281,24 +281,14 @@ class NamedDistribution(object):
     self.use_cloud_tpu = use_cloud_tpu
     self.has_chief = has_chief
     self.num_workers = num_workers
+    self._pool_runner_fn = pool_runner_fn
     self.no_xla = no_xla
-    self._runner = None
-
-    if _num_total_workers(self.has_chief, self.num_workers) > 1:
-      cluster_spec = multi_worker_test_base.create_cluster_spec(
-          has_chief=has_chief,
-          num_workers=num_workers,
-          num_ps=0,
-          has_eval=False)
-      if use_pool_runner:
-        # Need to create the strategy in the initializer so that collectives are
-        # configured before eager context initialization.
-        self._runner = multi_process_runner.MultiProcessPoolRunner(
-            cluster_spec, initializer=self._distribution_fn)
 
   @property
   def runner(self):
-    return self._runner
+    if self._pool_runner_fn is not None:
+      return self._pool_runner_fn()
+    return None
 
   @property
   def strategy(self):
diff --git a/tensorflow/python/distribute/client/BUILD b/tensorflow/python/distribute/coordinator/BUILD
similarity index 67%
rename from tensorflow/python/distribute/client/BUILD
rename to tensorflow/python/distribute/coordinator/BUILD
index f6a4fd7f17c..10c7c1dae84 100644
--- a/tensorflow/python/distribute/client/BUILD
+++ b/tensorflow/python/distribute/coordinator/BUILD
@@ -8,8 +8,8 @@ package(
 exports_files(["LICENSE"])
 
 py_library(
-    name = "client",
-    srcs = ["client.py"],
+    name = "cluster_coordinator",
+    srcs = ["cluster_coordinator.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":metric_utils",
@@ -34,37 +34,28 @@ py_library(
 )
 
 tf_py_test(
-    name = "client_test",
+    name = "cluster_coordinator_test",
     size = "small",
-    srcs = ["client_test.py"],
+    srcs = ["cluster_coordinator_test.py"],
     python_version = "PY3",
-    shard_count = 12,
-    tfrt_enabled = True,
-    deps = [
-        ":client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training_lib",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:def_function",
+    shard_count = 50,
+    tags = [
+        "no_oss",  # TODO(b/162119374)
+        "notsan",  # TODO(b/171040359): Flaky timeout, even if maximum shards
     ],
-)
-
-tf_py_test(
-    name = "parameter_server_client_test",
-    srcs = ["parameter_server_client_test.py"],
-    python_version = "PY3",
-    shard_count = 14,
-    tags = ["no_oss"],  # TODO(b/162119374)
     deps = [
-        ":client",
+        ":cluster_coordinator",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:training_lib",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_worker_test_base",
@@ -75,12 +66,15 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "client_mpr_test",
-    srcs = ["client_mpr_test.py"],
+    name = "cluster_coordinator_mpr_test",
+    srcs = ["cluster_coordinator_mpr_test.py"],
     python_version = "PY3",
-    shard_count = 2,
-    tags = ["no_oss"],  # TODO(b/162119374)
+    shard_count = 5,
+    tags = [
+        "notsan",  # TODO(b/171406091)
+    ],
     deps = [
+        ":cluster_coordinator",
         ":remote_eager_lib",
         ":utils",
         "//tensorflow/python:dtypes",
@@ -90,13 +84,45 @@ tf_py_test(
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:sharded_variable",
-        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
     ],
 )
 
+tf_py_test(
+    name = "fault_tolerance_test",
+    srcs = ["fault_tolerance_test.py"],
+    python_version = "PY3",
+    shard_count = 9,
+    tags = [
+        "no_oss",  # TODO(b/168772720)
+        "noasan",  # Multi-process runner does not work with test sanitizers
+        "notsan",  # Multi-process runner does not work with test sanitizers
+    ],
+    deps = [
+        ":cluster_coordinator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training:training_lib",
+    ],
+)
+
 py_library(
     name = "metric_utils",
     srcs = ["metric_utils.py"],
@@ -111,7 +137,7 @@ tf_py_test(
     srcs = ["metric_utils_test.py"],
     python_version = "PY3",
     deps = [
-        ":client",
+        ":cluster_coordinator",
         ":metric_utils",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
diff --git a/tensorflow/python/distribute/client/client.py b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
similarity index 67%
rename from tensorflow/python/distribute/client/client.py
rename to tensorflow/python/distribute/coordinator/cluster_coordinator.py
index 6eabbfa219a..08efff44527 100644
--- a/tensorflow/python/distribute/client/client.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for `Client` and relevant cluster-worker related library.
+"""Module for `ClusterCoordinator` and relevant cluster-worker related library.
 
 This is currently under development and the API is subject to change.
 """
@@ -29,13 +29,14 @@ import os
 import re
 import sys
 import threading
+import time
 import weakref
 from six.moves import queue
 
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import metric_utils
+from tensorflow.python.distribute.coordinator import metric_utils
 from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -44,10 +45,10 @@ from tensorflow.python.eager import function as tf_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 # Maximum time for failed worker to come back is 1 hour
 _WORKER_MAXIMUM_RECOVERY_SEC = 3600
@@ -56,9 +57,10 @@ _WORKER_MAXIMUM_RECOVERY_SEC = 3600
 # When the maximum queue size is reached, further schedule calls will become
 # blocking until some previously queued closures are executed on workers.
 # Note that using an "infinite" queue size can take a non-trivial portion of
-# memory, and even lead to client OOM. Modify the size to a smaller value for
-# client with constrained memory resource (only recommended for advanced users).
-# Also used in unit tests to ensure the correctness when the queue is full.
+# memory, and even lead to coordinator OOM. Modify the size to a smaller value
+# for coordinator with constrained memory resource (only recommended for
+# advanced users). Also used in unit tests to ensure the correctness when the
+# queue is full.
 _CLOSURE_QUEUE_MAX_SIZE = 256 * 1024
 
 # RPC error message from PS
@@ -99,22 +101,81 @@ class _RemoteValueStatus(enum.Enum):
   READY = "READY"
 
 
+@tf_export("distribute.experimental.coordinator.RemoteValue", v1=[])
 class RemoteValue(object):
-  """An asynchronously available value of a remotely executed function.
+  """An asynchronously available value of a scheduled function.
 
-  `RemoteValue` class is used as the return value of `Client.schedule()` where
-  the underlying concrete value comes at a later time once the function has been
-  remotely executed. `RemoteValue` can be used as an input to a subsequent
-  function scheduled with `Client.schedule()`.
+  This class is used as the return value of
+  `tf.distribute.experimental.coordinator.ClusterCoordinator.schedule` where
+  the underlying value becomes available at a later time once the function has
+  been executed.
 
-  Note: this class is not thread-safe.
+  Using `tf.distribute.experimental.coordinator.RemoteValue` as an input to
+  a subsequent function scheduled with
+  `tf.distribute.experimental.coordinator.ClusterCoordinator.schedule` is
+  currently not supported.
+
+  Example:
+
+  ```python
+  strategy = tf.distribute.experimental.ParameterServerStrategy(
+      cluster_resolver=...)
+  coordinator = (
+      tf.distribute.experimental.coordinator.ClusterCoordinator(strategy))
+
+  with strategy.scope():
+    v1 = tf.Variable(initial_value=0.0)
+    v2 = tf.Variable(initial_value=1.0)
+
+  @tf.function
+  def worker_fn():
+    v1.assign_add(0.1)
+    v2.assign_sub(0.2)
+    return v1.read_value() / v2.read_value()
+
+  result = coordinator.schedule(worker_fn)
+  # Note that `fetch()` gives the actual result instead of a `tf.Tensor`.
+  assert result.fetch() == 0.125
+
+  for _ in range(10):
+    # `worker_fn` will be run on arbitrary workers that are available. The
+    # `result` value will be available later.
+    result = coordinator.schedule(worker_fn)
+  ```
   """
 
-  def __init__(self, closure, type_spec):
+  def fetch(self):
+    """Wait for the result of `RemoteValue` to be ready and return the result.
+
+    This makes the value concrete by copying the remote value to local.
+
+    Returns:
+      The actual output of the `tf.function` associated with this `RemoteValue`,
+      previously by a
+      `tf.distribute.experimental.coordinator.ClusterCoordinator.schedule` call.
+      This can be a single value, or a structure of values, depending on the
+      output of the `tf.function`.
+
+    Raises:
+      tf.errors.CancelledError: If the function that produces this `RemoteValue`
+        is aborted or cancelled due to failure.
+    """
+    raise NotImplementedError("Must be implemented in subclasses.")
+
+
+class RemoteValueImpl(RemoteValue):
+  """Implementation of `RemoteValue`."""
+
+  def __init__(self, closure, type_spec):  # pylint: disable=super-init-not-called
+    """Initializes a `RemoteValueImpl`.
+
+    Args:
+      closure: The closure from which the `RemoteValue` is created.
+      type_spec: The type spec for this `RemoteValue` which is used to trace
+        functions that take this `RemoteValue` as input.
+    """
     self._closure = closure
-    # The type spec for this `RemoteValue` which is used to trace functions that
-    # take this `RemoteValue` as input.
-    self._type_spec = func_graph.convert_structure_to_signature(type_spec)
+    self._type_spec = type_spec
     self._value = None
     self._error = None
     self._status_available_event = threading.Event()
@@ -153,20 +214,7 @@ class RemoteValue(object):
     self._status_available_event.wait()
     return self._error
 
-  def _set_type_spec(self, type_spec):
-    self._type_spec = func_graph.convert_structure_to_signature(type_spec)
-
   def fetch(self):
-    """Wait for the result of RemoteValue to be ready and return the result.
-
-    Returns:
-      The remote value, as a numpy data type (if scalar) or ndarray.
-
-    Raises:
-      tf.errors.CancelledError: If the function that produces this `RemoteValue`
-        is aborted or cancelled due to failure, and the user should handle and
-        reschedule.
-    """
     self._status_available_event.wait()
     if self._status is _RemoteValueStatus.ABORTED:
       raise errors.CancelledError(
@@ -176,11 +224,8 @@ class RemoteValue(object):
     if self._error is not None:
       raise self._error  # pylint: disable=raising-bad-type
     else:
-      if isinstance(self._value,
-                    (ops.Tensor, resource_variable_ops.BaseResourceVariable)):
-        return self._value.numpy()
-      else:
-        return self._value
+      return nest.map_structure(
+          lambda x: x.numpy() if hasattr(x, "numpy") else x, self._value)
 
 
 class InputError(Exception):
@@ -241,8 +286,23 @@ def _maybe_as_type_spec(val):
     return val
 
 
+@tf_export("distribute.experimental.coordinator.PerWorkerValues", v1=[])
 class PerWorkerValues(object):
-  """Holds a list of per worker values."""
+  """A container that holds a list of values, one value per worker.
+
+  `tf.distribute.experimental.coordinator.PerWorkerValues` contains a collection
+  of values, where each of the values is located on its corresponding worker,
+  and upon being used as one of the `args` or `kwargs` of
+  `tf.distribute.experimental.coordinator.ClusterCoordinator.schedule()`, the
+  value specific to a worker will be passed into the function being executed at
+  that corresponding worker.
+
+  Currently, the only supported path to create an object of
+  `tf.distribute.experimental.coordinator.PerWorkerValues` is through calling
+  `iter` on a `ClusterCoordinator.create_per_worker_dataset`-returned
+  distributed dataset instance. The mechanism to create a custom
+  `tf.distribute.experimental.coordinator.PerWorkerValues` is not yet supported.
+  """
 
   def __init__(self, values):
     self._values = tuple(values)
@@ -262,9 +322,10 @@ def _disallow_remote_value_as_input(structured):
 
   def _raise_if_remote_value(x):
     if isinstance(x, RemoteValue):
-      raise ValueError("RemoteValue cannot be used as an input to scheduled "
-                       "function. Please file a feature request if you need "
-                       "this feature.")
+      raise ValueError(
+          "`tf.distribute.experimental.coordinator.RemoteValue` used "
+          "as an input to scheduled function is not yet "
+          "supported.")
 
   nest.map_structure(_raise_if_remote_value, structured)
 
@@ -274,8 +335,8 @@ class Closure(object):
 
   def __init__(self, function, cancellation_mgr, args=None, kwargs=None):
     if not callable(function):
-      raise ValueError("Function passed to `Client.schedule` must be a "
-                       "callable object.")
+      raise ValueError("Function passed to `ClusterCoordinator.schedule` must "
+                       "be a callable object.")
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -287,9 +348,9 @@ class Closure(object):
       replica_kwargs = _select_worker_slice(0, self._kwargs)
 
       # Note: no need to handle function registration failure since this kind of
-      # failure will not raise exceptions as designed in the runtime. The client
-      # has to rely on subsequent operations that raise to catch function
-      # registration failure.
+      # failure will not raise exceptions as designed in the runtime. The
+      # coordinator has to rely on subsequent operations that raise to catch
+      # function registration failure.
 
       # Record the function tracing overhead. Note that we pass in the tracing
       # count of the def_function.Function as a state tracker, so that metrics
@@ -297,36 +358,32 @@ class Closure(object):
       # function cache lookups).
       with metric_utils.monitored_timer(
           "function_tracing", state_tracker=function._get_tracing_count):  # pylint: disable=protected-access
-        concrete_function = function.get_concrete_function(
+        self._concrete_function = function.get_concrete_function(
             *nest.map_structure(_maybe_as_type_spec, replica_args),
             **nest.map_structure(_maybe_as_type_spec, replica_kwargs))
-      self._function = cancellation_mgr.get_cancelable_function(
-          concrete_function)
-      self._output_remote_values = nest.map_structure(
-          lambda x: RemoteValue(self, x), concrete_function.structured_outputs)
     elif isinstance(function, tf_function.ConcreteFunction):
-      self._function = cancellation_mgr.get_cancelable_function(function)
-      self._output_remote_values = nest.map_structure(
-          lambda x: RemoteValue(self, x), function.structured_outputs)
+      self._concrete_function = function
+
+    if hasattr(self, "_concrete_function"):
+      # If we have a concrete function, we get to retrieve the output type spec
+      # via the structured_output.
+      output_type_spec = func_graph.convert_structure_to_signature(
+          self._concrete_function.structured_outputs)
+      self._function = cancellation_mgr.get_cancelable_function(
+          self._concrete_function)
     else:
-      # Regular python functions.
+      # Otherwise (i.e. what is passed in is a regular python function), we have
+      # no such information.
+      output_type_spec = None
       self._function = function
-      # TODO(yuefengz): maybe we should trace python functions if their inputs
-      # are Python primitives, tensors and composite tensors.
-      self._output_remote_values = RemoteValue(self, None)
 
-  def _fetch_output_remote_values(self):
-    """Temporary method used to sync the scheduler."""
-    # It will do nothing if there is no return value.
-    nest.map_structure(lambda x: x.fetch(), self._output_remote_values)  # pylint: disable=protected-access
+    self.output_remote_value = RemoteValueImpl(self, output_type_spec)
 
-  def _set_output_remote_values_cancelled(self):
-    nest.map_structure(
-        lambda x: x._set_error(  # pylint: disable=protected-access,g-long-lambda
-            errors.CancelledError(
-                None, None, "The corresponding function is "
-                "cancelled. Please reschedule the function.")),
-        self._output_remote_values)  # pylint: disable=protected-access
+  def mark_cancelled(self):
+    self.output_remote_value._set_error(  # pylint: disable=protected-access
+        errors.CancelledError(
+            None, None, "The corresponding function is "
+            "cancelled. Please reschedule the function."))
 
   def execute_on(self, worker):
     """Executes the closure on the given worker.
@@ -343,8 +400,7 @@ class Closure(object):
     if e:
       if not isinstance(e, InputError):
         e = InputError(e)
-      for remote_value in nest.flatten(self._output_remote_values):
-        remote_value._set_error(e)  # pylint: disable=protected-access
+      self.output_remote_value._set_error(e)  # pylint: disable=protected-access
       return
 
     with ops.device(worker.device_name):
@@ -353,9 +409,7 @@ class Closure(object):
           output_value = self._function(
               *nest.map_structure(_maybe_get_remote_value, replica_args),
               **nest.map_structure(_maybe_get_remote_value, replica_kwargs))
-    for remote_value, value in zip(
-        nest.flatten(self._output_remote_values), nest.flatten(output_value)):
-      remote_value._set_value(value)  # pylint: disable=protected-access
+    self.output_remote_value._set_value(output_value)  # pylint: disable=protected-access
 
 
 class _CoordinatedClosureQueue(object):
@@ -394,7 +448,7 @@ class _CoordinatedClosureQueue(object):
 
     if _CLOSURE_QUEUE_MAX_SIZE <= 0:
       logging.warning(
-          "In a `Client`, creating an infinite closure queue can "
+          "In a `ClusterCoordinator`, creating an infinite closure queue can "
           "consume a significant amount of memory and even lead to OOM.")
     self._queue = queue.Queue(maxsize=_CLOSURE_QUEUE_MAX_SIZE)
     self._error = None
@@ -424,16 +478,16 @@ class _CoordinatedClosureQueue(object):
       try:
         closure = self._queue.get(block=False)
         self._queue_free_slot_condition.notify()
-        closure._set_output_remote_values_cancelled()  # pylint: disable=protected-access
+        closure.mark_cancelled()
       except queue.Empty:
         break
     # The cancellation manager cannot be reused once cancelled. After all
     # closures (queued or inflight) are cleaned up, recreate the cancellation
     # manager with clean state.
-    # Note on thread-safety: this is triggered when one of theses client APIs
-    # are called: `schedule`, `wait`, and `done`. At the same time, no new
-    # closures can be constructed (which reads the _cancellation_mgr to get
-    # cancellable functions).
+    # Note on thread-safety: this is triggered when one of theses
+    # ClusterCoordinator APIs are called: `schedule`, `wait`, and `done`. At the
+    # same time, no new closures can be constructed (which reads the
+    # _cancellation_mgr to get cancellable functions).
     self._cancellation_mgr = cancellation.CancellationManager()
 
   def _raise_if_error(self):
@@ -496,7 +550,7 @@ class _CoordinatedClosureQueue(object):
       if self._inflight_closure_count < 1:
         raise AssertionError("There is no inflight closures to put_back.")
       if self._error:
-        closure._set_output_remote_values_cancelled()  # pylint: disable=protected-access
+        closure.mark_cancelled()
       else:
         self._queue_free_slot_condition.wait_for(lambda: not self._queue.full())
         self._queue.put(closure, block=False)
@@ -690,7 +744,7 @@ class Worker(object):
         closure.execute_on(self)
         # TODO(yuefengz): we don't have to materialize results every step.
         with metric_utils.monitored_timer("remote_value_fetch"):
-          closure._fetch_output_remote_values()  # pylint: disable=protected-access
+          closure.output_remote_value.fetch()
         self._cluster._closure_queue.mark_finished()  # pylint: disable=protected-access
     except Exception as e:  # pylint: disable=broad-except
       # Avoid logging the derived cancellation error
@@ -698,12 +752,28 @@ class Worker(object):
         logging.error(
             "/job:worker/task:%d encountered the following error when "
             "processing closure: %r:%s", self.worker_index, e, e)
-      nest.map_structure(
-          lambda x: x._set_error(e),  # pylint: disable=protected-access
-          closure._output_remote_values)  # pylint: disable=protected-access
+      closure.output_remote_value._set_error(e)  # pylint: disable=protected-access
       self._cluster._closure_queue.mark_failed(e)  # pylint: disable=protected-access
 
+  def _maybe_delay(self):
+    """Delay if corresponding env vars are set."""
+    # If the following two env vars variables are set. Scheduling for workers
+    # will start in a staggered manner. Worker i will wait for
+    # `TF_COORDINATOR_SCHEDULE_START_DELAY` * i seconds, not exceeding
+    # `TF_COORDINATOR_SCHEDULE_START_DELAY_MAX`.
+    delay_secs = int(os.environ.get("TF_COORDINATOR_SCHEDULE_START_DELAY", "0"))
+    delay_cap = int(
+        os.environ.get("TF_COORDINATOR_SCHEDULE_START_DELAY_MAX", "0"))
+    if delay_cap:
+      delay_secs = min(delay_secs * self.worker_index, delay_cap)
+    if delay_secs > 0:
+      logging.info("Worker %d sleeping for %d seconds before running function",
+                   self.worker_index, delay_secs)
+    time.sleep(delay_secs)
+
   def _process_queue(self):
+    """Function running in a thread to process closure queues."""
+    self._maybe_delay()
     while True:
       closure = self._cluster._closure_queue.get()  # pylint: disable=protected-access
       self._process_closure(closure)
@@ -730,7 +800,7 @@ class Worker(object):
         self._cluster._closure_queue._cancellation_mgr,  # pylint: disable=protected-access
         args=args,
         kwargs=kwargs)
-    resource_remote_value = closure._output_remote_values  # pylint: disable=protected-access
+    resource_remote_value = closure.output_remote_value
     self._register_resource(resource_remote_value)
 
     # The following is a short-term solution to lazily create resources in
@@ -742,8 +812,8 @@ class Worker(object):
 
   def _register_resource(self, resource_remote_value):
     if not isinstance(resource_remote_value, RemoteValue):
-      raise ValueError(
-          "Resource being registered is not of type `RemoteValue`.")
+      raise ValueError("Resource being registered is not of type "
+                       "`tf.distribute.experimental.coordinator.RemoteValue`.")
     self._resource_remote_value_refs.append(weakref.ref(resource_remote_value))
 
 
@@ -753,7 +823,7 @@ class Cluster(object):
   We assume all function errors are fatal and based on this assumption our
   error reporting logic is:
   1) Both `schedule` and `join` can raise a non-retryable error which is the
-  first error seen by the client from any previously scheduled functions.
+  first error seen by the coordinator from any previously scheduled functions.
   2) When an error is raised, there is no guarantee on how many previously
   scheduled functions have been executed; functions that have not been executed
   will be thrown away and marked as cancelled.
@@ -775,17 +845,17 @@ class Cluster(object):
 
     # Ignore PS failures reported by workers due to transient connection errors.
     # Transient connectivity issues between workers and PS are relayed by the
-    # workers to the client, leading the client to believe that there are PS
-    # failures. The difference between transient vs. permanent PS failure is the
-    # number of reports from the workers. When this env var is set to a positive
-    # integer K, the client ignores up to K reports of a failed PS task. I.e.,
-    # only when there are more than K trials of executing closures fail due to
-    # errors from the same PS instance do we consider the PS instance encounters
-    # a failure.
+    # workers to the coordinator, leading the coordinator to believe that there
+    # are PS failures. The difference between transient vs. permanent PS failure
+    # is the number of reports from the workers. When this env var is set to a
+    # positive integer K, the coordinator ignores up to K reports of a failed PS
+    # task, i.e., only when there are more than K trials of executing closures
+    # fail due to errors from the same PS instance do we consider the PS
+    # instance encounters a failure.
     # TODO(b/164279603): Remove this workaround when the underlying connectivity
     # issue in gRPC server is resolved.
-    self._transient_ps_failures_threshold = int(os.environ.get(
-        "TF_CLIENT_IGNORE_TRANSIENT_PS_FAILURES", 3))
+    self._transient_ps_failures_threshold = int(
+        os.environ.get("TF_COORDINATOR_IGNORE_TRANSIENT_PS_FAILURES", 3))
     self._potential_ps_failures_lock = threading.Lock()
     self._potential_ps_failures_count = [0] * self._num_ps
 
@@ -825,7 +895,7 @@ class Cluster(object):
       kwargs: Keyword arguments for `fn`.
 
     Returns:
-      A structure of `RemoteValue` object.
+      A `RemoteValue` object.
     """
     closure = Closure(
         function,
@@ -833,7 +903,7 @@ class Cluster(object):
         args=args,
         kwargs=kwargs)
     self._closure_queue.put(closure)
-    return closure._output_remote_values  # pylint: disable=protected-access
+    return closure.output_remote_value
 
   def join(self):
     """Blocks until all scheduled functions are executed."""
@@ -844,73 +914,113 @@ class Cluster(object):
     return self._closure_queue.done()
 
 
-class ParameterServerFailureError(Exception):
-  """An error representing at least one parameter server is interrupted."""
-  pass
+@tf_export("distribute.experimental.coordinator.ClusterCoordinator", v1=[])
+class ClusterCoordinator(object):
+  """An object to schedule and coordinate remote function execution.
 
+  This class is used to create fault-tolerant resources and dispatch functions
+  to remote TensorFlow servers.
 
-class Client(object):
-  """An object to schedule and orchestrate remote function execution.
+  Currently, this class is not supported to be used in a standalone manner. It
+  should be used in conjunction with a `tf.distribute` strategy that is designed
+  to work with it. The `ClusterCoordinator` class currently only works
+  `tf.distribute.experimental.ParameterServerStrategy`.
 
-  A `Client` object represents a program used to create dataset, schedule
-  functions to be executed, and fetch the results of the functions.
+  __The `schedule`/`join` APIs__
 
-  Currently, `Client` is not supported to be used in a standalone manner.
-  It should be used in conjunction with `ParameterServerStrategyV2`.
+  The most important APIs provided by this class is the `schedule`/`join` pair.
+  The `schedule` API is non-blocking in that it queues a `tf.function` and
+  returns a `RemoteValue` immediately. The queued functions will be dispatched
+  to remote workers in background threads and their `RemoteValue`s will be
+  filled asynchronously. Since `schedule` doesn’t require worker assignment, the
+  `tf.function` passed in can be executed on any available worker. If the worker
+  it is executed on becomes unavailable before its completion, it will be
+  migrated to another worker. Because of this fact and function execution is not
+  atomic, a function may be executed more than once.
+
+  __Handling Task Failure__
+
+  This class when used with
+  `tf.distribute.experimental.ParameterServerStrategy`, comes with built-in
+  fault tolerance for worker failures. That is, when some workers are not
+  available for any reason to be reached from the coordinator, the training
+  progress continues to be made with the remaining workers. Upon recovery of a
+  failed worker, it will be added for function execution after datasets created
+  by `create_per_worker_dataset` are re-built on it.
+
+  When a parameter server fails, a `tf.errors.UnavailableError` is raised by
+  `schedule`, `join` or `done`. In this case, in addition to bringing back the
+  failed parameter server, users should restart the coordinator so that it
+  reconnects to workers and parameter servers, re-creates the variables, and
+  loads checkpoints. If the coordinator fails, after the user brings it back,
+  the program will automatically connect to workers and parameter servers, and
+  continue the progress from a checkpoint.
+
+  It is thus essential that in user's program, a checkpoint file is periodically
+  saved, and restored at the start of the program. If an
+  `tf.keras.optimizers.Optimizer` is checkpointed, after restoring from a
+  checkpoiont, its `iterations` property roughly indicates the number of steps
+  that have been made. This can be used to decide how many epochs and steps are
+  needed before the training completion.
+
+  See `tf.distribute.experimental.ParameterServerStrategy` docstring for an
+  example usage of this API.
 
   This is currently under development, and the API as well as implementation
-  is subject to changes.
+  are subject to changes.
   """
 
   def __init__(self, strategy):
-    """Initialization of a `Client` instance.
-
-    This connects the client to remote workers and parameter servers, through
-    a `tf.config.experimental_connect_to_cluster` call.
+    """Initialization of a `ClusterCoordinator` instance.
 
     Args:
-      strategy: a `tf.distribute.Strategy` object. Currently, only
-        `ParameterServerStrategyV2` is supported.
+      strategy: a supported `tf.distribute.Strategy` object. Currently, only
+        `tf.distribute.experimental.ParameterServerStrategy` is supported.
 
     Raises:
       ValueError: if the strategy being used is not supported.
     """
     if not isinstance(strategy,
                       parameter_server_strategy_v2.ParameterServerStrategyV2):
-      raise ValueError("Only `ParameterServerStrategyV2` is supported in "
-                       "`Client` currently.")
+      raise ValueError(
+          "Only `tf.distribute.experimental.ParameterServerStrategy` "
+          "is supported to work with "
+          "`tf.distribute.experimental.coordinator.ClusterCoordinator` "
+          "currently.")
     self._strategy = strategy
     self.cluster = Cluster(strategy)
 
   @property
   def strategy(self):
+    """Returns the `Strategy` associated with the `ClusterCoordinator`."""
     return self._strategy
 
   def schedule(self, fn, args=None, kwargs=None):
-    """Schedules `fn` to be dispatched to a worker for execution asynchronously.
+    """Schedules `fn` to be dispatched to a worker for asynchronous execution.
 
-    When calling `schedule` with a function `fn`, `fn` will be executed on a
-    remote worker at some later time. The process is asynchronous, meaning
-    `schedule` returns immediately, possibly without having the result ready
-    yet. `schedule` returns a structure of `RemoteValue` object, which wraps the
-    output of the function. Call `fetch()` on `RemoteValue` to wait for the
-    function execution to finish and retrieve its output from the remote worker.
+    This method is non-blocking in that it queues the `fn` which will be
+    executed later and returns a
+    `tf.distribute.experimental.coordinator.RemoteValue` object immediately.
+    `fetch` can be called on the it to wait for the function execution to finish
+    and retrieve its output from a remote worker. On the other hand, call
+    `tf.distribute.experimental.coordinator.ClusterCoordinator.join` to wait for
+    all scheduled functions to finish.
 
     `schedule` guarantees that `fn` will be executed on a worker at least once;
     it could be more than once if its corresponding worker fails in the middle
     of its execution. Note that since worker can fail at any point when
     executing the function, it is possible that the function is partially
-    executed, but `Client` guarantees that in those events, the function will
-    eventually be fully executed, possibly on a different worker that is
-    available.
+    executed, but `tf.distribute.experimental.coordinator.ClusterCoordinator`
+    guarantees that in those events, the function will eventually be executed on
+    any worker that is available.
 
-    If any previously scheduled function raises an error, `schedule` will fail
-    by raising any one of those errors, and clear the errors collected so far.
-    There are two implications when this happens: 1) user should call `schedule`
-    with `fn` again to re-schedule, and 2) some of the previously scheduled
-    functions may have not been executed. User can call `fetch` on the returned
-    `RemoteValue` to inspect if they have executed, failed, or cancelled, and
-    reschedule the corresponding function if needed.
+    If any previously scheduled function raises an error, `schedule` will raise
+    any one of those errors, and clear the errors collected so far. What happens
+    here, some of the previously scheduled functions may have not been executed.
+    User can call `fetch` on the returned
+    `tf.distribute.experimental.coordinator.RemoteValue` to inspect if they have
+    executed, failed, or cancelled, and reschedule the corresponding function if
+    needed.
 
     When `schedule` raises, it guarantees that there is no function that is
     still being executed.
@@ -919,12 +1029,13 @@ class Client(object):
     execution, or priority of the workers.
 
     `args` and `kwargs` are the arguments passed into `fn`, when `fn` is
-    executed on a worker. They can be `PerWorkerValues`, which is a collection
-    of values, each of which represents a component specific to a worker; in
-    this case, the argument will be substituted with the corresponding component
-    on the target worker. Arguments that are not `PerWorkerValues` will be
-    passed into `fn` as-is. Currently, `RemoteValue` is not supported to be
-    input `args` or `kwargs`.
+    executed on a worker. They can be
+    `tf.distribute.experimental.coordinator.PerWorkerValues` and in this case,
+    the argument will be substituted with the corresponding component on the
+    target worker. Arguments that are not
+    `tf.distribute.experimental.coordinator.PerWorkerValues` will be passed into
+    `fn` as-is. Currently, `tf.distribute.experimental.coordinator.RemoteValue`
+    is not supported to be input `args` or `kwargs`.
 
     Args:
       fn: A `tf.function`; the function to be dispatched to a worker for
@@ -933,16 +1044,17 @@ class Client(object):
       kwargs: Keyword arguments for `fn`.
 
     Returns:
-      A structure of `RemoteValue` object.
+      A `tf.distribute.experimental.coordinator.RemoteValue` object that
+      represents the output of the function scheduled.
 
     Raises:
-      Exception: one of the exceptions caught by the client by any previously
-        scheduled function since the last time an error was thrown or since
-        the beginning of the program.
+      Exception: one of the exceptions caught by the coordinator from any
+        previously scheduled function, since the last time an error was thrown
+        or since the beginning of the program.
     """
     # Slot variables are usually created during function tracing time; thus
     # `schedule` needs to be called within the `strategy.scope()`.
-    with self.strategy.scope(), _translate_parameter_server_failure():
+    with self.strategy.scope():
       return self.cluster.schedule(fn, args=args, kwargs=kwargs)
 
   def join(self):
@@ -951,21 +1063,20 @@ class Client(object):
     If any previously scheduled function raises an error, `join` will fail by
     raising any one of those errors, and clear the errors collected so far. If
     this happens, some of the previously scheduled functions may have not been
-    executed. Users can call `fetch` on the returned `RemoteValue` to inspect if
-    they have executed, failed, or cancelled. If some that have been cancelled
-    need to be rescheduled, users should call `schedule` with the function
-    again.
+    executed. Users can call `fetch` on the returned
+    `tf.distribute.experimental.coordinator.RemoteValue` to inspect if they have
+    executed, failed, or cancelled. If some that have been cancelled need to be
+    rescheduled, users should call `schedule` with the function again.
 
     When `join` returns or raises, it guarantees that there is no function that
     is still being executed.
 
     Raises:
-      Exception: one of the exceptions caught by the client by any previously
-        scheduled function since the last time an error was thrown or since
-        the beginning of the program.
+      Exception: one of the exceptions caught by the coordinator by any
+        previously scheduled function since the last time an error was thrown or
+        since the beginning of the program.
     """
-    with _translate_parameter_server_failure():
-      self.cluster.join()
+    self.cluster.join()
 
   def done(self):
     """Returns whether all the scheduled functions have finished execution.
@@ -975,29 +1086,64 @@ class Client(object):
 
     When `done` returns True or raises, it guarantees that there is no function
     that is still being executed.
+
+    Returns:
+      Whether all the scheduled functions have finished execution.
+    Raises:
+      Exception: one of the exceptions caught by the coordinator by any
+        previously scheduled function since the last time an error was thrown or
+        since the beginning of the program.
     """
     return self.cluster.done()
 
   def create_per_worker_dataset(self, dataset_fn):
     """Create dataset on workers by calling `dataset_fn` on worker devices.
 
-    This creates the given dataset generated by dataset_fn on the workers
+    This creates the given dataset generated by dataset_fn on workers
     and returns an object that represents the collection of those individual
-    datasets. Calling `iter` on such collection of dataset returns a
-    `PerWorkerValues`, which is a collection of iterators, where the iterators
-    have been placed on respective workers.
+    datasets. Calling `iter` on such collection of datasets returns a
+    `tf.distribute.experimental.coordinator.PerWorkerValues`, which is a
+    collection of iterators, where the iterators have been placed on respective
+    workers.
 
-    Calling `next` on this `PerWorkerValues` of iterators is currently
-    unsupported; it is meant to be passed as an argument into `Client.schedule`.
-    When the scheduled function is picked up and being executed by a worker, the
+    Calling `next` on a `PerWorkerValues` of iterator is unsupported. The
+    iterator is meant to be passed as an argument into
+    `tf.distribute.experimental.coordinator.ClusterCoordinator.schedule`. When
+    the scheduled function is about to be executed by a worker, the
     function will receive the individual iterator that corresponds to the
-    worker, and now `next` can be called on iterator to get the next (batch or
-    example) of data.
+    worker. The `next` method can be called on an iterator inside a
+    scheduled function when the iterator is an input of the function.
 
-    Dataset shuffling and repeating are usually needed in `dataset_fn`; however,
-    sharding is not recommended: some worker may not be available and those
-    examples may be skipped and not covered by other workers, if the dataset is
-    sharded.
+    Currently the `schedule` method assumes workers are all the same and thus
+    assumes the datasets on different workers are the same, except they may be
+    shuffled differently if they contain a `dataset.shuffle` operation and a
+    random seed is not set. Because of this, we also recommend the datasets to
+    be repeated indefinitely and schedule a finite number of steps instead of
+    relying on the `OutOfRangeError` from a dataset.
+
+
+    Example:
+
+    ```python
+    strategy = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver=...)
+    coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
+        strategy=strategy)
+
+    @tf.function
+    def worker_fn(iterator):
+      return next(iterator)
+
+    def per_worker_dataset_fn():
+      return strategy.distribute_datasets_from_function(
+          lambda x: tf.data.Dataset.from_tensor_slices([3] * 3))
+
+    per_worker_dataset = coordinator.create_per_worker_dataset(
+        per_worker_dataset_fn)
+    per_worker_iter = iter(per_worker_dataset)
+    remote_value = coordinator.schedule(worker_fn, args=(per_worker_iter,))
+    assert remote_value.fetch() == 3
+    ```
 
     Args:
       dataset_fn: The dataset function that returns a dataset. This is to be
@@ -1006,7 +1152,8 @@ class Client(object):
     Returns:
       An object that represents the collection of those individual
       datasets. `iter` is expected to be called on this object that returns
-      a `PerWorkerValues` of the iterators (that are on the workers).
+      a `tf.distribute.experimental.coordinator.PerWorkerValues` of the
+      iterators (that are on the workers).
     """
     input_workers = input_lib.InputWorkers([
         (w.device_name, [w.device_name]) for w in self.cluster.workers
@@ -1017,7 +1164,8 @@ class Client(object):
   def _create_per_worker_resources(self, fn, args=None, kwargs=None):
     """Synchronously create resources on the workers.
 
-    The resources are represented by `RemoteValue`s.
+    The resources are represented by
+    `tf.distribute.experimental.coordinator.RemoteValue`s.
 
     Args:
       fn: The function to be dispatched to all workers for execution
@@ -1026,7 +1174,9 @@ class Client(object):
       kwargs: Keyword arguments for `fn`.
 
     Returns:
-      A `PerWorkerValues` object, which wraps a tuple of `RemoteValue` objects.
+      A `tf.distribute.experimental.coordinator.PerWorkerValues` object, which
+      wraps a tuple of `tf.distribute.experimental.coordinator.RemoteValue`
+      objects.
     """
     results = []
     for w in self.cluster.workers:
@@ -1034,21 +1184,52 @@ class Client(object):
     return PerWorkerValues(tuple(results))
 
   def fetch(self, val):
-    """Blocking call to fetch results from `RemoteValue`s.
+    """Blocking call to fetch results from the remote values.
 
-    This returns the execution result of `RemoteValue`s; if not ready,
-    waiting for it while blocking the caller.
+    This is a wrapper around
+    `tf.distribute.experimental.coordinator.RemoteValue.fetch` for a
+    `RemoteValue` structure; it returns the execution results of
+    `RemoteValue`s. If not ready, wait for them while blocking the caller.
+
+    Example:
+    ```python
+    strategy = ...
+    coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
+        strategy)
+
+    def dataset_fn():
+      return tf.data.Dataset.from_tensor_slices([1, 1, 1])
+
+    with strategy.scope():
+      v = tf.Variable(initial_value=0)
+
+    @tf.function
+    def worker_fn(iterator):
+      def replica_fn(x):
+        v.assign_add(x)
+        return v.read_value()
+      return strategy.run(replica_fn, args=(next(iterator),))
+
+    distributed_dataset = coordinator.create_per_worker_dataset(dataset_fn)
+    distributed_iterator = iter(distributed_dataset)
+    result = coordinator.schedule(worker_fn, args=(distributed_iterator,))
+    assert coordinator.fetch(result) == 1
+    ```
 
     Args:
       val: The value to fetch the results from. If this is structure of
-        `RemoteValue`, `fetch()` will be called on the individual `RemoteValue`
-        to get the result.
+        `tf.distribute.experimental.coordinator.RemoteValue`, `fetch()` will be
+        called on the individual
+        `tf.distribute.experimental.coordinator.RemoteValue` to get the result.
 
     Returns:
-      If `val` is a `RemoteValue` or a structure of `RemoteValue`s, returns
-      the fetched `RemoteValue` value immediately if it's available, or blocks
-      the call until it's available, and returns the fetched `RemoteValue`
-      values with the same structure. If `val` is other types, return (`val`,).
+      If `val` is a `tf.distribute.experimental.coordinator.RemoteValue` or a
+      structure of `tf.distribute.experimental.coordinator.RemoteValue`s,
+      return the fetched `tf.distribute.experimental.coordinator.RemoteValue`
+      values immediately if they are available, or block the call until they are
+      available, and return the fetched
+      `tf.distribute.experimental.coordinator.RemoteValue` values with the same
+      structure. If `val` is other types, return it as-is.
     """
 
     def _maybe_fetch(val):
@@ -1058,31 +1239,15 @@ class Client(object):
         return val
 
     # TODO(yuefengz): we should fetch values in a batch.
-    result = nest.map_structure(_maybe_fetch, val)
-    if not isinstance(result, tuple):
-      return (result,)
-    return result
-
-
-# pylint: disable=missing-function-docstring
-@contextlib.contextmanager
-def _translate_parameter_server_failure():
-  try:
-    yield
-  except Exception as e:  # pylint: disable=broad-except
-    if _is_ps_failure(e):
-      raise ParameterServerFailureError(e)
-    else:
-      raise
+    return nest.map_structure(_maybe_fetch, val)
 
 
 # pylint: disable=missing-function-docstring
 @contextlib.contextmanager
 def handle_parameter_server_failure():
   try:
-    with _translate_parameter_server_failure():
-      yield
-  except ParameterServerFailureError as e:  # pylint: disable=broad-except
+    yield
+  except errors.UnavailableError as e:  # pylint: disable=broad-except
     restart_exit_code = os.environ.get("TF_CLIENT_NON_FATAL_RESTART_EXIT_CODE",
                                        None)
     if restart_exit_code is not None:
@@ -1094,13 +1259,14 @@ def handle_parameter_server_failure():
 class _PerWorkerDistributedDataset(object):
   """Represents worker-distributed datasets created from dataset function."""
 
-  def __init__(self, dataset_fn, input_workers, client):
+  def __init__(self, dataset_fn, input_workers, coordinator):
     """Makes an iterable from datasets created by the given function.
 
     Args:
       dataset_fn: A function that returns a `Dataset`.
       input_workers: an `InputWorkers` object.
-      client: a `Client` object, used to create dataset resources.
+      coordinator: a `ClusterCoordinator` object, used to create dataset
+        resources.
     """
     def disallow_variable_creation(next_creator, **kwargs):
       raise ValueError("Creating variables in `dataset_fn` is not allowed.")
@@ -1113,7 +1279,7 @@ class _PerWorkerDistributedDataset(object):
         dataset_fn = def_function.function(dataset_fn).get_concrete_function()
     self._dataset_fn = dataset_fn
     self._input_workers = input_workers
-    self._client = client
+    self._coordinator = coordinator
     self._element_spec = None
 
   def __iter__(self):
@@ -1131,13 +1297,13 @@ class _PerWorkerDistributedDataset(object):
     # If _PerWorkerDistributedDataset.__iter__ is called multiple
     # times, for the same object it should only create and register resource
     # once. Using object id to distinguish different iterator resources.
-    per_worker_iterator = self._client._create_per_worker_resources(
+    per_worker_iterator = self._coordinator._create_per_worker_resources(
         _create_per_worker_iterator)
 
     # Setting type_spec of each RemoteValue so that functions taking these
     # RemoteValues as inputs can be traced.
     for iterator_remote_value in per_worker_iterator._values:
-      iterator_remote_value._set_type_spec(
+      iterator_remote_value._type_spec = (  # pylint: disable=protected-access
           iterator_ops.IteratorSpec(
               self._dataset_fn.structured_outputs.element_spec))
     return _PerWorkerDistributedIterator(per_worker_iterator._values)
@@ -1150,7 +1316,7 @@ class _PerWorkerDistributedDataset(object):
 
 
 class _PerWorkerDistributedIterator(PerWorkerValues):
-  """Distributed iterator for `Client`."""
+  """Distributed iterator for `ClusterCoordinator`."""
 
   def __next__(self):
     return self.get_next()
@@ -1169,10 +1335,8 @@ def _extract_failed_ps_instances(err_msg):
 
 def _is_ps_failure(error):
   """Whether the error is considered a parameter server failure."""
-  if (_RPC_ERROR_FROM_PS in str(error) or
-      (isinstance(error, errors.InvalidArgumentError) and
-       "/job:ps" in str(error))):
-    return True
+  return (isinstance(error, errors.UnavailableError) and
+          _RPC_ERROR_FROM_PS in str(error))
 
 
 def _is_worker_failure(error):
@@ -1192,21 +1356,14 @@ def _is_worker_failure(error):
   # failure. In that case, gRPC allows channel (which is different from a
   # connection) to be reused for a replaced server listening to same address.
   if isinstance(error, errors.InvalidArgumentError):
-    if ("Unable to find a context_id" in str(error) or
-        "unknown device" in str(error) or
+    if ("unknown device" in str(error) or
         "Unable to find the relevant tensor remote_handle" in str(error)):
       # TODO(b/159961667): Fix "Unable to find the relevant tensor
       # remote_handle" part.
       return True
 
-  # TODO(b/162541228): The following 3 types of errors are very rare and only
+  # TODO(b/162541228): The following 2 types of errors are very rare and only
   # observed in large-scale testing. The types of errors should be reduced.
-  # This error could show up when copying function inputs from remote tasks.
-  if isinstance(error, errors.InternalError):
-    if ("Failed copying input tensor" in str(error) or
-        "Unable to find a context_id" in str(error)):
-      return True
-
   # This could happen when the function registration fails. In the observed
   # cases this only happens to the dataset related functions.
   if isinstance(error, errors.NotFoundError):
diff --git a/tensorflow/python/distribute/client/client_mpr_test.py b/tensorflow/python/distribute/coordinator/cluster_coordinator_mpr_test.py
similarity index 52%
rename from tensorflow/python/distribute/client/client_mpr_test.py
rename to tensorflow/python/distribute/coordinator/cluster_coordinator_mpr_test.py
index 802b23e87ec..8b3e95f1fea 100644
--- a/tensorflow/python/distribute/client/client_mpr_test.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator_mpr_test.py
@@ -13,31 +13,112 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Multi-process runner tests for `Client` with `ParameterServerStrategyV2`."""
+"""Multi-process runner tests for `ClusterCoordinator` with PSv2."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+import os
 import time
-
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import client as client_lib
-from tensorflow.python.distribute.client import utils
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
+from tensorflow.python.distribute.coordinator import utils
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 
-class ClientMprTest(test.TestCase):
+class ClusterCoordinatorMprTest(test.TestCase):
+
+  # TODO(b/168772720): Merge or remove the following task failure tests once
+  # MultiProcessCluster is made available in OSS.
+  def testStrategyRun_withWorkerFailures(self):
+    self._testStrategyRun("worker")
+
+  def testStrategyRun_withPsFailures(self):
+    self._testStrategyRun("ps")
+
+  def testStrategyRun_withoutFailures(self):
+    self._testStrategyRun(None)
+
+  def _testStrategyRun(self, failure_task_type):
+
+    def fn(functions_scheduled_event):
+      # TODO(b/170664373): This is needed for TF2 parameter server training in
+      # OSS. Remove this when resolved.
+      os.environ["GRPC_FAIL_FAST"] = "use_caller"
+
+      cluster_resolver = TFConfigClusterResolver()
+      if cluster_resolver.task_type != "chief":
+        utils.start_server(cluster_resolver, "grpc")
+      strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+          cluster_resolver)
+      ps_client = coordinator_lib.ClusterCoordinator(strategy)
+
+      with strategy.scope():
+        v = variables.Variable(initial_value=1)
+
+        @def_function.function
+        def worker_fn(input_tensor):
+
+          def replica_fn(input_tensor):
+            return input_tensor + v
+
+          run_result = strategy.run(replica_fn, args=(input_tensor,))
+          check_ops.assert_equal_v2(run_result, 4)
+          return run_result
+
+      for i in range(5000):
+        if i % 500 == 0:
+          logging.info("Scheduling function-{}...".format(i))
+        result = ps_client.schedule(worker_fn, args=(constant_op.constant(3),))
+      functions_scheduled_event.set()
+      logging.info("Joining...")
+      ps_client.join()
+      logging.info("Finished joining.")
+      if result.fetch() != 4:
+        raise AssertionError("Unexpected RemoteValue result: {}".format(
+            result.fetch()))
+      logging.info("testStrategyRun succeeded")
+
+    manager = multi_process_runner.manager()
+    functions_scheduled_event = manager.Event()
+    mpr = multi_process_runner.MultiProcessRunner(
+        fn,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=1, num_ps=1, has_eval=False),
+        args=(functions_scheduled_event,),
+        rpc_layer="grpc",
+        return_output=True)
+    mpr.start()
+
+    if failure_task_type is not None:
+      functions_scheduled_event.wait()
+      logging.info("Before interrupting {}-0.".format(failure_task_type))
+      mpr.terminate(failure_task_type, 0)
+
+      if failure_task_type == "ps":
+        with self.assertRaises(errors.UnavailableError):
+          mpr.join()
+        return
+
+      time.sleep(10)
+      logging.info("Before restarting {}-0.".format(failure_task_type))
+      mpr.start_single_process(task_type="worker", task_id=0)
+
+    self.assertTrue(
+        any(["testStrategyRun succeeded" in msg for msg in mpr.join().stdout]))
 
   def testScheduleTranslatePSFailureError(self):
     self._test_translate_ps_failure_error(test_schedule=True)
@@ -50,12 +131,16 @@ class ClientMprTest(test.TestCase):
                                        test_join=False):
 
     def fn(functions_scheduled_event, test_finished_event):
+      # TODO(b/170664373): This is needed for TF2 parameter server training in
+      # OSS. Remove this when resolved.
+      os.environ["GRPC_FAIL_FAST"] = "use_caller"
+
       cluster_resolver = TFConfigClusterResolver()
       if cluster_resolver.task_type != "chief":
         utils.start_server(cluster_resolver, "grpc")
       strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
           cluster_resolver)
-      ps_client = client_lib.Client(strategy)
+      ps_coordinator = coordinator_lib.ClusterCoordinator(strategy)
 
       with strategy.scope():
         v = variables.Variable(initial_value=0, dtype=dtypes.int32)
@@ -67,21 +152,20 @@ class ClientMprTest(test.TestCase):
           v.assign_add(1)
 
       # Keep the two workers occupied.
-      ps_client.schedule(worker_fn)
-      ps_client.schedule(worker_fn)
+      ps_coordinator.schedule(worker_fn)
+      ps_coordinator.schedule(worker_fn)
       # Now the main process can terminate.
       functions_scheduled_event.set()
 
-      # Verified that join and schedule indeed raise
-      # ParameterServerFailureError.
+      # Verified that join and schedule indeed raise UnavailableError.
       try:
         if test_join:
-          ps_client.join()
+          ps_coordinator.join()
         if test_schedule:
-          while ps_client.cluster._closure_queue._error is None:
+          while ps_coordinator.cluster._closure_queue._error is None:
             time.sleep(1)
-          ps_client.schedule(worker_fn)
-      except client_lib.ParameterServerFailureError:
+          ps_coordinator.schedule(worker_fn)
+      except errors.UnavailableError:
         # The following verifies that after PS fails, continue executing
         # functions on workers should fail and indicate it's PS failure.
         for worker_id in range(3):
@@ -91,7 +175,7 @@ class ClientMprTest(test.TestCase):
               # failure.
               worker_fn()
             except Exception as e:  # pylint: disable=broad-except
-              if client_lib._is_ps_failure(e):
+              if coordinator_lib._is_ps_failure(e):
                 if worker_id < 2:
                   continue
                 logging.info("_test_translate_ps_failure_error ends properly.")
@@ -101,7 +185,7 @@ class ClientMprTest(test.TestCase):
             raise RuntimeError("Executing a function after PS fails, should "
                                "result in a PS failure.")
 
-      raise RuntimeError("ParameterServerFailureError supposed to be raised.")
+      raise RuntimeError("UnavailableError supposed to be raised.")
 
     manager = multi_process_runner.manager()
     functions_scheduled_event = manager.Event()
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
new file mode 100644
index 00000000000..a8ab4300713
--- /dev/null
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
@@ -0,0 +1,928 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for coordinator.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import os
+import platform
+import sys
+import threading
+import time
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
+from tensorflow.python.eager import cancellation
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class CoordinatedClosureQueueTest(test.TestCase):
+
+  def testBasic(self):
+    queue = coordinator_lib._CoordinatedClosureQueue()
+    closure1 = self._create_closure(queue._cancellation_mgr)
+    queue.put(closure1)
+    self.assertIs(closure1, queue.get())
+    self.assertFalse(queue.done())
+    queue.put_back(closure1)
+    self.assertEqual(closure1, queue.get())
+    queue.mark_finished()
+    self.assertTrue(queue.done())
+    queue.wait()
+
+  def testProcessAtLeaseOnce(self):
+    closure_queue = coordinator_lib._CoordinatedClosureQueue()
+    labels = ['A', 'B', 'C', 'D', 'E']
+    processed_count = collections.defaultdict(int)
+
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    def process_queue():
+      with coord.stop_on_exception():
+        has_been_put_back = False
+        while True:
+          closure = closure_queue.get(timeout=30)
+          if closure is None:
+            break
+          if not has_been_put_back:
+            has_been_put_back = True
+            closure_queue.put_back(closure)
+            continue
+          closure._function()
+          closure_queue.mark_finished()
+
+    def get_func(label):
+
+      def func():
+        time.sleep(3)
+        processed_count[label] += 1
+
+      return func
+
+    cm = cancellation.CancellationManager()
+    for label in labels:
+      closure_queue.put(coordinator_lib.Closure(get_func(label), cm))
+    t1 = threading.Thread(target=process_queue, daemon=True)
+    t1.start()
+    t2 = threading.Thread(target=process_queue, daemon=True)
+    t2.start()
+
+    # Make sure multiple wait() calls are fine.
+    closure_queue.wait()
+    closure_queue.wait()
+    closure_queue.wait()
+    closure_queue.wait()
+
+    self.assertEqual(processed_count, collections.Counter(labels))
+
+    coord.join([t1, t2])
+
+  def testNotifyBeforeWait(self):
+    closure_queue = coordinator_lib._CoordinatedClosureQueue()
+
+    def func():
+      logging.info('func running')
+
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    def process_queue():
+      with coord.stop_on_exception():
+        closure_queue.get()
+        closure_queue.mark_finished()
+
+    closure_queue.put(
+        coordinator_lib.Closure(func, closure_queue._cancellation_mgr))
+    t = threading.Thread(target=process_queue)
+    t.start()
+    coord.join([t])
+
+    # This test asserts that waiting at the time the function has been processed
+    # doesn't time out.
+    closure_queue.wait()
+
+  def _assert_one_unblock_the_other(self, first_fn, second_fn):
+    """Asserts `second_fn` wouldn't return before `first_fn` is finished."""
+    first_fn_done = threading.Event()
+    second_fn_done = threading.Event()
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    def wrapped_first_fn():
+      with coord.stop_on_exception():
+        self.assertFalse(second_fn_done.is_set())
+        first_fn()
+        first_fn_done.set()
+
+    self.assertFalse(first_fn_done.is_set())
+    t = threading.Thread(target=wrapped_first_fn)
+    t.start()
+
+    second_fn()
+    self.assertTrue(first_fn_done.is_set())
+    second_fn_done.set()
+
+    coord.join([t])
+
+  def testWaitRaiseErrorAfterMarkFailure(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue = coordinator_lib._CoordinatedClosureQueue()
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
+    closure = closure_queue.get()
+
+    wait_finish_event = threading.Event()
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    # Using a thread to verify that closure_queue.wait() will not return until
+    # all inflight closures are finished.
+
+    def mark_finished_fn():
+      try:
+        raise ValueError('Some error.')
+      except ValueError as e:
+        closure_queue.mark_failed(e)
+
+    def wait_fn():
+      with self.assertRaises(ValueError):
+        closure_queue.wait()
+
+    self._assert_one_unblock_the_other(mark_finished_fn, wait_fn)
+
+    self.assertTrue(closure_queue.done())
+
+  def _create_closure(self, cancellation_mgr):
+
+    @def_function.function()
+    def some_function():
+      return 1.0
+
+    return coordinator_lib.Closure(some_function, cancellation_mgr)
+
+  def _put_two_closures_and_get_one(self):
+    closure_queue = coordinator_lib._CoordinatedClosureQueue()
+    closure1 = self._create_closure(closure_queue._cancellation_mgr)
+    closure_queue.put(closure1)
+
+    closure2 = self._create_closure(closure_queue._cancellation_mgr)
+    closure_queue.put(closure2)
+
+    closure_got = closure_queue.get()  # returns closure1
+    self.assertIs(closure_got, closure1)
+    self.assertIsNot(closure_got, closure2)
+    return closure_queue, closure1, closure2
+
+  def testPutRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue, _, closure2 = self._put_two_closures_and_get_one()
+
+    closure_queue.mark_failed(ValueError())
+
+    with self.assertRaises(ValueError):
+      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
+
+    self.assertTrue(closure_queue.done())
+
+    with self.assertRaisesRegex(
+        errors.CancelledError,
+        'The corresponding function is cancelled. Please reschedule the '
+        'function.'):
+      closure2.output_remote_value.fetch()
+
+    # The error is cleared.
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
+
+  def testWaitRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue, _, closure2 = self._put_two_closures_and_get_one()
+
+    closure_queue.mark_failed(ValueError())
+
+    with self.assertRaises(ValueError):
+      closure_queue.wait()
+    self.assertTrue(closure_queue.done())
+
+    with self.assertRaisesRegex(
+        errors.CancelledError,
+        'The corresponding function is cancelled. Please reschedule the '
+        'function.'):
+      closure2.output_remote_value.fetch()
+
+    # The error is cleared.
+    closure_queue.wait()
+
+  def testDoneRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue, _, _ = self._put_two_closures_and_get_one()
+
+    self.assertFalse(closure_queue.done())
+    closure_queue.mark_failed(ValueError())
+    with self.assertRaises(ValueError):
+      closure_queue.done()
+
+  def _set_error(self, closure_queue, closure, error):
+    try:
+      raise error
+    except Exception as e:  # pylint: disable=broad-except
+      closure.output_remote_value._set_error(e)
+      closure_queue.mark_failed(e)
+
+  def _test_cancel_closure_when_error(self, call_wait):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue, closure1, closure2 = self._put_two_closures_and_get_one()
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
+    closure_queue.get()
+    # At this moment, there are two inflight, one in queue.
+    self.assertEqual(closure_queue._inflight_closure_count, 2)
+
+    # Hold a copy of the queue's cancellation manager at this point
+    initial_cm = closure_queue._cancellation_mgr
+
+    # Simulating closure1 fails.
+    self._set_error(closure_queue, closure1, ValueError('Some error.'))
+
+    # At this moment, there are one inflight, one in queue.
+    self.assertEqual(closure_queue._queue.qsize(), 1)
+    self.assertEqual(closure_queue._inflight_closure_count, 1)
+
+    closure3 = self._create_closure(closure_queue._cancellation_mgr)
+
+    def fake_cancellation():
+      self._set_error(closure_queue, closure2,
+                      ValueError('Fake cancellation error.'))
+
+    def report_error():
+      # It should not report the fake cancellation error.
+      with self.assertRaisesRegex(ValueError, 'Some error.'):
+        # Verifying `wait()` or `put()` raises even if one closure is in
+        # flight.
+        if call_wait:
+          closure_queue.wait()
+        else:
+          closure_queue.put(closure3)
+
+    self._assert_one_unblock_the_other(fake_cancellation, report_error)
+
+    # The original cancellation manager of the queue has been cancelled.
+    self.assertTrue(initial_cm.is_cancelled)
+
+    # At this moment, there is zero inflight, nothing in queue.
+    self.assertTrue(closure_queue._queue.empty())
+    self.assertEqual(closure_queue._inflight_closure_count, 0)
+    self.assertIsNone(closure_queue._error)
+
+    # This asserts that closure1 has errored.
+    with self.assertRaisesRegex(ValueError, 'Some error.'):
+      closure1.output_remote_value.fetch()
+
+    # The following asserts that closure3 should have been cancelled.
+    if not call_wait:
+      with self.assertRaisesRegex(
+          errors.CancelledError,
+          'The corresponding function is cancelled. Please reschedule the '
+          'function.'):
+        closure3.output_remote_value.fetch()
+
+    # Closure2 was an inflight closure when it got cancelled.
+    self.assertEqual(closure2.output_remote_value._status,
+                     coordinator_lib._RemoteValueStatus.READY)
+    with self.assertRaisesRegex(ValueError, 'Fake cancellation error.'):
+      closure2.output_remote_value.fetch()
+
+    # This asserts that the queue has a clear state.
+    self.testBasic()
+
+  def testWaitRaiseErrorAfterCancelClosure(self):
+    self._test_cancel_closure_when_error(call_wait=True)
+
+  def testPutRaiseErrorAfterCancelClosure(self):
+    self._test_cancel_closure_when_error(call_wait=False)
+
+  def testStateIsRestoredAfterJoinIsCalled(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue, _, _ = self._put_two_closures_and_get_one()
+    self.assertEqual(closure_queue._inflight_closure_count, 1)
+    closure_queue.mark_failed(ValueError('test error'))
+    with self.assertRaises(ValueError):
+      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
+
+    # Its error should have been cleared.
+    self.assertIsNone(closure_queue._error)
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
+    self.assertIsNone(closure_queue._error)
+
+  def testThreadSafey(self):
+    thread_count = 10
+    queue = coordinator_lib._CoordinatedClosureQueue()
+
+    # Each thread performs 20 queue actions: 10 are `put_back` and 10 are
+    # `mark_finished`.
+    action_count = 20
+
+    def func():
+      for i in range(action_count):
+        closure = queue.get()
+        if i % 2 == 0:
+          queue.put_back(closure)
+        else:
+          queue.mark_finished()
+
+    threads = [threading.Thread(target=func) for i in range(thread_count)]
+    for t in threads:
+      t.start()
+
+    for _ in range(thread_count * action_count // 2):
+      queue.put(self._create_closure(queue._cancellation_mgr))
+    queue.wait()
+    self.assertTrue(queue.done())
+
+
+class ErrorReportingThread(threading.Thread):
+
+  error = None
+
+  def __init__(self, *args, **kwargs):
+    assert 'target' in kwargs
+    target = kwargs['target']
+
+    @functools.wraps(target)
+    def wrapped_target(*args, **kwargs):
+      try:
+        return target(*args, **kwargs)
+      except Exception as e:  # pylint: disable=broad-except
+        ErrorReportingThread.error = e
+
+    kwargs['target'] = wrapped_target
+    super(ErrorReportingThread, self).__init__(*args, **kwargs)
+
+
+class TestCaseWithErrorReportingThread(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._threading_thread = threading.Thread
+    threading.Thread = ErrorReportingThread
+    super(TestCaseWithErrorReportingThread, cls).setUpClass()
+
+  @classmethod
+  def tearDownClass(cls):
+    super(TestCaseWithErrorReportingThread, cls).tearDownClass()
+    threading.Thread = cls._threading_thread
+
+  def setUp(self):
+    ErrorReportingThread.error = None
+    super(TestCaseWithErrorReportingThread, self).setUp()
+
+  def tearDown(self):
+    super(TestCaseWithErrorReportingThread, self).tearDown()
+    if ErrorReportingThread.error:
+      raise ErrorReportingThread.error  # pylint: disable=raising-bad-type
+
+
+def make_coordinator(num_workers, num_ps):
+  # TODO(rchao): Test the internal rpc_layer version.
+  cluster_def = multi_worker_test_base.create_in_process_cluster(
+      num_workers=num_workers, num_ps=num_ps, rpc_layer='grpc')
+  cluster_def['chief'] = [
+      'localhost:%d' % multi_worker_test_base.pick_unused_port()
+  ]
+  cluster_resolver = SimpleClusterResolver(
+      ClusterSpec(cluster_def), rpc_layer='grpc')
+  strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+      cluster_resolver)
+  return coordinator_lib.ClusterCoordinator(strategy)
+
+
+class ClusterCoordinatorTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ClusterCoordinatorTest, cls).setUpClass()
+    cls.coordinator = make_coordinator(num_workers=3, num_ps=2)
+    cls.strategy = cls.coordinator.strategy
+
+  def testFnReturnNestedValues(self):
+    x = constant_op.constant(1)
+
+    @def_function.function
+    def f():
+      return x + 1, (x + 2, x + 3), [x + 4], {'v': x}
+
+    got = self.coordinator.schedule(f)
+    want = 2, (3, 4), [5], {'v': 1}
+    self.assertEqual(got.fetch(), want)
+    self.assertEqual(self.coordinator.fetch(got), want)
+
+  def testFetchingRemoteValueStructure(self):
+    x = constant_op.constant(1)
+
+    @def_function.function
+    def f():
+      return x + 1, (x + 2, x + 3), [x + 4], {'v': x}
+
+    want = 2, (3, 4), [5], {'v': 1}
+    remote_value_list = [self.coordinator.schedule(f) for _ in range(5)]
+    self.assertAllEqual(
+        self.coordinator.fetch(remote_value_list), [want for _ in range(5)])
+
+  def testInputFunction(self):
+
+    def input_fn():
+      return dataset_ops.DatasetV2.range(1, 2)
+
+    with self.strategy.scope():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int64)
+
+    @def_function.function
+    def worker_fn(iterator):
+      x = next(iterator)
+      v.assign_add(x)
+      return x
+
+    distributed_dataset = self.coordinator.create_per_worker_dataset(input_fn)
+    result = self.coordinator.schedule(
+        worker_fn, args=(iter(distributed_dataset),))
+    result = self.coordinator.fetch(result)
+    self.assertEqual(result, (1,))
+    result = self.coordinator.schedule(
+        worker_fn, args=(iter(distributed_dataset),))
+    result = self.coordinator.fetch(result)
+    self.assertEqual(result, (1,))
+
+    self.assertAlmostEqual(v.read_value().numpy(), 2, delta=1e-6)
+
+  def testAsyncScheduleAndJoin(self):
+
+    def input_fn():
+      return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
+
+    with self.strategy.scope():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+    # TODO(yuefengz): the following tf.function has a return value which is None
+    # in its structured_outputs.
+    @def_function.function
+    def worker_fn(iterator):
+      x = next(iterator)
+      v.assign_add(x)
+
+    distributed_dataset = self.coordinator.create_per_worker_dataset(input_fn)
+
+    iterator = iter(distributed_dataset)
+
+    # Verifying joining without any scheduling doesn't hang.
+    self.coordinator.join()
+    self.assertEqual(v.read_value().numpy(), 0)
+
+    for _ in range(5):
+      self.coordinator.schedule(worker_fn, args=(iterator,))
+    self.coordinator.join()
+
+    # With 5 addition it should be 2*5 = 10.
+    self.assertEqual(v.read_value().numpy(), 10)
+
+    for _ in range(5):
+      self.coordinator.schedule(worker_fn, args=(iterator,))
+
+    # Verifying multiple join is fine.
+    self.coordinator.join()
+    self.coordinator.join()
+    self.coordinator.join()
+
+    self.assertTrue(self.coordinator.done())
+
+    # Likewise, it's now 20.
+    self.assertEqual(v.read_value().numpy(), 20)
+
+  def testInputFunctionWithMap(self):
+    self._map_fn_tracing_count = 0
+
+    def input_fn():
+
+      def map_fn(x):
+        self._map_fn_tracing_count += 1
+        return x + 10
+
+      return dataset_ops.DatasetV2.range(0, 10).map(map_fn)
+
+    @def_function.function
+    def worker_fn(iterator):
+      return next(iterator)
+
+    distributed_dataset = self.coordinator.create_per_worker_dataset(input_fn)
+    result = self.coordinator.schedule(
+        worker_fn, args=(iter(distributed_dataset),))
+    self.assertEqual(result.fetch(), (10,))
+    self.assertEqual(self._map_fn_tracing_count, 1)
+
+  def testInputFunctionCreateVariables(self):
+
+    def input_fn():
+      v = variables.Variable(initial_value=0.0)
+      return v.read_value()
+
+    with self.assertRaises(ValueError):
+      self.coordinator.create_per_worker_dataset(input_fn)
+
+  def testDatasetsShuffledDifferently(self):
+    # This test requires at least two workers in the cluster.
+    self.assertGreaterEqual(len(self.coordinator.cluster.workers), 2)
+
+    random_seed.set_random_seed(None)
+
+    def input_fn():
+      return dataset_ops.DatasetV2.range(0, 100).shuffle(100)
+
+    distributed_dataset = self.coordinator.create_per_worker_dataset(input_fn)
+    distributed_iterator = iter(distributed_dataset)
+
+    # Get elements from the first two iterators.
+    iterator_1 = distributed_iterator._values[0]
+    iterator_1._rebuild_on(self.coordinator.cluster.workers[0])
+    iterator_1 = iterator_1.fetch()
+    elements_in_iterator_1 = [e.numpy() for e in iterator_1]
+
+    iterator_2 = distributed_iterator._values[1]
+    iterator_2._rebuild_on(self.coordinator.cluster.workers[1])
+    iterator_2 = iterator_2.fetch()
+    elements_in_iterator_2 = [e.numpy() for e in iterator_2]
+
+    self.assertNotAllEqual(elements_in_iterator_1, elements_in_iterator_2)
+
+  def testPerWorkerValue(self):
+    self.skipTest('b/168569314')
+    var_shape = tuple()
+    var_dtype = dtypes.float32
+    var_name = 'var'
+
+    def create_var():
+      var = variables.Variable(
+          initial_value=0.0, dtype=var_dtype, name=var_name)
+      self.assertIn('worker', var.device)
+      return var
+
+    worker_local_var = self.coordinator._create_per_worker_resources(create_var)
+
+    # The following is a workaround to allow `worker_local_var` to be passed in
+    # as args to the `coordinator.schedule` method which requires tensor specs
+    # to trace tf.function but _create_worker_resources' return values don't
+    # have tensor specs. We can get rid of this workaround once
+    # _create_worker_resources is able to infer the tensor spec of the return
+    # value of the function passed in. See b/154675763.
+    for var in worker_local_var._values:
+      var._type_spec = tensor_spec.TensorSpec(var_shape, var_dtype, var_name)
+
+    def worker_fn(var):
+      var.assign_add(1.0)
+
+    for _ in range(10):
+      # Which slice of `worker_local_var` will be used will depend on which
+      # worker the `worker_fn` gets scheduled on.
+      self.coordinator.schedule(worker_fn, args=(worker_local_var,))
+    self.coordinator.join()
+
+    var_sum = sum(self.coordinator.fetch(worker_local_var._values))
+    self.assertEqual(var_sum, 10.0)
+
+  def testDisallowRemoteValueAsInput(self):
+
+    @def_function.function
+    def func_0():
+      return 1.0
+
+    @def_function.function
+    def func_1(x):
+      return x + 1.0
+
+    remote_v = self.coordinator.schedule(func_0)
+    with self.assertRaises(ValueError):
+      self.coordinator.schedule(func_1, args=(remote_v,))
+
+
+class LimitedClosureQueueSizeBasicTest(ClusterCoordinatorTest):
+  """Test basic functionality works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in `ClusterCoordinatorTest`, with an
+  explicit size limit for the closure queue. Note that even when the queue size
+  is set to infinite, there is still a maximum practical size (depends on host
+  memory limit) that might cause the queue.put operations to be blocking when
+  scheduling a large number of closures on a big cluster. These tests make sure
+  that the coordinator does not run into deadlocks in such scenario.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
+    coordinator_lib._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.coordinator = make_coordinator(num_workers=3, num_ps=2)
+    cls.strategy = cls.coordinator.strategy
+
+
+class ScheduleStartDelayTest(ClusterCoordinatorTest):
+  """Test basic functionality works with worker scheduling delay.
+
+  This is basically to make sure that setting environment variables
+  `TF_COORDINATOR_SCHEDULE_START_DELAY` and
+  `TF_COORDINATOR_SCHEDULE_START_DELAY_MAX` will cause any failure.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(ScheduleStartDelayTest, cls).setUpClass()
+    os.environ['TF_COORDINATOR_SCHEDULE_START_DELAY'] = '2'
+    os.environ['TF_COORDINATOR_SCHEDULE_START_DELAY_MAX'] = '4'
+    cls.coordinator = make_coordinator(num_workers=3, num_ps=2)
+    cls.strategy = cls.coordinator.strategy
+
+  @classmethod
+  def tearDownClass(cls):
+    del os.environ['TF_COORDINATOR_SCHEDULE_START_DELAY']
+    del os.environ['TF_COORDINATOR_SCHEDULE_START_DELAY_MAX']
+    super(ScheduleStartDelayTest, cls).tearDownClass()
+
+
+class ErrorReportingTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ErrorReportingTest, cls).setUpClass()
+    cls.coordinator = make_coordinator(num_workers=3, num_ps=2)
+    cls.strategy = cls.coordinator.strategy
+
+    with cls.strategy.scope():
+      cls.iteration = variables.Variable(initial_value=0.0)
+
+  @def_function.function
+  def _normal_function(self):
+    x = random_ops.random_uniform((2, 10))
+    y = random_ops.random_uniform((10, 2))
+    self.iteration.assign_add(1.0)
+    return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+  @def_function.function
+  def _error_function(self):
+    x = random_ops.random_uniform((2, 10))
+    y = random_ops.random_uniform((10, 2))
+    check_ops.assert_non_positive_v2(math_ops.reduce_sum(math_ops.matmul(x, y)))
+    self.iteration.assign_add(1.0)
+    return self.iteration
+
+  @def_function.function
+  def _long_function(self):
+    x = random_ops.random_uniform((1000, 1000))
+    for _ in math_ops.range(10000):
+      a = random_ops.random_uniform((1000, 1000))
+      b = random_ops.random_uniform((1000, 1000))
+      x += math_ops.matmul(a, b)
+    return x
+
+  def testJoinRaiseError(self):
+    for _ in range(3):
+      self.coordinator.schedule(self._normal_function)
+    self.coordinator.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.coordinator.join()
+
+  def testScheduleRaiseError(self):
+    for _ in range(3):
+      self.coordinator.schedule(self._normal_function)
+    self.coordinator.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      while True:
+        self.coordinator.schedule(self._normal_function)
+
+  def testScheduleRaiseErrorWithMultipleFailure(self):
+    for _ in range(3):
+      self.coordinator.schedule(self._normal_function)
+    self.coordinator.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      while True:
+        self.coordinator.schedule(self._error_function)
+    self.coordinator.join()
+
+  def testErrorWillbeCleared(self):
+    self.coordinator.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.coordinator.join()
+
+    for _ in range(3):
+      self.coordinator.schedule(self._normal_function)
+    self.coordinator.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.coordinator.join()
+
+  def testRemoteValueReturnError(self):
+    result = self.coordinator.schedule(self._error_function)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      result.fetch()
+
+    # Clear the error.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.coordinator.join()
+
+  def testInputError(self):
+
+    worker_local_val = self.coordinator._create_per_worker_resources(
+        self._error_function)
+
+    @def_function.function
+    def func(x):
+      return x + 1
+
+    result = self.coordinator.schedule(func, args=(worker_local_val,))
+    with self.assertRaises(coordinator_lib.InputError):
+      self.coordinator.join()
+
+    with self.assertRaises(coordinator_lib.InputError):
+      result.fetch()
+
+  def testCancellation(self):
+    for _ in range(3):
+      self.coordinator.schedule(self._normal_function)
+    long_function = self.coordinator.schedule(self._long_function)
+    self.coordinator.schedule(self._error_function)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.coordinator.join()
+
+    with self.assertRaises(errors.CancelledError):
+      long_function.fetch()
+
+    for _ in range(3):
+      self.coordinator.schedule(self._normal_function)
+    self.coordinator.join()
+
+
+class LimitedClosureQueueErrorTest(ErrorReportingTest):
+  """Test error reporting works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in ErrorReportingTest, with an explicit
+  size limit for the closure queue.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueErrorTest, cls).setUpClass()
+    coordinator_lib._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.coordinator = make_coordinator(num_workers=3, num_ps=2)
+    cls.strategy = cls.coordinator.strategy
+
+    with cls.coordinator.strategy.scope():
+      cls.iteration = variables.Variable(initial_value=0.0)
+
+
+class StrategyIntegrationTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(StrategyIntegrationTest, cls).setUpClass()
+    cls.coordinator = make_coordinator(num_workers=1, num_ps=1)
+    cls.strategy = cls.coordinator.strategy
+
+  def testBasicVariableAssignment(self):
+    self.strategy.extended._variable_count = 0
+    with self.strategy.scope():
+      v1 = variables.Variable(initial_value=0.0)
+      v2 = variables.Variable(initial_value=1.0)
+    self.assertEqual(self.strategy.extended._variable_count, 2)
+
+    @def_function.function
+    def worker_fn():
+      v1.assign_add(0.1)
+      v2.assign_sub(0.2)
+      return v1.read_value() / v2.read_value()
+
+    results = self.coordinator.schedule(worker_fn)
+    logging.info('Results of experimental_run_v2: %f',
+                 self.coordinator.fetch(results))
+
+    self.assertAlmostEqual(v1.read_value().numpy(), 0.1, delta=1e-6)
+    self.assertAlmostEqual(v2.read_value().numpy(), 0.8, delta=1e-6)
+
+  def testRunAndReduce(self):
+    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    with self.strategy.scope():
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      v = variables.Variable(initial_value=1)
+
+      @def_function.function
+      def worker_fn(input_tensor):
+
+        def replica_fn(input_tensor):
+          # Within `replica_fn`, it has to be in a replica context.
+          self.assertFalse(
+              distribution_strategy_context.in_cross_replica_context())
+          return input_tensor + v, input_tensor - v
+
+        run_result = self.strategy.run(replica_fn, args=(input_tensor,))
+        reduced_result = self.strategy.reduce('SUM', run_result, axis=None)
+        check_ops.assert_equal_v2(run_result, (4, 2))
+        check_ops.assert_equal_v2(reduced_result, (4, 2))
+        return reduced_result
+
+      # Asserting scheduling in scope has the expected behavior.
+      result = self.coordinator.schedule(
+          worker_fn, args=(constant_op.constant(3),))
+      self.assertIsInstance(result, coordinator_lib.RemoteValue)
+      self.assertEqual(result.fetch(), (4, 2))
+
+    # Asserting scheduling out of scope has the expected behavior.
+    result = self.coordinator.schedule(
+        worker_fn, args=(constant_op.constant(3),))
+    self.assertEqual(result.fetch(), (4, 2))
+
+  def testDistributeDataset(self):
+
+    def per_worker_dataset_fn():
+      dataset = dataset_ops.DatasetV2.range(1, 2)
+      return self.strategy.experimental_distribute_dataset(dataset)
+
+    @def_function.function
+    def worker_fn(iterator):
+      return next(iterator)
+
+    distributed_dataset = self.coordinator.create_per_worker_dataset(
+        per_worker_dataset_fn)
+    result = self.coordinator.schedule(
+        worker_fn, args=(iter(distributed_dataset),))
+    result = result.fetch()
+    self.assertEqual(result, (1,))
+
+  def testDistributeDatasetsFromFunction(self):
+
+    def per_worker_dataset_fn():
+      return self.strategy.distribute_datasets_from_function(
+          lambda _: dataset_ops.DatasetV2.range(1, 2))
+
+    @def_function.function
+    def worker_fn(iterator):
+      return next(iterator)
+
+    distributed_dataset = self.coordinator.create_per_worker_dataset(
+        per_worker_dataset_fn)
+    result = self.coordinator.schedule(
+        worker_fn, args=(iter(distributed_dataset),))
+    result = result.fetch()
+    self.assertEqual(result, (1,))
+
+  def testCallingDistributeDatasetOutside(self):
+    with self.assertRaises(ValueError):
+      dataset = dataset_ops.DatasetV2.range(1, 2)
+      self.strategy.experimental_distribute_dataset(dataset)
+
+    with self.assertRaises(ValueError):
+      self.strategy.distribute_datasets_from_function(
+          lambda _: dataset_ops.DatasetV2.range(1, 2))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/coordinator/fault_tolerance_test.py b/tensorflow/python/distribute/coordinator/fault_tolerance_test.py
new file mode 100644
index 00000000000..cc075d09c3d
--- /dev/null
+++ b/tensorflow/python/distribute/coordinator/fault_tolerance_test.py
@@ -0,0 +1,393 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fault tolerance test for parameter server training in TF2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+import time
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator as thread_coordinator
+from tensorflow.python.training import server_lib
+
+_RPC_ERROR_FROM_WORKER = "GRPC error information from remote target /job:worker"
+_RPC_ERROR_FROM_PS = "GRPC error information from remote target /job:ps"
+
+
+class Model(object):
+
+  def __init__(self, coordinator):
+    self.cluster_coord = coordinator
+    self.strategy = self.cluster_coord.strategy
+    with self.cluster_coord.strategy.scope():
+      self.build()
+
+  def build(self):
+    self.w = variables.Variable(
+        initial_value=random_ops.random_uniform((1000, 1000)),
+        dtype=dtypes.float32)
+    self.iterations = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+  @def_function.function
+  def train_fn(self):
+    # train_fn roughly took 0.5s to execute on Intel Xeon Gold 6154 (3.00GHZ)
+    # w/o any compilation optimization (two worker setup).
+    for _ in math_ops.range(5):
+      x = math_ops.matmul(random_ops.random_uniform((1000, 1000)), self.w)
+      self.w.assign_add(x)
+    self.iterations.assign_add(1)
+
+  def schedule_training_functions(self, num_steps):
+    with self.strategy.scope():
+      for _ in range(num_steps):
+        self.cluster_coord.schedule(self.train_fn)
+
+  def join_training_functions(self):
+    self.cluster_coord.join()
+
+
+class FaultToleranceTest(test.TestCase):  # pylint: disable=missing-docstring
+
+  NUM_WORKERS = 2
+  NUM_PS = 2
+
+  def setUp(self):
+    super(FaultToleranceTest, self).setUp()
+
+    # Set the environment variable to prevent hanging upon job failure and
+    # restart. Note that it defaults to 'use_caller' at Google, but defaults
+    # to False in OSS.
+    os.environ["GRPC_FAIL_FAST"] = "use_caller"
+
+    self._cluster = multi_worker_test_base.create_multi_process_cluster(
+        num_workers=FaultToleranceTest.NUM_WORKERS,
+        num_ps=FaultToleranceTest.NUM_PS,
+        rpc_layer="grpc")
+    self._cluster_def = self._cluster.cluster_resolver.cluster_spec().as_dict()
+    self._cluster_def["chief"] = [
+        "localhost:%d" % multi_worker_test_base.pick_unused_port()
+    ]
+    cluster_resolver = SimpleClusterResolver(
+        server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc")
+
+    # The strategy's constructor would connect to the cluster.
+    self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        cluster_resolver)
+    self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy)
+
+    self.thread_coord = thread_coordinator.Coordinator(
+        clean_stop_exception_types=[])
+
+  def tearDown(self):
+    super(FaultToleranceTest, self).tearDown()
+    self._cluster.stop()
+
+  def _restart(self, downtime_secs, job):
+    """Kills `job` (index: 0) and restarts it after `downtime_secs`.
+
+    Args:
+      downtime_secs: secs before restarting the job.
+      job: a string specifying the job to restart.
+    """
+    self._cluster.kill_task(job, 0)
+    time.sleep(downtime_secs)
+    self.assertFalse(context.check_alive("/job:%s/replica:0/task:0" % job))
+    self._cluster.start_task(job, 0)
+    while not context.check_alive("/job:%s/replica:0/task:0" % job):
+      time.sleep(1)
+
+  def _restart_in_thread(self, downtime_secs, restart_job):
+
+    def _restart_fn():
+      with self.thread_coord.stop_on_exception():
+        self._restart(downtime_secs, restart_job)
+
+    restart_thread = threading.Thread(target=_restart_fn)
+    restart_thread.start()
+    return restart_thread
+
+  def testOneWorkerPreemption(self):
+    # A blackbox test to make sure the model can still train when there is
+    # worker preemption.
+    model = Model(self.cluster_coord)
+    model.schedule_training_functions(10)
+
+    time.sleep(1)  # Let it run a couple steps.
+    self.assertFalse(
+        self.cluster_coord.done(), "cluster finishes work before restart, this"
+        " is most likely due to the test runs in more powerful machine"
+        " compared to the one it previously runs. This setup is brittle but"
+        " there are no easy better alternatives. To fix the failure, consider"
+        " adding more work to the cluster, e.g, scheduling more functions.")
+    self._restart(5, "worker")
+
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 10)
+
+  def testOneWorkerPreemptionWithCancellation(self):
+
+    @def_function.function
+    def normal_function():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+    @def_function.function
+    def error_function():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      check_ops.assert_non_positive_v2(
+          math_ops.reduce_sum(math_ops.matmul(x, y)))
+      return x
+
+    @def_function.function
+    def long_function():
+      x = random_ops.random_uniform((1000, 1000))
+      for _ in math_ops.range(10000):
+        a = random_ops.random_uniform((1000, 1000))
+        b = random_ops.random_uniform((1000, 1000))
+        x += math_ops.matmul(a, b)
+      return x
+
+    for _ in range(3):
+      self.cluster_coord.schedule(normal_function)
+    long_function_result = self.cluster_coord.schedule(long_function)
+    self.cluster_coord.schedule(error_function)
+
+    time.sleep(1)  # Let it run a couple steps.
+    self._restart(1, "worker")
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.cluster_coord.join()
+
+    with self.assertRaises(errors.CancelledError):
+      long_function_result.fetch()
+
+    for _ in range(3):
+      self.cluster_coord.schedule(normal_function)
+    self.cluster_coord.join()
+
+  def testHandleDatasetCreationFailure(self):
+    model = Model(self.cluster_coord)
+
+    restart_thread = self._restart_in_thread(5, "worker")
+
+    model.schedule_training_functions(3)
+    model.join_training_functions()
+
+    self.thread_coord.join([restart_thread])
+    self.assertGreaterEqual(model.iterations.numpy(), 3)
+
+  def testWorkerPreemptionErrorType(self):
+
+    @def_function.function
+    def worker_train_fn():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+    def run_fn():
+      with self.thread_coord.stop_on_exception():
+        with ops.device("/job:worker/replica:0/task:0"):
+          for _ in range(3):
+            for _ in range(3):
+              worker_train_fn()
+            time.sleep(5)
+
+    run_thread = threading.Thread(target=run_fn)
+    run_thread.start()
+    time.sleep(1)  # Let it run a couple steps.
+    self._restart(2, "worker")
+
+    try:
+      self.thread_coord.join([run_thread])
+    except errors.UnavailableError as e:
+      logging.info("Got exception %r, error message is %s", e, e)
+
+      self.assertIn(_RPC_ERROR_FROM_WORKER, str(e))  # pylint: disable=g-assert-in-except
+      self.assertNotIn(_RPC_ERROR_FROM_PS, str(e))
+
+      self.assertTrue("failed to connect to all addresses" in str(e) or
+                      "Unable to find a context_id" in str(e) or
+                      "Socket closed" in str(e) or
+                      "Connection reset by peer" in str(e) or
+                      "Transport closed" in str(e))
+
+  def testWorkerPreemptionErrorTypeWithPythonFunction(self):
+
+    def worker_train_fn():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+    def run_fn():
+      with self.thread_coord.stop_on_exception():
+        with ops.device("/job:worker/replica:0/task:0"):
+          for _ in range(3):
+            for _ in range(3):
+              worker_train_fn()
+            time.sleep(5)
+
+    run_thread = threading.Thread(target=run_fn)
+    run_thread.start()
+    time.sleep(1)  # Let it run a couple steps.
+    self._restart(2, "worker")
+
+    try:
+      self.thread_coord.join([run_thread])
+    except errors.UnavailableError as e:
+      logging.info("Got exception %r, error message is %s", e, e)
+
+      self.assertIn(_RPC_ERROR_FROM_WORKER, str(e))  # pylint: disable=g-assert-in-except
+      self.assertNotIn(_RPC_ERROR_FROM_PS, str(e))
+
+      self.assertTrue("failed to connect to all addresses" in str(e) or
+                      "Unable to find a context_id" in str(e) or
+                      "Socket closed" in str(e) or
+                      "Connection reset by peer" in str(e) or
+                      "Transport closed" in str(e))
+
+  def testPSPreemptionErrorType(self):
+
+    with ops.device("/job:ps/replica:0/task:0"):
+      v = variables.Variable(
+          initial_value=random_ops.random_uniform((2, 10)),
+          dtype=dtypes.float32)
+
+    @def_function.function
+    def worker_train_fn():
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(v, y))
+
+    def run_fn():
+      with self.thread_coord.stop_on_exception():
+        with ops.device("/job:worker/replica:0/task:0"):
+          for _ in range(3):
+            for _ in range(3):
+              worker_train_fn()
+            time.sleep(5)
+
+    run_thread = threading.Thread(target=run_fn)
+    run_thread.start()
+    time.sleep(1)  # Let it run a couple steps.
+
+    # Use a short restart delay to cover the case that RPC channel is reused
+    self._restart(1, "ps")
+
+    try:
+      self.thread_coord.join([run_thread])
+    except (errors.UnavailableError, errors.AbortedError) as e:
+      logging.info("Got exception %r, error message is %s", e, e)
+      self.assertIn(_RPC_ERROR_FROM_PS, str(e))  # pylint: disable=g-assert-in-except
+
+      if isinstance(e, errors.UnavailableError):
+        self.assertTrue("failed to connect to all addresses" in str(e) or
+                        "Unable to find a context_id" in str(e) or
+                        "Socket closed" in str(e) or
+                        "Connection reset by peer" in str(e) or
+                        "Transport closed" in str(e))
+
+      if isinstance(e, errors.AbortedError):
+        self.assertIn("RecvTensor expects a different device incarnation",
+                      str(e))
+
+  def testTwoWorkersPreempted(self):
+    model = Model(self.cluster_coord)
+    model.schedule_training_functions(10)
+
+    time.sleep(1)
+    self.assertFalse(self.cluster_coord.done())
+    self._cluster.kill_task("worker", 0)
+    self._cluster.kill_task("worker", 1)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:1"))
+    self._cluster.start_task("worker", 0)
+    self._cluster.start_task("worker", 1)
+    time.sleep(2)
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:1"))
+
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 10)
+
+  def testWorkerContinuousFailure(self):
+    model = Model(self.cluster_coord)
+    model.schedule_training_functions(10)
+
+    time.sleep(1)
+    self.assertFalse(self.cluster_coord.done())
+    self._cluster.kill_task("worker", 0)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self._cluster.start_task("worker", 0)
+    time.sleep(2)
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+    self._cluster.kill_task("worker", 0)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self._cluster.start_task("worker", 0)
+    time.sleep(2)
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 10)
+
+  def testClusterStateNotDisrupted(self):
+    # This test has side effects and can disrupt other tests, even if the
+    # resource created by it will not be used in following tests.
+    # TODO(b/155209534): enable this test.
+    # self.testPSPreemptionErrorType()
+
+    self.thread_coord = thread_coordinator.Coordinator(
+        clean_stop_exception_types=[])
+    self.testOneWorkerPreemption()
+
+    self.thread_coord = thread_coordinator.Coordinator(
+        clean_stop_exception_types=[])
+    self.testWorkerPreemptionErrorType()
+
+    # In previous tests, workers may fail after training is done. But the
+    # following tests start with creating resources where failure is not
+    # handled.
+    # TODO(b/153888707): enable the following two tests.
+    # self.testTwoWorkersPreempted()
+    # self.testWorkerContinuousFailure()
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/client/metric_utils.py b/tensorflow/python/distribute/coordinator/metric_utils.py
similarity index 91%
rename from tensorflow/python/distribute/client/metric_utils.py
rename to tensorflow/python/distribute/coordinator/metric_utils.py
index f0a6628a333..308da213904 100644
--- a/tensorflow/python/distribute/client/metric_utils.py
+++ b/tensorflow/python/distribute/coordinator/metric_utils.py
@@ -31,15 +31,15 @@ enable_metrics = False
 _time_buckets = monitoring.ExponentialBuckets(0.001, 10, 6)
 
 _function_tracing_sampler = monitoring.Sampler(
-    '/tensorflow/api/ps_strategy/client/function_tracing', _time_buckets,
+    '/tensorflow/api/ps_strategy/coordinator/function_tracing', _time_buckets,
     'Sampler to track the time (in seconds) for tracing functions.')
 
 _closure_execution_sampler = monitoring.Sampler(
-    '/tensorflow/api/ps_strategy/client/closure_execution', _time_buckets,
+    '/tensorflow/api/ps_strategy/coordinator/closure_execution', _time_buckets,
     'Sampler to track the time (in seconds) for executing closures.')
 
 _remote_value_fetch_sampler = monitoring.Sampler(
-    '/tensorflow/api/ps_strategy/client/remote_value_fetch', _time_buckets,
+    '/tensorflow/api/ps_strategy/coordinator/remote_value_fetch', _time_buckets,
     'Sampler to track the time (in seconds) for fetching remote_value.')
 
 _METRICS_MAPPING = {
diff --git a/tensorflow/python/distribute/client/metric_utils_test.py b/tensorflow/python/distribute/coordinator/metric_utils_test.py
similarity index 90%
rename from tensorflow/python/distribute/client/metric_utils_test.py
rename to tensorflow/python/distribute/coordinator/metric_utils_test.py
index f94cdcb6d76..abd4221df4d 100644
--- a/tensorflow/python/distribute/client/metric_utils_test.py
+++ b/tensorflow/python/distribute/coordinator/metric_utils_test.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for metrics collecting in client."""
+"""Tests for metrics collecting in coordinator."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,9 +22,9 @@ from __future__ import print_function
 import time
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import client
-from tensorflow.python.distribute.client import metric_utils
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
+from tensorflow.python.distribute.coordinator import metric_utils
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.training.server_lib import ClusterSpec
@@ -35,7 +35,7 @@ class MetricUtilsTest(test.TestCase):
   def get_rpc_layer(self):
     return 'grpc'
 
-  def testClientMetrics(self):
+  def testClusterCoordinatorMetrics(self):
 
     metric_utils.enable_metrics = True
 
@@ -48,7 +48,7 @@ class MetricUtilsTest(test.TestCase):
         ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer())
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         cluster_resolver)
-    cluster = client.Cluster(strategy)
+    cluster = coordinator_lib.Cluster(strategy)
 
     @def_function.function
     def func():
diff --git a/tensorflow/python/distribute/client/utils.py b/tensorflow/python/distribute/coordinator/utils.py
similarity index 100%
rename from tensorflow/python/distribute/client/utils.py
rename to tensorflow/python/distribute/coordinator/utils.py
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index caade747b4a..c5aca728827 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import collections
 import copy
-import enum
 import threading
 
 import six
@@ -34,9 +33,10 @@ from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import executor
+from tensorflow.python.eager import executor as executor_lib
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -144,8 +144,8 @@ def _normalize_value_destination_pairs(value_destination_pairs):
 
 
 def _validate_value_destination_pairs(value_destination_pairs):
+  """Validates value_destination_pairs are valid."""
   # TODO(yuefengz): raise exceptions instead of returning False.
-  # pylint: disable=g-missing-docstring
   if not value_destination_pairs: return False
   if not isinstance(value_destination_pairs, (list, tuple)): return False
   if not all(isinstance(pair, tuple) for pair in value_destination_pairs):
@@ -197,7 +197,7 @@ def simple_broadcast(value, destinations, always_mirrored=False):
 
 def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
                    reduce_op):
-  # pylint: disable=g-missing-docstring
+  """Reduces the value by accumulation_fn and reduce_op."""
   all_values = per_replica_value.values
   if not all_values:
     raise ValueError("`per_replica_value` must be non-empty")
@@ -250,11 +250,7 @@ class CrossDeviceOps(object):
     # Returns 1 by default, the value may be overridden by sub classes.
     return 1
 
-  def reduce(self,
-             reduce_op,
-             per_replica_value,
-             destinations,
-             experimental_hints=None):
+  def reduce(self, reduce_op, per_replica_value, destinations, options=None):
     """Reduce `per_replica_value` to `destinations`.
 
     See `tf.distribute.StrategyExtended.reduce_to`. This can only be called in
@@ -271,8 +267,8 @@ class CrossDeviceOps(object):
         `destinations`. Note that if it's a `tf.Variable`, the value is reduced
         to the devices of that variable, and this method doesn't update the
         variable.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
       A `tf.Tensor` or `tf.distribute.DistributedValues`.
@@ -282,6 +278,8 @@ class CrossDeviceOps(object):
         `tf.distribute.DistributedValues` or if destinations is not a string,
         `tf.Variable` or `tf.distribute.DistributedValues`.
     """
+    if options is None:
+      options = collective_util.Options()
     if not isinstance(per_replica_value, value_lib.DistributedValues):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
@@ -295,16 +293,12 @@ class CrossDeviceOps(object):
         v = array_ops.identity(per_replica_value.values[0])
       return distribute_utils.regroup((v,), wrap_class=value_lib.Mirrored)
 
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
     return self.reduce_implementation(reduce_op, per_replica_value,
-                                      destinations, experimental_hints)
+                                      destinations, options)
 
-  def _gather(self,
-              per_replica_value,
-              destinations,
-              axis,
-              experimental_hints=None):
+  def _gather(self, per_replica_value, destinations, axis, options=None):
     """Gather `per_replica_value` to `destinations`.
 
     Args:
@@ -318,8 +312,8 @@ class CrossDeviceOps(object):
         variable.
       axis: specifies the dimension to gather along within each replica's
         tensor.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
       A `tf.Tensor` or `tf.distribute.DistributedValues`
@@ -329,8 +323,11 @@ class CrossDeviceOps(object):
         `tf.distribute.DistributedValues` or if destinations is not a string,
         `tf.Variable` or `tf.distribute.DistributedValues`.
     """
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if isinstance(per_replica_value, ops.IndexedSlices):
+      raise NotImplementedError("gather/all_gather does not support "
+                                "IndexedSlices")
+    if options is None:
+      options = collective_util.Options()
 
     if not isinstance(per_replica_value, value_lib.DistributedValues):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
@@ -346,10 +343,10 @@ class CrossDeviceOps(object):
       return distribute_utils.regroup((v,), wrap_class=value_lib.Mirrored)
 
     return self._gather_implementation(per_replica_value, destinations, axis,
-                                       experimental_hints)
+                                       options)
 
   def _gather_implementation(self, per_replica_value, destinations, axis,
-                             experimental_hints):
+                             options):
     """Implementation of `gather` method of `tf.distribute.CrossDeviceOps`.
 
     Overriding this method is useful for subclass implementers.
@@ -365,8 +362,8 @@ class CrossDeviceOps(object):
         variable.
       axis: specifies the dimension to gather along within each replica's
         tensor.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
       A `tf.Tensor` or `tf.distribute.DistributedValues`.
@@ -379,10 +376,7 @@ class CrossDeviceOps(object):
     raise NotImplementedError(
         "_gather method must be implemented in descendants.")
 
-  def batch_reduce(self,
-                   reduce_op,
-                   value_destination_pairs,
-                   experimental_hints=None):
+  def batch_reduce(self, reduce_op, value_destination_pairs, options=None):
     """Reduce values to destinations in batches.
 
     See `tf.distribute.StrategyExtended.batch_reduce_to`. This can only be
@@ -393,8 +387,8 @@ class CrossDeviceOps(object):
         combined.
       value_destination_pairs: a sequence of (value, destinations) pairs. See
         `tf.distribute.CrossDeviceOps.reduce` for descriptions.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
       A list of `tf.Tensor` or `tf.distribute.DistributedValues`, one per pair
@@ -404,6 +398,8 @@ class CrossDeviceOps(object):
       ValueError: if `value_destination_pairs` is not an iterable of
         tuples of `tf.distribute.DistributedValues` and destinations.
     """
+    if options is None:
+      options = collective_util.Options()
     # TODO(yuefengz): if destinations are different, split into several
     # `_batch_reduce` invocations.
     if not _validate_value_destination_pairs(value_destination_pairs):
@@ -424,10 +420,10 @@ class CrossDeviceOps(object):
           for v, _ in value_destination_pairs
       ]
 
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
     return self.batch_reduce_implementation(reduce_op, value_destination_pairs,
-                                            experimental_hints)
+                                            options)
 
   def broadcast(self, tensor, destinations):
     """Broadcast `tensor` to `destinations`.
@@ -450,7 +446,7 @@ class CrossDeviceOps(object):
 
   @doc_controls.for_subclass_implementers
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
+                            options):
     """Implementation of `reduce`.
 
     Overriding this method is useful for subclass implementers.
@@ -466,8 +462,8 @@ class CrossDeviceOps(object):
         `destinations`. Note that if it's a `tf.Variable`, the value is reduced
         to the devices of that variable, this method doesn't update the
         variable.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
       A `tf.Tensor` or `tf.distribute.DistributedValues`.
@@ -482,7 +478,7 @@ class CrossDeviceOps(object):
 
   @doc_controls.for_subclass_implementers
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
     """Implementation of `batch_reduce`.
 
     Overriding this method is useful for subclass implementers.
@@ -492,8 +488,8 @@ class CrossDeviceOps(object):
         combined.
       value_destination_pairs: a sequence of (value, destinations) pairs. See
         `reduce` for descriptions.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
       A list of `tf.Tensor` or `tf.distribute.DistributedValues`, one per pair
@@ -557,8 +553,8 @@ class ReductionToOneDevice(CrossDeviceOps):
     super(ReductionToOneDevice, self).__init__()
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
-    del experimental_hints  # Unused.
+                            options):
+    del options  # Unused.
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
@@ -572,8 +568,8 @@ class ReductionToOneDevice(CrossDeviceOps):
     return self.broadcast(reduced, destinations)
 
   def _gather_implementation(self, per_replica_value, destinations, axis,
-                             experimental_hints):
-    del experimental_hints  # Unused.
+                             options):
+    del options  # Unused.
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
@@ -586,10 +582,10 @@ class ReductionToOneDevice(CrossDeviceOps):
     return self.broadcast(gathered, destinations)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
     return [
         self.reduce_implementation(
-            reduce_op, t, destinations=v, experimental_hints=experimental_hints)
+            reduce_op, t, destinations=v, options=options)
         for t, v in value_destination_pairs
     ]
 
@@ -805,22 +801,25 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     super(AllReduceCrossDeviceOps, self).__init__()
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
-    del experimental_hints  # Unused.
-    if _devices_match(per_replica_value, destinations):
+                            options):
+    del options  # Unused.
+    # To use NCCL or all-reduce, source and destination devices should match,
+    # and none of the devices should be CPU.
+    if (_devices_match(per_replica_value, destinations) and
+        not any("cpu" in d.lower() for d in get_devices_from(destinations))):
       return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
       return self._simple_cross_replica_ops.reduce(reduce_op, per_replica_value,
                                                    destinations)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
     if _all_devices_match(value_destination_pairs):
       return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       return [
-          self.reduce_implementation(reduce_op, value, dest, experimental_hints)
+          self.reduce_implementation(reduce_op, value, dest, options)
           for value, dest in value_destination_pairs
       ]
 
@@ -881,13 +880,13 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
         reduce_op, zip(sparse_values, sparse_values))
 
   def _gather_implementation(self, per_replica_value, destinations, axis,
-                             experimental_hints):
+                             options):
     logging.warning("gather/all_gather with NCCL or HierarchicalCopy is not "
                     "supported. Falling back to gather on one device and "
                     "then broadcast. We're working on a more efficient "
                     "implementation.")
     return ReductionToOneDevice()._gather(per_replica_value, destinations, axis,  # pylint: disable=protected-access
-                                          experimental_hints)
+                                          options)
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -978,20 +977,9 @@ class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
         num_packs=num_packs)
 
 
-@tf_export("distribute.experimental.CollectiveCommunication")
-class CollectiveCommunication(enum.Enum):
-  """Communication choices for CollectiveOps.
-
-  * `AUTO`: Default to runtime's automatic choices.
-  * `RING`: TensorFlow's ring algorithms for all-reduce and
-    all-gather.
-  * `NCCL`: Use ncclAllReduce for all-reduce, and ring algorithms for
-    all-gather.
-  """
-  AUTO = "AUTO"
-  RING = "RING"
-  NCCL = "NCCL"
-  # TODO(ayushd): add ncclAllGather implementation.
+# TODO(crccw): remove after migrating all callers.
+CollectiveCommunication = collective_util.CommunicationImplementation
+CommunicationImplementation = collective_util.CommunicationImplementation
 
 
 # TODO(yuefengz): support in-graph collective all-reduce.
@@ -1002,11 +990,7 @@ class CollectiveAllReduce(CrossDeviceOps):
   all workers and then put results on the right destinations.
   """
 
-  def __init__(self,
-               devices,
-               group_size,
-               collective_keys=None,
-               communication=CollectiveCommunication.AUTO):
+  def __init__(self, devices, group_size, collective_keys=None):
     """Initializes the object.
 
     Args:
@@ -1014,16 +998,13 @@ class CollectiveAllReduce(CrossDeviceOps):
       group_size: the global group size. For between-graph replicated training
         it's the total number of devices across all workers.
       collective_keys: an optional CollectiveKey object.
-      communication: indicates which collective communication to use.
     """
     if group_size % len(devices) > 0:
       raise ValueError("group_size must be divisible by the number of devices.")
 
-    self._devices = tuple(device_util.canonicalize(d) for d in devices)
     self._group_size = group_size
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
-    self._communication = communication
     # This lock guards all collective launches, i.e. calls to
     # cross_device_utils.build_collectve_*.
     #
@@ -1039,14 +1020,21 @@ class CollectiveAllReduce(CrossDeviceOps):
     # This deadlocks since neither collective is able to finish.
     self._lock = threading.Lock()
 
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    group_key = self._collective_keys.get_group_key(self._devices)
     # Collective ops requires all devices to participate and is blocking. In
     # eager, we need one async executor for each device to be able to launch
     # them altogether. Note that async doesn't imply concurrency. Within an
     # async executor operations are still executed sequentially. In graph or
     # function building, the executors are not used.
     self._executors = []
-    for _ in range(len(devices)):
-      self._executors.append(executor.new_executor(enable_async=True))
+    self._launchers = []
+    for device in self._devices:
+      executor = executor_lib.new_executor(enable_async=True)
+      self._executors.append(executor)
+      launcher = cross_device_utils.CollectiveReplicaLauncher(
+          group_key, group_size, self._collective_keys, device, executor)
+      self._launchers.append(launcher)
 
     super(CollectiveAllReduce, self).__init__()
 
@@ -1056,9 +1044,10 @@ class CollectiveAllReduce(CrossDeviceOps):
     return self._group_size / len(self._devices)
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
+                            options):
+    values_util.mark_as_unsaveable()
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value],
-                                         experimental_hints)[0]
+                                         options)[0]
     devices = get_devices_from(destinations)
 
     if _devices_match(per_replica_value, destinations):
@@ -1087,12 +1076,13 @@ class CollectiveAllReduce(CrossDeviceOps):
     return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
+    values_util.mark_as_unsaveable()
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
       return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs],
-                                    experimental_hints)
+                                    options)
     else:
       if not all_devices_match:
         logging.log_first_n(
@@ -1100,43 +1090,41 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self.reduce_implementation(reduce_op, value, dest, experimental_hints)
+          self.reduce_implementation(reduce_op, value, dest, options)
           for value, dest in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, reduce_op, per_replica_values,
-                        experimental_hints):
+  def _batch_all_reduce(self, reduce_op, per_replica_values, options):
     """All reduce algorithm in a batch."""
     dense_values, dense_indices, sparse_values, sparse_indices = (
         cross_device_utils.split_by_sparsity(per_replica_values))
     if dense_values:
       dense_results = self._do_batch_all_reduce_dense(reduce_op, dense_values,
-                                                      experimental_hints)
+                                                      options)
     else:
       dense_results = []
     if sparse_values:
       sparse_results = self._do_batch_all_reduce_sparse(reduce_op,
-                                                        sparse_values,
-                                                        experimental_hints)
+                                                        sparse_values, options)
     else:
       sparse_results = []
     return cross_device_utils.stitch_values(
         ((dense_results, dense_indices), (sparse_results, sparse_indices)))
 
-  def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values,
-                                 experimental_hints):
+  def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values, options):
     """All-reduce across all workers in a batch."""
 
     batch_size = len(per_replica_values)
-    # Pass self._communication to the runtime as a communication hint.
-    communication = self._communication.value
+    implementation = options.implementation.value
     # For now, we use NCCL only when batch_size > 1 since we don't have a way to
     # order NCCL launches. We're hoping that there's only one batched
     # all-reduce, which is the gradients.
     # TODO(b/132575814): switch to NCCL for all collectives when communication
     # is NCCL if and only if we can order collectives deterministically.
-    if self._communication == CollectiveCommunication.NCCL and batch_size == 1:
-      communication = CollectiveCommunication.AUTO.value
+    # is NCCL.
+    if (options.implementation == CommunicationImplementation.NCCL and
+        batch_size == 1):
+      implementation = CommunicationImplementation.AUTO.value
 
     # Reverse the lists so that there's better chance that values follows
     # the order in which they are calculated (e.g. when they're gradients), so
@@ -1148,66 +1136,41 @@ class CollectiveAllReduce(CrossDeviceOps):
     # queuing time due to concurrent intense computation.
     #
     # TODO(b/147393503): explore solutions for optimal ordering.
-    packs = cross_device_utils.pack_by_size(
-        list(reversed(per_replica_values)), experimental_hints.bytes_per_pack)
+    values_by_device = [[] for _ in range(len(self._devices))]
+    for per_replica in reversed(per_replica_values):
+      for i in range(len(self._devices)):
+        values_by_device[i].append(per_replica.values[i])
 
-    if batch_size > 1:
-      logging.info(
-          "Collective batch_all_reduce: %d all-reduces, num_devices = %d, "
-          "group_size = %d, communication_hint = %s, num_packs = %d",
-          batch_size, len(self._devices), self._group_size, communication,
-          len(packs))
-    else:
-      logging.log_first_n(
-          logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
-          "num_devices = %d, group_size = %d, communication_hint = %s, "
-          "num_packs = %d" % (batch_size, len(
-              self._devices), self._group_size, communication, len(packs)), 10)
-
-    reduced_values = []
+    outputs_by_device = []
     with self._lock:
-      for pack in packs:
-        # By placing all CollectiveReduce ops in a pack under single name scope,
-        # we ensure they will be picked up by the `ScopedAllocator` grappler
-        # optimizer and packed into a single all-reduce.
-        with ops.name_scope("allreduce"):
-          for per_replica in pack:
-            # Add control dependencies per device from the last gradients to the
-            # current set, in order to serialize NCCL launches.
-            if (communication == CollectiveCommunication.NCCL.value and
-                reduced_values):
-              control_inputs = list(reduced_values[-1])
-            else:
-              control_inputs = None
-            reduced_values.append(
-                cross_device_utils.build_collective_reduce(
-                    per_replica.values,
-                    self._devices,
-                    self._group_size,
-                    self._collective_keys,
-                    "Add",
-                    "Id",
-                    communication,
-                    control_inputs,
-                    executors=self._executors,
-                    timeout=experimental_hints.timeout_seconds))
+      for i in range(len(self._devices)):
+        packs = cross_device_utils.group_by_size(
+            values_by_device[i], options.bytes_per_pack)
+        if not context.executing_eagerly() and i == 0:
+          logging.info(
+              "Collective batch_all_reduce: %d all-reduces, num_devices = %d, "
+              "group_size = %d, implementation = %s, num_packs = %d",
+              batch_size, len(self._launchers), self._group_size,
+              implementation, len(packs))
+        outputs_by_device.append(self._launchers[i].batch_all_reduce(
+            packs, implementation, options.timeout_seconds))
 
     for e in self._executors:
       e.wait()
 
     mirrored = []
-    # Reverse the order of reduced value to recover the order in the input.
-    for value in reversed(reduced_values):
+    for values in zip(*outputs_by_device):
       if reduce_op == reduce_util.ReduceOp.MEAN:
-        for i, v in enumerate(value):
+        values = list(values)
+        for i, v in enumerate(values):
           with ops.device(v.device):
-            value[i] = v / self._group_size
+            values[i] = v / self._group_size
       mirrored.append(
-          distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
-    return mirrored
+          distribute_utils.regroup(values, wrap_class=value_lib.Mirrored))
+    # Reverse the order of reduced value to recover the order in the input.
+    return list(reversed(mirrored))
 
-  def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values,
-                                  experimental_hints):
+  def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values, options):
     """All-reduce IndexedSlices across all workers in a batch."""
 
     logging.log_first_n(
@@ -1215,26 +1178,22 @@ class CollectiveAllReduce(CrossDeviceOps):
         "%d all-reduces, group_size = %d" %
         (len(per_replica_values), self._group_size), 10)
 
-    # Pass self._communication to the runtime as a communication hint.
-    communication_hint = self._communication.value
+    implementation = options.implementation.value
     # For now, we use NCCL only when batch_size > 1.
-    # TODO(b/132575814): switch to NCCL for all collectives when communication
+    # TODO(b/132575814): switch to NCCL for all collectives when implementation
     # is NCCL.
-    if self._communication == CollectiveCommunication.NCCL and len(
+    if options.implementation == CommunicationImplementation.NCCL and len(
         per_replica_values) == 1:
-      communication_hint = CollectiveCommunication.AUTO.value
+      implementation = CommunicationImplementation.AUTO.value
 
     gathered_values = []
-    with self._lock, ops.name_scope("allreduce"):
+    with self._lock:
       for per_replica in per_replica_values:
-        gathered_values.append(
-            cross_device_utils.build_collective_gather_indexed_slices(
-                per_replica.values,
-                self._devices,
-                self._group_size,
-                self._collective_keys,
-                communication_hint,
-                timeout=experimental_hints.timeout_seconds))
+        outputs = []
+        for i in range(len(self._devices)):
+          outputs.append(self._launchers[i].all_reduce_indexed_slices(
+              per_replica.values[i], implementation, options.timeout_seconds))
+        gathered_values.append(outputs)
 
     mirrored = []
     for value in gathered_values:
@@ -1248,9 +1207,9 @@ class CollectiveAllReduce(CrossDeviceOps):
     return mirrored
 
   def _gather_implementation(self, per_replica_value, destinations, axis,
-                             experimental_hints):
-    all_gathered = self._batch_all_gather([per_replica_value], axis,
-                                          experimental_hints)[0]
+                             options):
+    all_gathered = self._batch_all_gather([per_replica_value], axis, options)[0]
+    values_util.mark_as_unsaveable()
     devices = get_devices_from(destinations)
 
     if _devices_match(per_replica_value, destinations):
@@ -1276,41 +1235,34 @@ class CollectiveAllReduce(CrossDeviceOps):
               index.append(array_ops.identity(all_gathered._primary))  # pylint: disable=protected-access
     return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
 
-  def _batch_all_gather(self, per_replica_values, axis, experimental_hints):
+  def _batch_all_gather(self, per_replica_values, axis, options):
     """all gather multiple per-replica-values."""
     batch_size = len(per_replica_values)
-    # Pass self._communication to the runtime as a communication hint.
-    communication = self._communication.value
+    # Pass options.implementation to the runtime as a communication
+    # implementation hint.
+    implementation = options.implementation.value
     # For now, we use NCCL only when batch_size > 1.
-    # TODO(b/132575814): switch to NCCL for all collectives when communication
+    # TODO(b/132575814): switch to NCCL for all collectives when implementation
     # is NCCL.
-    if self._communication == CollectiveCommunication.NCCL and batch_size == 1:
-      communication = CollectiveCommunication.AUTO.value
+    if (options.implementation == CommunicationImplementation.NCCL and
+        batch_size == 1):
+      implementation = CommunicationImplementation.AUTO.value
 
     logging.log_first_n(
         logging.INFO, "Collective batch_all_gather: %d all-gathers, "
-        "num_devices = %d, group_size = %d, communication_hint = %s, " %
-        (batch_size, len(self._devices), self._group_size, communication), 10)
+        "num_devices = %d, group_size = %d, implementation = %s, " %
+        (batch_size, len(self._devices), self._group_size, implementation), 10)
 
     def compute_gathered_values():
       gathered_values = []
       with self._lock, ops.name_scope("allgather"):
         for per_replica in per_replica_values:
-          if (communication == CollectiveCommunication.NCCL.value and
-              gathered_values):
-            control_inputs = list(gathered_values[-1])
-          else:
-            control_inputs = None
-          gathered_values.append(
-              cross_device_utils.build_collective_gather(
-                  per_replica.values,
-                  self._devices,
-                  self._group_size,
-                  self._collective_keys,
-                  axis,
-                  communication,
-                  control_inputs,
-                  timeout=experimental_hints.timeout_seconds))
+          outputs = []
+          for i in range(len(self._devices)):
+            outputs.append(self._launchers[i].all_gather(
+                per_replica.values[i], axis, implementation,
+                options.timeout_seconds))
+          gathered_values.append(outputs)
       return gathered_values
 
     if context.executing_eagerly():
@@ -1319,8 +1271,7 @@ class CollectiveAllReduce(CrossDeviceOps):
       gathered_values = compute_gathered_values()
 
     mirrored = []
-    # Reverse the order of gathered value to recover the order in the input.
-    for value in reversed(gathered_values):
+    for value in gathered_values:
       mirrored.append(
           distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
@@ -1329,8 +1280,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     # distribute_coordinator deep-copies the strategy object, so
     # CollectiveAllReduce needs to support deep copy as well.
     collective_keys = copy.deepcopy(self._collective_keys, memo)
-    return CollectiveAllReduce(self._devices, self._group_size, collective_keys,
-                               self._communication)
+    return CollectiveAllReduce(self._devices, self._group_size, collective_keys)
 
 
 def select_cross_device_ops(devices, session_config=None):
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 461b32d57b0..191394f69af 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.distribute import cluster_resolver as cluster_resolver_li
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
@@ -45,9 +46,11 @@ from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
-CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+CommunicationImplementation = collective_util.CommunicationImplementation
 ReduceOp = reduce_util.ReduceOp
 IndexedSlicesValue = indexed_slices.IndexedSlicesValue
 IndexedSlices = indexed_slices.IndexedSlices
@@ -139,14 +142,13 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
     global_mpr_1p.runner.run(enable_collective_ops)
     global_mpr_2p.runner.run(enable_collective_ops)
 
-  def make_collective(self, num_processes, gpu_per_process, communication):
+  def make_collective(self, num_processes, gpu_per_process):
     """Returns collectives and other info to be used in tests.
 
     Args:
       num_processes: an integer indicating the number of processes that
         participate in the collective.
       gpu_per_process: number of GPUs (0 if no GPUs) used by each process.
-      communication: one of `CollectiveCommunication`.
 
     Returns:
      A tuple of (collective, devices, group_size) where collective is a instance
@@ -166,7 +168,7 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       ]
     group_size = num_processes * len(devices)
     collective = cross_device_ops_lib.CollectiveAllReduce(
-        devices=devices, group_size=group_size, communication=communication)
+        devices=devices, group_size=group_size)
     return collective, devices, cluster_resolver.task_id
 
   def as_list(self, value):
@@ -201,10 +203,13 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
           "num_processes",
           "gpus_per_process",
           "reduce_op",
-          "communication",
+          "communication_options",
+          "use_scoped_allocator",
+          "use_collective_v2",
       ])
-  RunOptions.__new__.__defaults__ = (["eager", "func_graph"], 2, 0,
-                                     ReduceOp.SUM, CollectiveCommunication.AUTO)
+  RunOptions.__new__.__defaults__ = (["eager",
+                                      "func_graph"], 2, 0, ReduceOp.SUM,
+                                     collective_util.Options(), True, False)
 
   def reduce_and_verify(self, inputs, expect, options):
     """Reduce the given `inputs` and verify the output matches `expect`.
@@ -218,15 +223,17 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
     """
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          options.use_collective_v2)
       collective, devices, pid = self.make_collective(options.num_processes,
-                                                      options.gpus_per_process,
-                                                      options.communication)
+                                                      options.gpus_per_process)
 
       def reduce_fn():
         value_fn = lambda device_idx: inputs[pid * len(devices) + device_idx]
         per_replica_value = make_per_replica_value(value_fn, devices)
         reduced_values = collective.reduce(options.reduce_op, per_replica_value,
-                                           per_replica_value)
+                                           per_replica_value,
+                                           options.communication_options)
         reduced_values = self.as_list(reduced_values)
         self.assertAllEqual(devices, [v.device for v in reduced_values])
         return [ops.convert_to_tensor(v) for v in reduced_values]
@@ -255,9 +262,12 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
     """
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = (
+          options.use_scoped_allocator)
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          options.use_collective_v2)
       collective, devices, pid = self.make_collective(options.num_processes,
-                                                      options.gpus_per_process,
-                                                      options.communication)
+                                                      options.gpus_per_process)
 
       def batch_reduce_fn():
         batch_size = len(inputs[0])
@@ -270,7 +280,8 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
           per_replica_value = make_per_replica_value(value_fn, devices)
           value_dst_pairs.append((per_replica_value, per_replica_value))
         reduced_values = collective.batch_reduce(options.reduce_op,
-                                                 value_dst_pairs)
+                                                 value_dst_pairs,
+                                                 options.communication_options)
         reduced_values = [self.as_list(v) for v in reduced_values]
         for v in reduced_values:
           self.assertAllEqual(devices, [t.device for t in v])
@@ -293,20 +304,23 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=[1, 2],
           required_gpus=[0, 1, 2],
-          communication=[
+          implementation=[
               # NCCL is only used for batch reduce, so we are not including
               # NCCL combination here.
-              CollectiveCommunication.AUTO,
-              CollectiveCommunication.RING
+              CommunicationImplementation.AUTO,
+              CommunicationImplementation.RING
           ],
-          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
-  def testAllReduceDense(self, num_processes, required_gpus, communication,
-                         reduce_op):
+          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN],
+          use_collective_v2=[True, False]))
+  def testAllReduceDense(self, num_processes, required_gpus, implementation,
+                         reduce_op, use_collective_v2):
     options = self.RunOptions(
         num_processes=num_processes,
         gpus_per_process=required_gpus,
         reduce_op=reduce_op,
-        communication=communication)
+        communication_options=collective_util.Options(
+            implementation=implementation),
+        use_collective_v2=use_collective_v2)
     group_size = options.num_processes * (options.gpus_per_process or 1)
 
     inputs_data = [1.0, 2.0, 3.0, 4.0]
@@ -325,22 +339,25 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=[1, 2],
           required_gpus=[0, 1, 2],
-          communication=[
+          implementation=[
               # NCCL is only used for batch reduce, so we are not including
               # NCCL combination here.
-              CollectiveCommunication.AUTO,
-              CollectiveCommunication.RING
+              CommunicationImplementation.AUTO,
+              CommunicationImplementation.RING
           ],
           # TODO(b/166682130): add MEAN reduce once the bug is fixed.
-          reduce_op=ReduceOp.SUM))
-  def testAllReduceSparse(self, num_processes, required_gpus, communication,
-                          reduce_op):
+          reduce_op=ReduceOp.SUM,
+          use_collective_v2=[True, False]))
+  def testAllReduceSparse(self, num_processes, required_gpus, implementation,
+                          reduce_op, use_collective_v2):
     options = self.RunOptions(
         mode=["func_graph"],  # Sparse reduce is not supported in eager.
         num_processes=num_processes,
         gpus_per_process=required_gpus,
         reduce_op=reduce_op,
-        communication=communication)
+        communication_options=collective_util.Options(
+            implementation=implementation),
+        use_collective_v2=use_collective_v2)
     group_size = options.num_processes * (options.gpus_per_process or 1)
 
     inputs_data = [
@@ -371,7 +388,8 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
     self.reduce_and_verify(inputs, expect, options)
 
-  def testAllReduceSparseVariableLength(self):
+  @combinations.generate(combinations.combine(use_collective_v2=[True, False]))
+  def testAllReduceSparseVariableLength(self, use_collective_v2):
     # One device per process, 2 processes, 2 replicas in total.
     inputs = [
         IndexedSlicesValue(values=[[1.]], indices=[0], dense_shape=[10, 1]),
@@ -388,22 +406,28 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
         self.RunOptions(
             mode=["func_graph"],  # Sparse reduce is not supported in eager.
             num_processes=2,
-            reduce_op=ReduceOp.SUM))
+            reduce_op=ReduceOp.SUM,
+            use_collective_v2=use_collective_v2))
 
   @combinations.generate(
       combinations.combine(
           num_processes=[1, 2],
           required_gpus=[0, 1, 2],
-          communication=[
-              CollectiveCommunication.AUTO, CollectiveCommunication.RING,
-              CollectiveCommunication.NCCL
+          implementation=[
+              CommunicationImplementation.AUTO,
+              CommunicationImplementation.RING, CommunicationImplementation.NCCL
           ],
-          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
-  def testBatchAllReduceDense(self, num_processes, required_gpus, communication,
-                              reduce_op):
-    if required_gpus == 0 and communication == CollectiveCommunication.NCCL:
+          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN],
+          use_scoped_allocator=[True, False],
+          use_collective_v2=[True, False]))
+  def testBatchAllReduceDense(self, num_processes, required_gpus,
+                              implementation, reduce_op, use_scoped_allocator,
+                              use_collective_v2):
+    if (required_gpus == 0 and
+        implementation == CommunicationImplementation.NCCL):
       self.skipTest("Skip CPU + NCCL combination")
-    if num_processes == 2 and communication == CollectiveCommunication.NCCL:
+    if (num_processes == 2 and
+        implementation == CommunicationImplementation.NCCL):
       self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
                     "physical GPUs for every process.")
 
@@ -411,7 +435,10 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
         num_processes=num_processes,
         gpus_per_process=required_gpus,
         reduce_op=reduce_op,
-        communication=communication)
+        communication_options=collective_util.Options(
+            implementation=implementation),
+        use_scoped_allocator=use_scoped_allocator,
+        use_collective_v2=use_collective_v2)
     group_size = options.num_processes * (options.gpus_per_process or 1)
 
     inputs_data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
@@ -430,18 +457,23 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=[1, 2],
           required_gpus=[0, 1, 2],
-          communication=[
-              CollectiveCommunication.AUTO,
-              CollectiveCommunication.RING,
-              CollectiveCommunication.NCCL,
+          implementation=[
+              CommunicationImplementation.AUTO,
+              CommunicationImplementation.RING,
+              CommunicationImplementation.NCCL,
           ],
           # TODO(b/166682130): add MEAN reduce once the bug is fixed.
-          reduce_op=ReduceOp.SUM))
+          reduce_op=ReduceOp.SUM,
+          use_scoped_allocator=[True, False],
+          use_collective_v2=[True, False]))
   def testBatchAllReduceSparse(self, num_processes, required_gpus,
-                               communication, reduce_op):
-    if required_gpus == 0 and communication == CollectiveCommunication.NCCL:
+                               implementation, reduce_op, use_scoped_allocator,
+                               use_collective_v2):
+    if (required_gpus == 0 and
+        implementation == CommunicationImplementation.NCCL):
       self.skipTest("Skip CPU + NCCL combination")
-    if num_processes == 2 and communication == CollectiveCommunication.NCCL:
+    if (num_processes == 2 and
+        implementation == CommunicationImplementation.NCCL):
       self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
                     "physical GPUs for every process.")
 
@@ -450,7 +482,10 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
         num_processes=num_processes,
         gpus_per_process=required_gpus,
         reduce_op=reduce_op,
-        communication=communication)
+        communication_options=collective_util.Options(
+            implementation=implementation),
+        use_scoped_allocator=use_scoped_allocator,
+        use_collective_v2=use_collective_v2)
     group_size = options.num_processes * (options.gpus_per_process or 1)
 
     inputs_data = ([
@@ -513,24 +548,26 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
           required_gpus=[0, 1, 2],
           axis=[0, 1, 2],
           func_mode=["eager", "func_graph"],
-          communication=[
-              CollectiveCommunication.NCCL,
-              CollectiveCommunication.AUTO,
-              CollectiveCommunication.RING
-          ]))
-  def testAllGatherSameShape(self, num_processes, required_gpus, communication,
-                             func_mode, axis):
+          implementation=[
+              CommunicationImplementation.NCCL,
+              CommunicationImplementation.AUTO, CommunicationImplementation.RING
+          ],
+          use_collective_v2=[True, False]))
+  def testAllGatherSameShape(self, num_processes, required_gpus, implementation,
+                             func_mode, axis, use_collective_v2):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
       collective, devices, _ = self.make_collective(num_processes,
-                                                    required_gpus,
-                                                    communication)
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
       value = constant_op.constant([[[1, 2], [1, 2]]], dtype=dtypes.float32)
 
       def gather_fn():
         per_replica_value = make_per_replica_value(value, devices)
         gathered_values = collective._gather(
-            per_replica_value, per_replica_value, axis=axis)
+            per_replica_value, per_replica_value, axis=axis, options=options)
         gathered_values = self.as_list(gathered_values)
         # Skip checking devices in eager. In eager the device attribute doesn't
         # reflect the actual device of the tensor.
@@ -554,19 +591,54 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          num_processes=1,
-          required_gpus=2,
-          communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testMultiThreadedCollectiveLaunchNoInterleave(self, num_processes,
-                                                    required_gpus,
-                                                    communication):
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
+          implementation=[CommunicationImplementation.RING]))
+  def testCollectiveV2ControlFlow(self, num_processes, required_gpus,
+                                  implementation):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = True
       collective, devices, _ = self.make_collective(num_processes,
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
+      value = make_per_replica_value(constant_op.constant([1.]), devices)
+
+      @def_function.function
+      def reduce_fn():
+
+        def cond_body():
+          reduced = collective.reduce(reduce_util.ReduceOp.SUM, value, value,
+                                      options)
+          return math_ops.add_n(self.as_list(reduced)) / len(devices)
+
+        return control_flow_ops.cond(
+            array_ops.identity(False), cond_body, cond_body)
+
+      num_replicas = num_processes * len(devices)
+      self.assertAllEqual(reduce_fn(), [1. * num_replicas])
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=1,
+          required_gpus=2,
+          implementation=[
+              CommunicationImplementation.NCCL, CommunicationImplementation.RING
+          ],
+          use_collective_v2=[True, False]))
+  def testMultiThreadedCollectiveLaunchNoInterleave(self, num_processes,
                                                     required_gpus,
-                                                    communication)
+                                                    implementation,
+                                                    use_collective_v2):
+
+    def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
+      collective, devices, _ = self.make_collective(num_processes,
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
 
       # We would like to simulate the following sequence:
       #   thread-0  device0                 device1
@@ -595,14 +667,15 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
         def thread_fn():
           reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM,
-                                            [(v0, v0), (v0, v0)])
+                                            [(v0, v0), (v0, v0)], options)
           self.assertAllEqual(reduced[0].values, [2.0, 2.0])
           self.assertAllEqual(reduced[1].values, [2.0, 2.0])
 
         t = threading.Thread(target=thread_fn)
         t.start()
         reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
-                                                                     (v1, v1)])
+                                                                     (v1, v1)],
+                                          options)
         self.assertAllEqual(reduced[0].values, [4.0, 4.0])
         self.assertAllEqual(reduced[1].values, [4.0, 4.0])
         t.join()
@@ -613,16 +686,19 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=1,
           required_gpus=2,
-          communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
+          implementation=[
+              CommunicationImplementation.NCCL, CommunicationImplementation.RING
+          ],
+          use_collective_v2=[True, False]))
   def testInputsAreFunctionArgs(self, num_processes, required_gpus,
-                                communication):
+                                implementation, use_collective_v2):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
       collective, devices, _ = self.make_collective(num_processes,
-                                                    required_gpus,
-                                                    communication)
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
 
       @def_function.function
       def reduce_fn(v):
@@ -632,7 +708,8 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
         # We only use NCCL for batch reduce with two or more values, so we use
         # two values here.
         reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v),
-                                                                     (v, v)])
+                                                                     (v, v)],
+                                          options)
         self.assertEqual(reduced[0].values[0].device, devices[0])
         self.assertEqual(reduced[0].values[1].device, devices[1])
         self.assertEqual(reduced[1].values[0].device, devices[0])
@@ -651,21 +728,26 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=2,
           required_gpus=[0, 1],
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutReduceDense(self, num_processes, communication, required_gpus):
+          implementation=[CommunicationImplementation.RING],
+          use_collective_v2=[True, False]))
+  def testTimeoutReduceDense(self, num_processes, implementation, required_gpus,
+                             use_collective_v2):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
       collective, devices, task_id = self.make_collective(
-          num_processes, required_gpus, communication)
+          num_processes, required_gpus)
       if task_id != 0:
         return
 
       v = make_per_replica_value(1.0, devices)
-      hints = collective_util.Hints(timeout_seconds=1)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
 
       @def_function.function
       def reduce_dense():
-        collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
+        return collective.reduce(reduce_util.ReduceOp.SUM, v, v, options)
 
       # The collective should time out because we only launch it on worker-0,
       # while there're three workers in total.
@@ -678,23 +760,27 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=2,
           required_gpus=[0, 1],
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutBatchReduceDense(self, num_processes, communication,
-                                  required_gpus):
+          implementation=[CommunicationImplementation.RING],
+          use_collective_v2=[True, False]))
+  def testTimeoutBatchReduceDense(self, num_processes, implementation,
+                                  required_gpus, use_collective_v2):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
       collective, devices, task_id = self.make_collective(
-          num_processes, required_gpus, communication)
+          num_processes, required_gpus)
       if task_id != 0:
         return
 
       v = make_per_replica_value(1.0, devices)
-      hints = collective_util.Hints(timeout_seconds=1)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
 
       @def_function.function
       def batch_reduce_dense():
-        collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)],
-                                hints)
+        return collective.batch_reduce(reduce_util.ReduceOp.SUM,
+                                       [(v, v), (v, v)], options)
 
       # The collective should time out because we only launch it on worker-0,
       # while there're two workers in total.
@@ -707,24 +793,28 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=2,
           required_gpus=[0, 1],
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutReduceSparse(self, num_processes, communication,
-                              required_gpus):
+          implementation=[CommunicationImplementation.RING],
+          use_collective_v2=[True, False]))
+  def testTimeoutReduceSparse(self, num_processes, implementation,
+                              required_gpus, use_collective_v2):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
       collective, devices, task_id = self.make_collective(
-          num_processes, required_gpus, communication)
+          num_processes, required_gpus)
       if task_id != 0:
         return
 
       v = make_per_replica_value(
           IndexedSlicesValue(
               values=[[4., 6.]], indices=[1], dense_shape=[5, 2]), devices)
-      hints = collective_util.Hints(timeout_seconds=1)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
 
       @def_function.function
       def reduce_sparse():
-        collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
+        return collective.reduce(reduce_util.ReduceOp.SUM, v, v, options)
 
       # The collective should time out because we only launch it on worker-0,
       # while there're two workers in total.
@@ -737,25 +827,29 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
       combinations.combine(
           num_processes=2,
           required_gpus=[0, 1],
-          communication=[CollectiveCommunication.RING]))
+          implementation=[CommunicationImplementation.RING],
+          use_collective_v2=[True, False]))
   def testTimeoutBatchReduceSparse(self, num_processes, required_gpus,
-                                   communication):
+                                   implementation, use_collective_v2):
 
     def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = (
+          use_collective_v2)
       collective, devices, task_id = self.make_collective(
-          num_processes, required_gpus, communication)
+          num_processes, required_gpus)
       if task_id != 0:
         return
 
       v = make_per_replica_value(
           IndexedSlicesValue(
               values=[[4., 6.]], indices=[1], dense_shape=[5, 2]), devices)
-      hints = collective_util.Hints(timeout_seconds=1)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
 
       @def_function.function
       def batch_reduce_sparse():
-        collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)],
-                                hints)
+        return collective.batch_reduce(reduce_util.ReduceOp.SUM,
+                                       [(v, v), (v, v)], options)
 
       # The collective should time out because we only launch it on worker-0,
       # while there're two workers in total.
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index cb442f4a53a..96866fb1ca4 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -24,9 +24,9 @@ import threading
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import control_flow_ops
@@ -34,7 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 from tensorflow.python.platform import tf_logging as logging
 
-OP_INSTANCE_KEY_START_NUMBER = 100
+INSTANCE_KEY_START_NUMBER = 100
 
 
 def aggregate_gradients_using_nccl(replica_grads):
@@ -181,69 +181,66 @@ class CollectiveKeys(object):
   *Instance key*: an integer key to identify the set of same counterpart of
   tensors on different devices in a device group that need to be all-reduced.
 
-  "Graph key": an integer key that is unique key graph. This is used to support
-  multiple graphs per client session. It must be non-zero and set in the
-  `config` argument of each call to `session.run`.
-
   This class is thread safe.
   """
 
-  def __init__(self,
-               group_key_start=1,
-               op_instance_key_start=OP_INSTANCE_KEY_START_NUMBER,
-               variable_instance_key_start=1000000):
+  def __init__(self, group_key_start=1):
     """Initializes the object.
 
     Args:
       group_key_start: the starting integer of group key.
-      op_instance_key_start: the starting integer of instance key for ops.
-      variable_instance_key_start: the starting integer of instance key for
-        variables.
     """
     self._group_key = group_key_start
     self._group_key_table = {}
-
-    assert op_instance_key_start != variable_instance_key_start
-    self._op_instance_key = op_instance_key_start
-    self._variable_instance_key = variable_instance_key_start
+    self._instance_key_table = {}
     self._lock = threading.Lock()
 
   def get_group_key(self, devices):
     """Returns a group key for the set of devices.
 
     Args:
-      devices: list of strings naming devices in a collective group.
+      devices: a list of canonical device strings in a collective group.
 
     Returns:
       int key uniquely identifying the set of device names.
     """
-    parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
-    # In the between-graph replicated training, different workers need to get
-    # the same device key. So we remove the task_type and task_id from the
-    # devices.
-    # TODO(yuefengz): in the in-graph replicated training, we need to include
-    # task_type and task_id.
-    names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
-    key_id = ','.join(names)
+    key_id = hash(tuple(sorted(devices)))
     with self._lock:
       if key_id not in self._group_key_table:
         new_key = self._group_key
         self._group_key += 1
         self._group_key_table[key_id] = new_key
+        self._instance_key_table[new_key] = {}
+        for device in devices:
+          self._instance_key_table[new_key][device] = INSTANCE_KEY_START_NUMBER
       return self._group_key_table[key_id]
 
-  def get_op_instance_key(self):
-    """Returns a new instance key for use in defining a collective op."""
-    with self._lock:
-      v = self._op_instance_key
-      self._op_instance_key += 1
-      return v
+  def get_instance_key(self, group_key, device):
+    """Returns a new instance key for use in defining a collective op.
 
-  def get_variable_instance_key(self):
-    """Returns a new instance key for use in creating a Variable."""
+    You should call this once per each collective op of a collective instance.
+
+    Args:
+      group_key: the group key returned by get_group_key(). You should not
+        assign the group key yourself.
+      device: a canonical device string. It should be the device this collective
+        op is on.
+
+    Returns:
+      a new instance key.
+
+    Raises:
+      ValueError: when the group key is invalid or the device is not in the
+      group.
+    """
     with self._lock:
-      v = self._variable_instance_key
-      self._variable_instance_key += 1
+      group = self._instance_key_table.get(group_key, None)
+      if group is None:
+        raise ValueError('group {} not found'.format(group_key))
+      if device not in group:
+        raise ValueError('{} not in group {}'.format(device, group_key))
+      v = group[device]
+      group[device] += 1
       return v
 
   def __deepcopy__(self, memo):
@@ -252,135 +249,249 @@ class CollectiveKeys(object):
     copied = CollectiveKeys()
     copied._group_key = self._group_key
     copied._group_key_table = copy.deepcopy(self._group_key_table, memo)
-    copied._op_instance_key = self._op_instance_key
-    copied._variable_instance_key = self._variable_instance_key
+    copied._instance_key_table = copy.deepcopy(self._instance_key_table, memo)
     return copied
 
 
-def build_collective_reduce(input_tensors,
-                            devices,
-                            group_size,
-                            collective_keys,
-                            reduction_op='Add',
-                            unary_op='Id',
-                            communication_hint='AUTO',
-                            control_inputs=None,
-                            executors=None,
-                            timeout=None):
-  """Build a subgraph that does one full all-reduce, using the collective Op.
+class CollectiveReplicaLauncher(object):
+  """Launch collectives on one replica."""
 
-  If called in eager mode, it's required to supply a list of async executors for
-  each input Tensor.
+  _use_scoped_allocator = True
+  _use_collective_v2 = False
 
-  Args:
-    input_tensors: tensors within a single worker graph that are to be reduced
-      together; must be one per device.
-    devices: a list of device strings to run the collective on.
-    group_size: total number of devices globally that will be doing this same
-      reduction.  The reduction will actually include the corresponding tensors
-      at all these workers.
-    collective_keys: a CollectiveKeys object.
-    reduction_op: string naming the reduction op.
-    unary_op: string naming the unary final op.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_reduce tensors
-    executors: a list of async executor. Required for eager execution.
-    timeout: a float or None. The timeout in seconds.
+  def __init__(self,
+               group_key,
+               group_size,
+               collective_keys,
+               device,
+               executor=None):
+    if executor and not executor.is_async():
+      raise ValueError('executor must be async')
+    self._group_key = group_key
+    self._group_size = group_size
+    self._collective_keys = collective_keys
+    self._device = device
+    self._executor = executor
 
-  Returns:
-    An array of final tensors, one per device, computed by the full reduction.
-
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
-  """
-  if context.executing_eagerly():
-    if (not executors or len(executors) != len(input_tensors) or
-        not all(e.is_async() for e in executors)):
-      raise ValueError(
-          'collectives requires async executors for each device in eager mode')
-  if len(input_tensors) != len(devices):
-    raise ValueError('collective requires one input tensor for each device, '
-                     'len(input_tensors) = %d, len(devices) = %d' %
-                     (len(input_tensors), len(devices)))
-
-  if group_size < 2:
-    return input_tensors
-  group_key = collective_keys.get_group_key(devices)
-  instance_key = collective_keys.get_op_instance_key()
-  subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-
-  out_tensors = []
-  for idx, input_tensor in enumerate(input_tensors):
+  def _executor_scope(self):
+    if context.executing_eagerly() and not self._executor:
+      raise ValueError('collectives requires a async executor in eager mode')
     if context.executing_eagerly():
-      executor_scope = context.executor_scope(executors[idx])
+      return context.executor_scope(self._executor)
+    return ops.NullContextmanager()
+
+  def _control_input(self, control_input):
+    if control_input is not None:
+      return ops.control_dependencies([control_input])
+    return ops.NullContextmanager()
+
+  def _should_use_collective_v2(self):
+    if not CollectiveReplicaLauncher._use_collective_v2:
+      return False
+    if not ops.executing_eagerly_outside_functions():
+      return False
+    return True
+
+  def _next_instance_key(self):
+    """Returns the next instance key."""
+    if self._should_use_collective_v2():
+      # Assigning instance keys at function building time have issues since
+      # different workers may retrace the function at different times. With
+      # collective V2 we can use capture_call_time_value to use a placeholder as
+      # the instance key and feed it at function call time. In this way we also
+      # don't reuse instance keys, which allows for per-instance cancellation.
+      graph = ops.get_default_graph()
+      # Control flow ops don't work with capture_call_time_value, so we put the
+      # capture in the function graph of that control flow op.
+      while getattr(graph, 'is_control_flow_graph', False):
+        graph = graph.outer_graph
+      if not context.executing_eagerly() and graph.building_function:
+        with graph.as_default():
+          # Capture self._next_instance_key so that when building a function
+          # that calls another tf.function, the instance key assignment is
+          # further delayed until we actually call the function in eager. Note
+          # that capture_call_time_value doesn't automatically propagate the
+          # deferred capture to the outer function.
+          return graph.capture_call_time_value(
+              self._next_instance_key, tensor_spec.TensorSpec([], dtypes.int32))
+      else:
+        instance_key = self._collective_keys.get_instance_key(
+            self._group_key, self._device)
+        with ops.device('CPU:0'):
+          return ops.convert_to_tensor(instance_key, dtype=dtypes.int32)
     else:
-      executor_scope = ops.NullContextmanager()
-    with executor_scope, \
-         ops.device(devices[idx]), \
-         ops.control_dependencies(
-             _control_input(devices, control_inputs, idx)):
-      out_tensor = collective_ops.all_reduce(
-          input_tensor,
-          group_size,
-          group_key,
-          instance_key,
-          reduction_op,
-          unary_op,
-          subdiv_offsets,
-          communication_hint,
-          timeout=timeout)
-    out_tensors.append(out_tensor)
-  return out_tensors
+      return self._collective_keys.get_instance_key(self._group_key,
+                                                    self._device)
 
+  def all_reduce(self,
+                 input_tensor,
+                 control_input=None,
+                 communication_hint='AUTO',
+                 timeout=0):
+    """All-reduce a dense tensor.
 
-def build_collective_gather(input_tensors,
-                            devices,
-                            group_size,
-                            collective_keys,
-                            axis,
-                            communication_hint='AUTO',
-                            control_inputs=None,
-                            timeout=None):
-  """Build a subgraph that does one full all-gather, using the collective Op.
+    This can be called in eager mode if a async executor is supplied when
+    creating the launcher.
 
-  This method must be called in graph mode or inside a tf.function.
+    Args:
+      input_tensor: a dense tensor. It must have the same shape on all replicas.
+      control_input: if not None, add control edges between control_input and
+        the all-reduce.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
 
-  Args:
-    input_tensors: tensors within a single worker graph that are to be gathered
-      together; must be one per device. Input tensors cannot have rank 0.
-    devices: a list of device strings to run the collective on.
-    group_size: total number of devices globally that will be doing this same
-      gathering. The gathering will actually include the corresponding tensors
-      at all these workers.
-    collective_keys: a CollectiveKeys object.
-    axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
-      range [0, rank(value)).
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation. Available options are `AUTO`, `NCCL`, and `RING`.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_gather tensors
-    timeout: a float or None. The timeout in seconds.
+    Returns:
+      The reduced tensor.
+    """
+    instance_key = self._next_instance_key()
+    with self._executor_scope(), \
+         ops.device(self._device), \
+         self._control_input(control_input):
+      if self._should_use_collective_v2():
+        return collective_ops.all_reduce_v2(
+            input_tensor,
+            self._group_size,
+            self._group_key,
+            instance_key,
+            communication_hint=communication_hint,
+            timeout=timeout)
+      else:
+        return collective_ops.all_reduce(
+            input_tensor,
+            self._group_size,
+            self._group_key,
+            instance_key,
+            communication_hint=communication_hint,
+            timeout=timeout)
 
-  Returns:
-    An array of final tensors, one per device, computed by the full gather.
-  """
-  if len(input_tensors) != len(devices):
-    raise ValueError(
-        'collective requires one input tensor for each device, %d != %d' %
-        (len(input_tensors), len(devices)))
+  def _all_gather(self, input_tensor, communication_hint='AUTO', timeout=0):
+    """All-gather a dense tensor.
 
-  if group_size < 2:
-    return input_tensors
-  group_key = collective_keys.get_group_key(devices)
-  instance_key_tensor = collective_keys.get_op_instance_key()
-  instance_key_shape = collective_keys.get_op_instance_key()
+    This can be called in eager mode if an async executor is supplied when
+    creating the launcher.
 
-  out_tensors = []
-  for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(devices[idx]), ops.control_dependencies(
-        _control_input(devices, control_inputs, idx)):
+    Args:
+      input_tensor: a dense tensor. It must have the same shape on all replicas.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
+
+    Returns:
+      The reduced tensor.
+    """
+    instance_key = self._next_instance_key()
+    with self._executor_scope(), ops.device(self._device):
+      if self._should_use_collective_v2():
+        return collective_ops.all_gather_v2(
+            input_tensor,
+            self._group_size,
+            self._group_key,
+            instance_key,
+            communication_hint=communication_hint,
+            timeout=timeout)
+      else:
+        return collective_ops.all_gather(
+            input_tensor,
+            self._group_size,
+            self._group_key,
+            instance_key,
+            communication_hint=communication_hint,
+            timeout=timeout)
+
+  def batch_all_reduce(self,
+                       input_tensor_packs,
+                       communication_hint='AUTO',
+                       timeout=0):
+    """Batch all-reduce dense tensors.
+
+    This takes a list of batches of tensors. Using multiple batches have the
+    benefit that it doesn't need to wait for all inputs to be ready to start the
+    all-reduce.
+
+    This can be called in eager mode if a async executor is supplied when
+    creating the launcher.
+
+    Args:
+      input_tensor_packs: a list of lists of dense tensors.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
+
+    Returns:
+      A flat list of reduced tensors.
+    """
+    # We don't batch with concat in eager. It's easy to get it wrong because
+    # we need to avoid any numpy() calls on values produced by the async
+    # executor. This effectively disables batching in eager, but it's unlikely
+    # to all-reduce a large number of tensors in eager.
+    batch_with_concat = (not self._use_scoped_allocator and
+                         not context.executing_eagerly())
+    outputs = []
+    for pack in input_tensor_packs:
+      # TODO(b/169168846): inserts a parallel all_gather to verify packings
+      # are the same on each replica.
+      if batch_with_concat:
+        with ops.device(self._device):
+          flat_tensors = [array_ops.reshape(t, [-1]) for t in pack]
+          shapes = [array_ops.shape(t) for t in pack]
+          if communication_hint == 'NCCL' and outputs:
+            control_input = outputs[-1]
+          else:
+            control_input = None
+          reduced = self.all_reduce(
+              array_ops.concat(flat_tensors, axis=0), control_input,
+              communication_hint, timeout)
+          num_elements = [math_ops.reduce_prod(s) for s in shapes]
+          flat_outputs = array_ops.split(reduced, num_elements, axis=0)
+          for shape, flat_output in zip(shapes, flat_outputs):
+            outputs.append(array_ops.reshape(flat_output, shape))
+      else:
+        # By placing all CollectiveReduce ops in a batch under single name
+        # scope, we ensure they will be picked up by the `ScopedAllocator`
+        # grappler optimizer and packed into a single all-reduce.
+        with ops.name_scope('allreduce'):
+          for input_tensor in pack:
+            if communication_hint == 'NCCL' and outputs:
+              control_input = outputs[-1]
+            else:
+              control_input = None
+            outputs.append(
+                self.all_reduce(input_tensor, control_input, communication_hint,
+                                timeout))
+
+    return outputs
+
+  def all_gather(self,
+                 input_tensor,
+                 axis,
+                 communication_hint='AUTO',
+                 timeout=0):
+    """All-gather a dense tensor.
+
+    This method must be called inside a tf.function.
+
+    Args:
+      input_tensor: a dense tensor. It must have the same rank on all replicas,
+        and dimensions other than `axis` need to be the same as well.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation. Available options are `AUTO`, `NCCL`, and
+        `RING`.
+      timeout: a float. The timeout in seconds.
+
+    Returns:
+      The gathered Tensor.
+
+    Raises:
+      RuntimeError: if called in eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('all_gather in eager mode is not supported')
+
+    with ops.device(self._device), \
+         ops.control_dependencies([array_ops.identity(input_tensor)]):
       # 1. Transpose
       # E.g. Given an input_tensor with shape [2,2,5,1] and axis to gather is 3,
       # we use perm_pre=[3 0 1 2] to reshape it to [1,2,2,5], which
@@ -392,11 +503,8 @@ def build_collective_gather(input_tensors,
           axis=0)
       input_tensor_t = array_ops.transpose(input_tensor, perm=perm_pre)
       # 2. Pad
-      gathered_shape = collective_ops.all_gather(
+      gathered_shape = self._all_gather(
           array_ops.expand_dims_v2(array_ops.shape_v2(input_tensor_t), axis=0),
-          group_size,
-          group_key,
-          instance_key_shape,
           communication_hint,
           timeout=timeout)
       first_dims = gathered_shape[:, 0]
@@ -404,16 +512,11 @@ def build_collective_gather(input_tensors,
       padded_input_tensor = _pad_util(input_tensor_t, full_axis_dim)
 
       # 3. Gather
-      gather_padded_out_tensor = collective_ops.all_gather(
-          padded_input_tensor,
-          group_size,
-          group_key,
-          instance_key_tensor,
-          communication_hint,
-          timeout=timeout)
+      gather_padded_out_tensor = self._all_gather(
+          padded_input_tensor, communication_hint, timeout=timeout)
       # 4. Unpad
       split_tensors = []
-      for i in range(first_dims.shape[0]):
+      for i in range(self._group_size):
         start_pos = i * full_axis_dim
         split_tensors.append(gather_padded_out_tensor[start_pos:start_pos +
                                                       first_dims[i]])
@@ -424,106 +527,50 @@ def build_collective_gather(input_tensors,
           (math_ops.range(1, axis + 1), [0],
            math_ops.range(axis + 1, array_ops.rank(input_tensor_t))),
           axis=0)
-      out_tensor = array_ops.transpose(out_tensor_t, perm=perm_after)
-      out_tensors.append(out_tensor)
-  return out_tensors
+      return array_ops.transpose(out_tensor_t, perm=perm_after)
 
+  def all_reduce_indexed_slices(self,
+                                input_slices,
+                                communication_hint='AUTO',
+                                timeout=0):
+    """All-reduce an IndexedSlices.
 
-def _pad_util(input_tensor, full_axis_dim):
-  """Pad the `input_tensor`'s first dimension to be `full_axis_dim`."""
-  missing_axis_dim = full_axis_dim - array_ops.shape_v2(input_tensor)[0]
-  tensor_rank = array_ops.rank(input_tensor)
-  paddings_axis = [[0, missing_axis_dim]]
-  paddings = array_ops.concat([
-      paddings_axis,
-      array_ops.zeros(shape=(tensor_rank - 1, 2), dtype=dtypes.int32)
-  ],
-                              axis=0)
-  padded_input_tensor = array_ops.pad(input_tensor, paddings)
-  return padded_input_tensor
+    This method must be called inside a tf.function.
 
+    Args:
+      input_slices: an IndexedSlices.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
 
-def build_collective_gather_indexed_slices(input_slices_list,
-                                           devices,
-                                           group_size,
-                                           collective_keys,
-                                           communication_hint='AUTO',
-                                           control_inputs=None,
-                                           timeout=None):
-  """Build a subgraph that all-gathers IndexedSlices using the collective Op.
+    Returns:
+      The reduced IndexedSlices.
 
-  This method must be called in graph mode or inside a tf.function.
+    Raises:
+      RuntimeError: if called in eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError(
+          'all_reduce_indexed_slices in eager mode is not supported')
 
-  Args:
-    input_slices_list: a list of IndexedSlices within a single worker graph that
-      are to be gathered together; must be one per device.
-    devices: a list of device strings to run the collective on.
-    group_size: total number of devices globally that will be doing this same
-      gathering. The gathering will actually include the corresponding tensors
-      at all these workers.
-    collective_keys: a CollectiveKeys object.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_reduce tensors
-    timeout: a float or None. The timeout in seconds.
-
-  Returns:
-    An array of final IndexedSlices, one per device, computed by the full
-    gather.
-
-  Raises:
-    ValueError: if control_inputs is not None and doesn't match the length and
-      devices of inputs.
-  """
-  assert not context.executing_eagerly(), (
-      'build_collective_gather_indexed_slices can only be called in graph mode'
-      ' or inside tf.function')
-  if len(input_slices_list) != len(devices):
-    raise ValueError(
-        'collective requires one input IndexedSlice for each device, %d != %d' %
-        (len(input_slices_list), len(devices)))
-
-  if group_size < 2:
-    return input_slices_list
-
-  group_key = collective_keys.get_group_key(devices)
-  gather_length_key = collective_keys.get_op_instance_key()
-  gather_indices_key = collective_keys.get_op_instance_key()
-  gather_values_key = collective_keys.get_op_instance_key()
-  reduce_densified_key = collective_keys.get_op_instance_key()
-
-  # Current CollectiveAllGather implementations require input IndexedSlices to
-  # have consistent length across the board, we handle the reduction of
-  # IndexedSlices as follows:
-  #   1. Gather the lengths of IndexedSlices from all participants.
-  #   2. If they have consistent length, apply all_gather.
-  #   3. Otherwise convert IndexedSlices to dense tensors and apply
-  #      all_reduce.
-  out_slices_list = []
-  for idx, input_slices in enumerate(input_slices_list):
-    # pylint: disable = cell-var-from-loop
-    with ops.device(devices[idx]):
+    # Current CollectiveAllGather implementations require input IndexedSlices to
+    # have consistent length across the board, we handle the reduction of
+    # IndexedSlices as follows:
+    #   1. Gather the lengths of IndexedSlices from all participants.
+    #   2. If they have consistent length, apply all_gather.
+    #   3. Otherwise convert IndexedSlices to dense tensors and apply
+    #      all_reduce.
+    with ops.device(self._device):
 
       def all_gather():
         """Use all_gather to aggregate `IndexedSlices`."""
-        all_values = collective_ops.all_gather(
-            input_slices.values,
-            group_size,
-            group_key,
-            gather_values_key,
-            communication_hint,
-            timeout=timeout)
+        all_values = self._all_gather(
+            input_slices.values, communication_hint, timeout=timeout)
         # Add control dependency to order the all-gather.
         control = [all_values] if communication_hint == 'NCCL' else []
         with ops.control_dependencies(control):
-          all_indices = collective_ops.all_gather(
-              input_slices.indices,
-              group_size,
-              group_key,
-              gather_indices_key,
-              communication_hint,
-              timeout=timeout)
+          all_indices = self._all_gather(
+              input_slices.indices, communication_hint, timeout=timeout)
         return ops.IndexedSlices(
             values=all_values,
             indices=all_indices,
@@ -532,15 +579,8 @@ def build_collective_gather_indexed_slices(input_slices_list,
       def densify_and_all_reduce():
         """Use all_reduce to aggregate `IndexedSlices`."""
         densified = ops.convert_to_tensor(input_slices)
-        reduced = collective_ops.all_reduce(
-            densified,
-            group_size,
-            group_key,
-            reduce_densified_key,
-            'Add',
-            'Id', [0],
-            communication_hint,
-            timeout=timeout)
+        reduced = self.all_reduce(
+            densified, communication_hint=communication_hint, timeout=timeout)
         # We have to convert dense grad to IndexedSlice because all_reduce()
         # and all_gather() must have the same return type as required by
         # control_flow_ops.cond.
@@ -550,23 +590,13 @@ def build_collective_gather_indexed_slices(input_slices_list,
             dense_shape=input_slices.dense_shape)
 
       length = array_ops.shape(input_slices.indices)
-      with ops.control_dependencies(
-          _control_input(input_slices, control_inputs, idx)):
-        all_lengths = collective_ops.all_gather(
-            length,
-            group_size,
-            group_key,
-            gather_length_key,
-            communication_hint,
-            timeout=timeout)
-      out_slices = control_flow_ops.cond(
+      all_lengths = self._all_gather(
+          length, communication_hint, timeout=timeout)
+      return control_flow_ops.cond(
           math_ops.equal(
               math_ops.reduce_max(all_lengths),
               math_ops.reduce_min(all_lengths)), all_gather,
           densify_and_all_reduce)
-      out_slices_list.append(out_slices)
-    # pylint: enable=cell-var-from-loop
-  return out_slices_list
 
 
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
@@ -653,56 +683,35 @@ def stitch_values(values_and_indices_list):
   return result
 
 
-def per_replica_num_elements(per_replica):
-  """Returns the static number of elements of one replica.
+def group_by_size(input_tensors, bytes_per_pack):
+  """Groups `input_tensors` into chunks of `bytes_per_pack`.
 
-  Args:
-    per_replica: A PerReplica of Tensor or IndexedSlices.
-
-  Returns:
-    Number of elements. None if some replica has a different or unknown shape.
-  """
-
-  values = per_replica._values  # pylint: disable=protected-access
-  s0 = values[0].shape
-  for v in values:
-    assert not isinstance(v, ops.IndexedSlices)
-    if v.shape != s0:
-      return None
-  return s0.num_elements()
-
-
-def pack_by_size(per_replica_list, bytes_per_pack):
-  """Packs `per_replica_list` into chunks of `bytes_per_pack`.
-
-  The method preserves the original order of `per_replica_list`. The packing is
+  The method preserves the original order of `input_tensors`. The grouping is
   best effort, each pack could have more or less bytes than `bytes_per_pack`.
-  It only packs values with known shape. Note that, the usage is different from
-  `cross_device_ops._pack_tensors`, this function is intended to work with the
-  ScopeAllocator style batching used in `CollectiveAllReduce`.
+  It only groups values with known shape.
 
   Args:
-    per_replica_list: A list of PerReplica.
-    bytes_per_pack: Bytes per pack.
+    input_tensors: a list of Tensor.
+    bytes_per_pack: an integer.
 
   Returns:
-    A list of packs of PerReplica. All values are packed into one pack if
-      `bytes_per_pack` is zero or any of the value has unknown shape.
+    A list of packs of Tensor. All values are grouped into one pack if
+    `bytes_per_pack` is zero or any of the value has unknown shape.
   """
 
   if bytes_per_pack == 0:
-    return [per_replica_list]
+    return [input_tensors]
   packs = []
   last_pack_size = 0
-  for value in per_replica_list:
-    num_elements = per_replica_num_elements(value)
+  for value in input_tensors:
+    num_elements = value.shape.num_elements()
     if num_elements is None:
       # Can't pack values with unknown shape.
       logging.warning(
           'not packing values due to the unknown or inconsistent shape of %s',
           value)
-      return [per_replica_list]
-    size = num_elements * value._primary.dtype.size  # pylint: disable=protected-access
+      return [input_tensors]
+    size = num_elements * value.dtype.size
     # Try to keep each pack as close to bytes_per_pack as possible, while each
     # pack is at least bytes_per_pack large. I.E. we err on the side of having
     # few but large packs.
@@ -714,24 +723,15 @@ def pack_by_size(per_replica_list, bytes_per_pack):
   return packs
 
 
-def _control_input(devices, control_inputs, idx):
-  """Returns the `idx`-th item in control_inputs to be used in ops.control_dependencies.
-
-  This is a helper function for building collective ops.
-
-  Args:
-    devices: a list of device strings the collective run on.
-    control_inputs: a list or None.
-    idx: the index into `inputs` and `control_inputs`.
-
-  Returns:
-    A one item list of the `idx`-th element of `control_inputs`, or an empty
-    list if `control_inputs` is None.
-  """
-  if control_inputs is None:
-    return []
-  if len(control_inputs) != len(devices):
-    raise ValueError(
-        'control_inputs must match the length of the devices, %s != %s' %
-        (len(control_inputs), len(devices)))
-  return [control_inputs[idx]]
+def _pad_util(input_tensor, full_axis_dim):
+  """Pad the `input_tensor`'s first dimension to be `full_axis_dim`."""
+  missing_axis_dim = full_axis_dim - array_ops.shape_v2(input_tensor)[0]
+  tensor_rank = array_ops.rank(input_tensor)
+  paddings_axis = [[0, missing_axis_dim]]
+  paddings = array_ops.concat([
+      paddings_axis,
+      array_ops.zeros(shape=(tensor_rank - 1, 2), dtype=dtypes.int32)
+  ],
+                              axis=0)
+  padded_input_tensor = array_ops.pad(input_tensor, paddings)
+  return padded_input_tensor
diff --git a/tensorflow/python/distribute/cross_device_utils_test.py b/tensorflow/python/distribute/cross_device_utils_test.py
index 626ec5cfd60..108d5478ce6 100644
--- a/tensorflow/python/distribute/cross_device_utils_test.py
+++ b/tensorflow/python/distribute/cross_device_utils_test.py
@@ -23,7 +23,6 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -114,11 +113,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         device_util.resolve(destination), device_util.resolve(result.device))
 
 
-class PackBySizeTest(test.TestCase):
-
-  def assertShape(self, per_replica, shape):
-    for v in per_replica._values:  # pylint: disable=protected-access
-      self.assertEqual(v.shape, shape)
+class GroupBySizeTest(test.TestCase):
 
   def testPreferLargerPack(self):
     # Each packs except the last one should be equal or larger than
@@ -133,49 +128,38 @@ class PackBySizeTest(test.TestCase):
         # size = 1 * 4 = 4
         array_ops.ones([1], dtype=dtypes.int32),
     ]
-    per_replica_values = [value_lib.PerReplica([v, v]) for v in values]
-    packs = cross_device_utils.pack_by_size(
-        per_replica_values, bytes_per_pack=200)
+    packs = cross_device_utils.group_by_size(values, bytes_per_pack=200)
     self.assertLen(packs, 2)
     self.assertLen(packs[0], 3)
-    self.assertShape(packs[0][0], [2, 4, 4])
-    self.assertShape(packs[0][1], [8])
-    self.assertShape(packs[0][2], [10, 10])
+    self.assertEqual(packs[0][0].shape, [2, 4, 4])
+    self.assertEqual(packs[0][1].shape, [8])
+    self.assertEqual(packs[0][2].shape, [10, 10])
     self.assertLen(packs[1], 1)
-    self.assertShape(packs[1][0], [1])
+    self.assertEqual(packs[1][0].shape, [1])
 
   def testZeroBytesPerPack(self):
     values = [
         array_ops.ones([1], dtype=dtypes.float32),
         array_ops.ones([2], dtype=dtypes.float32),
     ]
-    per_replica_values = [value_lib.PerReplica([v, v]) for v in values]
-    packs = cross_device_utils.pack_by_size(
-        per_replica_values, bytes_per_pack=0)
+    packs = cross_device_utils.group_by_size(values, bytes_per_pack=0)
     self.assertLen(packs, 1)
     self.assertLen(packs[0], 2)
-    self.assertShape(packs[0][0], [1])
-    self.assertShape(packs[0][1], [2])
+    self.assertEqual(packs[0][0].shape, [1])
+    self.assertEqual(packs[0][1].shape, [2])
 
   def testUnknownShape(self):
     def create_placeholder(shape, dtype):
       with ops.Graph().as_default():
         return array_ops.placeholder(dtype=dtype, shape=shape)
 
-    per_replica_values = [
-        value_lib.PerReplica([
-            array_ops.ones([10, 10], dtype=dtypes.float32),
-            array_ops.ones([10, 10], dtype=dtypes.float32),
-        ]),
-        value_lib.PerReplica([
-            array_ops.ones([10, 10], dtype=dtypes.float32),
-            create_placeholder([None, 10], dtype=dtypes.float32),
-        ]),
+    values = [
+        array_ops.ones([10, 10], dtype=dtypes.float32),
+        create_placeholder([None, 10], dtype=dtypes.float32),
     ]
-    packs = cross_device_utils.pack_by_size(
-        per_replica_values, bytes_per_pack=1)
+    packs = cross_device_utils.group_by_size(values, bytes_per_pack=1)
     self.assertLen(packs, 1)
-    self.assertEqual(packs[0], per_replica_values)
+    self.assertEqual(packs[0], values)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index c1cdeed6f87..d74967e8320 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -35,7 +35,7 @@ the same way with eager and graph execution.
 
   The tutorials cover how to use `tf.distribute.Strategy` to do distributed
   training with native Keras APIs, custom training loops,
-  and Esitmator APIs. They also cover how to save/load model when using
+  and Estimator APIs. They also cover how to save/load model when using
   `tf.distribute.Strategy`.
 
 *Glossary*
@@ -80,7 +80,7 @@ the same way with eager and graph execution.
   parameters/variables, used by some strategies (right now just
   `tf.distribute.experimental.ParameterServerStrategy`). All replicas that want
   to operate on a variable retrieve it at the beginning of a step and send an
-  update to be applied at the end of the step. These can in priniciple support
+  update to be applied at the end of the step. These can in principle support
   either sync or async training, but right now we only have support for async
   training with parameter servers. Compare to
   `tf.distribute.experimental.CentralStorageStrategy`, which puts all variables
@@ -295,7 +295,8 @@ def get_loss_reduction():
 # Internal API for validating the current thread mode
 
 
-def _require_cross_replica_or_default_context_extended(extended):
+def _require_cross_replica_or_default_context_extended(extended,
+                                                       error_message=None):
   """Verify in cross-replica context."""
   context = _get_per_thread_mode()
   cross_replica = context.cross_replica_context
@@ -308,8 +309,10 @@ def _require_cross_replica_or_default_context_extended(extended):
   if context.strategy is not strategy:
     _wrong_strategy_scope(strategy, context)
   assert cross_replica is None
-  raise RuntimeError("Method requires being in cross-replica context, use "
+  if not error_message:
+    error_message = ("Method requires being in cross-replica context, use "
                      "get_replica_context().merge_call()")
+  raise RuntimeError(error_message)
 
 
 def _wrong_strategy_scope(strategy, context):
@@ -439,8 +442,12 @@ class InputReplicationMode(enum.Enum):
     Replicas will dequeue from the local Dataset on their worker.
     `tf.distribute.Strategy` doesn't manage any state sharing between such
     separate input pipelines.
+  * `PER_REPLICA`: The input function will be called on each replica separately.
+    `tf.distribute.Strategy` doesn't manage any state sharing between such
+    separate input pipelines.
   """
   PER_WORKER = "PER_WORKER"
+  PER_REPLICA = "PER_REPLICA"
 
 
 @tf_export("distribute.InputContext")
@@ -616,6 +623,8 @@ class RunOptions(
 class InputOptions(
     collections.namedtuple("InputOptions", [
         "experimental_prefetch_to_device",
+        "experimental_replication_mode",
+        "experimental_place_dataset_on_device",
     ])):
   """Run options for `experimental_distribute_dataset(s_from_function)`.
 
@@ -633,19 +642,36 @@ class InputOptions(
       strategy.experimental_distribute_dataset(
           dataset,
           tf.distribute.InputOptions(
-              experimental_prefetch_to_device=False)))
+              experimental_replication_mode=
+              experimental_replication_mode.PER_WORKER,
+              experimental_place_dataset_on_device=False)))
   ```
 
   Attributes:
     experimental_prefetch_to_device: Boolean. Defaults to True. If True, dataset
       elements will be prefetched to accelerator device memory. When False,
       dataset elements are prefetched to host device memory. Must be False when
-      using TPUEmbedding API.
+      using TPUEmbedding API. experimental_prefetch_to_device can only be used
+      with experimental_replication_mode=PER_WORKER
+    experimental_replication_mode: Replication mode for the input function.
+      Currently, the InputReplicationMode.PER_REPLICA is only supported with
+      tf.distribute.MirroredStrategy.
+      experimental_distribute_datasets_from_function.
+      The default value is InputReplicationMode.PER_WORKER.
+    experimental_place_dataset_on_device: Boolean. Default to False. When True,
+      dataset will be placed on the device, otherwise it will remain on the
+      host. experimental_place_dataset_on_device=True can only be used with
+      experimental_replication_mode=PER_REPLICA
   """
 
-  def __new__(cls, experimental_prefetch_to_device=True):
-    return super(InputOptions, cls).__new__(cls,
-                                            experimental_prefetch_to_device)
+  def __new__(cls,
+              experimental_prefetch_to_device=True,
+              experimental_replication_mode=InputReplicationMode.PER_WORKER,
+              experimental_place_dataset_on_device=False):
+    return super(InputOptions,
+                 cls).__new__(cls, experimental_prefetch_to_device,
+                              experimental_replication_mode,
+                              experimental_place_dataset_on_device)
 
 # ------------------------------------------------------------------------------
 # Base classes for all distribution strategies.
@@ -660,7 +686,7 @@ class StrategyBase(object):
   See [the guide](https://www.tensorflow.org/guide/distributed_training)
   for overview and examples. See `tf.distribute.StrategyExtended` and
   [`tf.distribute`](https://www.tensorflow.org/api_docs/python/tf/distribute)
-  for a glossory of concepts mentioned on this page such as "per-replica",
+  for a glossary of concepts mentioned on this page such as "per-replica",
   _replica_, and _reduce_.
 
   In short:
@@ -681,9 +707,7 @@ class StrategyBase(object):
     [see the
     guide](https://www.tensorflow.org/guide/distributed_training#using_tfdistributestrategy_with_custom_training_loops):
 
-      * Start by either creating a `tf.data.Dataset` normally or using
-        `tf.distribute.experimental_make_numpy_dataset` to make a dataset out of
-        a `numpy` array.
+      * Start by creating a `tf.data.Dataset` normally.
       * Use `tf.distribute.Strategy.experimental_distribute_dataset` to convert
         a `tf.data.Dataset` to something that produces "per-replica" values.
         If you want to manually specify how the dataset should be partitioned
@@ -908,39 +932,6 @@ class StrategyBase(object):
       return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
           input_fn, replication_mode=replication_mode)
 
-  @deprecation.deprecated(
-      "2020-09-30", "Please use tf.data.Dataset.from_tensor_slices instead")
-  def experimental_make_numpy_dataset(self, numpy_input):
-    """Makes a `tf.data.Dataset` from a numpy array.
-
-    This avoids adding `numpy_input` as a large constant in the graph,
-    and copies the data to the machine or machines that will be processing
-    the input.
-
-    Note that you will likely need to use `experimental_distribute_dataset`
-    with the returned dataset to further distribute it with the strategy.
-
-    Example:
-
-    >>> strategy = tf.distribute.MirroredStrategy()
-    >>> numpy_input = np.ones([10], dtype=np.float32)
-    >>> dataset = strategy.experimental_make_numpy_dataset(numpy_input)
-    >>> dataset
-    <TensorSliceDataset shapes: (), types: tf.float32>
-    >>> dataset = dataset.batch(2)
-    >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
-
-    Args:
-      numpy_input: a nest of NumPy input arrays that will be converted into a
-        dataset. Note that the NumPy arrays are stacked, as that is normal
-        `tf.data.Dataset` behavior.
-
-    Returns:
-      A `tf.data.Dataset` representing `numpy_input`.
-    """
-    return self.extended.experimental_make_numpy_dataset(
-        numpy_input, session=None)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED: TF 1.x only
   def experimental_run(self, fn, input_iterator=None):
     """DEPRECATED TF 1.x ONLY."""
@@ -986,7 +977,7 @@ class StrategyBase(object):
     }]
 
 
-    Three key actions happending under the hood of this method are batching,
+    Three key actions happening under the hood of this method are batching,
     sharding, and prefetching.
 
     In the code snippet above, `dataset` is batched by `global_batch_size`, and
@@ -1262,7 +1253,7 @@ class StrategyBase(object):
 
     with self.scope():
       # tf.distribute supports Eager functions, so AutoGraph should not be
-      # applied when when the caller is also in Eager mode.
+      # applied when the caller is also in Eager mode.
       fn = autograph.tf_convert(
           fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
       return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
@@ -1466,107 +1457,6 @@ class StrategyBase(object):
     denom = math_ops.cast(denom, numer.dtype)
     return math_ops.truediv(numer, denom)
 
-  # TODO(wxinyi): generate docs after it is implemented for all strategies.
-  # TODO(wxinyi): hide from V1 API
-  def _gather(self, value, axis):
-    # pylint: disable=line-too-long, protected-access
-    """Gather `value` across replicas along `axis` to the current device.
-
-    Given a `tf.distribute.DistributedValues` or `tf.Tensor`-like
-    object `value`, this API gathers and concatenates `value` along the
-    `axis`-th dimension. The result is copied to the "current" device - which
-    would typically be the CPU of the worker on which the program is running.
-    For `tf.distribute.TPUStrategy`, it is the first TPU host. For multi-client
-    `MultiWorkerMirroredStrategy`, this is CPU of each worker.
-
-    This API can only be called in the cross-replica context. For a counterpart
-    in the replica context, see `tf.distribute.ReplicaContext.all_gather`.
-
-    Note: the input `value` on different replicas must have the same rank, and
-    they must have shapes that are consistent along all dimensions except the
-    `axis`-th dimension. For example, given a `tf.distribute.DistributedValues`
-    with tensors of shape `(1, 2, 3)` and `(1, 3, 3)` on two replicas, you can
-    call `gather(..., axis=1, ...)` on it, but not `gather(..., axis=0, ...)` or
-    `gather(..., axis=2, ...)`.
-
-
-    # TODO(wxinyi): convert to testable docstring after implemented for MirroredStrategy
-    ```python
-    strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
-    local_tensor = tf.constant([[1, 2], [3, 4]])
-    distributed_values = strategy.experimental_distribute_values_from_function(lambda _: tf.identity(local_tensor))
-    @tf.function
-    def run():
-      return strategy.gather(distributed_values, axis=0)
-    run()
-    # <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
-    # array([[1, 2],
-    #        [3, 4],
-    #        [1, 2],
-    #        [3, 4]], dtype=int32)>
-    ```
-
-    Some more example cases:
-
-    ```python
-    strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])
-    local_tensor = tf.reshape(tf.range(6), shape=(1,2,3))
-    distributed_values = strategy.experimental_distribute_values_from_function(lambda _: local_tensor)
-    @tf.function
-    def run():
-      return strategy.gather(distributed_values, axis=AXIS)
-    run()
-
-    #     With AXIS=0, the result is
-    #     <tf.Tensor: shape=(4, 2, 3), dtype=int32, numpy=
-    #     array([[[0, 1, 2],
-    #             [3, 4, 5]],
-    #            [[0, 1, 2],
-    #             [3, 4, 5]],
-    #            [[0, 1, 2],
-    #             [3, 4, 5]],
-    #            [[0, 1, 2],
-    #             [3, 4, 5]]], dtype=int32)>
-    #     With AXIS=1, the result is
-    #     <tf.Tensor: shape=(1, 8, 3), dtype=int32, numpy=
-    #     array([[[0, 1, 2],
-    #             [3, 4, 5],
-    #             [0, 1, 2],
-    #             [3, 4, 5],
-    #             [0, 1, 2],
-    #             [3, 4, 5],
-    #             [0, 1, 2],
-    #             [3, 4, 5]]], dtype=int32)>
-    #     With AXIS=2, the result is
-    #     <tf.Tensor: shape=(1, 2, 12), dtype=int32, numpy=
-    #     array([[[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
-    #             [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5]]], dtype=int32)>
-
-    ```
-
-    Args:
-      value: a `tf.distribute.DistributedValues` instance, e.g. returned by
-        `Strategy.run`, to be combined into a single tensor. It can also be a
-        regular tensor when used with `OneDeviceStrategy` or default strategy.
-        The underlying tensor constructs can only be dense tensors with non-zero
-        rank, NOT `tf.IndexedSlices`.
-      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
-        range [0, rank(value)).
-
-    Returns:
-       A `Tensor` that's the concatenation of `value` across replicas along
-       `axis` dimension.
-    """
-    # pylint: enable=line-too-long
-    _require_cross_replica_or_default_context_extended(self._extended)
-    dst = device_util.current(
-    ) or self._extended._default_device or "/device:CPU:0"
-    if isinstance(value, ops.IndexedSlices):
-      raise NotImplementedError("gather/all_gather does not support "
-                                "IndexedSlices")
-    return self._extended._local_results(
-        self._extended._gather_to(value, dst, axis))[0]
-
   @doc_controls.do_not_doc_inheritable  # DEPRECATED
   def unwrap(self, value):
     """Returns the list of all local per-replica values contained in `value`.
@@ -1799,6 +1689,113 @@ class Strategy(StrategyBase):
     return self._extended._experimental_distribute_values_from_function(  # pylint: disable=protected-access
         value_fn)
 
+  def gather(self, value, axis):
+    # pylint: disable=line-too-long, protected-access
+    """Gather `value` across replicas along `axis` to the current device.
+
+    Given a `tf.distribute.DistributedValues` or `tf.Tensor`-like
+    object `value`, this API gathers and concatenates `value` across replicas
+    along the `axis`-th dimension. The result is copied to the "current" device,
+    which would typically be the CPU of the worker on which the program is
+    running. For `tf.distribute.TPUStrategy`, it is the first TPU host. For
+    multi-client `tf.distribute.MultiWorkerMirroredStrategy`, this is the CPU of
+    each worker.
+
+    This API can only be called in the cross-replica context. For a counterpart
+    in the replica context, see `tf.distribute.ReplicaContext.all_gather`.
+
+    Note: For all strategies except `tf.distribute.TPUStrategy`, the input
+    `value` on different replicas must have the same rank, and their shapes must
+    be the same in all dimensions except the `axis`-th dimension. In other
+    words, their shapes cannot be different in a dimension `d` where `d` does
+    not equal to the `axis` argument. For example, given a
+    `tf.distribute.DistributedValues` with component tensors of shape
+    `(1, 2, 3)` and `(1, 3, 3)` on two replicas, you can call
+    `gather(..., axis=1, ...)` on it, but not `gather(..., axis=0, ...)` or
+    `gather(..., axis=2, ...)`. However, for `tf.distribute.TPUStrategy.gather`,
+    all tensors must have exactly the same rank and same shape.
+
+    Note: Given a `tf.distribute.DistributedValues` `value`, its component
+    tensors must have a non-zero rank. Otherwise, consider using
+    `tf.expand_dims` before gathering them.
+
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> # A DistributedValues with component tensor of shape (2, 1) on each replica
+    ... distributed_values = strategy.experimental_distribute_values_from_function(lambda _: tf.identity(tf.constant([[1], [2]])))
+    >>> @tf.function
+    ... def run():
+    ...   return strategy.gather(distributed_values, axis=0)
+    >>> run()
+    <tf.Tensor: shape=(4, 1), dtype=int32, numpy=
+    array([[1],
+           [2],
+           [1],
+           [2]], dtype=int32)>
+
+
+    Consider the following example for more combinations:
+
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])
+    >>> single_tensor = tf.reshape(tf.range(6), shape=(1,2,3))
+    >>> distributed_values = strategy.experimental_distribute_values_from_function(lambda _: tf.identity(single_tensor))
+    >>> @tf.function
+    ... def run(axis):
+    ...   return strategy.gather(distributed_values, axis=axis)
+    >>> axis=0
+    >>> run(axis)
+    <tf.Tensor: shape=(4, 2, 3), dtype=int32, numpy=
+    array([[[0, 1, 2],
+            [3, 4, 5]],
+           [[0, 1, 2],
+            [3, 4, 5]],
+           [[0, 1, 2],
+            [3, 4, 5]],
+           [[0, 1, 2],
+            [3, 4, 5]]], dtype=int32)>
+    >>> axis=1
+    >>> run(axis)
+    <tf.Tensor: shape=(1, 8, 3), dtype=int32, numpy=
+    array([[[0, 1, 2],
+            [3, 4, 5],
+            [0, 1, 2],
+            [3, 4, 5],
+            [0, 1, 2],
+            [3, 4, 5],
+            [0, 1, 2],
+            [3, 4, 5]]], dtype=int32)>
+    >>> axis=2
+    >>> run(axis)
+    <tf.Tensor: shape=(1, 2, 12), dtype=int32, numpy=
+    array([[[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
+            [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5]]], dtype=int32)>
+
+
+    Args:
+      value: a `tf.distribute.DistributedValues` instance, e.g. returned by
+        `Strategy.run`, to be combined into a single tensor. It can also be a
+        regular tensor when used with `tf.distribute.OneDeviceStrategy` or the
+        default strategy. The tensors that constitute the DistributedValues
+        can only be dense tensors with non-zero rank, NOT a `tf.IndexedSlices`.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+
+    Returns:
+       A `Tensor` that's the concatenation of `value` across replicas along
+       `axis` dimension.
+    """
+    # pylint: enable=line-too-long
+    error_message = ("tf.distribute.Strategy.gather method requires "
+                     "cross-replica context, use "
+                     "get_replica_context().all_gather() instead")
+    _require_cross_replica_or_default_context_extended(self._extended,
+                                                       error_message)
+    dst = device_util.current(
+    ) or self._extended._default_device or "/device:CPU:0"
+    if isinstance(value, ops.IndexedSlices):
+      raise NotImplementedError("gather does not support IndexedSlices")
+    return self._extended._local_results(
+        self._extended._gather_to(value, dst, axis))[0]
+
 
 # TF v1.x version has additional deprecated APIs
 @tf_export(v1=["distribute.Strategy"])
@@ -2217,7 +2214,7 @@ class StrategyExtendedV2(object):
     dst = device_util.current() or self._default_device or "/device:CPU:0"
     return self._local_results(self.reduce_to(reduce_op, value, dst))[0]
 
-  def reduce_to(self, reduce_op, value, destinations, experimental_hints=None):
+  def reduce_to(self, reduce_op, value, destinations, options=None):
     """Combine (via e.g. sum or mean) values across replicas.
 
     `reduce_to` aggregates `tf.distribute.DistributedValues` and distributed
@@ -2282,12 +2279,17 @@ class StrategyExtendedV2(object):
         `destinations`. Note that if it's a `tf.Variable`, the value is reduced
         to the devices of that variable, and this method doesn't update the
         variable.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
       A tensor or value reduced to `destinations`.
     """
+    if options is None:
+      options = collective_util.Options()
     _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
     assert not isinstance(reduce_op, variable_scope.VariableAggregation)
@@ -2295,17 +2297,12 @@ class StrategyExtendedV2(object):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
     assert (reduce_op == reduce_util.ReduceOp.SUM or
             reduce_op == reduce_util.ReduceOp.MEAN)
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
-    return self._reduce_to(reduce_op, value, destinations, experimental_hints)
+    return self._reduce_to(reduce_op, value, destinations, options)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     raise NotImplementedError("must be implemented in descendants")
 
-  def batch_reduce_to(self,
-                      reduce_op,
-                      value_destination_pairs,
-                      experimental_hints=None):
+  def batch_reduce_to(self, reduce_op, value_destination_pairs, options=None):
     """Combine multiple `reduce_to` calls into one for faster execution.
 
     Similar to `reduce_to`, but accepts a list of (value, destinations) pairs.
@@ -2360,30 +2357,30 @@ class StrategyExtendedV2(object):
         "SUM", "MEAN".
       value_destination_pairs: a sequence of (value, destinations) pairs. See
         `tf.distribute.Strategy.reduce_to` for descriptions.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
       A list of reduced values, one per pair in `value_destination_pairs`.
     """
+    if options is None:
+      options = collective_util.Options()
     _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     if isinstance(reduce_op, six.string_types):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
-    return self._batch_reduce_to(reduce_op, value_destination_pairs,
-                                 experimental_hints)
+    return self._batch_reduce_to(reduce_op, value_destination_pairs, options)
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs,
-                       experimental_hints):
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     return [
-        self.reduce_to(
-            reduce_op, t, destinations=v, experimental_hints=experimental_hints)
+        self.reduce_to(reduce_op, t, destinations=v, options=options)
         for t, v in value_destination_pairs
     ]
 
-  def _gather_to(self, value, destinations, axis, experimental_hints=None):
+  def _gather_to(self, value, destinations, axis, options=None):
     """Gather `value` across replicas along axis-th dimension to `destinations`.
 
     `gather_to` gathers `tf.distribute.DistributedValues` or `tf.Tensor`-like
@@ -2400,31 +2397,30 @@ class StrategyExtendedV2(object):
         variable.
       axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
         range [0, rank(value)).
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
-        `tf.distribute.experimental.CollectiveHints` for details.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
       A tensor or value gathered to `destinations`.
     """
     _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
-    return self._gather_to_implementation(value, destinations, axis, experimental_hints)
+    if options is None:
+      options = collective_util.Options()
+    return self._gather_to_implementation(value, destinations, axis, options)
 
-  def _gather_to_implementation(self, value, destinations, axis, experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
     raise NotImplementedError("_gather_to must be implemented in descendants")
 
-  def _batch_gather_to(self,
-                       value_destination_pairs,
-                       axis,
-                       experimental_hints=None):
+  def _batch_gather_to(self, value_destination_pairs, axis, options=None):
     _require_cross_replica_or_default_context_extended(self)
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
     return [
-        self._gather_to(
-            t, destinations=v, axis=axis, experimental_hints=experimental_hints)
+        self._gather_to(t, destinations=v, axis=axis, options=options)
         for t, v in value_destination_pairs
     ]
 
@@ -2445,7 +2441,8 @@ class StrategyExtendedV2(object):
     Example usage:
 
     ```python
-    strategy = tf.distribute.MirroredStrategy(['GPU:0', 'GPU:1']) # With 2 devices
+    strategy = tf.distribute.MirroredStrategy(['GPU:0', 'GPU:1']) # With 2
+    devices
     with strategy.scope():
       v = tf.Variable(5.0, aggregation=tf.VariableAggregation.SUM)
     def update_fn(v):
@@ -2570,7 +2567,7 @@ class StrategyExtendedV2(object):
     Multi-worker training refers to the setup where the training is
     distributed across multiple workers, as opposed to the case where
     only a local process performs the training. This function is
-    used by higher-level apis such as Keras' `model.fit()` to infer
+    used by higher-level APIs such as Keras' `model.fit()` to infer
     for example whether or not a distribute coordinator should be run,
     and thus TensorFlow servers should be started for communication
     with other servers in the cluster, or whether or not saving/restoring
@@ -2846,8 +2843,7 @@ class StrategyExtendedV1(StrategyExtendedV2):
 #   It sets the current Strategy for purposes of
 #   `get_strategy()` and `has_strategy()`
 #   and switches the thread mode to a "cross-replica context".
-@tf_export("distribute.ReplicaContext")
-class ReplicaContext(object):
+class ReplicaContextBase(object):
   """A class with a collection of APIs that can be called in a replica context.
 
   You can use `tf.distribute.get_replica_context` to get an instance of
@@ -2883,7 +2879,7 @@ class ReplicaContext(object):
       raise ValueError(
           "replica_id_in_sync_group can only be an integer, a Tensor or None.")
     self._replica_id_in_sync_group = replica_id_in_sync_group
-    # We need this check becaused TPUContext extends from ReplicaContext and
+    # We need this check because TPUContext extends from ReplicaContext and
     # does not pass a strategy object since it is used by TPUEstimator.
     if strategy:
       self._local_replica_id = strategy.extended._get_local_replica_id(
@@ -2940,6 +2936,7 @@ class ReplicaContext(object):
     require_replica_context(self)
     if kwargs is None:
       kwargs = {}
+
     merge_fn = autograph.tf_convert(
         merge_fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
     return self._merge_call(merge_fn, args, kwargs)
@@ -3010,7 +3007,7 @@ class ReplicaContext(object):
     require_replica_context(self)
     return (device_util.current(),)
 
-  def all_reduce(self, reduce_op, value, experimental_hints=None):
+  def all_reduce(self, reduce_op, value, options=None):
     """All-reduces `value` across all replicas.
 
     >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
@@ -3023,7 +3020,7 @@ class ReplicaContext(object):
      <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
 
     It supports batched operations. You can pass a list of values and it
-    attempts to batch them when possible. You can also specify `experimental_hints`
+    attempts to batch them when possible. You can also specify `options`
     to indicate the desired batching behavior, e.g. batch the values into
     multiple packs so that they can better overlap with computations.
 
@@ -3063,8 +3060,11 @@ class ReplicaContext(object):
       value: a nested structure of `tf.Tensor` which `tf.nest.flatten` accepts.
         The structure and the shapes of the `tf.Tensor` need to be same on all
         replicas.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
        A nested structure of `tf.Tensor` with the reduced values. The structure
@@ -3072,13 +3072,13 @@ class ReplicaContext(object):
     """
     if isinstance(reduce_op, six.string_types):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
 
     def batch_all_reduce(strategy, *value_flat):
       return strategy.extended.batch_reduce_to(
           reduce_op, [(v, _batch_reduce_destination(v)) for v in value_flat],
-          experimental_hints)
+          options)
 
     if reduce_op in [reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN]:
       # TODO(cjfj): Work out why `batch_reduce` doesn't return the correct grad.
@@ -3104,76 +3104,119 @@ class ReplicaContext(object):
   #   to that point that the first result is needed. Most likely this can be
   #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
-  # TODO(wxinyi): generate docs after it is implemented for all strategies.
-  def _all_gather(self, value, axis, experimental_hints=None):
+
+@tf_export("distribute.ReplicaContext", v1=[])
+class ReplicaContext(ReplicaContextBase):
+
+  __doc__ = ReplicaContextBase.__doc__
+
+  def all_gather(self, value, axis, options=None):
     """All-gathers `value` across all replicas along `axis`.
 
-    Note: An `all_gather` method can only be called in replica context. To find
+    Note: An `all_gather` method can only be called in replica context. For
     a cross-replica context counterpart, see `tf.distribute.Strategy.gather`.
     All replicas need to participate in the all-gather, otherwise this
     operation hangs. So if `all_gather` is called in any replica, it must be
     called in all replicas.
 
-    Note: If there're multiple all-gather calls, they need to execute in
-    the same order on all replicas. Dispatching all-gather based on conditions
+    Note: If there are multiple `all_gather` calls, they need to be executed in
+    the same order on all replicas. Dispatching `all_gather` based on conditions
     is usually error-prone.
 
-    # TODO(wxinyi): convert to testable docstring after implemented for MirroredStrategy
-    ```python
-    strategy = tf.distribute.MirroredStrategy(["GPU:0", "CPU:0"])
-    @tf.function
-    def gather_value():
-      ctx = tf.distribute.get_replica_context()
-      value = tf.constant([1, 2, 3])
-      # all_gather a `tf.distribute.DistributedValues`
-      return strategy.run(ctx.all_gather(value, axis=0))
-    strategy.experimental_local_results(gather_value)
-    # Result:
-    # (<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
-    # dtype=int32)>,
-    #  <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
-    # dtype=int32)>)
-    ```
+    For all strategies except `tf.distribute.TPUStrategy`, the input
+    `value` on different replicas must have the same rank, and their shapes must
+    be the same in all dimensions except the `axis`-th dimension. In other
+    words, their shapes cannot be different in a dimension `d` where `d` does
+    not equal to the `axis` argument. For example, given a
+    `tf.distribute.DistributedValues` with component tensors of shape
+    `(1, 2, 3)` and `(1, 3, 3)` on two replicas, you can call
+    `all_gather(..., axis=1, ...)` on it, but not `all_gather(..., axis=0, ...)`
+    or `all_gather(..., axis=2, ...)`. However, with
+    `tf.distribute.TPUStrategy`, all tensors must have exactly the same rank and
+    same shape.
 
-    ```python
-    strategy = tf.distribute.MirroredStrategy(["GPU:0", "CPU:0"])
-    @tf.function
-    def gather_nest():
-      ctx = tf.distribute.get_replica_context()
-      value_1 = tf.constant([1, 2, 3])
-      value_2 = tf.constant([[1, 2], [3, 4]])
-      # all_gather a nest of `tf.distribute.DistributedValues`
-      return ctx.all_gather([value_1, value_2], axis=0)
-    strategy.experimental_local_results(gather_nest)
-    # Result:
-    # ([<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
-    # dtype=int32)>, <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
-    # array([[1, 2],
-    #        [3, 4],
-    #        [1, 2],
-    #        [3, 4]], dtype=int32)],
-    # [<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
-    # dtype=int32)>, <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
-    # array([[1, 2],
-    #        [3, 4],
-    #        [1, 2],
-    #        [3, 4]], dtype=int32)])
-    ```
+    Note: The input `value` must have a non-zero rank. Otherwise, consider using
+    `tf.expand_dims` before gathering them.
 
-    Example with two replicas:
-      Replica 0 `value`: {'a': [0], 'b': [[0, 1]]}
-      Replica 1 `value`: {'a': [1], 'b': [[2, 3], [4, 5]]}
+    You can pass in a single tensor to all-gather:
 
-      Result for `all_gather` with `axis`=0: (on all replicas):
-      {'a': [1, 2], 'b': [[0, 1], [2, 3], [4, 5]]}
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> @tf.function
+    ... def gather_value():
+    ...   ctx = tf.distribute.get_replica_context()
+    ...   local_value = tf.constant([1, 2, 3])
+    ...   return ctx.all_gather(local_value, axis=0)
+    >>> result = strategy.run(gather_value)
+    >>> result
+    PerReplica:{
+      0: <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3], dtype=int32)>,
+      1: <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3], dtype=int32)>
+    }
+    >>> strategy.experimental_local_results(result)
+    (<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    dtype=int32)>,
+    <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    dtype=int32)>)
 
-    Note: an input to be all_gathered must have the same rank on different
-    replicas, and they must have shapes that are consistent along all dimensions
-    except the `axis`-th dimension. For example, given a
-    `tf.distribute.DistributedValues` with tensors of shape `(1, 2, 3)` and
-    `(1, 3, 3)` on two replicas, you can call `all_gather(..., axis=1, ...)` on
-    it, but not `all_gather(..., axis=0, ...)` or `all_gather(..., axis=2, ...)`.
 
+    You can also pass in a nested structure of tensors to all-gather, say, a
+    list:
+
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> @tf.function
+    ... def gather_nest():
+    ...   ctx = tf.distribute.get_replica_context()
+    ...   value_1 = tf.constant([1, 2, 3])
+    ...   value_2 = tf.constant([[1, 2], [3, 4]])
+    ...   # all_gather a nest of `tf.distribute.DistributedValues`
+    ...   return ctx.all_gather([value_1, value_2], axis=0)
+    >>> result = strategy.run(gather_nest)
+    >>> result
+    [PerReplica:{
+      0: <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3], dtype=int32)>,
+      1: <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3], dtype=int32)>
+    }, PerReplica:{
+      0: <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    array([[1, 2],
+           [3, 4],
+           [1, 2],
+           [3, 4]], dtype=int32)>,
+      1: <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    array([[1, 2],
+           [3, 4],
+           [1, 2],
+           [3, 4]], dtype=int32)>
+    }]
+    >>> strategy.experimental_local_results(result)
+    ([PerReplica:{
+      0: <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3], dtype=int32)>,
+      1: <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3], dtype=int32)>
+    }, PerReplica:{
+      0: <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    array([[1, 2],
+           [3, 4],
+           [1, 2],
+           [3, 4]], dtype=int32)>,
+      1: <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    array([[1, 2],
+           [3, 4],
+           [1, 2],
+           [3, 4]], dtype=int32)>
+    }],)
+
+
+    What if you are all-gathering tensors with different shapes on different
+    replicas? Consider the following example with two replicas, where you have
+    `value` as a nested structure consisting of two items to all-gather, `a` and
+    `b`.
+
+      On Replica 0, `value` is `{'a': [0], 'b': [[0, 1]]}`.
+
+      On Replica 1, `value` is `{'a': [1], 'b': [[2, 3], [4, 5]]}`.
+
+      Result for `all_gather` with `axis`=0 (on each of the replicas) is:
+
+      ```{'a': [1, 2], 'b': [[0, 1], [2, 3], [4, 5]]}```
 
     Args:
       value: a nested structure of `tf.Tensor` which `tf.nest.flatten` accepts,
@@ -3182,8 +3225,11 @@ class ReplicaContext(object):
         constructs can only be dense tensors with non-zero rank, NOT
         `tf.IndexedSlices`.
       axis: 0-D int32 Tensor. Dimension along which to gather.
-      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
        A nested structure of `tf.Tensor` with the gathered values. The structure
@@ -3191,26 +3237,45 @@ class ReplicaContext(object):
     """
     for v in nest.flatten(value):
       if isinstance(v, ops.IndexedSlices):
-        raise NotImplementedError("gather/all_gather does not support "
-                                  "IndexedSlices")
+        raise NotImplementedError("all_gather does not support IndexedSlices")
 
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
 
     def batch_all_gather(strategy, *value_flat):
       return strategy.extended._batch_gather_to(  # pylint: disable=protected-access
           [(v, _batch_reduce_destination(v)) for v in value_flat], axis,
-          experimental_hints)
+          options)
 
     @custom_gradient.custom_gradient
     def grad_wrapper(*xs):
       ys = self.merge_call(batch_all_gather, args=xs)
-      # The gradient of an all-gather is itself an all-gather.
-      return ys, lambda *dy_s: self._all_gather(dy_s, axis)
+
+      def grad(*dy_s):
+        grads = self.all_reduce(reduce_util.ReduceOp.SUM, dy_s)
+        new_grads = []
+        for i, grad in enumerate(grads):
+          input_shape = array_ops.shape(xs[i])
+          axis_dim = array_ops.reshape(input_shape[axis], [1])
+          with ops.control_dependencies([array_ops.identity(grads)]):
+            d = self.all_gather(axis_dim, axis=0)
+            begin_dim = math_ops.reduce_sum(d[:self.replica_id_in_sync_group])
+            end_dim = begin_dim + array_ops.shape(xs[i])[axis]
+            new_grad = array_ops.gather(
+                grad, axis=axis, indices=math_ops.range(begin_dim, end_dim))
+            new_grads.append(new_grad)
+        return new_grads
+
+      return ys, grad
 
     return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
 
 
+@tf_export(v1=["distribute.ReplicaContext"])
+class ReplicaContextV1(ReplicaContextBase):
+  __doc__ = ReplicaContextBase.__doc__
+
+
 def _batch_reduce_destination(x):
   """Returns the destinations for batch all-reduce."""
   if isinstance(x, ops.Tensor):
@@ -3354,13 +3419,13 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     with ReplicaContext(self._container_strategy(), replica_id_in_sync_group=0):
       return fn(*args, **kwargs)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     # TODO(josh11b): Use destinations?
-    del reduce_op, destinations, experimental_hints
+    del reduce_op, destinations, options
     return value
 
-  def _gather_to_implementation(self, value, destinations, axis, experimental_hints):
-    del destinations, axis, experimental_hints
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    del destinations, axis, options
     return value
 
   def _update(self, var, fn, args, kwargs, group):
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 938cc42f070..7533b3a35ca 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.data.ops import dataset_ops
@@ -95,14 +94,10 @@ class _TestExtended(distribute_lib.StrategyExtendedV1):
   def _local_results(self, value):
     return (value,)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
-    del reduce_op, destinations, experimental_hints
+  def _reduce_to(self, reduce_op, value, destinations, options):
+    del reduce_op, destinations, options
     return value
 
-  def _experimental_make_numpy_dataset(self, numpy_input, session):
-    del session
-    return dataset_ops.DatasetV2.from_tensor_slices(numpy_input)
-
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values=None):
     # TODO(tomhennigan) This is missing many things (e.g. ctx.run_op).
@@ -346,13 +341,6 @@ class TestStrategyTest(test.TestCase):
       self.assertEqual(self.evaluate(x), self.evaluate(x_r))
       self.assertEqual(self.evaluate(y), self.evaluate(y_r))
 
-  @_run_in_and_out_of_scope
-  def testExperimentalMakeNumpyDataset(self, dist):
-    numpy_input = np.ones([10], dtype=np.float32)
-    dataset = dist.experimental_make_numpy_dataset(numpy_input)
-    self.assertEqual(
-        self.evaluate(dataset.reduce(0., lambda a, b: a + b)), 10.)
-
   @_run_in_and_out_of_scope
   def testExperimentalRunStepsOnIterator(self, dist):
     all_inputs = []
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index d01cedcead0..390d2612753 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -25,6 +25,7 @@ import six
 
 from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
@@ -35,6 +36,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.distribute_lib import InputReplicationMode
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
@@ -108,7 +110,8 @@ def get_distributed_dataset(dataset,
 def get_distributed_datasets_from_function(dataset_fn,
                                            input_workers,
                                            input_contexts,
-                                           strategy):
+                                           strategy,
+                                           options=None):
   """Returns a distributed dataset from the given input function.
 
   This is a common function that is used by all strategies to return a
@@ -126,22 +129,43 @@ def get_distributed_datasets_from_function(dataset_fn,
         `worker_device_pairs`.
     strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
         handle last partial batch.
+    options: Default is None. `tf.distribute.InputOptions` used to control
+        options on how this dataset is distributed.
 
   Returns:
     A distributed dataset instance.
+
+  Raises:
+    ValueError: if `options.experimental_replication_mode` and
+    `options.experimental_place_dataset_on_device` are not consistent
   """
+  if (options is not None and
+      options.experimental_replication_mode != InputReplicationMode.PER_REPLICA
+      and options.experimental_place_dataset_on_device):
+    raise ValueError(
+        "When `experimental_place_dataset_on_device` is set for dataset "
+        "placement, you must also specify `PER_REPLICA` for the "
+        "replication mode")
+
+  if (options is not None and
+      options.experimental_replication_mode == InputReplicationMode.PER_REPLICA
+      and options.experimental_prefetch_to_device and
+      options.experimental_place_dataset_on_device):
+    raise ValueError(
+        "`experimental_place_dataset_on_device` can not be set to True "
+        "when experimental_prefetch_to_device is True and "
+        "replication mode is set to `PER_REPLICA`")
+
   if tf2.enabled():
-    return DistributedDatasetsFromFunction(
-        dataset_fn,
-        input_workers,
-        input_contexts,
-        strategy)
+    return DistributedDatasetsFromFunction(dataset_fn, input_workers,
+                                           input_contexts, strategy, options)
   else:
     return DistributedDatasetsFromFunctionV1(
         dataset_fn,
         input_workers,
         input_contexts,
-        strategy)
+        strategy,
+        options)
 
 
 @tf_export("distribute.DistributedIterator", v1=[])
@@ -535,7 +559,8 @@ def _get_next_as_optional(iterator, strategy, return_per_replica=False):
     flattened_data = []
     for per_worker_data in replicas:
       flattened_data.extend(per_worker_data)
-    replicas = distribute_utils.regroup(flattened_data)
+    replicas = _create_per_replica(
+        flattened_data, strategy, get_next_as_optional=True)
 
   # Run an all-reduce to see whether any worker has values.
   # TODO(b/131423105): we should be able to short-cut the all-reduce in some
@@ -635,7 +660,8 @@ class DistributedIteratorBase(DistributedIteratorInterface):
           # Make `replicas` a flat list of values across all replicas.
           replicas.extend(
               self._iterators[i].get_next_as_list_static_shapes(new_name))
-      return distribute_utils.regroup(replicas)
+      return _create_per_replica(
+          replicas, self._strategy, get_next_as_optional=False)
 
     out_of_range_replicas = []
     def out_of_range_fn(worker_index, device):
@@ -669,7 +695,8 @@ class DistributedIteratorBase(DistributedIteratorInterface):
             results.append(result)
     replicas = results
 
-    return distribute_utils.regroup(replicas)
+    return _create_per_replica(replicas, self._strategy,
+                               self._enable_get_next_as_optional)
 
 
 class DistributedIteratorV1(DistributedIteratorBase):
@@ -869,11 +896,25 @@ class DistributedIterator(DistributedIteratorBase,
 
   @property
   def element_spec(self):
+    # When partial batch handling is enabled, always set the batch dimension to
+    # None, otherwise we just follow element_spec of the underlying dataset
+    # (whose batch dimension may also be None). This is because with partial
+    # batching handling we could always produce empty batches.
+    #
+    # TODO(b/163362689): avoid this once we have more elegant way to handle
+    # retracing and collectives.
+    if (self._enable_get_next_as_optional and
+        self._strategy.extended._in_multi_worker_mode()):  # pylint: disable=protected-access
+      return nest.map_structure(
+          _rebatch_as_dynamic, self._element_spec, expand_composites=False)
     return self._element_spec
 
   @property
   def _type_spec(self):
-    return DistributedIteratorSpec(self._input_workers, self.element_spec,
+    # Note that we use actual element_spec to create DistributedIteratorSpec,
+    # to be consistent with the underlying iterators' specs.
+    # TODO(b/163362689): remove the comment after the bug if fixed.
+    return DistributedIteratorSpec(self._input_workers, self._element_spec,
                                    self._strategy,
                                    self._enable_get_next_as_optional)
 
@@ -989,7 +1030,7 @@ class DistributedDataset(_IterableInput):
     self._input_workers = input_workers
     self._strategy = strategy
     self._enable_get_next_as_optional = _enable_get_next_as_optional(
-        self._strategy, dataset.element_spec)
+        self._strategy, dataset)
     self._element_spec = _create_distributed_tensor_spec(
         self._strategy, self._cloned_datasets[0].element_spec)
 
@@ -1073,7 +1114,7 @@ class DistributedDataset(_IterableInput):
           worker_iterators,
           self._strategy,
           enable_get_next_as_optional=self._enable_get_next_as_optional)
-    iterator._element_spec = self.element_spec  # pylint: disable=protected-access
+    iterator._element_spec = self._element_spec  # pylint: disable=protected-access
 
     # When async eager is enabled, sometimes the iterator may not finish
     # initialization before passing to a multi device function, add a sync point
@@ -1086,6 +1127,17 @@ class DistributedDataset(_IterableInput):
   @property
   def element_spec(self):
     """The type specification of an element of this dataset."""
+    # When partial batch handling is enabled, always set the batch dimension to
+    # None, otherwise we just follow element_spec of the underlying dataset
+    # (whose batch dimension may also be None). This is because with partial
+    # batching handling we could always produce empty batches.
+    #
+    # TODO(b/163362689): avoid this once we have more elegant way to handle
+    # retracing and collectives.
+    if (self._enable_get_next_as_optional and
+        self._strategy.extended._in_multi_worker_mode()):  # pylint: disable=protected-access
+      return nest.map_structure(
+          _rebatch_as_dynamic, self._element_spec, expand_composites=False)
     return self._element_spec
 
 
@@ -1178,7 +1230,8 @@ class DistributedDatasetV1(DistributedDataset):
 class DistributedDatasetsFromFunction(_IterableInput):
   """Inputs created from dataset function."""
 
-  def __init__(self, dataset_fn, input_workers, input_contexts, strategy):
+  def __init__(self, dataset_fn, input_workers, input_contexts, strategy,
+               options):
     """Makes an iterable from datasets created by the given function.
 
     Args:
@@ -1189,6 +1242,8 @@ class DistributedDatasetsFromFunction(_IterableInput):
         `worker_device_pairs`.
       strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
         handle last partial batch.
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
     """
     super(DistributedDatasetsFromFunction, self).__init__(
         input_workers=input_workers)
@@ -1202,12 +1257,12 @@ class DistributedDatasetsFromFunction(_IterableInput):
     self._input_workers = input_workers
     self._input_contexts = input_contexts
     self._strategy = strategy
+    self._options = options
     self._datasets, element_spec = (
-        _create_datasets_per_worker_with_input_context(self._input_contexts,
-                                                       self._input_workers,
-                                                       dataset_fn))
+        _create_datasets_from_function_with_input_context(
+            self._input_contexts, self._input_workers, dataset_fn))
     self._enable_get_next_as_optional = _enable_get_next_as_optional(
-        self._strategy, element_spec)
+        self._strategy, self._datasets[0])
     self._element_spec = _create_distributed_tensor_spec(
         self._strategy, element_spec)
 
@@ -1220,11 +1275,10 @@ class DistributedDatasetsFromFunction(_IterableInput):
       # out this change.
       enable_legacy_iterators = getattr(self._strategy,
                                         "_enable_legacy_iterators", False)
-
       iterators = _create_iterators_per_worker(self._datasets,
                                                self._input_workers,
-                                               enable_legacy_iterators)
-
+                                               enable_legacy_iterators,
+                                               self._options)
       if enable_legacy_iterators:
         iterator = DistributedIteratorV1(
             self._input_workers,
@@ -1233,9 +1287,9 @@ class DistributedDatasetsFromFunction(_IterableInput):
             enable_get_next_as_optional=self._enable_get_next_as_optional)
       else:
         iterator = DistributedIterator(
-            self._input_workers,
-            iterators,
-            self._strategy,
+            input_workers=self._input_workers,
+            iterators=iterators,
+            strategy=self._strategy,
             enable_get_next_as_optional=self._enable_get_next_as_optional)
       iterator._element_spec = self._element_spec  # pylint: disable=protected-access
 
@@ -1253,6 +1307,17 @@ class DistributedDatasetsFromFunction(_IterableInput):
   @property
   def element_spec(self):
     """The type specification of an element of this dataset."""
+    # When partial batch handling is enabled, always set the batch dimension to
+    # None, otherwise we just follow element_spec of the underlying dataset
+    # (whose batch dimension may also be None). This is because with partial
+    # batching handling we could always produce empty batches.
+    #
+    # TODO(b/163362689): avoid this once we have more elegant way to handle
+    # retracing and collectives.
+    if (self._enable_get_next_as_optional and
+        self._strategy.extended._in_multi_worker_mode()):  # pylint: disable=protected-access
+      return nest.map_structure(
+          _rebatch_as_dynamic, self._element_spec, expand_composites=False)
     return self._element_spec
 
 
@@ -1304,7 +1369,7 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
                        "or when eager execution is enabled.")
 
 
-# TODO(anjalisridhar): This class will be soon be removed in favor of newer
+# TODO(anjalisridhar): This class will be soon removed in favor of newer
 # APIs.
 class InputFunctionIterator(DistributedIteratorV1):
   """Iterator created from input function."""
@@ -1350,6 +1415,7 @@ class InputFunctionIterator(DistributedIteratorV1):
 
     super(InputFunctionIterator, self).__init__(
         input_workers, iterators, strategy, enable_get_next_as_optional=False)
+    self._enable_get_next_as_optional = False
 
 
 # TODO(anjalisridhar): This class will soon be removed and users should move
@@ -1475,7 +1541,7 @@ def _recover_shape_fn(data, value_structure):
 class _SingleWorkerDatasetIteratorBase(object):
   """Iterator for a single `tf.data.Dataset`."""
 
-  def __init__(self, dataset, worker, devices):
+  def __init__(self, dataset, worker, devices, options=None):
     """Create iterator for the `dataset` to fetch data to worker's `devices` .
 
     A `MultiDeviceIterator`  or `OwnedMultiDeviceIterator` is used to prefetch
@@ -1485,27 +1551,52 @@ class _SingleWorkerDatasetIteratorBase(object):
       dataset: A `tf.data.Dataset` instance.
       worker: Worker on which ops should be created.
       devices: Distribute data from `dataset` to these devices.
+      options: options.
     """
     self._dataset = dataset
     self._worker = worker
     self._devices = devices
     self._element_spec = dataset.element_spec
+    self._options = options
     self._make_iterator()
 
   def _make_iterator(self):
     raise NotImplementedError("must be implemented in descendants")
 
+  def _format_data_list_with_options(self, data_list):
+    """Change the data in to a list type if required.
+
+    The OwnedMultiDeviceIterator returns the list data type,
+    while the PER_REPLICA iterator (when used with prefetch disabled)
+    returns without the enclosed list. This is to fix the inconsistency.
+    Args:
+      data_list: data_list
+    Returns:
+      list
+    """
+    if (self._options and self._options.experimental_replication_mode ==
+        InputReplicationMode.PER_REPLICA and
+        not self._options.experimental_prefetch_to_device):
+      return [data_list]
+    else:
+      return data_list
+
   def get_next(self, device, name=None):
     """Get next element for the given device."""
     del name
     with ops.device(self._worker):
-      return self._iterator.get_next(device)
+      if isinstance(self._iterator,
+                    (multi_device_iterator_ops.OwnedMultiDeviceIterator,
+                     multi_device_iterator_ops.MultiDeviceIterator)):
+        return self._iterator.get_next(device)
+      else:
+        return self._iterator.get_next()
 
   def get_next_as_list_static_shapes(self, name=None):
     """Get next element from the underlying iterator.
 
     Runs the iterator get_next() within a device scope. Since this doesn't use
-    get_next_as_optional(), is is considerably faster than get_next_as_list()
+    get_next_as_optional(), it is considerably faster than get_next_as_list()
     (but can only be used when the shapes are static).
 
     Args:
@@ -1516,7 +1607,7 @@ class _SingleWorkerDatasetIteratorBase(object):
     """
     del name
     with ops.device(self._worker):
-      return self._iterator.get_next()
+      return self._format_data_list_with_options(self._iterator.get_next())
 
   def get_next_as_list(self, name=None):
     """Get next element from underlying iterator.
@@ -1536,7 +1627,8 @@ class _SingleWorkerDatasetIteratorBase(object):
     """
     del name
     with ops.device(self._worker):
-      data_list = self._iterator.get_next_as_optional()
+      data_list = self._format_data_list_with_options(
+          self._iterator.get_next_as_optional())
       result = []
       for i, data in enumerate(data_list):
         # Place the condition op in the same device as the data so the data
@@ -1616,8 +1708,13 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase,
                                         composite_tensor.CompositeTensor):
   """Iterator for a DistributedDataset instance."""
 
-  def __init__(self, dataset=None, worker=None, devices=None, components=None,
-               element_spec=None):
+  def __init__(self,
+               dataset=None,
+               worker=None,
+               devices=None,
+               components=None,
+               element_spec=None,
+               options=None):
     """Create iterator for the `dataset` to fetch data to worker's `devices` .
 
     `OwnedMultiDeviceIterator` is used to prefetch input to the devices on the
@@ -1633,6 +1730,8 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase,
         _SingleWorkerOwnedDatasetIterator from.
       element_spec: A nested structure of `TypeSpec` objects that represents the
       type specification of elements of the iterator.
+      options: `tf.distribute.InputOptions` used to control options on how this
+      dataset is distributed.
     """
     if worker is None or devices is None:
       raise ValueError("Both `worker` and `devices` should be provided")
@@ -1640,6 +1739,7 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase,
     error_message = ("Either `dataset` or both `components` and `element_spec` "
                      "need to be provided.")
 
+    self._options = options
     if dataset is None:
       if (components is None or element_spec is None):
         raise ValueError(error_message)
@@ -1650,18 +1750,25 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase,
     else:
       if (components is not None or element_spec is not None):
         raise ValueError(error_message)
-      super(_SingleWorkerOwnedDatasetIterator, self).__init__(dataset, worker,
-                                                              devices)
+      super(_SingleWorkerOwnedDatasetIterator,
+            self).__init__(dataset, worker, devices, options)
 
   def _make_iterator(self):
     """Make appropriate iterator on the dataset."""
     if not self._worker:
       raise ValueError("Worked device must be specified when creating an "
                        "owned iterator.")
-    host_device = device_util.get_host_for_device(self._worker)
-    with ops.device(self._worker):
-      self._iterator = multi_device_iterator_ops.OwnedMultiDeviceIterator(
-          self._dataset, self._devices, source_device=host_device)
+    if (self._options is None or self._options.experimental_replication_mode ==
+        InputReplicationMode.PER_WORKER or
+        (self._options.experimental_replication_mode == InputReplicationMode
+         .PER_REPLICA and self._options.experimental_prefetch_to_device)):
+      host_device = device_util.get_host_for_device(self._worker)
+      with ops.device(self._worker):
+        self._iterator = multi_device_iterator_ops.OwnedMultiDeviceIterator(
+            self._dataset, self._devices, source_device=host_device)
+    else:
+      with ops.device(self._worker):
+        self._iterator = iter(self._dataset)
 
   @property
   def element_spec(self):
@@ -1782,19 +1889,23 @@ class _SingleWorkerCallableIterator(object):
     return []
 
 
-def _create_iterators_per_worker(worker_datasets, input_workers,
-                                 enable_legacy_iterators):
+def _create_iterators_per_worker(worker_datasets,
+                                 input_workers,
+                                 enable_legacy_iterators,
+                                 options=None):
   """Create a multidevice iterator on each of the workers."""
   assert isinstance(input_workers, InputWorkers)
-
   assert len(worker_datasets) == len(input_workers.worker_devices)
   iterators = []
   for i, worker in enumerate(input_workers.worker_devices):
     with ops.device(worker):
       worker_devices = input_workers.compute_devices_for_worker(i)
       if tf2.enabled() and not enable_legacy_iterators:
-        iterator = _SingleWorkerOwnedDatasetIterator(worker_datasets[i], worker,
-                                                     worker_devices)
+        iterator = _SingleWorkerOwnedDatasetIterator(
+            dataset=worker_datasets[i],
+            worker=worker,
+            devices=worker_devices,
+            options=options)
       else:
         iterator = _SingleWorkerDatasetIterator(worker_datasets[i], worker,
                                                 worker_devices)
@@ -1802,8 +1913,9 @@ def _create_iterators_per_worker(worker_datasets, input_workers,
   return iterators
 
 
-def _create_datasets_per_worker_with_input_context(input_contexts,
-                                                   input_workers, dataset_fn):
+def _create_datasets_from_function_with_input_context(input_contexts,
+                                                      input_workers,
+                                                      dataset_fn):
   """Create device datasets per worker given a dataset function."""
   datasets = []
   for i, ctx in enumerate(input_contexts):
@@ -1993,13 +2105,14 @@ def _create_distributed_tensor_spec(strategy, tensor_spec):
   """
   num_replicas = len(strategy.extended.worker_devices)
 
-  # If the number of devices used in the strategy is just 1 then we return
-  # the tensor_spec as is.
-  if num_replicas == 1:
+  # For one device strategy that is not MultiWorkerMirroredStrategy,  return the
+  # tensor_spec as is, since we don't wrap the output with PerReplica in this
+  # case.
+  # TODO(b/166464552): remove after we always wrap for all strategies.
+  if not _always_wrap(strategy):
     return tensor_spec
 
-  # If the number of devices is greater than 1 then we assume the input to
-  # tf.function is a per replica type.
+  # For other cases we assume the input to tf.function is a per replica type.
   def _get_value_per_replica(tensor_spec_per_input):
     value_specs = [tensor_spec_per_input for _ in range(num_replicas)]
     return values.PerReplicaSpec(*value_specs)
@@ -2015,7 +2128,7 @@ def _replace_per_replica_spec(spec, i):
     return spec
 
 
-def _enable_get_next_as_optional(strategy, element_spec):
+def _enable_get_next_as_optional(strategy, dataset):
   """Returns whether to enable using partial batch handling."""
   # TODO(b/133073708): we currently need a flag to control the usage because
   # there is a performance difference between get_next() and
@@ -2027,5 +2140,81 @@ def _enable_get_next_as_optional(strategy, element_spec):
   if not getattr(strategy.extended, "experimental_enable_get_next_as_optional",
                  False):
     return False
+
+  if context.executing_eagerly():
+    # If the dataset is infinite, we don't need to enable last partial batch
+    # support. Currently the logic only applies to the case that distributed
+    # dataset is created in eager mode, as we need to evaluate the dataset
+    # cardinality.
+    with ops.device(dataset._variant_tensor.device):  # pylint: disable=protected-access
+      return dataset.cardinality().numpy() != cardinality.INFINITE
+
   return not _is_statically_shaped(
-      element_spec) or strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
+      dataset.element_spec) or strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
+
+
+def _create_per_replica(value_list, strategy, get_next_as_optional):
+  """Creates a PerReplica.
+
+  For strategies other than OneDeviceStrategy, it creates a PerReplica whose
+  type spec is set to the element spec of the dataset. This helps avoid
+  retracing for partial batches. Retracing is problematic for multi client when
+  different client retraces different time, since retracing changes the
+  collective keys in the tf.function, and causes mismatches among clients.
+
+  For single client strategies, this simply calls distribute_utils.regroup().
+
+  Args:
+    value_list: a list of values, one for each replica.
+    strategy: the `tf.distribute.Strategy`.
+    get_next_as_optional: whether last partial batch handling is enabled.
+
+  Returns:
+    a structure of PerReplica.
+
+  """
+  # TODO(b/166464552): always wrap for all one device strategies as well.
+  always_wrap = _always_wrap(strategy)
+  per_replicas = distribute_utils.regroup(value_list, always_wrap=always_wrap)
+
+  # When partial batch handling is enabled, always set the batch dimension to
+  # None, otherwise we just follow element_spec of the underlying dataset
+  # (whose batch dimension may also be None). This is because with partial
+  # batching handling we could always produce empty batches.
+  #
+  # TODO(b/163362689): avoid this once we have more elegant way to handle
+  # retracing and collectives.
+  if (get_next_as_optional and strategy.extended._in_multi_worker_mode()):  # pylint: disable=protected-access
+    # Use expand_composites=False since we don't want to expand PerReplica,
+    # which is a CompositeTensor.
+    flat_per_replicas = nest.flatten(per_replicas, expand_composites=False)
+    flat_spec = [type_spec.type_spec_from_value(v) for v in flat_per_replicas]
+    for per_replica, spec in zip(flat_per_replicas, flat_spec):
+      per_replica._type_spec_override = _rebatch_as_dynamic(spec)  # pylint: disable=protected-access
+    per_replicas = nest.pack_sequence_as(per_replicas, flat_per_replicas)
+
+  return per_replicas
+
+
+def _always_wrap(strategy):
+  """Returns whether to always wrap the values in a DistributedValues."""
+  return strategy.extended._in_multi_worker_mode() or len(  # pylint: disable=protected-access
+      strategy.extended.worker_devices) > 1
+
+
+def _rebatch_as_dynamic(per_replica_spec):
+  """Rebatch the spec to have a dynamic batch dimension."""
+  assert isinstance(per_replica_spec, values.PerReplicaSpec), per_replica_spec
+
+  # pylint: disable=protected-access
+  def _rebatch(spec):
+    # Rebatch if possible.
+    try:
+      return spec._unbatch()._batch(None)
+    except ValueError:
+      pass
+    return spec
+
+  return values.PerReplicaSpec(
+      *nest.map_structure(_rebatch, per_replica_spec._value_specs))
+  # pylint: enable=protected-access
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 5abd6f483d3..f4286e96bbc 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -557,7 +557,7 @@ class DistributedIteratorTest(DistributedIteratorTestBase,
 
     iterator = iter(dist_dataset)
     for i, element in enumerate(iterator):
-      self.assertEqual(i, element.numpy())
+      self.assertAllEqual(distribution.experimental_local_results(element), [i])
 
   @combinations.generate(
       combinations.combine(
@@ -897,7 +897,7 @@ class DistributedIteratorTest(DistributedIteratorTestBase,
         feature = data["feature"]
         label = data["label"]
 
-        # Asser the shapes are still staic from all replicas.
+        # Assert the shapes are still static from all replicas.
         for replica_id in range(len(distribution.extended.worker_devices)):
           self.assertEqual([per_replica_batch_size, 10],
                            feature[replica_id].shape)
@@ -1088,21 +1088,21 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
         except (StopIteration, errors.OutOfRangeError):
           return sums
 
+    expected_for_sum = 200.
+    if (not drop_remainder or input_type == "input_fn"):
+      expected_for_sum = 310.
     while_sums = sum_while_loop(
         iter(dataset),
         defun(lambda state, iterator: _reduce(state, next(iterator))))
-    self.assertAllEqual(
-        nest.flatten(while_sums),
-        # When there's no partial batch, the sum is smaller.
-        [200. if drop_remainder else 310.] * 3)
-    for_sums = defun(sum_for_loop)(dataset)
+    self.assertAllEqual(nest.flatten(while_sums), [expected_for_sum] * 3)
+
     # For loops always call get next as optional inside tf functions, so we
     # expect 310 here when using an input function (as there are 5 batches of
     # size 4 round robined over 2 replicas.
     expected_for_sum = 200.
-    if (not drop_remainder or (
-        defun_type == "tf_function" and input_type == "input_fn")):
+    if (not drop_remainder or input_type == "input_fn"):
       expected_for_sum = 310.
+    for_sums = defun(sum_for_loop)(dataset)
     self.assertAllEqual(nest.flatten(for_sums), [expected_for_sum] * 3)
 
   @combinations.generate(
@@ -1116,12 +1116,12 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
           ],
           input_type=["dataset", "input_fn"],
           drop_remainder=[False, True],
+          repeat=[False, True],
           tensor_type=["sparse", "ragged"],
-          enable_get_next_as_optional=[True, False]
-      ))
-  def testRaggedSparseGetNextAsOptional(
-      self, distribution, input_type, drop_remainder, tensor_type,
-      enable_get_next_as_optional):
+          enable_get_next_as_optional=[True, False]))
+  def testRaggedSparseGetNextAsOptional(self, distribution, input_type,
+                                        drop_remainder, repeat, tensor_type,
+                                        enable_get_next_as_optional):
     """Test with `RaggedTensor`s and `SparseTensor`s."""
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -1142,6 +1142,8 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
                         ragged_tensor.to_sparse()),
       })
       dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+      if repeat:
+        dataset = dataset.repeat()
       return dataset.batch(batch_size, drop_remainder=drop_remainder)
 
     if input_type == "dataset":
@@ -1151,8 +1153,8 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
       ds = distribution.distribute_datasets_from_function(dataset_fn)
     iterator = iter(ds)
 
-    self.assertEqual(iterator._enable_get_next_as_optional,
-                     (not drop_remainder) and enable_get_next_as_optional)
+    self.assertEqual(iterator._enable_get_next_as_optional, (not repeat) and
+                     enable_get_next_as_optional)
 
   @combinations.generate(
       combinations.combine(
@@ -1421,5 +1423,198 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
         input_context=distribution.extended._make_input_context())
 
 
+class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
+                                       parameterized.TestCase):
+  """Tests for PER_WORKER and PER_REPLICA's InputOptions variants."""
+
+  def setUp(self):
+    context._reset_context()
+    strategy_combinations.set_virtual_cpus_to_at_least(3)
+    super(DistributedIteratorPerDeviceTest, self).setUp()
+
+  @combinations.generate(
+      combinations.combine(
+          input_options=[
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=True,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_WORKER),
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=True,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_REPLICA),
+          ],
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_two_gpus,
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          ]))
+  def testDevicePlacementForPerWorkerValuesWithPrefetch(self, distribution,
+                                                        input_options):
+
+    def dataset_fn(input_context):  # pylint: disable=[unused-argument]
+      return dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4])
+
+    ds = distribution.experimental_distribute_datasets_from_function(
+        dataset_fn, input_options)
+
+    for x in ds:
+      assert x.values[0].device == distribution.extended.worker_devices[0]
+      assert x.values[0].backing_device == distribution.extended.worker_devices[
+          0]
+      assert x.values[1].device == distribution.extended.worker_devices[1]
+      assert x.values[1].backing_device == distribution.extended.worker_devices[
+          1]
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_two_gpus,
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          ],
+          input_options=[
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=False,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_WORKER)
+          ],
+          mode=["eager"],
+      ))
+  def testDevicePlacementForPerWorkerValuesWithoutPrefetch(
+      self, distribution, input_options):
+
+    def dataset_fn(input_context):
+      return dataset_ops.Dataset.from_tensor_slices(
+          np.full(4, input_context.input_pipeline_id))
+
+    ds = distribution.experimental_distribute_datasets_from_function(
+        dataset_fn, input_options)
+
+    for x in ds:
+      x = distribution.run(lambda inputs: inputs, args=(x,))
+      assert x.values[
+          0].device == "/job:localhost/replica:0/task:0/device:CPU:0"
+      assert x.values[
+          0].backing_device == "/job:localhost/replica:0/task:0/device:CPU:0"
+      assert x.values[
+          1].device == "/job:localhost/replica:0/task:0/device:CPU:0"
+      assert x.values[
+          1].backing_device == "/job:localhost/replica:0/task:0/device:CPU:0"
+
+  @combinations.generate(
+      combinations.combine(
+          input_options=[
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=True,
+                  experimental_prefetch_to_device=False,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_WORKER),
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=True,
+                  experimental_prefetch_to_device=True,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_REPLICA)
+          ],
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_two_gpus,
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          ]))
+  def testDevicePlacementForInvalidCombinations(self, distribution,
+                                                input_options):
+
+    def dataset_fn(input_context):
+      return dataset_ops.Dataset.from_tensor_slices(
+          np.full(4, input_context.input_pipeline_id))
+
+    with self.assertRaises(ValueError):
+      distribution.experimental_distribute_datasets_from_function(
+          dataset_fn, input_options)
+
+  @combinations.generate(
+      combinations.combine(
+          input_options=[
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=False,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_WORKER),
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=True,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_WORKER),
+          ],
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_two_gpus,
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          ]))
+  def testOutputValuesForPerWorkerInputOptions(self, distribution,
+                                               input_options):
+
+    def dataset_fn(input_context):
+      return dataset_ops.Dataset.from_tensor_slices(
+          np.arange(1, 11).reshape(
+              (2, 5)) * (input_context.input_pipeline_id + 1))
+
+    ds = distribution.experimental_distribute_datasets_from_function(
+        dataset_fn, input_options)
+
+    # validating the values
+    x = next(iter(ds))
+    assert np.array_equal(x.values[0].numpy(), np.array([1, 2, 3, 4, 5]))
+    assert np.array_equal(x.values[1].numpy(), np.array([6, 7, 8, 9, 10]))
+
+  @combinations.generate(
+      combinations.combine(
+          input_options=[
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=True,
+                  experimental_prefetch_to_device=False,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_REPLICA),
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=False,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_REPLICA),
+              distribute_lib.InputOptions(
+                  experimental_place_dataset_on_device=False,
+                  experimental_prefetch_to_device=True,
+                  experimental_replication_mode=distribute_lib
+                  .InputReplicationMode.PER_REPLICA),
+          ],
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_two_gpus,
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          ]))
+  def testOutputValuesForPerReplicaInputOptions(self, distribution,
+                                                input_options):
+
+    def dataset_fn(input_context):
+      return dataset_ops.Dataset.from_tensor_slices(
+          np.arange(1, 10) * (input_context.input_pipeline_id + 1))
+
+    ds = distribution.experimental_distribute_datasets_from_function(
+        dataset_fn, input_options)
+    expected = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
+    for i, x in enumerate(ds):
+      # validating the values
+      assert x.values[0].numpy() == expected[i]
+      assert x.values[1].numpy() == expected[i] * 2
+      loop_num = i
+    assert loop_num == len(expected) - 1
+
+
 if __name__ == "__main__":
   test_util.main()
diff --git a/tensorflow/python/distribute/input_lib_type_spec_test.py b/tensorflow/python/distribute/input_lib_type_spec_test.py
index bc6ac811bbb..940949efd87 100644
--- a/tensorflow/python/distribute/input_lib_type_spec_test.py
+++ b/tensorflow/python/distribute/input_lib_type_spec_test.py
@@ -18,15 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import def_function
@@ -37,6 +40,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_tensor as ragged_tensor_lib
 from tensorflow.python.util import nest
 
@@ -116,14 +120,17 @@ class DistributedIteratorTest(test.TestCase,
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
           ],
-          enable_get_next_as_optional=[True, False]))
+          enable_get_next_as_optional=[True, False],
+          drop_remainder=[True, False],
+          tf_api_version=2,
+      ))
   def testDoesNotTriggerFunctionTracing(self, input_type, distribution,
-                                        enable_get_next_as_optional):
-    if not tf2.enabled():
-      self.skipTest("DistributedIterator CompositeTensor support is only "
-                    "present in TF 2.0 only.")
-
+                                        enable_get_next_as_optional,
+                                        drop_remainder):
     trace_count = [0]
 
     @def_function.function
@@ -135,7 +142,8 @@ class DistributedIteratorTest(test.TestCase,
         counter += 1
       return counter
 
-    dataset = dataset_ops.DatasetV2.range(10).batch(2)
+    dataset = dataset_ops.DatasetV2.range(10).batch(
+        2, drop_remainder=drop_remainder)
 
     distribution.extended.experimental_enable_get_next_as_optional = (
         enable_get_next_as_optional)
@@ -161,27 +169,79 @@ class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
-              strategy_combinations.central_storage_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
           ],
-          input_type=["dataset", "dataset_fn"],
+          tf_api_version=2,
+          enable_get_next_as_optional=[True, False],
+          drop_remainder=[True, False],
       ))
-  def testInputSignatureForPerReplicaValues(self, distribution, input_type):
-    def dataset_fn(ctx):
-      del ctx  # unused
-      return dataset_ops.DatasetV2.from_tensor_slices(
-          np.ones([10, 12]).astype(np.float32)).batch(4)
+  def testInputSignatureForPerReplicaValues(self, distribution,
+                                            enable_get_next_as_optional,
+                                            drop_remainder):
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+    ds = dataset_ops.DatasetV2.from_tensor_slices(
+        np.ones([9, 12]).astype(np.float32)).batch(
+            4, drop_remainder=drop_remainder)
+    ds = distribution.experimental_distribute_dataset(ds)
+    _check_type_spec_structure(iter(ds))
+    element_spec = ds.element_spec
+    iter_element_spec = iter(ds).element_spec
+    nest.assert_same_structure(element_spec, iter_element_spec)
+    self.assertAllEqual(
+        nest.flatten(element_spec), nest.flatten(iter_element_spec))
 
-    if input_type == "dataset":
-      ds = distribution.experimental_distribute_dataset(
-          dataset_fn(distribute_lib.InputContext()))
-      type_spec = ds.element_spec
-    else:
-      ds = distribution.distribute_datasets_from_function(dataset_fn)
-      iterator = iter(ds)
-      _check_type_spec_structure(iterator)
-      type_spec = iterator.element_spec
+    @def_function.function(input_signature=[element_spec])
+    def process_inputs(inputs):
+      distribution.run(lambda inputs: inputs, args=(inputs,))
 
-    @def_function.function(input_signature=[type_spec])
+    for x in ds:
+      process_inputs(x)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          tf_api_version=2,
+          enable_get_next_as_optional=[True, False],
+          drop_remainder=[True, False],
+      ))
+  def testFromFunctionInputSignatureForPerReplicaValues(
+      self, distribution, enable_get_next_as_optional, drop_remainder):
+    # Create files that produce partial/empty batches at different batch. Note
+    # that some worker will get empty batches even when drop_remainder=True.
+    fname1 = os.path.join(self.get_temp_dir(), "1.txt")
+    _create_text_file(fname1, 5)
+    fname2 = os.path.join(self.get_temp_dir(), "2.txt")
+    _create_text_file(fname2, 9)
+
+    def dataset_fn(input_context):
+      dataset = dataset_ops.DatasetV2.from_tensor_slices([fname1, fname2])
+      dataset = dataset.shard(input_context.num_input_pipelines,
+                              input_context.input_pipeline_id)
+      return readers.TextLineDatasetV2(dataset).map(
+          string_ops.string_to_number).batch(
+              input_context.get_per_replica_batch_size(4),
+              drop_remainder=drop_remainder)
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+    ds = distribution.experimental_distribute_datasets_from_function(dataset_fn)
+    _check_type_spec_structure(iter(ds))
+    element_spec = ds.element_spec
+    iter_element_spec = iter(ds).element_spec
+    nest.assert_same_structure(element_spec, iter_element_spec)
+    self.assertAllEqual(
+        nest.flatten(element_spec), nest.flatten(iter_element_spec))
+
+    @def_function.function(input_signature=[element_spec])
     def process_inputs(inputs):
       distribution.run(lambda inputs: inputs, args=(inputs,))
 
@@ -247,6 +307,149 @@ class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(spec1, spec1.most_specific_compatible_type(spec2))
     self.assertEqual(spec1, spec2.most_specific_compatible_type(spec1))
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          tf_api_version=2,
+          drop_remainder=[True, False],
+      ))
+  def testFromDatasetDoesNotTriggerFunctionTracing(self, distribution,
+                                                   drop_remainder):
+    self.trace_count = 0
+
+    @def_function.function
+    def f(v):
+      del v
+      self.trace_count += 1
+
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    # Total dataset size 5 allows us to have full batches, partial batches and
+    # empty batches.
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(np.ones((5, 3))).batch(
+        4, drop_remainder=drop_remainder)
+    dataset = distribution.experimental_distribute_dataset(dataset)
+    for v in iter(dataset):
+      f(v)
+    self.assertEqual(self.trace_count, 1)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          tf_api_version=2,
+          drop_remainder=[True, False],
+      ))
+  def testFromDatasetFileShardingDoesNotTriggerFunctionTracing(
+      self, distribution, drop_remainder):
+    # Create files that produce partial/empty batches at different batch.
+    fname1 = os.path.join(self.get_temp_dir(), "1.txt")
+    _create_text_file(fname1, 5)
+    fname2 = os.path.join(self.get_temp_dir(), "2.txt")
+    _create_text_file(fname2, 9)
+
+    self.trace_count = 0
+
+    @def_function.function
+    def f(v):
+      del v
+      self.trace_count += 1
+
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    dataset = readers.TextLineDatasetV2([fname1, fname2]).batch(
+        4, drop_remainder=drop_remainder)
+    dataset = distribution.experimental_distribute_dataset(dataset)
+    for v in iter(dataset):
+      f(v)
+    self.assertEqual(self.trace_count, 1)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          tf_api_version=2,
+          drop_remainder=[True, False],
+      ))
+  def testFromFunctionDoesNotTriggerFunctionTracing(self, distribution,
+                                                    drop_remainder):
+
+    def dataset_fn(input_context):
+      # Total dataset size 5 allows us to have full batches, partial batches and
+      # empty batches.
+      dataset = dataset_ops.DatasetV2.from_tensor_slices(np.ones((5, 3)))
+      dataset = dataset.batch(
+          input_context.get_per_replica_batch_size(4),
+          drop_remainder=drop_remainder)
+      return dataset.shard(input_context.num_input_pipelines,
+                           input_context.input_pipeline_id)
+
+    self.trace_count = 0
+
+    @def_function.function
+    def f(v):
+      del v
+      self.trace_count += 1
+
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    dataset = distribution.experimental_distribute_datasets_from_function(
+        dataset_fn)
+    for v in iter(dataset):
+      f(v)
+    self.assertEqual(self.trace_count, 1)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          tf_api_version=2,
+          drop_remainder=[True, False],
+      ))
+  def testFromFunctionFileShardingDoesNotTriggerFunctionTracing(
+      self, distribution, drop_remainder):
+    # Create files that produce partial/empty batches at different batch.
+    fname1 = os.path.join(self.get_temp_dir(), "1.txt")
+    _create_text_file(fname1, 5)
+    fname2 = os.path.join(self.get_temp_dir(), "2.txt")
+    _create_text_file(fname2, 9)
+
+    def dataset_fn(input_context):
+      dataset = dataset_ops.DatasetV2.from_tensor_slices([fname1, fname2])
+      dataset = dataset.shard(input_context.num_input_pipelines,
+                              input_context.input_pipeline_id)
+      return readers.TextLineDatasetV2(dataset).batch(
+          input_context.get_per_replica_batch_size(4),
+          drop_remainder=drop_remainder)
+
+    self.trace_count = 0
+
+    @def_function.function
+    def f(v):
+      del v
+      self.trace_count += 1
+
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    dataset = distribution.experimental_distribute_datasets_from_function(
+        dataset_fn)
+    for v in iter(dataset):
+      f(v)
+    self.assertEqual(self.trace_count, 1)
+
 
 class RaggedTensorDistributedIteratorTest(test.TestCase,
                                           parameterized.TestCase):
@@ -254,14 +457,14 @@ class RaggedTensorDistributedIteratorTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["eager"],
+          tf_api_version=2,
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
           ],
           enable_get_next_as_optional=[True, False]))
   def testTypeSpec(self, distribution, enable_get_next_as_optional):
-    if not tf2.enabled():
-      self.skipTest("DistributedIterator has CompositeTensor support in "
-                    "TF 2.0 only.")
     ctx = distribute_lib.InputContext()
     batch_size = ctx.get_per_replica_batch_size(8)
     # Use 20 which isn't divisible by 8 to test partial batch behavior.
@@ -313,16 +516,16 @@ class RaggedTensorDistributedIteratorTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["eager"],
+          tf_api_version=2,
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
           ],
+
           enable_get_next_as_optional=[True, False]))
   def testTypeSpecRoundTrip(self, distribution, enable_get_next_as_optional):
-    if not tf2.enabled():
-      self.skipTest("DistributedIterator CompositeTensor support is only "
-                    "present in TF 2.0 only.")
-
     ctx = distribute_lib.InputContext()
     batch_size = ctx.get_per_replica_batch_size(8)
     # Use 20 which isn't divisible by 8 to test partial batch behavior.
@@ -366,17 +569,17 @@ class RaggedTensorDistributedIteratorTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["eager"],
+          tf_api_version=2,
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
           ],
           enable_get_next_as_optional=[True, False]))
   def testDoesNotTriggerFunctionTracing(self, distribution,
                                         enable_get_next_as_optional):
-    if not tf2.enabled():
-      self.skipTest("DistributedIterator CompositeTensor support is only "
-                    "present in TF 2.0 only.")
-
     trace_count = [0]
 
     @def_function.function
@@ -432,5 +635,11 @@ def _check_type_spec_structure(x):
     nest.assert_same_structure(x, x._type_spec, expand_composites=True)
 
 
+def _create_text_file(fname, num_lines):
+  with open(fname, "w") as f:
+    for i in range(num_lines):
+      f.write("%d\n" % i)
+
+
 if __name__ == "__main__":
-  test.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index d3a71e0136d..b19710e17b3 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -9,6 +9,9 @@ package(
 distribute_py_test(
     name = "saved_model_test",
     srcs = ["saved_model_test.py"],
+    tags = [
+        "no_windows",  # TODO(b/171350360)
+    ],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lookup_ops",
diff --git a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
index 02dee6f6adb..283a76300a9 100644
--- a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
+++ b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
@@ -37,6 +37,8 @@ from tensorflow.python.eager import test
 mwms_lib.CollectiveAllReduceExtended._enable_check_health = True
 mwms_lib.CollectiveAllReduceExtended._check_health_interval = 3
 mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 0
+# This is needed for OSS, which issues all RPCs with fail_fast=false by default.
+mwms_lib.CollectiveAllReduceExtended._check_health_timeout = 1
 
 
 def get_attempt(strategy, attempts):
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 658b45ecec6..6db1aeb6ca3 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -313,6 +314,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     assert devices, ("Got an empty `devices` list and unable to recognize "
                      "any local devices.")
     self._cross_device_ops = cross_device_ops
+    self._communication_options = collective_util.Options()
     self._initialize_strategy(devices)
 
     # TODO(b/128995245): Enable last partial batch support in graph mode.
@@ -339,6 +341,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._devices = tuple(device_util.canonicalize(d) for d in devices)
     self._input_workers_devices = (
         (device_util.canonicalize("/device:CPU:0", devices[0]), devices),)
+
     self._inferred_cross_device_ops = None if self._cross_device_ops else (
         cross_device_ops_lib.select_cross_device_ops(devices))
     self._host_input_device = numpy_dataset.SingleDevice(
@@ -394,12 +397,27 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     logging.info("Using MirroredStrategy with remote devices %r", devices)
 
   def _input_workers_with_options(self, options=None):
-    if not options or options.experimental_prefetch_to_device:
+    if not options:
+      return input_lib.InputWorkers(self._input_workers_devices)
+    if (options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      if options.experimental_place_dataset_on_device:
+        self._input_workers_devices = (
+            tuple(
+                (device_util.canonicalize(d, d), (d,)) for d in self._devices))
+      else:
+        self._input_workers_devices = (
+            tuple((device_util.canonicalize("/device:CPU:0", d), (d,))
+                  for d in self._devices))
       return input_lib.InputWorkers(self._input_workers_devices)
     else:
-      return input_lib.InputWorkers(
-          [(host_device, (host_device,) * len(compute_devices)) for
-           host_device, compute_devices in self._input_workers_devices])
+      if not options.experimental_prefetch_to_device:
+        return input_lib.InputWorkers([
+            (host_device, (host_device,) * len(compute_devices))
+            for host_device, compute_devices in self._input_workers_devices
+        ])
+      else:
+        return input_lib.InputWorkers(self._input_workers_devices)
 
   @property
   def _input_workers(self):
@@ -497,6 +515,13 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
                                            self._container_strategy())
 
   def _experimental_distribute_dataset(self, dataset, options):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          "`experimental_distribute_datasets_from_function`."
+      )
     return input_lib.get_distributed_dataset(
         dataset,
         self._input_workers_with_options(options),
@@ -508,8 +533,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         numpy_input, self._host_input_device, session)
 
   def _distribute_datasets_from_function(self, dataset_fn, options):
-    input_contexts = []
     input_workers = self._input_workers_with_options(options)
+    input_contexts = []
     num_workers = input_workers.num_workers
     for i in range(num_workers):
       input_contexts.append(distribute_lib.InputContext(
@@ -518,10 +543,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
           num_replicas_in_sync=self._num_replicas_in_sync))
 
     return input_lib.get_distributed_datasets_from_function(
-        dataset_fn,
-        input_workers,
-        input_contexts,
-        self._container_strategy())
+        dataset_fn, input_workers, input_contexts, self._container_strategy(),
+        options)
 
   def _experimental_distribute_values_from_function(self, value_fn):
     per_replica_values = []
@@ -632,8 +655,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     del value  # Unused.
     return self._cross_device_ops or self._inferred_cross_device_ops
 
-  def _gather_to_implementation(self, value, destinations, axis,
-                                experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
     if not isinstance(value, values.DistributedValues):
       # ReductionToOneDevice._gather accepts DistributedValues only.
       return value
@@ -641,9 +663,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         value,
         destinations=destinations,
         axis=axis,
-        experimental_hints=experimental_hints)
+        options=self._communication_options.merge(options))
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     if (distribute_utils.is_mirrored(value) and
         reduce_op == reduce_util.ReduceOp.MEAN):
       return value
@@ -659,10 +681,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         reduce_op,
         value,
         destinations=destinations,
-        experimental_hints=experimental_hints)
+        options=self._communication_options.merge(options))
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs,
-                       experimental_hints):
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     cross_device_ops = None
     for value, _ in value_destination_pairs:
       if cross_device_ops is None:
@@ -670,8 +691,10 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       elif cross_device_ops is not self._get_cross_device_ops(value):
         raise ValueError("inputs to batch_reduce_to must be either all on the "
                          "the host or all on the compute devices")
-    return cross_device_ops.batch_reduce(reduce_op, value_destination_pairs,
-                                         experimental_hints)
+    return cross_device_ops.batch_reduce(
+        reduce_op,
+        value_destination_pairs,
+        options=self._communication_options.merge(options))
 
   def _update(self, var, fn, args, kwargs, group):
     # TODO(josh11b): In eager mode, use one thread per device.
diff --git a/tensorflow/python/distribute/multi_process_lib.py b/tensorflow/python/distribute/multi_process_lib.py
index 89021448eb2..81ee53a285f 100644
--- a/tensorflow/python/distribute/multi_process_lib.py
+++ b/tensorflow/python/distribute/multi_process_lib.py
@@ -12,41 +12,161 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""OSS multi-process library to be implemented."""
+"""Library for multi-process testing."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-import multiprocessing as _multiprocessing
+import multiprocessing
 import os
+import platform
+import sys
 import unittest
+from absl import app
 
 from tensorflow.python.eager import test
 
 
-try:
-  multiprocessing = _multiprocessing.get_context('forkserver')
-except ValueError:
-  # forkserver is not available on Windows.
-  multiprocessing = _multiprocessing.get_context('spawn')
+def is_oss():
+  """Returns whether the test is run under OSS."""
+  return len(sys.argv) >= 1 and 'bazel' in sys.argv[0]
 
 
-class Process(object):
-  """A process simulating a worker for testing multi-worker training."""
+def _is_enabled():
+  # Note that flags may not be parsed at this point and simply importing the
+  # flags module causes a variety of unusual errors.
+  tpu_args = [arg for arg in sys.argv if arg.startswith('--tpu')]
+  if is_oss() and tpu_args:
+    return False
+  if sys.version_info == (3, 8) and platform.system() == 'Linux':
+    return False  # TODO(b/171242147)
+  return sys.platform != 'win32'
+
+
+class _AbslProcess:
+  """A process that runs using absl.app.run."""
 
   def __init__(self, *args, **kwargs):
-    del args, kwargs
-    raise unittest.SkipTest(
-        'TODO(b/150264776): Implement OSS version of `multi_process_lib`')
+    super(_AbslProcess, self).__init__(*args, **kwargs)
+    # Monkey-patch that is carried over into the spawned process by pickle.
+    self._run_impl = getattr(self, 'run')
+    self.run = self._run_with_absl
+
+  def _run_with_absl(self):
+    app.run(lambda _: self._run_impl())
+
+
+if _is_enabled():
+
+  class AbslForkServerProcess(_AbslProcess,
+                              multiprocessing.context.ForkServerProcess):
+    """An absl-compatible Forkserver process.
+
+    Note: Forkserver is not available in windows.
+    """
+
+  class AbslForkServerContext(multiprocessing.context.ForkServerContext):
+    _name = 'absl_forkserver'
+    Process = AbslForkServerProcess  # pylint: disable=invalid-name
+
+  multiprocessing = AbslForkServerContext()
+  Process = multiprocessing.Process
+
+else:
+
+  class Process(object):
+    """A process that skips test (until windows is supported)."""
+
+    def __init__(self, *args, **kwargs):
+      del args, kwargs
+      raise unittest.SkipTest(
+          'TODO(b/150264776): Windows is not supported in MultiProcessRunner.')
+
+
+_test_main_called = False
+
+
+def _set_spawn_exe_path():
+  """Set the path to the executable for spawned processes.
+
+  This utility searches for the binary the parent process is using, and sets
+  the executable of multiprocessing's context accordingly.
+
+  Raises:
+    RuntimeError: If the binary path cannot be determined.
+  """
+  # TODO(b/150264776): This does not work with Windows. Find a solution.
+  if sys.argv[0].endswith('.py'):
+    # If all we have is a python module path, we'll need to make a guess for
+    # the actual executable path. Since the binary path may correspond to the
+    # parent's path of the python module, we are making guesses by reducing
+    # directories one at a time. E.g.,
+    # tensorflow/python/some/path/my_test.py
+    # -> tensorflow/python/some/path/my_test
+    # -> tensorflow/python/some/my_test
+    # -> tensorflow/python/my_test
+    path_to_use = None
+    guess_path = sys.argv[0][:-3]
+    guess_path = guess_path.split(os.sep)
+    for path_reduction in range(-1, -len(guess_path), -1):
+      possible_path = os.sep.join(guess_path[:path_reduction] +
+                                  [guess_path[-1]])
+      if os.access(possible_path, os.X_OK):
+        path_to_use = possible_path
+        break
+      # The binary can possibly have _gpu suffix.
+      possible_path += '_gpu'
+      if os.access(possible_path, os.X_OK):
+        path_to_use = possible_path
+        break
+    if path_to_use is None:
+      raise RuntimeError('Cannot determine binary path')
+    sys.argv[0] = path_to_use
+  # Note that this sets the executable for *all* contexts.
+  multiprocessing.get_context().set_executable(sys.argv[0])
+
+
+def _if_spawn_run_and_exit():
+  """If spawned process, run requested spawn task and exit. Else a no-op."""
+
+  # `multiprocessing` module passes a script "from multiprocessing.x import y"
+  # to subprocess, followed by a main function call. We use this to tell if
+  # the process is spawned. Examples of x are "forkserver" or
+  # "semaphore_tracker".
+  is_spawned = ('-c' in sys.argv[1:] and
+                sys.argv[sys.argv.index('-c') +
+                         1].startswith('from multiprocessing.'))
+
+  if not is_spawned:
+    return
+  cmd = sys.argv[sys.argv.index('-c') + 1]
+  # As a subprocess, we disregarding all other interpreter command line
+  # arguments.
+  sys.argv = sys.argv[0:1]
+
+  # Run the specified command - this is expected to be one of:
+  # 1. Spawn the process for semaphore tracker.
+  # 2. Spawn the initial process for forkserver.
+  # 3. Spawn any process as requested by the "spawn" method.
+  exec(cmd)  # pylint: disable=exec-used
+  sys.exit(0)  # Semaphore tracker doesn't explicitly sys.exit.
 
 
 def test_main():
   """Main function to be called within `__main__` of a test file."""
+  global _test_main_called
+  _test_main_called = True
+
   os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
+
+  if _is_enabled():
+    _set_spawn_exe_path()
+    _if_spawn_run_and_exit()
+
+  # Only runs test.main() if not spawned process.
   test.main()
 
 
 def initialized():
   """Returns whether the module is initialized."""
-  return True
+  return _test_main_called
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 107aa1a6a48..95841b8ee90 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import atexit
 import collections
 import contextlib
 import json
@@ -699,6 +698,8 @@ class MultiProcessRunner(object):
     sig = sig or getattr(signal, 'SIGKILL', signal.SIGTERM)
     for (task_type, task_id), p in self._processes.items():
       if p.exitcode is not None:
+        logging.info('%s-%d has already exited. Not terminating.', task_type,
+                     task_id)
         continue
       try:
         os.kill(p.pid, sig)
@@ -853,11 +854,8 @@ class _ProcFunc(object):
 
 
 # Active MultiProcessPoolRunner. We need to shut them down when the program
-# exits. For the main process, we do this via atexit callback. For a process
-# that is spawned by MultiProcessPoolRunner, e.g. nested MultiProcessPoolRunner,
-# we do this manually at the end of _pool_runner_worker. The reason is that
-# multiprocessing library waits for all spawned processes to exit, so atexit
-# callbacks won't trigger until all pools are shutdown.
+# exits, and this is by setting the `tearDownModule` of the module containing
+# `__main__`. Note this it set in both the parent process and the subprocesses.
 _active_pool_runners = weakref.WeakSet()
 
 
@@ -866,6 +864,11 @@ def _shutdown_all_pool_runners():
     pool.shutdown()
 
 
+def is_oss():
+  """Returns whether the test is run under OSS."""
+  return len(sys.argv) >= 1 and 'bazel' in sys.argv[0]
+
+
 class MultiProcessPoolRunner(object):
   """A utility class to start a process pool to simulate a cluster.
 
@@ -937,10 +940,6 @@ class MultiProcessPoolRunner(object):
             task_id,
             fn=_pool_runner_worker,
             args=(task_type, task_id, initializer, conn2))
-    # In the case MultiProcessPoolRunner is not GC-ed, we register an atexit
-    # callback to shut them down. For example, when there're global
-    # MultiProcessPoolRunner.
-    atexit.register(_shutdown_all_pool_runners)
 
   def run(self, fn, args=None, kwargs=None):
     """Runs `fn` with `args` and `kwargs` on all jobs.
@@ -1012,12 +1011,6 @@ def _pool_runner_worker(task_type, task_id, initializer, conn):
     sys.stdout.flush()
     sys.stderr.flush()
     conn.send(info)
-  # Shutdown all MultiProcessPoolRunner in this process manually.
-  # MultiProcessPoolRunner registers an atexit callback to shutdown all pool
-  # runners, but we cannot rely on that in processes spawned by the
-  # multiprocessing library. This is because the library waits for all
-  # subprocesses before exiting and thus all atexit callbacks.
-  _shutdown_all_pool_runners()
 
 
 def _run_contained(task_type, task_id, fn, args, kwargs):
@@ -1412,4 +1405,17 @@ def test_main():
     tf.__internal__.distribute.multi_process_runner.test_main()
   ```
   """
+  # Inject tearDownModule() to shut down all pool runners. Active pool runners
+  # will block the program from exiting. This is necessary for global pool
+  # runners. We tried atexit in the past, and it doesn't work in some
+  # deployment.
+  old_tear_down_module = getattr(sys.modules['__main__'], 'tearDownModule',
+                                 None)
+
+  def tear_down_module():
+    _shutdown_all_pool_runners()
+    if old_tear_down_module is not None:
+      old_tear_down_module()
+
+  setattr(sys.modules['__main__'], 'tearDownModule', tear_down_module)
   multi_process_lib.test_main()
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index c164bd490e3..41bb5d09af3 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -319,6 +319,9 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_seg_fault_raises_error(self):
 
+    if multi_process_runner.is_oss():
+      self.skipTest('TODO(b/171004637): Failing in OSS')
+
     def fn_expected_to_seg_fault():
       ctypes.string_at(0)  # Intentionally made seg fault.
 
@@ -331,10 +334,14 @@ class MultiProcessRunnerTest(test.TestCase):
     self.assertIn('Subprocess worker-0 exited with exit code',
                   str(cm.exception))
     list_to_assert = cm.exception.mpr_result.stdout
-    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+    self.assertTrue(
+        any('Segmentation fault' in line for line in list_to_assert))
 
   def test_seg_fault_in_chief_raises_error(self):
 
+    if multi_process_runner.is_oss():
+      self.skipTest('TODO(b/171004637): Failing in OSS')
+
     def fn_expected_to_seg_fault():
       if multi_worker_test_base.get_task_type() == 'worker':
         time.sleep(10000)
@@ -350,7 +357,8 @@ class MultiProcessRunnerTest(test.TestCase):
     self.assertIn('Subprocess chief-0 exited with exit code',
                   str(cm.exception))
     list_to_assert = cm.exception.mpr_result.stdout
-    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+    self.assertTrue(
+        any('Segmentation fault' in line for line in list_to_assert))
 
   def test_exit_code_is_reported_by_chief_subprocess(self):
 
@@ -514,6 +522,9 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_timeout_none(self):
 
+    if multi_process_runner.is_oss():
+      self.skipTest('Intentionally skipping longer test in OSS.')
+
     def fn():
       time.sleep(250)
       raise ValueError('Worker 0 errored')
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 9e56e6d1bf7..5809182b2a8 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -234,6 +234,11 @@ class MultiProcessCluster(object):
       server_config = config_pb2.ConfigProto()
       server_config.device_count['GPU'] = 0
 
+      # Set the environment variable to prevent hanging upon job failure and
+      # restart. Note that it defaults to 'use_caller' at Google, but defaults
+      # to False in OSS.
+      os.environ['GRPC_FAIL_FAST'] = 'use_caller'
+
       server_lib.Server(
           cluster_spec,
           job_name=task_type,
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index 3d5175d9055..946735352a3 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -312,12 +312,26 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
   def _experimental_distribute_dataset(self, dataset, options):
     # Note that split_batch_by argument is not passed because it is always 1 in
     # this strategy, and adding it adds unnecessary overhead to the dataset.
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in  "
+          "`experimental_distribute_datasets_from_function`."
+      )
     return input_lib.get_distributed_dataset(
         dataset,
         self._input_workers_with_options(options),
         self._container_strategy())
 
   def _distribute_datasets_from_function(self, dataset_fn, options):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          "`experimental_distribute_datasets_from_function` "
+          "of tf.distribute.MirroredStrategy")
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers_with_options(options),
@@ -379,13 +393,12 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
     with ops.device(self._device), _OneDeviceReplicaContext(strategy):
       return fn(*args, **kwargs)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
-    del reduce_op, destinations, experimental_hints
+  def _reduce_to(self, reduce_op, value, destinations, options):
+    del reduce_op, destinations, options
     return value
 
-  def _gather_to_implementation(self, value, destinations, axis,
-                                experimental_hints):
-    del destinations, axis, experimental_hints
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    del destinations, axis, options
     return value
 
   def _update(self, var, fn, args, kwargs, group):
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 0cc2b21c3aa..312a3a483c7 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -47,9 +47,8 @@ from tensorflow.python.util.tf_export import tf_export
 _LOCAL_CPU = "/device:CPU:0"
 
 
-# TODO(yuefengz): maybe cache variables on local CPU.
-@tf_export("distribute.experimental.ParameterServerStrategy", v1=[])
-class ParameterServerStrategy(distribute_lib.Strategy):
+@tf_export(v1=["distribute.experimental.ParameterServerStrategy"])  # pylint: disable=missing-docstring
+class ParameterServerStrategyV1(distribute_lib.StrategyV1):
   """An asynchronous multi-worker parameter server tf.distribute strategy.
 
   This strategy requires two roles: workers and parameter servers. Variables and
@@ -112,52 +111,51 @@ class ParameterServerStrategy(distribute_lib.Strategy):
     """
     if cluster_resolver is None:
       cluster_resolver = TFConfigClusterResolver()
-    if not cluster_resolver.cluster_spec():
-      raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
-    extended = ParameterServerStrategyExtended(
-        self, cluster_resolver=cluster_resolver)
-    super(ParameterServerStrategy, self).__init__(extended)
-
-  def experimental_distribute_dataset(self, dataset, options=None):
-    self._raise_pss_error_if_eager()
-    super(ParameterServerStrategy,
-          self).experimental_distribute_dataset(dataset=dataset,
-                                                options=options)
-
-  def distribute_datasets_from_function(self, dataset_fn, options=None):
-    self._raise_pss_error_if_eager()
-    super(ParameterServerStrategy, self).distribute_datasets_from_function(
-        dataset_fn=dataset_fn, options=options)
-
-  def run(self, fn, args=(), kwargs=None, options=None):
-    self._raise_pss_error_if_eager()
-    super(ParameterServerStrategy, self).run(
-        fn, args=args, kwargs=kwargs, options=options)
-
-  def scope(self):
-    self._raise_pss_error_if_eager()
-    return super(ParameterServerStrategy, self).scope()
-
-  def _raise_pss_error_if_eager(self):
-    if context.executing_eagerly():
-      raise NotImplementedError("ParameterServerStrategy currently only works "
-                                "with the tf.Estimator API")
-
-
-@tf_export(v1=["distribute.experimental.ParameterServerStrategy"])  # pylint: disable=missing-docstring
-class ParameterServerStrategyV1(distribute_lib.StrategyV1):
-
-  __doc__ = ParameterServerStrategy.__doc__
-
-  def __init__(self, cluster_resolver=None):
-    """Initializes this strategy."""
     super(ParameterServerStrategyV1, self).__init__(
         ParameterServerStrategyExtended(
             self, cluster_resolver=cluster_resolver))
     distribute_lib.distribution_strategy_gauge.get_cell("V1").set(
         "ParameterServerStrategy")
 
-  __init__.__doc__ = ParameterServerStrategy.__init__.__doc__
+  def experimental_distribute_dataset(self, dataset, options=None):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          "`experimental_distribute_datasets_from_function`."
+      )
+    self._raise_pss_error_if_eager()
+    super(ParameterServerStrategyV1,
+          self).experimental_distribute_dataset(dataset=dataset,
+                                                options=options)
+
+  def distribute_datasets_from_function(self, dataset_fn, options=None):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          "`experimental_distribute_datasets_from_function` "
+          "of tf.distribute.MirroredStrategy")
+    self._raise_pss_error_if_eager()
+    super(ParameterServerStrategyV1, self).distribute_datasets_from_function(
+        dataset_fn=dataset_fn, options=options)
+
+  def run(self, fn, args=(), kwargs=None, options=None):
+    self._raise_pss_error_if_eager()
+    super(ParameterServerStrategyV1, self).run(
+        fn, args=args, kwargs=kwargs, options=options)
+
+  def scope(self):
+    self._raise_pss_error_if_eager()
+    return super(ParameterServerStrategyV1, self).scope()
+
+  def _raise_pss_error_if_eager(self):
+    if context.executing_eagerly():
+      raise NotImplementedError(
+          "`tf.compat.v1.distribute.experimental.ParameterServerStrategy` "
+          "currently only works with the tf.Estimator API")
 
 
 # TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1.
@@ -504,7 +502,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
             (d, self._worker_device))
 
   def _gather_to_implementation(self, value, destinations, axis,
-                                experimental_hints):
+                                options):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       return value
@@ -512,27 +510,22 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
         value,
         destinations=destinations,
         axis=axis,
-        experimental_hints=experimental_hints)
+        options=options)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, self._num_replicas_in_sync)
     return self._cross_device_ops.reduce(
-        reduce_op,
-        value,
-        destinations=destinations,
-        experimental_hints=experimental_hints)
+        reduce_op, value, destinations=destinations, options=options)
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs,
-                       experimental_hints):
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     for _, destinations in value_destination_pairs:
       self._verify_destinations_not_different_worker(destinations)
     return self._cross_device_ops.batch_reduce(reduce_op,
-                                               value_destination_pairs,
-                                               experimental_hints)
+                                               value_destination_pairs, options)
 
   def _select_single_value(self, structured):
     """Select any single value in `structured`."""
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index 1b4cd21c249..2c0b73de14d 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -84,7 +84,7 @@ def create_test_objects(cluster_spec=None,
         task_type=task_type,
         task_id=task_id,
         num_accelerators={'GPU': num_gpus})
-    distribution = parameter_server_strategy.ParameterServerStrategy(
+    distribution = parameter_server_strategy.ParameterServerStrategyV1(
         cluster_resolver)
     target = 'grpc://' + cluster_spec[WORKER][task_id]
   else:
@@ -191,7 +191,7 @@ class ParameterServerStrategyTestBase(
           g = e + 1.0
         self.assertEqual(g.device, worker_device + '/device:CPU:1')
 
-        # Ths ops.colocate_with will be ignored when defining a variable but not
+        # This ops.colocate_with will be ignored when defining a variable but not
         # for a normal tensor.
         with ops.colocate_with(x):
           u = variable_scope.get_variable('u', initializer=30.0)
@@ -345,7 +345,7 @@ class ParameterServerStrategyTestBase(
           g = e + 1.0
         self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))
 
-        # Ths ops.colocate_with will be ignored when defining a variable but not
+        # This ops.colocate_with will be ignored when defining a variable but not
         # for a normal tensor.
         with ops.colocate_with(x):
           u = variable_scope.get_variable('u', initializer=30.0)
@@ -748,7 +748,7 @@ class ParameterServerStrategyTest(
         task_type='worker',
         task_id=1,
         num_accelerators={'GPU': 0})
-    strategy = parameter_server_strategy.ParameterServerStrategy(
+    strategy = parameter_server_strategy.ParameterServerStrategyV1(
         cluster_resolver)
     dataset = dataset_ops.DatasetV2.from_tensor_slices([5., 6., 7., 8.])
 
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2.py b/tensorflow/python/distribute/parameter_server_strategy_v2.py
index d215be0dd94..ee5f3556a7e 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2.py
@@ -37,82 +37,421 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
-# pylint: disable=protected-access
+@tf_export("distribute.experimental.ParameterServerStrategy", v1=[])
 class ParameterServerStrategyV2(distribute_lib.Strategy):
-  """An asynchronous multi-worker parameter server tf.distribute strategy.
+  """An multi-worker tf.distribute strategy with parameter servers.
 
-  Currently, `ParameterServerStrategyV2` is not supported to be used as a
-  standalone tf.distribute strategy. It should be used in conjunction with
-  `Client`. Please see `Client` for more information.
+  Parameter server training is a common data-parallel method to scale up a
+  machine learning model on multiple machines. A parameter server training
+  cluster consists of workers and parameter servers. Variables are created on
+  parameter servers and they are read and updated by workers in each step.
+  By default, workers read and update these variables independently without
+  synchronizing with each other. Under this configuration, it is known as
+  asynchronous training.
 
-  This is currently under development, and the API as well as implementation
-  is subject to changes.
+  In TensorFlow 2, we recommend an architecture based on central coordination
+  for parameter server training. Each worker and parameter server runs a
+  `tf.distribute.Server`, and on top of that, a coordinator task is responsible
+  for creating resources on workers and parameter servers, dispatching
+  functions, and coordinating the training. The coordinator uses a
+  `tf.distribute.experimental.coordinator.ClusterCoordinator` to coordinate the
+  cluster, and a `tf.distribute.experimental.ParameterServerStrategy` to define
+  variables on parameter servers and computation on workers.
+
+  For the training to work, the coordinator dispatches `tf.function`s to be
+  executed on remote workers. Upon receiving requests from the coordinator, a
+  worker executes the `tf.function` by reading the variables from parameter
+  servers, executing the ops, and updating the variables on the parameter
+  servers. Each of the worker only processes the requests from the coordinator,
+  and communicates with parameter servers, without direct interactions with
+  other workers in the cluster.
+
+  As a result, failures of some workers do not prevent the cluster from
+  continuing the work, and this allows the cluster to train with instances that
+  can be occasionally unavailable (e.g. preemptible or spot instances). The
+  coordinator and parameter servers though, must be available at all times for
+  the cluster to make progress.
+
+  Note that the coordinator is not one of the training workers. Instead, it
+  creates resources such as variables and datasets, dispatchs `tf.function`s,
+  saves checkpoints and so on. In addition to workers, parameter servers and
+  the coordinator, an optional evaluator can be run on the side that
+  periodically reads the checkpoints saved by the coordinator and runs
+  evaluations against each checkpoint.
+
+  `tf.distribute.experimental.ParameterServerStrategy` has to work in
+  conjunction with a `tf.distribute.experimental.coordinator.ClusterCoordinator`
+  object. Standalone usage of
+  `tf.distribute.experimental.ParameterServerStrategy` without central
+  coordination is not supported at this time.
+
+  __Example code for coordinator__
+
+  Here's an example usage of the API, with a custom training loop to train a
+  model. This code snippet is intended to be run on (the only) one task that
+  is designated as the coordinator. Note that `cluster_resolver`,
+  `variable_partitioner`, and `dataset_fn` arguments are explained in the
+  following "Cluster setup", "Variable partitioning", and "Dataset preparation"
+  sections.
+
+  ```python
+  # Set the environment variable to allow reporting worker and ps failure to the
+  # coordinator. This a short-term workaround.
+  os.environ["GRPC_FAIL_FAST"] = "use_caller"
+
+  # Prepare a strategy to use with the cluster and variable partitioning info.
+  strategy = tf.distribute.experimental.ParameterServerStrategy(
+      cluster_resolver=...,
+      variable_partitioner=...)
+  coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
+      strategy=strategy)
+
+  # Prepare a distribute dataset that will place datasets on the workers.
+  distributed_dataset = coordinator.create_per_worker_dataset(dataset_fn=...)
+
+  with strategy.scope():
+    model = ...
+    optimizer, metrics = ...  # Keras optimizer/metrics are great choices
+    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint, checkpoint_dir, max_to_keep=2)
+    # `load_checkpoint` infers initial epoch from `optimizer.iterations`.
+    initial_epoch = load_checkpoint(checkpoint_manager) or 0
+
+  @tf.function
+  def worker_fn(iterator):
+
+    def replica_fn(inputs):
+      batch_data, labels = inputs
+      # calculate gradient, applying gradient, metrics update etc.
+
+    strategy.run(replica_fn, args=(next(iterator),))
+
+  for epoch in range(initial_epoch, num_epoch):
+    distributed_iterator = iter(distributed_dataset)  # Reset iterator state.
+    for step in range(steps_per_epoch):
+
+      # Asynchronously schedule the `worker_fn` to be executed on an arbitrary
+      # worker. This call returns immediately.
+      coordinator.schedule(worker_fn, args=(distributed_iterator,))
+
+    # `join` blocks until all scheduled `worker_fn`s finish execution. Once it
+    # returns, we can read the metrics and save checkpoints as needed.
+    coordinator.join()
+    logging.info('Metric result: %r', metrics.result())
+    train_accuracy.reset_states()
+    checkpoint_manager.save()
+  ```
+
+  __Example code for worker and parameter servers__
+
+  In addition to the coordinator, there should be tasks designated as
+  "worker" or "ps". They should run the following code to start a TensorFlow
+  server, waiting for coordinator's requests:
+
+  ```python
+  # Set the environment variable to allow reporting worker and ps failure to the
+  # coordinator.
+  os.environ["GRPC_FAIL_FAST"] = "use_caller"
+
+  # Provide a `tf.distribute.cluster_resolver.ClusterResolver` that serves
+  # the cluster information. See below "Cluster setup" section.
+  cluster_resolver = ...
+
+  server = tf.distribute.Server(
+      cluster_resolver.cluster_spec(),
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol="grpc")
+
+  # Blocking the process that starts a server from exiting.
+  server.join()
+  ```
+
+  __Cluster setup__
+
+  In order for the tasks in the cluster to know other tasks' addresses,
+  a `tf.distribute.cluster_resolver.ClusterResolver` is required to be used
+  in coordinator, worker, and ps. The
+  `tf.distribute.cluster_resolver.ClusterResolver` is responsible for providing
+  the cluster information, as well as the task type and id of the current task.
+  See `tf.distribute.cluster_resolver.ClusterResolver` for more information.
+
+  If `TF_CONFIG` environment variable is set, a
+  `tf.distribute.cluster_resolver.TFConfigClusterResolver` should be used as
+  well. Note that for legacy reason, on some platform, "chief" is used as the
+  task type for the coordinator, as the following example demonstrates. Here we
+  set `TF_CONFIG` for the task designated as a parameter server (task type "ps")
+  and index 1 (the second task), in a cluster with 1 chief, 2 parameter servers,
+  and 3 workers. Note that the it needs to be set before the use of
+  `tf.distribute.cluster_resolver.TFConfigClusterResolver`.
+
+  Example code for cluster setup:
+  ```python
+  os.environ['TF_CONFIG'] = '''
+  {
+    "cluster": {
+      "chief": ["chief.example.com:2222"],
+      "ps": ["ps0.example.com:2222", "ps1.example.com:2222"],
+      "worker": ["worker0.example.com:2222", "worker1.example.com:2222",
+                 "worker2.example.com:2222"]
+    },
+    "task": {
+      "type": "ps",
+      "index": 1
+    }
+  }
+  '''
+  ```
+
+  If you prefer to run the same binary for all tasks, you will need to let the
+  binary branch into different roles at the beginning of the program:
+  ```python
+  os.environ["GRPC_FAIL_FAST"] = "use_caller"
+  cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+
+  # If coordinator, create a strategy and start the training program.
+  if cluster_resolver.task_type == 'chief':
+    strategy = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    ...
+
+  # If worker/ps, create a server
+  elif cluster_resolver.task_type in ("worker", "ps"):
+    server = tf.distribute.Server(...)
+    ...
+  ```
+  Alternatively, you can also start a bunch of TensorFlow servers in advance and
+  connect to them later. The coordinator can be in the same cluster or on any
+  machine that has connectivity to workers and parameter servers. This is
+  covered in our guide and tutorial.
+
+  __Variable creation with `strategy.scope()`__
+
+  `tf.distribute.experimental.ParameterServerStrategy` follows the
+  `tf.distribute` API contract where variable creation is expected to be inside
+  the context manager returned by `strategy.scope()`, in order to be correctly
+  placed on parameter servers in a round-robin manner:
+
+  ```python
+  # In this example, we're assuming having 3 ps.
+  strategy = tf.distribute.experimental.ParameterServerStrategy(
+      cluster_resolver=...)
+  coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
+      strategy=strategy)
+
+  # Variables should be created inside scope to be placed on parameter servers.
+  # If created outside scope such as `v1` here, it would be placed on the
+  # coordinator.
+  v1 = tf.Variable(initial_value=0.0)
+
+  with strategy.scope():
+    v2 = tf.Variable(initial_value=1.0)
+    v3 = tf.Variable(initial_value=2.0)
+    v4 = tf.Variable(initial_value=3.0)
+    v5 = tf.Variable(initial_value=4.0)
+
+  # v2 through v5 are created in scope and are distributed on parameter servers.
+  # Default placement is round-robin but the order should not be relied on.
+  assert v2.device == "/job:ps/replica:0/task:0/device:CPU:0"
+  assert v3.device == "/job:ps/replica:0/task:1/device:CPU:0"
+  assert v4.device == "/job:ps/replica:0/task:2/device:CPU:0"
+  assert v5.device == "/job:ps/replica:0/task:0/device:CPU:0"
+  ```
+
+  See `distribute.Strategy.scope` for more information.
+
+  __Variable partitioning__
+
+  Having dedicated servers to store variables means being able to divide up, or
+  "shard" the variables across the ps. Partitioning large variable among ps is a
+  commonly used technique to boost training throughput and mitigate memory
+  constraints. It enables parallel computations and updates on different shards
+  of a variable, and often yields better load balancing across parameter
+  servers. Without sharding, models with large variables (e.g, embeddings) that
+  can't fit into one machine's memory would otherwise be unable to train.
+
+  With `tf.distribute.experimental.ParameterServerStrategy`, if a
+  `variable_partitioner` is provided to `__init__` and certain conditions are
+  satisfied, the resulting variables created in scope are sharded across the
+  parameter servers, in a round-robin fashion. The variable reference returned
+  from `tf.Variable` becomes a type that serves as the container of the sharded
+  variables. One can access `variables` attribute of this container for the
+  actual variable components. If building model with `tf.Module` or Keras,
+  the variable components are collected in the `variables` alike attributes.
+
+
+  ```python
+  class Dense(tf.Module):
+    def __init__(self, name=None):
+      super().__init__(name=name)
+      self.w = tf.Variable(tf.random.normal([100, 10]), name='w')
+
+    def __call__(self, x):
+      return x * self.w
+
+  # Partition the dense layer into 2 shards.
+  variable_partitioner = (
+    tf.distribute.experimental.partitioners.FixedShardsPartitioner(
+      num_shards = 2))
+  strategy = tf.distribute.experimental.ParameterServerStrategy(
+    cluster_resolver=...,
+    variable_partitioner = variable_partitioner)
+  with strategy.scope():
+    dense = Dense()
+  assert len(dense.variables) == 2
+  assert isinstance(dense.variables[0], tf.Variable)
+  assert isinstance(dense.variables[1], tf.Variable)
+  assert dense.variables[0].shape == (50, 10)
+  assert dense.variables[1].shape == (50, 10)
+  ```
+
+  The sharded variable container can be converted to a `Tensor` via
+  `tf.convert_to_tensor`. This means the container can be directly used in most
+  Python Ops where such `Tensor` conversion automatically happens. For example,
+  in the above code snippet, `x * self.w` would implicitly apply the said tensor
+  conversion. Note that such conversion can be expensive, as the variable
+  components need to be transferred from multiple parameter servers to where
+  the value is used.
+
+  `tf.nn.embedding_lookup` on the other hand doesn't apply the tensor
+  conversion, and performs parallel lookups on the variable components instead.
+  This is crucial to scale up embedding lookups when the embedding table
+  variable is large.
+
+  When a partitioned variable is saved to a `SavedModel`, it will be saved as if
+  it is one single variable. This improves serving efficiency by eliminating
+  a number of Ops that handle the partiton aspects.
+
+  Known limitations of variable partitioning:
+
+  * Number of partitions must not change across Checkpoint saving/loading.
+
+  * After saving partitioned variables to a SavedModel, the SavedModel can't be
+    loaded via `tf.saved_model.load`.
+
+  * Partition variable doesn't directly work with `tf.GradientTape`, please use
+    the `variables` attributes to get the actual variable components and use
+    them in gradient APIs instead.
+
+  __Dataset preparation__
+
+  With `tf.distribute.experimental.ParameterServerStrategy`, a dataset is
+  created in each of the workers to be used for training. This is done by
+  creating a `dataset_fn` that takes no argument and returns a
+  `tf.data.Dataset`, and passing the `dataset_fn` into
+  `tf.distribute.experimental.coordinator.
+  ClusterCoordinator.create_per_worker_dataset`. We recommend the dataset to be
+  shuffled and repeated to have the examples run through the training as evenly
+  as possible.
+
+  ```python
+  def dataset_fn():
+    filenames = ...
+    dataset = tf.data.Dataset.from_tensor_slices(filenames)
+
+    # Dataset is recommended to be shuffled, and repeated.
+    return dataset.shuffle(buffer_size=...).repeat().batch(batch_size=...)
+
+  coordinator =
+      tf.distribute.experimental.coordinator.ClusterCoordinator(strategy=...)
+  distributed_dataset = coordinator.create_per_worker_dataset(dataset_fn)
+  ```
+
+  __Limitations__
+
+  * `tf.distribute.experimental.ParameterServerStrategy` in TF2 is experimental,
+  and the API is subject to further changes.
+
+  * `tf.distribute.experimental.ParameterServerStrategy` does not yet support
+  training with GPU(s). This is a feature request being developed.
+
+  * `tf.distribute.experimental.ParameterServerStrategy` only supports
+  [custom training loop
+  API](https://www.tensorflow.org/tutorials/distribute/custom_training)
+  currently in TF2. Usage of it with Keras `compile`/`fit` API is being
+  developed.
+
+  * `tf.distribute.experimental.ParameterServerStrategy` must be used with
+  `tf.distribute.experimental.coordinator.ClusterCoordinator`.
   """
 
+  # pyformat: disable
   def __init__(self, cluster_resolver, variable_partitioner=None):
-    """Initializes the V2 parameter server strategy.
+    """Initializes the TF2 parameter server strategy.
 
-    This also connects to the remote server cluster.
+    This initializes the `tf.distribute.experimental.ParameterServerStrategy`
+    object to be ready for use with
+    `tf.distribute.experimental.coordinator.ClusterCoordinator`.
 
     Args:
       cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`
         object.
-      variable_partitioner: a callable with the signature `num_partitions =
-        fn(shape, dtype)`, where `num_partitions` is a list/tuple representing
-        the number of partitions on each axis, and `shape` and `dtype` are of
-        types `tf.TensorShape` and `tf.dtypes.Dtype`. If None, variables will
-        not be partitioned. * `variable_partitioner` will be called for all
-        variables created under strategy `scope` to instruct how the variables
-        should be partitioned. Variables will be partitioned if there are more
-        than one partitions along the partitioning axis, otherwise it falls back
-        to normal `tf.Variable`. * Only the first / outermost axis partitioning
-        is supported, namely, elements in `num_partitions` must be 1 other than
-        the first element. * Partitioner like `min_max_variable_partitioner`,
-        `variable_axis_size_partitioner` and `fixed_size_partitioner` are also
-        supported since they conform to the required signature. * Div partition
-        strategy is used to partition variables. Assuming we assign consecutive
-        integer ids along the first axis of a variable, then ids are assigned to
-        shards in a contiguous manner, while attempting to keep each shard size
-        identical. If the ids do not evenly divide the number of shards, each of
-        the first several shards will be assigned one more id. For instance, a
-        variable whose first dimension is
-        13 has 13 ids, and they are split across 5 shards as: `[[0, 1, 2], [3,
-          4, 5], [6, 7, 8], [9, 10], [11, 12]]`. * Variables created under
-          `strategy.extended.colocate_vars_with` will not be partitioned, e.g,
-          optimizer's slot variables.
+      variable_partitioner:
+        a `distribute.experimental.partitioners.Partitioner` that specifies
+        how to partition variables. If `None`, variables will not be
+        partitioned.
+
+        * Predefined partitioners in `tf.distribute.experimental.partitioners`
+        can be used for this argument. A commonly used partitioner is
+        `MinSizePartitioner(min_shard_bytes = 256 << 10, max_shards = num_ps)`,
+        which allocates at least 256K per shard, and each ps gets at most one
+        shard.
+
+        * `variable_partitioner` will be called for each variable created under
+        strategy `scope` to instruct how the variable should be partitioned.
+        Variables that have only one partition along the partitioning axis
+        (i.e., no need for partition) will be created as a normal `tf.Variable`.
+
+        * Only the first / outermost axis partitioning is supported.
+
+        * Div partition strategy is used to partition variables. Assuming we
+        assign consecutive integer ids along the first axis of a variable, then
+        ids are assigned to shards in a contiguous manner, while attempting to
+        keep each shard size identical. If the ids do not evenly divide the
+        number of shards, each of the first several shards will be assigned one
+        more id. For instance, a variable whose first dimension is 13 has 13
+        ids, and they are split across 5 shards as:
+        `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
+
+        * Variables created under `strategy.extended.colocate_vars_with` will
+        not be partitioned.
     """
+    # pyformat: enable
     self._cluster_resolver = cluster_resolver
     self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver,
                                                        variable_partitioner)
     self._verify_args_and_config(cluster_resolver)
     logging.info(
-        "ParameterServerStrategyV2 is initialized with cluster_spec: "
-        "%s", cluster_resolver.cluster_spec())
+        "`tf.distribute.experimental.ParameterServerStrategy` is initialized "
+        "with cluster_spec: %s", cluster_resolver.cluster_spec())
 
-    # TODO(b/167894802): Make chief, worker, and ps names customizable.
-    self._connect_to_cluster(client_name="chief")
+    # TODO(b/167894802): Make coordinator, worker, and ps names customizable.
+    self._connect_to_cluster(coordinator_name="chief")
     super(ParameterServerStrategyV2, self).__init__(self._extended)
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
         "ParameterServerStrategy")
 
-  def _connect_to_cluster(self, client_name):
-    if client_name in ["worker", "ps"]:
-      raise ValueError("Client name should not be 'worker' or 'ps'.")
+  def _connect_to_cluster(self, coordinator_name):
+    if coordinator_name in ["worker", "ps"]:
+      raise ValueError("coordinator name should not be 'worker' or 'ps'.")
     cluster_spec = self._cluster_resolver.cluster_spec()
     self._num_workers = len(cluster_spec.as_dict().get("worker", ()))
     self._num_ps = len(cluster_spec.as_dict().get("ps", ()))
 
     device_filters = server_lib.ClusterDeviceFilters()
-    # For any worker, only the devices on PS and chief nodes are visible
+    # For any worker, only the devices on ps and coordinator nodes are visible
     for i in range(self._num_workers):
       device_filters.set_device_filters(
-          "worker", i, ["/job:ps", "/job:%s" % client_name])
-    # Similarly for any ps, only the devices on workers and chief are visible
+          "worker", i, ["/job:ps", "/job:%s" % coordinator_name])
+    # Similarly for any ps, only the devices on workers and coordinator are
+    # visible
     for i in range(self._num_ps):
       device_filters.set_device_filters(
-          "ps", i, ["/job:worker", "/job:%s" % client_name])
+          "ps", i, ["/job:worker", "/job:%s" % coordinator_name])
 
     # Allow at most one outstanding RPC for each worker at a certain time. This
     # is to simplify worker failure handling in the runtime
@@ -122,7 +461,7 @@ class ParameterServerStrategyV2(distribute_lib.Strategy):
                  self.__class__.__name__, cluster_spec)
     remote.connect_to_cluster(
         cluster_spec,
-        job_name=client_name,
+        job_name=coordinator_name,
         protocol=self._cluster_resolver.rpc_layer,
         cluster_device_filters=device_filters)
 
@@ -134,7 +473,7 @@ class ParameterServerStrategyV2(distribute_lib.Strategy):
   def _verify_args_and_config(self, cluster_resolver):
     if not cluster_resolver.cluster_spec():
       raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
-    if self.extended._num_gpus_per_worker > 1:
+    if self.extended._num_gpus_per_worker > 1:  # pylint: disable=protected-access
       raise NotImplementedError("Multi-gpu is not supported yet.")
 
 
@@ -205,8 +544,8 @@ class ParameterServerStrategyV2Extended(
       init_from_fn = False
       initial_value = initial_value()
     if not init_from_fn:
-      # The initial_value is created on client, it will need to be sent to
-      # PS for variable initialization, which can be inefficient and can
+      # The initial_value is created on coordinator, it will need to be sent to
+      # ps for variable initialization, which can be inefficient and can
       # potentially hit the 2GB limit on protobuf serialization.
       initial_value = ops.convert_to_tensor(initial_value, dtype=dtype)
       dtype = initial_value.dtype
@@ -248,25 +587,39 @@ class ParameterServerStrategyV2Extended(
             logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and
             shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
         return initial_value[offsets[shard_index]:offsets[shard_index + 1]]
+      partition_shape = (offsets[shard_index + 1] -
+                         offsets[shard_index],) + shape[1:]
+      partition_offset = (offsets[shard_index],) + (0,) * len(shape[1:])
       arg_spec = tf_inspect.getfullargspec(initial_value)
       if ("shard_info" not in arg_spec.args and
           "shard_info" not in arg_spec.kwonlyargs):
-        # `initial_value` is a callable that doesn't accept `shard_info`.
-        logging.log_if(
-            logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and
-            shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
-        full_value = initial_value()
-        return full_value[offsets[shard_index]:offsets[shard_index + 1]]
+        try:
+          value = initial_value(
+              partition_shape=partition_shape,
+              partition_offset=partition_offset)
+        except (TypeError, ValueError):
+          # TypeError: Initializer doesn't accept kwargs
+          # ValueError: Initializer doesn't accept partition kwargs
+          # In both cases we go ahead creating the full value and then slice.
+          value = initial_value()
+
+        if value.shape == partition_shape:
+          # Initializer supports partition: value is the partition value.
+          return value
+        else:
+          # Initializer doesn't support partition: value is the full value
+          # and needs to be sliced to get the partition value.
+          logging.log_if(
+              logging.WARN, _INEFFICIENT_INIT_WARNING % name,
+              shard_index == 0 and
+              shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
+          return value[offsets[shard_index]:offsets[shard_index + 1]]
       else:
-        # Memory-efficient way of initializing sharded variable. It requires
-        # the `init_fn` to accept a namedtuple `shard_info`.
-        component_shape = (offsets[shard_index + 1] -
-                           offsets[shard_index],) + shape[1:]
-        offsets_all_axes = (offsets[shard_index],) + (0,) * len(shape[1:])
+        # For compatibility with `CheckpointInitialValueCallable`.
         return initial_value(
             shard_info=trackable.ShardInfo(
-                shape=tensor_shape.as_shape(component_shape),
-                offset=offsets_all_axes))
+                shape=tensor_shape.as_shape(partition_shape),
+                offset=partition_offset))
 
     var_list = []
     for i in range(num_partitions):
@@ -292,6 +645,22 @@ class ParameterServerStrategyV2Extended(
         self._variable_count += 1
         return var
 
+  def _experimental_distribute_dataset(self, dataset, options):
+    if not ops.get_default_graph().building_function:
+      raise ValueError(
+          "The `experimental_distribute_dataset` method must be called inside "
+          "a `tf.function` passed to `create_per_worker_dataset` of "
+          "`tf.distribute.experimental.coordinator.ClusterCoordinator`")
+    return dataset
+
+  def _distribute_datasets_from_function(self, dataset_fn, options):
+    if not ops.get_default_graph().building_function:
+      raise ValueError(
+          "The `distribute_datasets_from_function` method must be called "
+          "inside a `tf.function` passed to `create_per_worker_dataset` of "
+          "`tf.distribute.experimental.coordinator.ClusterCoordinator`")
+    return dataset_fn(distribute_lib.InputContext())
+
   def _call_for_each_replica(self, fn, args, kwargs):
     with distribute_lib.ReplicaContext(
         self._container_strategy(),
@@ -299,6 +668,11 @@ class ParameterServerStrategyV2Extended(
       # TODO(rchao): Support multi-replica per worker or sync-group.
       return distribute_utils.regroup((fn(*args, **kwargs),))
 
+  def _reduce(self, reduce_op, value):
+    # TODO(rchao): Provide implementation for multi-replica. Also look into why
+    # the default implementation is not working.
+    return value
+
 
 # The warning that will be logged if the way we initialize sharded variables
 # is memory-inefficient.
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
index d7c447a756f..b097c5961b1 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import linalg_ops_impl
-from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variables
 from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.training.tracking import tracking
@@ -77,11 +76,13 @@ class ParameterServerStrategyV2Test(test.TestCase):
 
 class PartitionAwareIdentity(object):
 
-  def __call__(self, shape, dtype, shard_info):
+  def __call__(self, shape, dtype, **kwargs):
     value = linalg_ops_impl.eye(*shape, dtype=dtype)
-    if shard_info is not None:
-      value = array_ops.slice(value, shard_info.offset, shard_info.shape)
-    return value
+    if "partition_shape" in kwargs and "partition_offset" in kwargs:
+      return array_ops.slice(value, kwargs["partition_offset"],
+                             kwargs["partition_shape"])
+    raise AssertionError("PartitionAwareIdentity do not support "
+                         "non-partitioned initialization")
 
 
 class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
@@ -108,7 +109,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testBasic(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(2))
     with strategy.scope():
       init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
       v1 = variables.Variable(
@@ -138,7 +139,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testNonCallableInitialValue(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(4))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(4))
     with strategy.scope():
       v = variables.Variable([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
@@ -155,7 +156,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testNumPartitionsLargerThanSize(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(4))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(4))
     with strategy.scope():
       v = variables.Variable([0, 1, 2])
 
@@ -170,8 +171,8 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testPartitionToOne(self):
     # For small variables there is only one partition.
-    variable_partitioner = partitioned_variables.min_max_variable_partitioner(
-        max_partitions=2, min_slice_size=64 << 20)
+    variable_partitioner = sharded_variable.MinSizePartitioner(
+        min_shard_bytes=64 << 20, max_shards=2)
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         self.cluster_resolver, variable_partitioner)
     with strategy.scope():
@@ -195,7 +196,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testColocateWith(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(2))
     with strategy.scope():
       v1 = variables.Variable([0, 1, 2, 3])
 
@@ -209,9 +210,9 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(v2.device, v1.variables[0].device)
     self.assertAllEqual(v2, [4, 5])
 
-  def testPartitionAwareInitializer(self):
+  def testCustomPartitionAwareInitializer(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(2))
     with strategy.scope():
       initializer = PartitionAwareIdentity()
       initial_value = functools.partial(
@@ -228,7 +229,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testPartitionWhenLackOfInfo(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(2))
     with strategy.scope():
       initializer = init_ops_v2.Constant([0, 1, 2, 3])
       # Shape is not explicitly specified.
@@ -278,7 +279,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
   def testCreateInsideTFFunction(self):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(2))
 
     collection = []
 
@@ -327,7 +328,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
             getter=make_variable)
 
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+        self.cluster_resolver, sharded_variable.FixedShardsPartitioner(2))
     ckpt_dir = os.path.join(self.get_temp_dir(), "checkpoint")
 
     with strategy.scope():
@@ -342,7 +343,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
 
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         self.cluster_resolver,
-        partitioned_variables.fixed_size_partitioner(restore_shards))
+        sharded_variable.FixedShardsPartitioner(restore_shards))
 
     with strategy.scope():
       model2 = Model()
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index 7b757db95cb..553d82e4a26 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -19,64 +19,236 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
 
 
-class ShardedVariable(trackable.Trackable):
-  """A container for `Variables` that should be treated as shards.
+@tf_export('distribute.experimental.partitioners.Partitioner', v1=[])
+class Partitioner(object):
+  """Partitioner base class: all partitiners inherit from this class.
 
-  Variables that are too large to fit on a single device (e.g., large
-  embeddings)
-  may need to be sharded over multiple devices. This class maintains a list of
-  smaller variables that can be independently stored on separate devices (eg,
-  multiple parameter servers), and saves and restores those variables as if they
-  were a single larger variable.
+  Partitioners should implement a `__call__` method with the following
+  signature:
 
-  Objects of this class can be saved with a given number of shards and then
-  restored from a checkpoint into a different number of shards.
-
-  Objects of this class can be saved to SavedModel format using
-  `tf.saved_model.save`. The SavedModel can be used by programs like TF serving
-  APIs. It is not yet supported to load the SavedModel with
-  `tf.saved_model.load`.
-
-  Since `ShardedVariable` can be saved and then restored to different number of
-  shards depending on the restore environments, for example, TF serving APIs
-  would restore to one shard for serving efficiency, when using
-  `ShardedVariable` in a tf.function, one should generally not assume it has the
-  same number of shards across save and load.
-
-  Sharding is only supported along the first dimension.
-
-  >>> class Model(tf.Module):
-  ...   def __init__(self):
-  ...     self.sharded_variable = ShardedVariable([
-  ...       tf.Variable([3.0], dtype=tf.float32),
-  ...       tf.Variable([2.0], dtype=tf.float32)
-  ...     ])
-  ...
-  ...   @tf.function(input_signature=[tf.TensorSpec([], dtype=tf.int32)])
-  ...   def fn(self, x):
-  ...     return tf.nn.embedding_lookup(self.sharded_variable.variables, x)
-  ...
-  ...   @tf.function(input_signature=[tf.TensorSpec([], dtype=tf.int32)])
-  ...   def serve_fn(self, x):
-  ...     return tf.nn.embedding_lookup(self.sharded_variable.variables, x)
-  >>>
-  >>> model = Model()
-  >>> model.fn(1).numpy()
-  2.0
-  >>> tf.saved_model.save(model, export_dir='/tmp/saved_model',
-  ...   signatures=model.serve_fn)
+  ```python
+  def __call__(self, shape, dtype, axis=0):
+    # Partitions the given `shape` and returns the partition results.
+    # See docstring of `__call__` method for the format of partition results.
+  ```
   """
 
+  def __call__(self, shape, dtype, axis=0):
+    """Partitions the given `shape` and returns the partition results.
+
+    Examples of a partitioner that allocates a fixed number of shards:
+
+    ```python
+    partitioner = FixedShardsPartitioner(num_shards=2)
+    partitions = partitioner(tf.TensorShape([10, 3], tf.float32), axis=0)
+    print(partitions) # [2, 0]
+    ```
+
+    Args:
+      shape: a `tf.TensorShape`, the shape to partition.
+      dtype: a `tf.dtypes.Dtype` indicating the type of the partition value.
+      axis: The axis to partition along.  Default: outermost axis.
+
+    Returns:
+      A list of integers representing the number of partitions on each axis,
+      where i-th value correponds to i-th axis.
+    """
+    raise NotImplementedError
+
+
+@tf_export('distribute.experimental.partitioners.FixedShardsPartitioner', v1=[])
+class FixedShardsPartitioner(Partitioner):
+  """Partitioner that allocates a fixed number of shards.
+
+  Examples:
+
+  >>> # standalone usage:
+  >>> partitioner = FixedShardsPartitioner(num_shards=2)
+  >>> partitions = partitioner(tf.TensorShape([10, 3]), tf.float32)
+  >>> [2, 1]
+  >>>
+  >>> # use in ParameterServerStrategy
+  >>> # strategy = tf.distribute.experimental.ParameterServerStrategy(
+  >>> #   cluster_resolver=cluster_resolver, variable_partitioner=partitioner)
+
+  """
+
+  def __init__(self, num_shards):
+    """Creates a new `FixedShardsPartitioner`.
+
+    Args:
+      num_shards: `int`, number of shards to partition.
+    """
+    self._num_shards = num_shards
+
+  def __call__(self, shape, dtype, axis=0):
+    del dtype
+    result = [1] * len(shape)
+    result[axis] = min(self._num_shards, shape.dims[axis].value)
+    return result
+
+
+@tf_export('distribute.experimental.partitioners.MinSizePartitioner', v1=[])
+class MinSizePartitioner(Partitioner):
+  """Partitioner that allocates a minimum size per shard.
+
+  This partitioner ensures each shard has at least `min_shard_bytes`, and tries
+  to allocate as many shards as possible, i.e., keeping shard size as small as
+  possible. The maximum number of such shards (upper bound) is given by
+  `max_shards`.
+
+  Examples:
+
+  >>> partitioner = MinSizePartitioner(min_shard_bytes=4, max_shards=2)
+  >>> partitions = partitioner(tf.TensorShape([6, 1]), tf.float32)
+  >>> [2, 1]
+  >>> partitioner = MinSizePartitioner(min_shard_bytes=4, max_shards=10)
+  >>> partitions = partitioner(tf.TensorShape([6, 1]), tf.float32)
+  >>> [6, 1]
+  >>>
+  >>> # use in ParameterServerStrategy
+  >>> # strategy = tf.distribute.experimental.ParameterServerStrategy(
+  >>> #   cluster_resolver=cluster_resolver, variable_partitioner=partitioner)
+  """
+
+  def __init__(self,
+               min_shard_bytes=256 << 10,
+               max_shards=1,
+               bytes_per_string=16):
+    """Creates a new `MinSizePartitioner`.
+
+    Args:
+      min_shard_bytes: Minimum bytes of each shard. Defaults to 256K.
+      max_shards: Upper bound on the number of shards. Defaults to 1.
+      bytes_per_string: If the partition value is of type string, this provides
+        an estimate of how large each string is.
+    """
+    if min_shard_bytes < 1:
+      raise ValueError('min_shard_bytes must be positive, got: %r' %
+                       min_shard_bytes)
+    if max_shards < 1:
+      raise ValueError('max_shards must be positive, got: %r' % max_shards)
+    if bytes_per_string < 1:
+      raise ValueError('bytes_per_string must be positive, got: %r' %
+                       bytes_per_string)
+    self._min_shard_bytes = min_shard_bytes
+    self._max_shards = max_shards
+    self._bytes_per_string = bytes_per_string
+
+  def __call__(self, shape, dtype, axis=0):
+    return partitioned_variables.min_max_variable_partitioner(
+        max_partitions=self._max_shards,
+        axis=axis,
+        min_slice_size=self._min_shard_bytes,
+        bytes_per_string_element=self._bytes_per_string)(shape, dtype)
+
+
+@tf_export('distribute.experimental.partitioners.MaxSizePartitioner', v1=[])
+class MaxSizePartitioner(Partitioner):
+  """Partitioner that keeps shards below `max_shard_bytes`.
+
+  This partitioner ensures each shard has at most `max_shard_bytes`, and tries
+  to allocate as few shards as possible, i.e., keeping shard size as large
+  as possible.
+
+  If the partitioner hits the `max_shards` limit, then each shard may end up
+  larger than `max_shard_bytes`. By default `max_shards` equals `None` and no
+  limit on the number of shards is enforced.
+
+  Examples:
+
+  >>> partitioner = MaxSizePartitioner(max_shard_bytes=4)
+  >>> partitions = partitioner(tf.TensorShape([6, 1]), tf.float32)
+  >>> [6, 1]
+  >>> partitioner = MaxSizePartitioner(max_shard_bytes=4, max_shards=2)
+  >>> partitions = partitioner(tf.TensorShape([6, 1]), tf.float32)
+  >>> [2, 1]
+  >>> partitioner = MaxSizePartitioner(max_shard_bytes=1024)
+  >>> partitions = partitioner(tf.TensorShape([6, 1]), tf.float32)
+  >>> [1, 1]
+  >>>
+  >>> # use in ParameterServerStrategy
+  >>> # strategy = tf.distribute.experimental.ParameterServerStrategy(
+  >>> #   cluster_resolver=cluster_resolver, variable_partitioner=partitioner)
+  """
+
+  def __init__(self, max_shard_bytes, max_shards=None, bytes_per_string=16):
+    """Creates a new `MaxSizePartitioner`.
+
+    Args:
+      max_shard_bytes: The maximum size any given shard is allowed to be.
+      max_shards: The maximum number of shards in `int` created taking
+        precedence over `max_shard_bytes`.
+      bytes_per_string: If the partition value is of type string, this provides
+        an estimate of how large each string is.
+    """
+    if max_shard_bytes < 1:
+      raise ValueError('max_shard_bytes must be positive, got: %r' %
+                       max_shard_bytes)
+    if max_shards and max_shards < 1:
+      raise ValueError('max_shards must be positive, got: %r' % max_shards)
+    if bytes_per_string < 1:
+      raise ValueError('bytes_per_string must be positive, got: %r' %
+                       bytes_per_string)
+
+    self._max_shard_bytes = max_shard_bytes
+    self._max_shards = max_shards
+    self._bytes_per_string = bytes_per_string
+
+  def __call__(self, shape, dtype, axis=0):
+    return partitioned_variables.variable_axis_size_partitioner(
+        max_shard_bytes=self._max_shard_bytes,
+        max_shards=self._max_shards,
+        bytes_per_string_element=self._bytes_per_string,
+        axis=axis)(shape, dtype)
+
+
+class ShardedVariableSpec(type_spec.TypeSpec):
+  """Type specification for a `ShardedVariable`."""
+
+  __slots__ = ['_variable_specs']
+
+  value_type = property(lambda self: ShardedVariable)
+
+  def __init__(self, *variable_specs):
+    self._variable_specs = tuple(variable_specs)
+
+  def _serialize(self):
+    return self._variable_specs
+
+  @property
+  def _component_specs(self):
+    return self._variable_specs
+
+  def _to_components(self, value):
+    return value.variables
+
+  def _from_components(self, variables):
+    return ShardedVariable(variables)
+
+
+class ShardedVariableMixin(trackable.Trackable):
+  """Mixin for ShardedVariable."""
+
+  # TODO(b/170877138): Remove this mixin once fixed. This mixin is required
+  # since TPUShardedVariable can't be a CompositeTensor.
+
   def __init__(self, variables, name='ShardedVariable'):
     """Treats `variables` as shards of a larger Variable.
 
@@ -89,17 +261,17 @@ class ShardedVariable(trackable.Trackable):
       tf.Variable(..., shape=(15, 100), dtype=tf.float32),
       tf.Variable(..., shape=(5, 100), dtype=tf.float32)
     ]
-    sharded_variable = ShardedVariable(variables)
+    sharded_variable = ShardedVariableMixin(variables)
     assert sharded_variable.shape.as_list() == [30, 100]
     ```
 
     Args:
       variables: A list of `ResourceVariable`s that comprise this sharded
         variable. Variables should not be shared between different
-        `ShardedVariable` objects.
+        `ShardedVariableMixin` objects.
       name: String. Name of this container. Defaults to "ShardedVariable".
     """
-    super(ShardedVariable, self).__init__()
+    super(ShardedVariableMixin, self).__init__()
     self._variables = variables
     self._name = name
 
@@ -149,6 +321,12 @@ class ShardedVariable(trackable.Trackable):
     """Return an iterable for accessing the underlying sharded variables."""
     return iter(self._variables)
 
+  @property
+  def _type_spec(self):
+    return ShardedVariableSpec(*(
+        resource_variable_ops.VariableSpec(v.shape, v.dtype)
+        for v in self._variables))
+
   @property
   def variables(self):
     """The list of `Variable`s that make up the shards of this object."""
@@ -220,7 +398,63 @@ class ShardedVariable(trackable.Trackable):
     return obj_map, resource_map
 
 
+class ShardedVariable(ShardedVariableMixin, composite_tensor.CompositeTensor):
+  """A container for `Variables` that should be treated as shards.
+
+  Variables that are too large to fit on a single device (e.g., large
+  embeddings)
+  may need to be sharded over multiple devices. This class maintains a list of
+  smaller variables that can be independently stored on separate devices (eg,
+  multiple parameter servers), and saves and restores those variables as if they
+  were a single larger variable.
+
+  Objects of this class can be saved with a given number of shards and then
+  restored from a checkpoint into a different number of shards.
+
+  Objects of this class can be saved to SavedModel format using
+  `tf.saved_model.save`. The SavedModel can be used by programs like TF serving
+  APIs. It is not yet supported to load the SavedModel with
+  `tf.saved_model.load`.
+
+  Since `ShardedVariable` can be saved and then restored to different number of
+  shards depending on the restore environments, for example, TF serving APIs
+  would restore to one shard for serving efficiency, when using
+  `ShardedVariable` in a tf.function, one should generally not assume it has the
+  same number of shards across save and load.
+
+  Sharding is only supported along the first dimension.
+
+  >>> class Model(tf.Module):
+  ...   def __init__(self):
+  ...     self.sharded_variable = ShardedVariable([
+  ...       tf.Variable([3.0], dtype=tf.float32),
+  ...       tf.Variable([2.0], dtype=tf.float32)
+  ...     ])
+  ...
+  ...   @tf.function(input_signature=[tf.TensorSpec([], dtype=tf.int32)])
+  ...   def fn(self, x):
+  ...     return tf.nn.embedding_lookup(self.sharded_variable.variables, x)
+  ...
+  ...   @tf.function(input_signature=[tf.TensorSpec([], dtype=tf.int32)])
+  ...   def serve_fn(self, x):
+  ...     return tf.nn.embedding_lookup(self.sharded_variable.variables, x)
+  >>>
+  >>> model = Model()
+  >>> model.fn(1).numpy()
+  2.0
+  >>> tf.saved_model.save(model, export_dir='/tmp/saved_model',
+  ...   signatures=model.serve_fn)
+  """
+
+  @property
+  def _type_spec(self):
+    return ShardedVariableSpec(*(
+        resource_variable_ops.VariableSpec(v.shape, v.dtype)
+        for v in self._variables))
+
+
 def _var_to_tensor(var, dtype=None, name=None, as_ref=False):
+  """Converts a `ShardedVariable` to a `Tensor`."""
   del name
   if dtype is not None and not dtype.is_compatible_with(var.dtype):
     raise ValueError(
@@ -229,9 +463,40 @@ def _var_to_tensor(var, dtype=None, name=None, as_ref=False):
   if as_ref:
     raise NotImplementedError(
         "ShardedVariable doesn't support being used as a reference.")
+  # We use op dispatch mechanism to override embedding_lookup ops when called
+  # with ShardedVariable. This requires embedding_lookup ops to raise TypeError
+  # when called with ShardedVariable. However since ShardedVariable can be
+  # converted to a tensor via concat, embedding_lookup ops would silently
+  # do the convertion and never raise a TypeError. To be able to properly
+  # raise a TypeError, namescope is used to detect if this method is called
+  # within a embedding_lookup op.
+  # NOTE: This doesn't work in eager mode since op namescope is always cleared
+  # in eager. This also breaks if user sets the name of embedding_lookup op
+  # with something that doesn't contain str "embedding_lookup".
+  #
+  # TODO(chenkai): Find a more robust way to do this, which should not rely
+  # on namescope.
+  if 'embedding_lookup' in ops.get_name_scope():
+    raise TypeError('Converting ShardedVariable to tensor in embedding lookup'
+                    ' ops is disallowed.')
   return array_ops.concat(var.variables, axis=0)
 
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(ShardedVariable, _var_to_tensor)
+
+
+# Override the behavior of embedding_lookup(sharded_variable, ...)
+@dispatch.dispatch_for_types(embedding_ops.embedding_lookup, ShardedVariable)
+def embedding_lookup(params,
+                     ids,
+                     partition_strategy='mod',
+                     name=None,
+                     validate_indices=True,
+                     max_norm=None):
+  if isinstance(params, list):
+    params = params[0]
+  return embedding_ops.embedding_lookup(params.variables, ids,
+                                        partition_strategy, name,
+                                        validate_indices, max_norm)
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index f04e5b248a3..8b88d7b016e 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -24,11 +24,17 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
@@ -37,6 +43,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
 
 
 def _load_and_run(
@@ -60,6 +67,39 @@ def _load_and_run(
     return session.run(output_dict, feed_dict=feed_dict)
 
 
+class PartitionerTest(test.TestCase):
+
+  def test_fixed_shards_partitioner(self):
+    partitioner = sharded_variable.FixedShardsPartitioner(num_shards=2)
+    got = partitioner(tensor_shape.TensorShape([10, 3]), dtypes.float32)
+    self.assertAllEqual(got, [2, 1])
+
+  def test_min_size_partitioner(self):
+    partitioner = sharded_variable.MinSizePartitioner(
+        min_shard_bytes=4, max_shards=2)
+    got = partitioner(tensor_shape.TensorShape([6, 1]), dtypes.float32)
+    self.assertAllEqual(got, [2, 1])
+
+    partitioner = sharded_variable.MinSizePartitioner(
+        min_shard_bytes=4, max_shards=10)
+    got = partitioner(tensor_shape.TensorShape([6, 1]), dtypes.float32)
+    self.assertAllEqual(got, [6, 1])
+
+  def test_max_size_partitioner(self):
+    partitioner = sharded_variable.MaxSizePartitioner(max_shard_bytes=4)
+    got = partitioner(tensor_shape.TensorShape([6, 1]), dtypes.float32)
+    self.assertAllEqual(got, [6, 1])
+
+    partitioner = sharded_variable.MaxSizePartitioner(
+        max_shard_bytes=4, max_shards=2)
+    got = partitioner(tensor_shape.TensorShape([6, 1]), dtypes.float32)
+    self.assertAllEqual(got, [2, 1])
+
+    partitioner = sharded_variable.MaxSizePartitioner(max_shard_bytes=1024)
+    got = partitioner(tensor_shape.TensorShape([6, 1]), dtypes.float32)
+    self.assertAllEqual(got, [1, 1])
+
+
 class ShardedVariableTest(test.TestCase):
 
   def test_sharded_variable_simple(self):
@@ -286,6 +326,194 @@ class ShardedVariableTest(test.TestCase):
               full_name='s', full_shape=[2], var_offset=[0], var_shape=[1]))
       sharded_variable.ShardedVariable([v])
 
+  def test_as_function_input(self):
+    variables1 = [
+        variables_lib.Variable([1]),
+        variables_lib.Variable([1]),
+    ]
+    s = sharded_variable.ShardedVariable(variables1)
+    variables2 = [
+        variables_lib.Variable([2]),
+        variables_lib.Variable([2]),
+    ]
+    s2 = sharded_variable.ShardedVariable(variables2)
+
+    trace_count = [0]
+
+    @def_function.function
+    def func(sharded_var):
+      trace_count[0] = trace_count[0] + 1
+      sharded_var.assign([0, 0])
+
+    func(s)
+    self.assertAllEqual(ops.convert_to_tensor(s), [0, 0])
+    self.assertEqual(trace_count[0], 1)
+    func(s2)
+    self.assertAllEqual(ops.convert_to_tensor(s2), [0, 0])
+    self.assertEqual(trace_count[0], 1)
+
+  def test_flatten(self):
+    variables = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([1]),
+    ]
+    s = sharded_variable.ShardedVariable(variables)
+
+    got = nest.flatten(s)
+    self.assertEqual(s, got[0])
+
+    got = nest.flatten(s, expand_composites=True)
+    self.assertAllEqual(variables, got)
+
+  def test_tf_module(self):
+
+    class Model(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        variables = [
+            variables_lib.Variable([0]),
+            variables_lib.Variable([1]),
+        ]
+        self.w = sharded_variable.ShardedVariable(variables)
+
+    model = Model()
+
+    self.assertLen(model.variables, 2)
+    self.assertEqual(model.variables[0], [0])
+    self.assertEqual(model.variables[1], [1])
+    self.assertAllEqual(model.variables, model.trainable_variables)
+
+    self.assertLen(model._checkpoint_dependencies, 1)
+    self.assertEqual(model._checkpoint_dependencies[0].ref, model.w)
+
+  def test_keras_layer_setattr(self):
+
+    class Layer(base_layer.Layer):
+
+      def __init__(self):
+        super().__init__()
+        variables1 = [
+            variables_lib.Variable([0]),
+            variables_lib.Variable([1]),
+        ]
+        variables2 = [
+            variables_lib.Variable([2], trainable=False),
+            variables_lib.Variable([3], trainable=False),
+        ]
+        self.w = sharded_variable.ShardedVariable(variables1)
+        self.b = sharded_variable.ShardedVariable(variables2)
+
+    layer = Layer()
+
+    self.assertLen(layer.trainable_weights, 2)
+    self.assertEqual(layer.trainable_weights[0], [0])
+    self.assertEqual(layer.trainable_weights[1], [1])
+    self.assertLen(layer.non_trainable_weights, 2)
+    self.assertEqual(layer.non_trainable_weights[0], [2])
+    self.assertEqual(layer.non_trainable_weights[1], [3])
+    self.assertAllEqual(layer.weights,
+                        layer.trainable_weights + layer.non_trainable_weights)
+    self.assertAllEqual(layer.trainable_weights, layer.trainable_variables)
+    self.assertAllEqual(layer.weights, layer.variables)
+
+    checkpoint_deps = set(dep.ref for dep in layer._checkpoint_dependencies)
+    self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
+
+  def test_keras_layer_add_weight(self):
+
+    class Layer(base_layer.Layer):
+
+      def __init__(self):
+        super().__init__()
+        self.w = self.add_weight(
+            shape=(2,), initializer=lambda shape, dtype: [0, 1], trainable=True)
+        self.b = self.add_weight(
+            shape=(2,),
+            initializer=lambda shape, dtype: [2, 3],
+            trainable=False)
+
+    def sharded_variable_creator(next_creator, **kwargs):
+      v1_value = kwargs['initial_value']()[0:1]
+      v2_value = kwargs['initial_value']()[1:]
+
+      kwargs['initial_value'] = v1_value
+      kwargs['shape'] = (1,)
+      v1 = next_creator(**kwargs)
+
+      kwargs['initial_value'] = v2_value
+      kwargs['shape'] = (1,)
+      v2 = next_creator(**kwargs)
+
+      return sharded_variable.ShardedVariable([v1, v2])
+
+    with variable_scope.variable_creator_scope(sharded_variable_creator):
+      layer = Layer()
+
+    self.assertLen(layer.trainable_weights, 2)
+    self.assertEqual(layer.trainable_weights[0], [0])
+    self.assertEqual(layer.trainable_weights[1], [1])
+    self.assertLen(layer.non_trainable_weights, 2)
+    self.assertEqual(layer.non_trainable_weights[0], [2])
+    self.assertEqual(layer.non_trainable_weights[1], [3])
+    self.assertAllEqual(layer.weights,
+                        layer.trainable_weights + layer.non_trainable_weights)
+    self.assertAllEqual(layer.trainable_weights, layer.trainable_variables)
+    self.assertAllEqual(layer.weights, layer.variables)
+
+    checkpoint_deps = set(dep.ref for dep in layer._checkpoint_dependencies)
+    self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
+
+  def test_embedding_lookup(self):
+    v = [
+        variables_lib.Variable([[1., 2.], [3., 4.]]),
+        variables_lib.Variable([[5., 6.], [7., 8.]]),
+        variables_lib.Variable([[9., 10.]])
+    ]
+    sv = sharded_variable.ShardedVariable(v)
+
+    @def_function.function
+    def lookup():
+      ids = constant_op.constant([0, 3, 4])
+      return embedding_ops.embedding_lookup_v2(sv, ids)
+
+    @def_function.function
+    def sparse_lookup():
+      sp_ids = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0], [2, 2]],
+          values=[0, 3, 4, 1],
+          dense_shape=[3, 3])
+      return embedding_ops.embedding_lookup_sparse_v2(sv, sp_ids, None)
+
+    @def_function.function
+    def safe_sparse_lookup():
+      sp_ids = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0], [2, 2]],
+          values=[0, -1, 4, 1],
+          dense_shape=[3, 3])
+      sp_weights = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0], [2, 2]],
+          values=[1., 1., -1., 1.],
+          dense_shape=[3, 3])
+      return embedding_ops.safe_embedding_lookup_sparse_v2(
+          sv, sp_ids, sp_weights)
+
+    # TODO(chenkai): Add safe_sparse_lookup to the list. Currently
+    # ShardedVariable is converted to a tensor in safe_sparse_lookup.
+    for func in [lookup, sparse_lookup]:
+      num_gather_ops = 0
+      for op in func.get_concrete_function().graph.get_operations():
+        if op.type == 'ResourceGather':
+          num_gather_ops += 1
+      self.assertEqual(
+          num_gather_ops, len(v), 'Number of ResourceGather op does not match'
+          ' expected, possibly due to ShardedVariable accidentally being'
+          ' converted to tensor in embedding_lookup ops.')
+
+    self.assertAllEqual(lookup(), [[1., 2.], [7., 8.], [9., 10.]])
+    self.assertAllClose(sparse_lookup(), [[4., 5.], [9., 10.], [3., 4.]])
+    self.assertAllClose(safe_sparse_lookup(), [[1., 2.], [0., 0.], [3., 4.]])
+
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 0f0de47dc1d..1c014dd6edf 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -26,6 +26,7 @@ from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
 from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import one_device_strategy as one_device_lib
 from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
@@ -40,6 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 _TF_INTERNAL_API_PREFIX = "__internal__.distribute.combinations."
 
 _did_connect_to_cluster = False
+_topology = None
 CollectiveAllReduceExtended = (
     collective_all_reduce_strategy.CollectiveAllReduceExtended)
 
@@ -75,6 +77,7 @@ def _get_tpu_strategy_creator(steps_per_run,
   def _create_tpu_strategy():
     FLAGS = flags.FLAGS  # pylint: disable=invalid-name
     global _did_connect_to_cluster
+    global _topology
 
     try:
       # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
@@ -92,16 +95,16 @@ def _get_tpu_strategy_creator(steps_per_run,
       )
 
     # Only connect once per process, rather than per test method.
-    if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
-      if not _did_connect_to_cluster:
+    if not _did_connect_to_cluster:
+      if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
         remote.connect_to_cluster(resolver)
         _did_connect_to_cluster = True
+      _topology = tpu_strategy_util.initialize_tpu_system(resolver)
 
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
     device_assignment = None
     if use_single_core:
       device_assignment = device_assignment_lib.DeviceAssignment(
-          topology,
+          _topology,
           core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)
 
     # Steps per run is only supported in TF 1.x
@@ -158,6 +161,50 @@ def _get_multi_worker_mirrored_creator(required_gpus):
   return _create_multi_worker_mirrored
 
 
+def _deferred_pool_runner(has_chief, num_workers, initializer=None):
+  """Returns a callable that returns the pool runner.
+
+  It creates the pool runner only upon first invocation. This avoids creating it
+  when this file is imported.
+
+  Args:
+    has_chief: whether there should be a chief.
+    num_workers: the number of workers excluding the chief.
+    initializer: initializer of each process.
+
+  Returns:
+    A callable that returns the runner.
+  """
+
+  container = []
+
+  def get_or_create():
+    if not container:
+      cluster_spec = multi_worker_test_base.create_cluster_spec(
+          has_chief=has_chief,
+          num_workers=num_workers,
+          num_ps=0,
+          has_eval=False)
+      runner = multi_process_runner.MultiProcessPoolRunner(
+          cluster_spec, initializer=initializer)
+      container.append(runner)
+    return container[0]
+
+  return get_or_create
+
+
+# We need to create the strategy in the initializer to start the server before
+# any test runs.
+_two_worker_pool = _deferred_pool_runner(
+    has_chief=True,
+    num_workers=1,
+    initializer=_get_multi_worker_mirrored_creator(required_gpus=0))
+_four_worker_pool = _deferred_pool_runner(
+    has_chief=True,
+    num_workers=3,
+    initializer=_get_multi_worker_mirrored_creator(required_gpus=0))
+
+
 # pylint: disable=g-long-lambda
 default_strategy = combinations.NamedDistribution(
     "Default",
@@ -218,7 +265,7 @@ mirrored_strategy_with_cpu_1_and_2.__doc__ = (
     """)
 central_storage_strategy_with_two_gpus = combinations.NamedDistribution(
     "CentralStorage2GPUs",
-    lambda: CentralStorageStrategy._from_num_gpus(2),  # pylint: disable=protected-access
+    lambda: CentralStorageStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
 central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
     "CentralStorageCPUAndGPU",
@@ -230,7 +277,7 @@ multi_worker_mirrored_2x1_cpu = combinations.NamedDistribution(
     _get_multi_worker_mirrored_creator(required_gpus=0),
     has_chief=True,
     num_workers=1,
-    use_pool_runner=True,
+    pool_runner_fn=_two_worker_pool,
     no_xla=True,
 )
 # chief + 1 worker, with 1 GPU each.
@@ -240,7 +287,7 @@ multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=1,
     required_gpus=1,
-    use_pool_runner=True,
+    pool_runner_fn=_two_worker_pool,
     no_xla=True,
 )
 # chief + 1 worker, with 2 GPU each.
@@ -250,7 +297,7 @@ multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=1,
     required_gpus=2,
-    use_pool_runner=True,
+    pool_runner_fn=_two_worker_pool,
     no_xla=True,
 )
 # chief + 3 workers, with CPU.
@@ -259,7 +306,7 @@ multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
     _get_multi_worker_mirrored_creator(required_gpus=0),
     has_chief=True,
     num_workers=3,
-    use_pool_runner=True,
+    pool_runner_fn=_four_worker_pool,
     no_xla=True,
 )
 
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index fbe07dd7f70..6b19a744457 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -21,21 +21,18 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
-from tensorflow.python.distribute.tpu_strategy import TPUStrategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -114,565 +111,6 @@ class ReduceTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(3 * strategy.num_replicas_in_sync, x_s)
 
 
-@combinations.generate(
-    combinations.combine(
-        strategy=[
-            strategy_combinations.default_strategy,
-            strategy_combinations.one_device_strategy,
-            strategy_combinations.one_device_strategy_gpu,
-            strategy_combinations.central_storage_strategy_with_two_gpus,
-            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
-            strategy_combinations.mirrored_strategy_with_one_cpu,
-            strategy_combinations.mirrored_strategy_with_one_gpu,
-            strategy_combinations.mirrored_strategy_with_two_gpus,
-            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
-            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-            strategy_combinations.multi_worker_mirrored_2x2_gpu,
-            strategy_combinations.multi_worker_mirrored_2x1_cpu,
-            strategy_combinations.multi_worker_mirrored_2x1_gpu,
-        ],
-        mode=['eager'],
-        pure_eager=[True, False]))
-class GatherTest(test.TestCase, parameterized.TestCase):
-
-  def _gather_same_shape_and_verify(self, value_on_replica, axis, pure_eager,
-                                    strategy):
-    distributed_values = strategy.experimental_distribute_values_from_function(
-        lambda _: array_ops.identity(value_on_replica))
-
-    def run():
-      return strategy._gather(distributed_values, axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    all_results = [
-        value_on_replica for _ in range(strategy.num_replicas_in_sync)
-    ]
-    expected_result = array_ops.concat(all_results, axis=axis)
-    self.assertAllEqual(run().numpy(), expected_result)
-
-  def testGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
-    """A DistributedValues object with two tensors of shape [3] on each replica gathers to a tensor of [6]."""
-    single_value = constant_op.constant([1, 2, 3])
-    axis = 0
-    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
-
-  def testGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
-    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 0th dim to a tensor of [2, 3]."""
-    single_value = constant_op.constant([[1, 2, 3]])
-    axis = 0
-    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
-
-  def testGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
-    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 1st dim to a tensor of [1, 6]."""
-    single_value = constant_op.constant([[1, 2, 3]])
-    axis = 1
-    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
-
-  def testGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
-    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 0th dim to a tensor of [2, 2, 2]."""
-    single_value = constant_op.constant([[[1, 2], [1, 2]]])
-    axis = 0
-    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
-
-  def testGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
-    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 1nd dimension to a tensor of [1, 4, 2]."""
-    single_value = constant_op.constant([[[1, 2], [1, 2]]])
-    axis = 1
-    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
-
-  def testGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
-    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 2nd dimension to a tensor of [1, 2, 4]."""
-    single_value = constant_op.constant([[[1, 2], [1, 2]]])
-    axis = 2
-    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
-
-  def testGatherDiffShapeAtAxis0(self, strategy, pure_eager):
-    """Different `Axis`-th (0) dimension: shape [1, 1], [2, 1] -> [3, 1]."""
-
-    def value_fn(ctx):
-      return constant_op.constant(
-          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
-
-    distributed_values = strategy.experimental_distribute_values_from_function(
-        value_fn)
-    axis = 0
-
-    def run():
-      return strategy._gather(distributed_values, axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    if strategy.num_replicas_in_sync == 1:
-      expected_result = constant_op.constant(1, shape=(1, 1))
-    elif strategy.num_replicas_in_sync == 2:
-      expected_result = constant_op.constant(1, shape=(3, 1))
-    elif strategy.num_replicas_in_sync == 4:
-      expected_result = constant_op.constant(1, shape=(10, 1))
-    else:
-      # should follow expected_result = constant_op.constant(
-      #    1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
-      raise ValueError('Add your own expect according to num_replicas_in sync')
-
-    self.assertAllEqual(run().numpy(), expected_result)
-
-  def testGatherDiffShapeAtAxis1(self, strategy, pure_eager):
-    """Different `Axis`-th (non-0) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
-
-    def value_fn(ctx):
-      return constant_op.constant(
-          1, shape=(1, ctx.replica_id_in_sync_group + 1))
-
-    distributed_values = strategy.experimental_distribute_values_from_function(
-        value_fn)
-    axis = 1
-
-    def run():
-      return strategy._gather(distributed_values, axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    if strategy.num_replicas_in_sync == 1:
-      expected_result = constant_op.constant(1, shape=(1, 1))
-    elif strategy.num_replicas_in_sync == 2:
-      expected_result = constant_op.constant(1, shape=(1, 3))
-    elif strategy.num_replicas_in_sync == 4:
-      expected_result = constant_op.constant(1, shape=(1, 10))
-    else:
-      # should follow expected_result = constant_op.constant(
-      #   1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
-      raise ValueError('Add your own expect according to num_replicas_in sync')
-
-    self.assertAllEqual(run().numpy(), expected_result)
-
-  def testGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
-    """Different at non-`axis`-th dimension : [1, 1], [1, 2], 0th -> raise error."""
-    if isinstance(strategy, CollectiveAllReduceStrategy
-                 ) and _get_num_replicas_per_client(strategy) > 1:
-      self.skipTest('b/167331966')
-
-    if strategy.num_replicas_in_sync <= 1:
-      self.skipTest('Test for more than 1 replica only.')
-
-    def value_fn(ctx):
-      return constant_op.constant(
-          1, shape=(1, ctx.replica_id_in_sync_group + 1))
-
-    distributed_values = strategy.experimental_distribute_values_from_function(
-        value_fn)
-    axis = 0
-
-    def run():
-      return strategy._gather(distributed_values, axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    if isinstance(strategy, CollectiveAllReduceStrategy):
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  r'Shape mismatch'):
-        run()
-    elif isinstance(
-        strategy,
-        (mirrored_strategy.MirroredStrategy,
-         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  r'Dimensions of inputs should match'):
-        run()
-    else:
-      with self.assertRaisesRegex(ValueError,
-                                  r'Dimension \d in both shapes must be equal'):
-        run()
-
-  def testGatherRaiseSparse(self, strategy, pure_eager):
-    dense_shape = [5, 2]
-    t0 = _make_indexed_slices(
-        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
-
-    def run(value):
-      return strategy._gather(value, axis=0)
-
-    with self.assertRaisesRegex(
-        NotImplementedError,
-        r'gather/all_gather does not support IndexedSlices'):
-      if pure_eager:
-        run(t0)
-      else:
-        def_function.function(run)(t0)
-
-  def testGatherRaiseDifferentRank(self, strategy, pure_eager):
-    """Different rank: [1,], [1, 2] -> raise error."""
-    if strategy.num_replicas_in_sync <= 1:
-      self.skipTest('Test for more than 1 replicas.')
-    if isinstance(strategy, CollectiveAllReduceStrategy
-                 ) and _get_num_replicas_per_client(strategy) > 1:
-      self.skipTest('b/167331966')
-    def value_fn(ctx):
-      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
-
-    distributed_values = strategy.experimental_distribute_values_from_function(
-        value_fn)
-    axis = 0
-
-    def run():
-      return strategy._gather(distributed_values, axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    if isinstance(strategy, CollectiveAllReduceStrategy):
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  r'Shape mismatch'):
-        run()
-    elif isinstance(strategy,
-                    (mirrored_strategy.MirroredStrategy,
-                     central_storage_strategy.CentralStorageStrategy)):
-      if pure_eager:
-        with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                    r'Ranks of all input tensors should match'):
-          run()
-      else:
-        with self.assertRaisesRegex(ValueError,
-                                    r'Shape must be rank \d but is rank \d'):
-          run()
-    else:
-      with self.assertRaisesRegex(ValueError,
-                                  r'Dimension \d in both shapes must be equal'):
-        run()
-
-
-@combinations.generate(
-    combinations.combine(
-        strategy=[
-            strategy_combinations.default_strategy,
-            strategy_combinations.one_device_strategy,
-            strategy_combinations.one_device_strategy_gpu,
-            strategy_combinations.central_storage_strategy_with_two_gpus,
-            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
-            strategy_combinations.mirrored_strategy_with_one_cpu,
-            strategy_combinations.mirrored_strategy_with_one_gpu,
-            strategy_combinations.mirrored_strategy_with_two_gpus,
-            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
-            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-            strategy_combinations.multi_worker_mirrored_2x2_gpu,
-            strategy_combinations.multi_worker_mirrored_2x1_cpu,
-            strategy_combinations.multi_worker_mirrored_2x1_gpu,
-        ],
-        mode=['eager'],
-        pure_eager=[True, False]))
-class AllGatherTest(test.TestCase, parameterized.TestCase):
-
-  def _all_gather_same_shape_and_verify(self, value_on_replica, axis,
-                                        pure_eager, strategy):
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        lambda _: array_ops.identity(value_on_replica))
-
-    def replica_fn(per_replica_value):
-      ctx = ds_context.get_replica_context()
-      local_value = array_ops.identity(per_replica_value)
-      return ctx._all_gather(local_value, axis=axis)
-
-    if not pure_eager:
-      replica_fn = def_function.function(replica_fn)
-
-    result = strategy.experimental_local_results(
-        strategy.run(replica_fn, args=(per_replica_value,)))
-
-    all_value = [value_on_replica for _ in range(strategy.num_replicas_in_sync)]
-    expect = array_ops.concat(all_value, axis=axis)
-    expected_result = [expect] * _get_num_replicas_per_client(strategy)
-
-    self.assertAllClose(result, expected_result)
-
-  def testAllGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (3,) on two replica returns a PerReplica of tensor(s) with shape (6,)."""
-    single_value = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
-    axis = 0
-    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
-                                           strategy)
-
-  def testAllGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (2,3)."""
-    single_value = constant_op.constant([[1, 2, 3]])
-    axis = 0
-    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
-                                           strategy)
-
-  def testAllGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (1,6)."""
-    single_value = constant_op.constant([[1, 2, 3]])
-    axis = 1
-    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
-                                           strategy)
-
-  def testAllGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (2,2,2)."""
-    single_value = constant_op.constant([[[1, 2], [1, 2]]])
-    axis = 0
-    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
-                                           strategy)
-
-  def testAllGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,4,2)."""
-    single_value = constant_op.constant([[[1, 2], [1, 2]]])
-    axis = 1
-    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
-                                           strategy)
-
-  def testAllGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=2,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,2,4)."""
-    single_value = constant_op.constant([[[1, 2], [1, 2]]])
-    axis = 2
-    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
-                                           strategy)
-
-  def testAllGatherDiffShapeAtAxis0(self, strategy, pure_eager):
-    """Different `Axis==0`-th dimension: shape [1, 1], [2, 1] -> [3, 1]."""
-
-    def value_fn(ctx):
-      return constant_op.constant(
-          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
-
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        value_fn)
-
-    if strategy.num_replicas_in_sync == 1:
-      expect = constant_op.constant(1, shape=(1, 1))
-    elif strategy.num_replicas_in_sync == 2:
-      expect = constant_op.constant(1, shape=(3, 1))
-    elif strategy.num_replicas_in_sync == 4:
-      expect = constant_op.constant(1, shape=(10, 1))
-    else:
-      # should follow expect = constant_op.constant(
-      #     1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
-      raise ValueError('Add your own expect according to num_replicas_in sync')
-
-    def run(value):
-      value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather(value_identity, axis=0)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    expected_result = [expect] * _get_num_replicas_per_client(strategy)
-    result = strategy.experimental_local_results(
-        strategy.run(run, args=(per_replica_value,)))
-    self.assertAllEqual(result, expected_result)
-
-  def testAllGatherDiffShapeAtAxis1(self, strategy, pure_eager):
-    """Different `Axis`-th (not 0th) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
-
-    def value_fn(ctx):
-      return constant_op.constant(
-          1, shape=(1, ctx.replica_id_in_sync_group + 1))
-
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        value_fn)
-
-    if strategy.num_replicas_in_sync == 1:
-      expect = constant_op.constant(1, shape=(1, 1))
-    elif strategy.num_replicas_in_sync == 2:
-      expect = constant_op.constant(1, shape=(1, 3))
-    elif strategy.num_replicas_in_sync == 4:
-      expect = constant_op.constant(1, shape=(1, 10))
-    else:
-      # should follow expect = constant_op.constant(
-      #    1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
-      raise ValueError('Add your own expect according to num_replicas_in sync')
-
-    def run(value):
-      value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather(value_identity, axis=1)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    expected_result = [expect] * _get_num_replicas_per_client(strategy)
-    result = strategy.experimental_local_results(
-        strategy.run(run, args=(per_replica_value,)))
-    self.assertAllEqual(result, expected_result)
-
-  def testAllGatherNest(self, strategy, pure_eager):
-    axis = 1
-
-    def value_fn(ctx):
-      value = constant_op.constant(
-          1, shape=(1, ctx.replica_id_in_sync_group + 1))
-      return value
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        value_fn)
-
-    if strategy.num_replicas_in_sync == 1:
-      expect_1 = constant_op.constant(1, shape=(1, 1))
-    elif strategy.num_replicas_in_sync == 2:
-      expect_1 = constant_op.constant(1, shape=(1, 3))
-    elif strategy.num_replicas_in_sync == 4:
-      expect_1 = constant_op.constant(1, shape=(1, 10))
-    else:
-      # should follow expect_1 = constant_op.constant(
-      #    1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
-      raise ValueError('Add your own expect according to num_replicas_in sync')
-
-    expected_per_replica_1 = [expect_1] * _get_num_replicas_per_client(strategy)
-
-    value_2 = constant_op.constant([[[1, 2], [1, 2]]])
-
-    if strategy.num_replicas_in_sync == 1:
-      expect_2 = constant_op.constant([[[1, 2], [1, 2]]])
-    elif strategy.num_replicas_in_sync == 2:
-      expect_2 = constant_op.constant([[[1, 2], [1, 2], [1, 2], [1, 2]]])
-    elif strategy.num_replicas_in_sync == 4:
-      expect_2 = constant_op.constant([[[1, 2], [1, 2], [1, 2], [1, 2], [1, 2],
-                                        [1, 2], [1, 2], [1, 2]]])
-    else:
-      # should follow expect_2 = array_ops.concat(
-      #    [value_2 for _ in range(strategy.num_replicas_in_sync)], axis=axis)
-      raise ValueError('Add your own expect according to num_replicas_in sync')
-
-    expected_per_replica_2 = [expect_2] * _get_num_replicas_per_client(strategy)
-
-    def run(value):
-      value_1 = array_ops.identity(value)
-      value_3 = array_ops.identity(value_2)
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather([value_1, value_3], axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    result = strategy.run(run, args=(per_replica_value,))
-    self.assertAllEqual(
-        strategy.experimental_local_results(result[0]), expected_per_replica_1)
-    self.assertAllEqual(
-        strategy.experimental_local_results(result[1]), expected_per_replica_2)
-
-  def testAllGatherNest1D0Axis(self, strategy, pure_eager):
-    """all_gather(..., axis=0,...) a nest of DistributedValues."""
-    single_value = constant_op.constant([1, 2, 3])
-    axis = 0
-
-    def run():
-      value_identity = array_ops.identity(single_value)
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather([value_identity, value_identity], axis=axis)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    all_value = [single_value for _ in range(strategy.num_replicas_in_sync)]
-    expect = array_ops.concat(all_value, axis=axis)
-    expected_per_replica = [expect] * _get_num_replicas_per_client(strategy)
-
-    result = strategy.run(run)
-    for gathered_result in result:
-      self.assertAllEqual(
-          strategy.experimental_local_results(gathered_result),
-          expected_per_replica)
-
-  def testAllGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
-    """Different at non-`axis`-th dimension : [2, 1], [1, 1], all_gather(...axis=1...) -> raise error."""
-    if isinstance(strategy, CollectiveAllReduceStrategy
-                 ) and _get_num_replicas_per_client(strategy) > 1:
-      self.skipTest('b/167331966')
-
-    if strategy.num_replicas_in_sync <= 1:
-      self.skipTest('Test for more than 1 replica only.')
-
-    def value_fn(ctx):
-      return constant_op.constant(
-          1, shape=(1, ctx.replica_id_in_sync_group + 1))
-
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        value_fn)
-
-    def run(value):
-      value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather(value_identity, axis=0)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    if isinstance(strategy, CollectiveAllReduceStrategy):
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  r'Shape mismatch'):
-        strategy.run(run, args=(per_replica_value,))
-    elif isinstance(
-        strategy,
-        (mirrored_strategy.MirroredStrategy,
-         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  r'Dimensions of inputs should match'):
-        strategy.run(run, args=(per_replica_value,))
-    else:
-      with self.assertRaisesRegex(ValueError,
-                                  r'Dimension \d in both shapes must be equal'):
-        strategy.run(run, args=(per_replica_value,))
-
-  def testAllGatherRaiseSparse(self, strategy, pure_eager):
-    dense_shape = [5, 2]
-    t0 = _make_indexed_slices(
-        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
-
-    def replica_fn(value):
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather(value, axis=0)
-
-    with self.assertRaisesRegex(
-        NotImplementedError,
-        r'gather/all_gather does not support IndexedSlices'):
-      if not pure_eager:
-        strategy.run(def_function.function(replica_fn), args=(t0,))
-      else:
-        strategy.run(replica_fn, args=(t0,))
-
-  def testAllGatherRaiseDifferentRank(self, strategy, pure_eager):
-    """Different rank: [1,], [1, 2] -> raise error."""
-    if strategy.num_replicas_in_sync <= 1:
-      self.skipTest('Test for more than 1 replicas.')
-    if isinstance(strategy, CollectiveAllReduceStrategy
-                 ) and _get_num_replicas_per_client(strategy) > 1:
-      self.skipTest('b/167331966')
-    def value_fn(ctx):
-      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
-
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        value_fn)
-
-    def run(value):
-      value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
-      return ctx._all_gather(value_identity, axis=0)
-
-    if not pure_eager:
-      run = def_function.function(run)
-
-    if isinstance(strategy, CollectiveAllReduceStrategy):
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  r'Shape mismatch'):
-        strategy.run(run, args=(per_replica_value,))
-    elif isinstance(strategy,
-                    (mirrored_strategy.MirroredStrategy,
-                     central_storage_strategy.CentralStorageStrategy)):
-      if pure_eager:
-        with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                    r'Ranks of all input tensors should match'):
-          strategy.run(run, args=(per_replica_value,))
-      else:
-        with self.assertRaisesRegex(ValueError,
-                                    r'Shape must be rank \d but is rank \d'):
-          strategy.run(run, args=(per_replica_value,))
-    else:
-      with self.assertRaisesRegex(ValueError,
-                                  r'Dimension \d in both shapes must be equal'):
-        strategy.run(run, args=(per_replica_value,))
-
-
 def _make_indexed_slices(values, indices, dense_shape):
   tensor = ops.IndexedSlices(
       values=constant_op.constant(values),
@@ -689,6 +127,12 @@ def _get_num_replicas_per_client(strategy):
     return strategy.num_replicas_in_sync
 
 
+def _is_tpu_strategy(strategy):
+  return isinstance(strategy,
+                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
+                     tpu_strategy.TPUStrategyV2))
+
+
 @combinations.generate(
     combinations.combine(
         strategy=[
@@ -739,8 +183,9 @@ class DistributedCollectiveAllReduceStrategyTest(
     result = run(input_iterator)
     expected_data_on_workers = {'chief': [8, 9, 10], 'worker': [11, 12, 13]}
     self.assertAllEqual(
+        expected_data_on_workers[multi_worker_test_base.get_task_type()],
         result.numpy(),
-        expected_data_on_workers[multi_worker_test_base.get_task_type()])
+    )
 
   def testSimpleInputFromFnLastPartialBatch(self, strategy):
 
@@ -766,8 +211,8 @@ class DistributedCollectiveAllReduceStrategyTest(
 
     expected_data_on_worker = {'chief': [8, 9, 10, 11], 'worker': [12, 13]}
     self.assertAllEqual(
-        result.numpy(),
-        expected_data_on_worker[multi_worker_test_base.get_task_type()])
+        expected_data_on_worker[multi_worker_test_base.get_task_type()],
+        result.numpy())
 
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(
@@ -785,7 +230,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     reduced = strategy.extended.batch_reduce_to(reduce_util.ReduceOp.SUM,
                                                 [(value, value),
                                                  (value, value)])
-    self.assertAllEqual(reduced, [2., 2.])
+    self.assertAllEqual([2., 2.], reduced)
 
   def testReduceDeviceTensors(self, strategy):
     value = strategy.run(lambda: array_ops.identity(1.))
@@ -803,7 +248,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     reduced = strategy.extended.batch_reduce_to(reduce_util.ReduceOp.SUM,
                                                 [(value, value),
                                                  (value, value)])
-    self.assertAllEqual(reduced, [2., 2.])
+    self.assertAllEqual([2., 2.], reduced)
 
   # TODO(crccw): add a test that mixes device and host tensors after multi
   # worker strategy combinations can run on a fixed number of GPUs.
@@ -821,7 +266,7 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
     # `None` otherwise.
     resolver = strategy.cluster_resolver
     if not isinstance(strategy, CollectiveAllReduceStrategy) and not isinstance(
-        strategy, TPUStrategy):
+        strategy, tpu_strategy.TPUStrategy):
       self.assertIsNone(resolver)
       return
 
diff --git a/tensorflow/python/distribute/strategy_gather_test.py b/tensorflow/python/distribute/strategy_gather_test.py
new file mode 100644
index 00000000000..9c70f1d34b3
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_gather_test.py
@@ -0,0 +1,666 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common methods in strategy classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import central_storage_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.default_strategy,
+            strategy_combinations.one_device_strategy,
+            strategy_combinations.one_device_strategy_gpu,
+            strategy_combinations.central_storage_strategy_with_two_gpus,
+            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+            strategy_combinations.mirrored_strategy_with_one_cpu,
+            strategy_combinations.mirrored_strategy_with_one_gpu,
+            strategy_combinations.mirrored_strategy_with_two_gpus,
+            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+            strategy_combinations.multi_worker_mirrored_2x2_gpu,
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ],
+        mode=['eager'],
+        pure_eager=[True, False]) + combinations.combine(
+            strategy=[
+                strategy_combinations.tpu_strategy,
+                strategy_combinations.tpu_strategy_packed_var,
+                strategy_combinations.tpu_strategy_one_step,
+                strategy_combinations.cloud_tpu_strategy,
+            ],
+            mode=['eager'],
+            pure_eager=[False]))
+class GatherTest(test.TestCase, parameterized.TestCase):
+
+  def _gather_same_shape_and_verify(self, value_on_replica, axis, pure_eager,
+                                    strategy):
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.identity(value_on_replica))
+
+    def run():
+      return strategy.gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    all_results = [
+        value_on_replica for _ in range(strategy.num_replicas_in_sync)
+    ]
+    expected_result = array_ops.concat(all_results, axis=axis)
+    self.assertAllEqual(expected_result, run().numpy())
+
+  def testGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of shape [3] on each replica gathers to a tensor of [6]."""
+    single_value = constant_op.constant([1, 2, 3])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 0th dim to a tensor of [2, 3]."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 1st dim to a tensor of [1, 6]."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 1
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 0th dim to a tensor of [2, 2, 2]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 1nd dimension to a tensor of [1, 4, 2]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 1
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 2nd dimension to a tensor of [1, 2, 4]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 2
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherDiffShapeAtAxis0(self, strategy, pure_eager):
+    """Different `Axis`-th (0) dimension: shape [1, 1], [2, 1] -> [3, 1]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy.gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = constant_op.constant(
+        1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
+
+    self.assertAllEqual(expected_result, run().numpy())
+
+  def testGatherDiffShapeAtAxis1(self, strategy, pure_eager):
+    """Different `Axis`-th (non-0) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 1
+
+    def run():
+      return strategy.gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = constant_op.constant(
+        1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+
+    self.assertAllEqual(expected_result, run().numpy())
+
+  def testGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
+    """Different at non-`axis`-th dimension : [1, 1], [1, 2], 0th -> raise error."""
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replica only.')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy.gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        run()
+    elif isinstance(
+        strategy,
+        (mirrored_strategy.MirroredStrategy,
+         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Dimensions of inputs should match'):
+        run()
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        run()
+
+  def testGatherRaiseSparse(self, strategy, pure_eager):
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices(
+        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
+
+    def run(value):
+      return strategy.gather(value, axis=0)
+
+    with self.assertRaisesRegex(
+        NotImplementedError,
+        r'gather does not support IndexedSlices'):
+      if pure_eager:
+        run(t0)
+      else:
+        def_function.function(run)(t0)
+
+  def testGatherRaiseDifferentRank(self, strategy, pure_eager):
+    """Different rank: [1,], [1, 2] -> raise error."""
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replicas.')
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy.gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        run()
+    elif isinstance(
+        strategy,
+        (mirrored_strategy.MirroredStrategy,
+         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Ranks of all input tensors should match'):
+        run()
+    elif _is_tpu_strategy(strategy) and pure_eager:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        run()
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Shape must be rank \d but is rank \d'):
+        run()
+
+  # Ideally, here we should split them into another test class, AllGatherTest.
+  # But doing that makes two initialize_tpu_system() calls and one of them times
+  # out, on Kokoro. Integrating two into one avoids it.
+  def _all_gather_same_shape_and_verify(self, value_on_replica, axis,
+                                        pure_eager, strategy):
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.identity(value_on_replica))
+
+    def replica_fn(per_replica_value):
+      ctx = ds_context.get_replica_context()
+      local_value = array_ops.identity(per_replica_value)
+      return ctx.all_gather(local_value, axis=axis)
+
+    if not pure_eager:
+      replica_fn = def_function.function(replica_fn)
+
+    result = strategy.experimental_local_results(
+        strategy.run(replica_fn, args=(per_replica_value,)))
+
+    all_value = [value_on_replica for _ in range(strategy.num_replicas_in_sync)]
+    expect = array_ops.concat(all_value, axis=axis)
+    expected_result = [expect] * _get_num_replicas_per_client(strategy)
+
+    self.assertAllClose(expected_result, result)
+
+  def testAllGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (3,) on two replica returns a PerReplica of tensor(s) with shape (6,)."""
+    single_value = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (2,3)."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (1,6)."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 1
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (2,2,2)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,4,2)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 1
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=2,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,2,4)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 2
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherDiffValueTPU(self, strategy, pure_eager):
+    # Test for TPU only since it can't be tested via testAllGatherDiffShape*
+    if not _is_tpu_strategy(strategy):
+      self.skipTest('Test for TPU only. For other strategies case already'
+                    ' covered in other tests')
+
+    data = [[1], [2], [3], [4], [5], [6], [7], [8]]
+
+    axis = 0
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(data).batch(8)
+    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def replica_fn(per_replica_value):
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(array_ops.identity(per_replica_value), axis=axis)
+
+    result = strategy.experimental_local_results(
+        strategy.run(replica_fn, args=(next(input_iterator),)))
+
+    expected_result = [data] * _get_num_replicas_per_client(strategy)
+    self.assertAllClose(expected_result, result)
+
+  def testAllGatherDiffShapeAtAxis0(self, strategy, pure_eager):
+    """Different `Axis==0`-th dimension: shape [1, 1], [2, 1] -> [3, 1]."""
+
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TPU does not support all_gather different shapes')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    expect = constant_op.constant(
+        1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = [expect] * _get_num_replicas_per_client(strategy)
+    result = strategy.experimental_local_results(
+        strategy.run(run, args=(per_replica_value,)))
+    self.assertAllEqual(expected_result, result)
+
+  def testAllGatherDiffShapeAtAxis1(self, strategy, pure_eager):
+    """Different `Axis`-th (not 0th) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TPU does not support all_gather different shapes')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    expect = constant_op.constant(
+        1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(value_identity, axis=1)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = [expect] * _get_num_replicas_per_client(strategy)
+    result = strategy.experimental_local_results(
+        strategy.run(run, args=(per_replica_value,)))
+    self.assertAllEqual(expected_result, result)
+
+  def testAllGatherNest(self, strategy, pure_eager):
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TPU does not support all_gather different shapes')
+
+    axis = 1
+
+    def value_fn(ctx):
+      value = constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+      return value
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    expect_1 = constant_op.constant(
+        1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+
+    expected_per_replica_1 = [expect_1] * _get_num_replicas_per_client(strategy)
+
+    value_2 = constant_op.constant([[[1, 2], [1, 2]]])
+
+    expect_2 = array_ops.concat(
+        [value_2 for _ in range(strategy.num_replicas_in_sync)], axis=axis)
+
+    expected_per_replica_2 = [expect_2] * _get_num_replicas_per_client(strategy)
+
+    def run(value):
+      value_1 = array_ops.identity(value)
+      value_3 = array_ops.identity(value_2)
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather([value_1, value_3], axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    result = strategy.run(run, args=(per_replica_value,))
+    self.assertAllEqual(expected_per_replica_1,
+                        strategy.experimental_local_results(result[0]))
+    self.assertAllEqual(expected_per_replica_2,
+                        strategy.experimental_local_results(result[1]))
+
+  def testAllGatherNest1D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a nest of DistributedValues."""
+    single_value = constant_op.constant([1, 2, 3])
+    axis = 0
+
+    def run():
+      value_identity = array_ops.identity(single_value)
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather([value_identity, value_identity], axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    all_value = [single_value for _ in range(strategy.num_replicas_in_sync)]
+    expect = array_ops.concat(all_value, axis=axis)
+    expected_per_replica = [expect] * _get_num_replicas_per_client(strategy)
+
+    result = strategy.run(run)
+    for gathered_result in result:
+      self.assertAllEqual(expected_per_replica,
+                          strategy.experimental_local_results(gathered_result))
+
+  def testAllGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
+    """Different at non-`axis`-th dimension : [2, 1], [1, 1], all_gather(...axis=1...) -> raise error."""
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TODO(b/169108777): raise a clear error message in xla.')
+
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replica only.')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        strategy.run(run, args=(per_replica_value,))
+    elif isinstance(
+        strategy,
+        (mirrored_strategy.MirroredStrategy,
+         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Dimensions of inputs should match'):
+        strategy.run(run, args=(per_replica_value,))
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        strategy.run(run, args=(per_replica_value,))
+
+  def testAllGatherRaiseSparse(self, strategy, pure_eager):
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices(
+        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
+
+    def replica_fn(value):
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(value, axis=0)
+
+    with self.assertRaisesRegex(
+        NotImplementedError,
+        r'all_gather does not support IndexedSlices'):
+      if not pure_eager:
+        strategy.run(def_function.function(replica_fn), args=(t0,))
+      else:
+        strategy.run(replica_fn, args=(t0,))
+
+  def testAllGatherRaiseDifferentRank(self, strategy, pure_eager):
+    """Different rank: [1,], [1, 2] -> raise error."""
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TODO(b/169108777): raise a clear error message in xla.')
+
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replicas.')
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        strategy.run(run, args=(per_replica_value,))
+    elif isinstance(strategy,
+                    (mirrored_strategy.MirroredStrategy,
+                     central_storage_strategy.CentralStorageStrategy)):
+      if pure_eager:
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    r'Ranks of all input tensors should match'):
+          strategy.run(run, args=(per_replica_value,))
+      else:
+        with self.assertRaisesRegex(ValueError,
+                                    r'Shape must be rank \d but is rank \d'):
+          strategy.run(run, args=(per_replica_value,))
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        strategy.run(run, args=(per_replica_value,))
+
+  def testAllGatherGradient(self, strategy, pure_eager):
+    if pure_eager:
+      self.skipTest('`tf.gradients` is not supported with eager execution '
+                    'without using tf.functions.')
+
+    def all_gather_fn(value):
+      axis = 1
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(array_ops.identity(value), axis)
+
+    gradient_comp = sum(range(1, strategy.num_replicas_in_sync + 1))
+    gradient = [[gradient_comp], [gradient_comp]]
+    grads_for_all_replicas = [gradient] * _get_num_replicas_per_client(strategy)
+
+    @def_function.function
+    def step(c):
+      x = constant_op.constant([[3.], [5.]])
+      mid = all_gather_fn(x)
+      y = mid * c
+      return gradients_impl.gradients_v2(y, [x])[0]
+
+    def value_fn(ctx):
+      x = [1., 2., 3., 4., 5., 6., 7., 8.]
+      return array_ops.constant([x[ctx.replica_id_in_sync_group]])
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    result = strategy.experimental_local_results(
+        strategy.run(step, args=(per_replica_value,)))
+
+    self.assertAllEqual(grads_for_all_replicas, result)
+
+  def testAllGatherGradientNest(self, strategy, pure_eager):
+    if pure_eager:
+      self.skipTest('`tf.gradients` is not supported with eager execution '
+                    'without using tf.functions.')
+
+    def all_gather_fn(value):
+      axis = 1
+      ctx = ds_context.get_replica_context()
+      return ctx.all_gather(array_ops.identity(value), axis)
+
+    gradient_comp = sum(range(1, strategy.num_replicas_in_sync + 1))
+    gradient = [[gradient_comp], [gradient_comp]]
+    grads_for_all_replicas = [gradient] * _get_num_replicas_per_client(strategy)
+
+    @def_function.function
+    def step(c):
+      x = constant_op.constant([[3.], [5.]])
+      y = constant_op.constant([[2.], [4.]])
+      mid = all_gather_fn([x, y])
+      y = mid * c
+      return gradients_impl.gradients_v2(y, [x])[0]
+
+    def value_fn(ctx):
+      x = [1., 2., 3., 4., 5., 6., 7., 8.]
+      return array_ops.constant([x[ctx.replica_id_in_sync_group]])
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    result = strategy.experimental_local_results(
+        strategy.run(step, args=(per_replica_value,)))
+
+    self.assertAllEqual(grads_for_all_replicas, result)
+
+
+def _make_indexed_slices(values, indices, dense_shape):
+  tensor = ops.IndexedSlices(
+      values=constant_op.constant(values),
+      indices=constant_op.constant(indices),
+      dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _get_num_replicas_per_client(strategy):
+  if isinstance(strategy, CollectiveAllReduceStrategy):
+    resolver = strategy.cluster_resolver
+    return max(nest.flatten(resolver.num_accelerators())[0], 1)
+  else:
+    return strategy.num_replicas_in_sync
+
+
+def _is_tpu_strategy(strategy):
+  return isinstance(strategy,
+                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
+                     tpu_strategy.TPUStrategyV2))
+
+
+if __name__ == '__main__':
+  test_util.main()
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index f660e3ab9f8..cbdcf51b9dd 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -28,6 +28,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
@@ -429,6 +430,8 @@ class DistributionTestBase(test.TestCase):
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
 
   def _test_numpy_dataset(self, strategy, session=None, run_in_function=False):
+    if not isinstance(strategy, distribute_lib.StrategyV1):
+      self.skipTest("n/a: V1 only")
     cached_session = session or self.cached_session()
     with strategy.scope(), cached_session as sess:
       x = np.asarray([[1, 2], [6, 12], [2, 4], [5, 10], [3, 6], [4, 8]])
diff --git a/tensorflow/python/distribute/test_util.py b/tensorflow/python/distribute/test_util.py
index 82867edb4c2..46a7b461d45 100644
--- a/tensorflow/python/distribute/test_util.py
+++ b/tensorflow/python/distribute/test_util.py
@@ -58,7 +58,7 @@ def _gather(strategy, value):
     return array_ops.stack(value._values)
   assert len(strategy.extended.worker_devices) == len(value._values)
   inputs = [array_ops.expand_dims_v2(v, axis=0) for v in value._values]
-  return strategy._gather(values.PerReplica(inputs), axis=0)
+  return strategy.gather(values.PerReplica(inputs), axis=0)
   # pylint: enable=protected-access
 
 
@@ -82,8 +82,8 @@ def set_logical_devices_to_at_least(device, num):
           context.LogicalDeviceConfiguration(memory_limit=2048))
     else:
       logical_devices.append(context.LogicalDeviceConfiguration())
-  # Create logical devices from the the last device since sometimes the first
-  # GPU is the primary graphic card and may has less memory available.
+  # Create logical devices from the last device since sometimes the first GPU
+  # is the primary graphic card and may have less memory available.
   config.set_logical_device_configuration(physical_devices[-1], logical_devices)
 
 
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index ae27d6d14bb..1d520712823 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -23,8 +23,8 @@ import collections
 import contextlib
 import copy
 import weakref
-from absl import logging
 
+from absl import logging
 import numpy as np
 
 from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
@@ -338,8 +338,8 @@ class TPUStrategyV2(distribute_lib.Strategy):
     """Adds annotation that `tensor` will be split across logical devices.
 
     This adds an annotation to tensor `tensor` specifying that operations on
-    `tensor` will be be split among multiple logical devices. Tensor `tensor`
-    will be split across dimensions specified by `partition_dimensions`.
+    `tensor` will be split among multiple logical devices. Tensor `tensor` will
+    be split across dimensions specified by `partition_dimensions`.
     The dimensions of `tensor` must be divisible by corresponding value in
     `partition_dimensions`.
 
@@ -741,6 +741,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     # Flag to turn on VariablePolicy
     self._use_var_policy = True
 
+    # Flag to enable TF2 SPMD
+    self._use_spmd_for_xla_partitioning = False
+
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils. validate_colocate(colocate_with_variable, self)
 
@@ -796,12 +799,19 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         raise ValueError(
             "Found tensor {} with spec {}. TPUStrategy does not support "
             "distributed datasets with device prefetch when using sparse or "
-            "ragged tensors. If you indend to use sparse or ragged tensors, "
+            "ragged tensors. If you intend to use sparse or ragged tensors, "
             "please pass a tf.distribute.InputOptions object with "
             "experimental_prefetch_to_device set to False to your dataset "
             "distribution function.".format(path, type(spec)))
 
   def _experimental_distribute_dataset(self, dataset, options):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          "`experimental_distribute_datasets_from_function`."
+      )
     if options is None or options.experimental_prefetch_to_device:
       self._check_spec(dataset.element_spec)
 
@@ -812,6 +822,13 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         num_replicas_in_sync=self._num_replicas_in_sync)
 
   def _distribute_datasets_from_function(self, dataset_fn, options):
+    if (options and options.experimental_replication_mode ==
+        distribute_lib.InputReplicationMode.PER_REPLICA):
+      raise NotImplementedError(
+          "InputReplicationMode.PER_REPLICA "
+          "is only supported in "
+          " `experimental_distribute_datasets_from_function` "
+          "of tf.distribute.MirroredStrategy")
     input_workers = self._get_input_workers(options)
     input_contexts = []
     num_workers = input_workers.num_workers
@@ -886,8 +903,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           run_fn,
           replicate_inputs,
           device_assignment=self._device_assignment,
-          xla_options=tpu.XLAOptions(use_spmd_for_xla_partitioning=False))
-
+          xla_options=tpu.XLAOptions(use_spmd_for_xla_partitioning=self
+                                     ._use_spmd_for_xla_partitioning))
       # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
       # will flatten it in this case. If run_fn has no tensor outputs,
       # tpu.replicate returns a list of no_ops, we will keep the output as it
@@ -1021,7 +1038,54 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         distribute_utils.TPU_VARIABLE_CLASS_MAPPING,
         distribute_utils.TPU_VARIABLE_POLICY_MAPPING, **kwargs)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    if not isinstance(value, values.DistributedValues):
+      return value
+
+    value_list = value.values
+    # pylint: disable=protected-access
+    if isinstance(
+        value,
+        values.DistributedVariable) and value._packed_variable is not None:
+      value_list = tuple(
+          value._packed_variable.on_device(d)
+          for d in value._packed_variable.devices)
+    # pylint: enable=protected-access
+
+    # Currently XLA op by op mode has a limit for the number of inputs for a
+    # single op, thus we break one `add_n` op into a group of `add_n` ops to
+    # work around the constraint.
+    if len(value.values) <= _XLA_OP_BY_OP_INPUTS_LIMIT:
+      output = array_ops.concat(value_list, axis=axis)
+    else:
+      output = array_ops.concat(
+          value_list[:_XLA_OP_BY_OP_INPUTS_LIMIT], axis=axis)
+      for i in range(_XLA_OP_BY_OP_INPUTS_LIMIT, len(value_list),
+                     _XLA_OP_BY_OP_INPUTS_LIMIT - 1):
+        output = array_ops.concat(
+            [output] + value_list[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT - 1],
+            axis=axis)
+
+    output = self._broadcast_output(destinations, output)
+    return output
+
+  def _broadcast_output(self, destinations, output):
+    devices = cross_device_ops_lib.get_devices_from(destinations)
+
+    if len(devices) == 1:
+      # If necessary, copy to requested destination.
+      dest_canonical = device_util.canonicalize(devices[0])
+      host_canonical = device_util.canonicalize(self._host_device)
+
+      if dest_canonical != host_canonical:
+        with ops.device(dest_canonical):
+          output = array_ops.identity(output)
+    else:
+      output = cross_device_ops_lib.simple_broadcast(output, destinations)
+
+    return output
+
+  def _reduce_to(self, reduce_op, value, destinations, options):
     if (isinstance(value, values.DistributedValues) or
         tensor_util.is_tensor(value)
        ) and tpu_values.enclosing_tpu_context() is not None:
@@ -1065,19 +1129,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     if reduce_op == reduce_util.ReduceOp.MEAN:
       output *= (1. / len(value_list))
 
-    devices = cross_device_ops_lib.get_devices_from(destinations)
-
-    if len(devices) == 1:
-      # If necessary, copy to requested destination.
-      dest_canonical = device_util.canonicalize(devices[0])
-      host_canonical = device_util.canonicalize(self._host_device)
-
-      if dest_canonical != host_canonical:
-        with ops.device(dest_canonical):
-          output = array_ops.identity(output)
-    else:
-      output = cross_device_ops_lib.simple_broadcast(output, destinations)
-
+    output = self._broadcast_output(destinations, output)
     return output
 
   def _update(self, var, fn, args, kwargs, group):
@@ -1312,7 +1364,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
             device_assignment=self._device_assignment,
             maximum_shapes=maximum_shapes,
             padding_spec=padding_spec,
-            xla_options=tpu.XLAOptions(use_spmd_for_xla_partitioning=False))
+            xla_options=tpu.XLAOptions(use_spmd_for_xla_partitioning=self
+                                       ._use_spmd_for_xla_partitioning))
 
       # Remove all no ops that may have been added during 'tpu.replicate()'
       if isinstance(result[0], list):
@@ -1374,6 +1427,65 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
     """Places variables and ops on the specified logical device."""
     return self.strategy.extended.experimental_logical_device(logical_device_id)
 
+  # TODO(wxinyi): Investigate whether to use cross_replica_sum to optimize it.
+  def all_gather(self, value, axis, experimental_hints=None):
+    del experimental_hints
+    for v in nest.flatten(value):
+      if isinstance(v, ops.IndexedSlices):
+        raise NotImplementedError("all_gather does not support IndexedSlices")
+
+    def _all_to_all(value, axis):
+      # The underlying AllToAllOp first do a split of the input value and then
+      # cross-replica communication and concatenation of the result. So we
+      # concatenate the local tensor here first.
+      inputs = array_ops.concat(
+          [value for _ in range(self.num_replicas_in_sync)], axis=0)
+      unordered_output = tpu_ops.all_to_all(
+          inputs,
+          concat_dimension=axis,
+          split_dimension=0,
+          split_count=self.num_replicas_in_sync)
+
+      # Re-order since xla.replica_id and ReplicaContext.replica_id mismatch.
+      # xla_id = xla.replica_id()
+      concat_replica_id = array_ops.concat([
+          array_ops.expand_dims_v2(self.replica_id_in_sync_group, 0)
+          for _ in range(self.num_replicas_in_sync)
+      ],
+                                           axis=0)
+      replica_ids = tpu_ops.all_to_all(
+          concat_replica_id,
+          concat_dimension=0,
+          split_dimension=0,
+          split_count=self.num_replicas_in_sync)
+
+      splited_unordered = array_ops.split(
+          unordered_output,
+          num_or_size_splits=self.num_replicas_in_sync,
+          axis=axis)
+      sorted_with_extra_dim = math_ops.unsorted_segment_sum(
+          array_ops.concat([
+              array_ops.expand_dims(replica, axis=0)
+              for replica in splited_unordered
+          ],
+                           axis=0),
+          replica_ids,
+          num_segments=self.num_replicas_in_sync)
+
+      splited_with_extra_dim = array_ops.split(
+          sorted_with_extra_dim,
+          num_or_size_splits=self.num_replicas_in_sync,
+          axis=0)
+      squeezed = [
+          array_ops.squeeze(replica, axis=0)
+          for replica in splited_with_extra_dim
+      ]
+      result = array_ops.concat(squeezed, axis=axis)
+      return result
+
+    ys = [_all_to_all(t, axis=axis) for t in nest.flatten(value)]
+    return nest.pack_sequence_as(value, ys)
+
 
 def _set_last_step_outputs(ctx, last_step_tensor_outputs):
   """Sets the last step outputs on the given context."""
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index e0afda84359..239882c1571 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -179,7 +179,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     with ops.device("/device:TPU:0"):
       self.assertAllEqual(func(), 2.0)
 
-  def test_sequential_experimental_runs(self, enable_packed_var):
+  def test_sequential_runs(self, enable_packed_var):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
     topology = tpu_strategy_util.initialize_tpu_system(resolver)
@@ -254,8 +254,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
       return strategy.run(computation)
 
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "TPU compilation failed"):
+    with self.assertRaises(errors.OpError):
       compilation_failure_run()
 
     @def_function.function
@@ -476,7 +475,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(expected_result, run(input_iterator))
     self.assertAllEqual((0.,), w.read_value())
 
-  def test_experimental_run_output_on_device(self, enable_packed_var):
+  def test_run_output_on_device(self, enable_packed_var):
     strategy = get_tpu_strategy(enable_packed_var)
 
     def computation(x):
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index f734caef5c5..3094f74372e 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -204,7 +204,7 @@ class TPUDistributedVariable(TPUVariableMixin, values.DistributedVariable):
   """DistributedVariable subclass for TPUStrategy."""
 
   def _is_mirrored(self):
-    self._policy._is_mirrored()  # pylint: disable=protected-access
+    return self._policy._is_mirrored()  # pylint: disable=protected-access
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
     if values_util.is_saving_non_distributed():
diff --git a/tensorflow/python/distribute/v1/cross_device_ops_test.py b/tensorflow/python/distribute/v1/cross_device_ops_test.py
index 9914505f51c..a38c3c705ea 100644
--- a/tensorflow/python/distribute/v1/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/v1/cross_device_ops_test.py
@@ -27,7 +27,7 @@ from absl.testing import parameterized
 import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.distribute import cluster_resolver
-from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
@@ -432,7 +432,7 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
 
 NUM_WORKERS = 3
 
-CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+CollectiveCommunication = collective_util.CollectiveCommunication
 
 
 class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
@@ -451,6 +451,8 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
     # Reusing keys is not supported well. So we have to give a different
     # collective key base for different tests.
     CollectiveAllReduceTest.collective_key_base += 100000
+    mwms_lib.CollectiveAllReduceStrategy._collective_key_base = (
+        CollectiveAllReduceTest.collective_key_base)
 
   def _get_test_objects(self,
                         task_type,
@@ -460,10 +462,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                         use_strategy_object=False,
                         local_mode=False):
     collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 + CollectiveAllReduceTest.collective_key_base,
-        op_instance_key_start=100 + CollectiveAllReduceTest.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceTest.collective_key_base)
+        group_key_start=10 + CollectiveAllReduceTest.collective_key_base)
     if local_mode:
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
@@ -471,20 +470,15 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         devices = ["/device:CPU:0"]
 
       if use_strategy_object:
-        strategy = (
-            collective_all_reduce_strategy.CollectiveAllReduceStrategy
-            ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
-        strategy.extended._collective_keys = collective_keys
-        strategy.extended._cross_device_ops._collective_keys = collective_keys
-        strategy.extended._host_cross_device_ops._collective_keys = (
-            collective_keys)
+        comm_options = collective_util.Options(implementation=communication)
+        strategy = (mwms_lib.CollectiveAllReduceStrategy
+                    ._from_local_devices(devices, comm_options))  # pylint: disable=protected-access
         return strategy, devices, ""
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
             devices=devices,
             group_size=len(devices),
-            collective_keys=collective_keys,
-            communication=communication)
+            collective_keys=collective_keys)
         return collective_all_reduce_ops, devices, ""
     else:
       # NCCL requires physical GPUs for every replica, which we can't do with
@@ -507,18 +501,16 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
             task_type=task_type,
             task_id=task_id,
             num_accelerators={"GPU": num_gpus})
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-            cluster_resolver=resolver, communication=communication)
-        strategy.extended._collective_keys = collective_keys
-        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        comm_options = collective_util.Options(implementation=communication)
+        strategy = mwms_lib.CollectiveAllReduceStrategy(
+            communication_options=comm_options, cluster_resolver=resolver)
         return (strategy, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
             devices=devices,
             group_size=len(devices) * NUM_WORKERS,
-            collective_keys=collective_keys,
-            communication=communication)
+            collective_keys=collective_keys)
         return (collective_all_reduce_ops, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 61fb933cdbf..a59164bb0d7 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -362,8 +362,23 @@ class DistributedDelegate(DistributedValues):
 class PerReplica(DistributedValues, composite_tensor.CompositeTensor):
   """Holds a map from replica to unsynchronized values."""
 
+  def __init__(self, values, type_spec_override=None):
+    super(PerReplica, self).__init__(values)
+    # Allow setting a type spec that can be different from the underlying
+    # values. This allows us avoid retracing for PerReplica from full, partial
+    # and empty batches. In a multi client setup, we need to avoid such
+    # retracing otherwise the collectives may mismatch since we assign new
+    # collective keys when retracing the function.
+    #
+    # TODO(b/166169298): remove after CrossDeviceOps is tracing safe.
+    self._type_spec_override = type_spec_override
+
   @property
   def _type_spec(self):
+    if self._type_spec_override is not None:
+      # Return a deep copy in case the caller changes it, since _type_spec()
+      # normally returns a temporary object.
+      return copy.deepcopy(self._type_spec_override)
     return PerReplicaSpec(
         *(type_spec.type_spec_from_value(v) for v in self._values))
 
@@ -874,6 +889,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     Returns:
       Updated variable or `tf.Operation`.
     """
+    values_util.mark_as_unsaveable()
     return self.distribute_strategy.extended.update(
         self, update_fn, args=(value,), kwargs=kwargs, group=True)
 
@@ -961,6 +977,30 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           self._primary.handle]
     return obj_map, resource_map
 
+  def _write_object_proto(self, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    if self._policy:
+      if self._policy._is_mirrored():  # pylint: disable=protected-access
+        self._policy._write_object_proto(self, proto, options)  # pylint: disable=protected-access
+    else:
+      self._write_object_proto(proto, options)
+
 
 # We extend from `saveable_object.SaveableObject` instead of
 # `saveable_object_util.ResourceVariableSaveable` since we need to read the
@@ -1062,14 +1102,16 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   def _write_object_proto(self, proto, options):
-    """Update a SavedObject proto for this object.
+    """Update a SavedObject proto for the caller.
 
-    If an object defines this method, it will be called when saving with a
-    pre-built `SavedObject` proto representing the object, plus an instance of
-    `SaveOptions`. This method is then free to modify that proto instance.
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
 
-    `MirroredVariables` optionally write out information about their components
-    to the `experimental_distributed_variable_components` field of a
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
     `SavedVariable` (depending on the `SaveOptions` variable policy).
 
     Args:
@@ -1077,13 +1119,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
         will be a `SavedVariable` instance.
       options: A `SaveOptions` instance.
     """
-    if options.experimental_variable_policy._expand_distributed_variables(  # pylint: disable=protected-access
-    ):
-      for var in self.values:
-        var_proto = (
-            proto.variable.experimental_distributed_variable_components.add())
-        var_proto.name = var.name.split(":")[0]
-        var_proto.device = var.device
+    values_util.write_object_proto(self, proto, options)
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
@@ -1135,6 +1171,7 @@ class SyncOnReadVariable(DistributedVariable):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_sub_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1147,6 +1184,7 @@ class SyncOnReadVariable(DistributedVariable):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_add_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1159,6 +1197,7 @@ class SyncOnReadVariable(DistributedVariable):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1223,7 +1262,8 @@ class SyncOnReadVariable(DistributedVariable):
       # Consider returning a tensor value here to make the return value of
       # _get_cross_replica consistent.
       return self._get_replica(0)
-
+    if self._aggregation == vs.VariableAggregation.SUM:
+      values_util.mark_as_unsaveable()
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
@@ -1254,6 +1294,26 @@ class SyncOnReadVariable(DistributedVariable):
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
+  def _write_object_proto(self, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    pass
+
 
 # Register a conversion functions which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
@@ -1360,9 +1420,10 @@ class OnReadPolicy(VariablePolicy):
   def _get_cross_replica(self, var):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       return var._get_replica(0)  # pylint: disable=protected-access
-
+    if self._aggregation == vs.VariableAggregation.SUM:
+      values_util.mark_as_unsaveable()
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      return  var.distribute_strategy.reduce(
+      return var.distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
           var,
           axis=None)
@@ -1381,6 +1442,7 @@ class OnReadPolicy(VariablePolicy):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_sub_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1394,6 +1456,7 @@ class OnReadPolicy(VariablePolicy):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_add_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1405,6 +1468,7 @@ class OnReadPolicy(VariablePolicy):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_cross_replica(var, value,
                                                         read_value=read_value)
       else:
@@ -1538,6 +1602,27 @@ class AutoPolicy(VariablePolicy):
   def get_restore_ops(self, var, tensor):
     return values_util.get_on_write_restore_ops(var, tensor)
 
+  def _write_object_proto(self, var, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      var : A DistributedVariable object
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    values_util.write_object_proto(var, proto, options)
+
 
 class OnWritePolicy(AutoPolicy):
   """Policy defined for `tf.VariableSynchronization.ON_WRITE` synchronization.
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 8a9f0acbd75..1f9bef137d5 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.training import saver as saver_lib
@@ -825,6 +826,67 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
 
     # pylint: enable=g-long-lambda
 
+  def testUnsaveable(self, distribution, synchronization, aggregation, mode):
+    if isinstance(distribution.extended,
+                  parameter_server_strategy.ParameterServerStrategyExtended):
+      self.skipTest("n/a: not appliable to AggregatingVariable")
+    if (isinstance(distribution,
+                   collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+        and mode == "graph"):
+      self.skipTest("MWMS combinations tests do not work well in graph mode.")
+    with distribution.scope():
+      v = variables_lib.Variable([1., 1.],
+                                 synchronization=synchronization,
+                                 aggregation=aggregation)
+
+    with self.cached_session():
+      self.evaluate(variables_lib.global_variables_initializer())
+
+    export_dir = self.get_temp_dir()
+
+    def _assert_unsaveable(f):
+      # Ignore if it cannot be traced. Certain combinations are not supported or
+      # yet or not allowed.
+      try:
+        f = def_function.function(f).get_concrete_function()
+      except (NotImplementedError, ValueError):
+        return
+      with self.assertRaisesRegex(ValueError, "f_with_input_signature"):
+        save.save(v, export_dir, signatures=f)
+
+    _assert_unsaveable(lambda: v.assign(ops.convert_to_tensor([1., 1.])))
+    _assert_unsaveable(lambda: v.assign_add(ops.convert_to_tensor([1., 1.])))
+    _assert_unsaveable(lambda: v.assign_sub(ops.convert_to_tensor([1., 1.])))
+    _assert_unsaveable(lambda: v.scatter_add(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_sub(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_mul(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_div(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_min(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_max(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_update(_make_index_slices([1.], [0])))
+    # Reading a ON_READ variable should be unsaveable if either:
+    # 1) CollectiveAllReduceStrategy, and aggregation is MEAN/SUM.
+    # 2) aggregation is SUM.
+    if (synchronization == variables_lib.VariableSynchronization.ON_READ and
+        (aggregation == variables_lib.VariableAggregation.SUM or
+         (isinstance(distribution.extended,
+                     collective_all_reduce_strategy.CollectiveAllReduceExtended)
+          and aggregation == variables_lib.VariableAggregation.MEAN))):
+      _assert_unsaveable(v.read_value)
+      _assert_unsaveable(v.value)
+      _assert_unsaveable(lambda: ops.convert_to_tensor(v))
+    else:
+      # Otherwise reading a variable should be saveable.
+
+      @def_function.function
+      def f():
+        v.read_value()
+        v.value()
+        return ops.convert_to_tensor(v)
+
+      with self.cached_session():
+        save.save(v, export_dir, signatures=f.get_concrete_function())
+
 
 @combinations.generate(
     combinations.combine(
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index 9653be0087e..369e2435d9b 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -32,6 +32,34 @@ from tensorflow.python.saved_model import save_options
 from tensorflow.python.training.saving import saveable_object
 
 
+def write_object_proto(var, proto, options):
+  """Update a SavedObject proto for the caller.
+
+  If a DistributedVariable object supports this method, it will be called when
+  saving with a pre-built `SavedObject` proto representing the object, plus an
+  instance of `SaveOptions`. This method is then free to modify that proto
+  instance.
+
+  `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+   write out information about their components to the
+   `experimental_distributed_variable_components` field of a
+   `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+  Args:
+    var: The DistributedVariable object.
+    proto: A pre-built `SavedObject` proto for this object. It is assumed this
+      will be a `SavedVariable` instance.
+    options: A `SaveOptions` instance.
+  """
+  if options.experimental_variable_policy._expand_distributed_variables(  # pylint: disable=protected-access
+  ):
+    for var in var.values:
+      var_proto = (
+          proto.variable.experimental_distributed_variable_components.add())
+      var_proto.name = var.name.split(":")[0]
+      var_proto.device = var.device
+
+
 def get_on_write_saveable(var, primary_var, name):
   """Return saveable spec for AUTO and ON_WRITE variables."""
   # We use a callable so that we don't have to evaluate this expression
@@ -343,3 +371,23 @@ def is_saving_non_distributed():
   options = save_context.get_save_options()
   return (options.experimental_variable_policy !=
           save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES)
+
+
+def mark_as_unsaveable():
+  """Marks the function as unsaveable if not inside save context."""
+  if ops.inside_function() and not save_context.in_save_context():
+    ops.get_default_graph().mark_as_unsaveable("""
+ConcreteFunction that uses distributed variables in certain way cannot be saved.
+If you're saving with
+
+tf.saved_model.save(..., signatures=f.get_concrete_function())
+
+do
+
+@tf.function(input_signature=...)
+def f_with_input_signature():
+  ...
+
+tf.saved_model.save(..., signatures=f_with_input_signature)`
+
+instead.""")
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index f9ebb9cb4f2..cb732d37089 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -14,6 +14,10 @@ load(
     "tf_py_logged_benchmark",
 )
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_rpc_deps",
+)
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -68,6 +72,7 @@ cc_library(
         "//tensorflow/python:py_seq_tensor",
         "//tensorflow/python:py_util",
         "//tensorflow/python:safe_ptr",
+        "//tensorflow/python:safe_pyobject_ptr",
         "//tensorflow/python:stack_trace",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
@@ -103,7 +108,6 @@ cuda_py_test(
         "no_oss",  # TODO(b/168051787): Enable.
         "no_pip",  # TODO(b/168051787): Enable.
     ],
-    tfrt_enabled = True,
     deps = [
         ":pywrap_tensor_test_util",
         ":test",
@@ -179,7 +183,6 @@ cuda_py_test(
     size = "small",
     srcs = ["cancellation_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":cancellation",
         ":test",
@@ -264,7 +267,6 @@ cuda_py_test(
     size = "small",
     srcs = ["context_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":context",
         ":test",
@@ -288,7 +290,6 @@ cuda_py_test(
     name = "monitoring_test",
     srcs = ["monitoring_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":monitoring",
         ":test",
@@ -361,7 +362,6 @@ cuda_py_test(
     name = "tensor_test",
     srcs = ["tensor_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":context",
         ":test",
@@ -376,6 +376,10 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "no_windows",  #TODO(b/139745667)
+        "notsan",  #TODO(b/139745667)
+    ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
     ],
     deps = [
         ":backprop",
@@ -414,7 +418,6 @@ cuda_py_test(
     size = "small",
     srcs = ["core_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":context",
         ":core",
@@ -450,7 +453,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["function_defun_collection_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":def_function",
@@ -566,7 +568,6 @@ cuda_py_test(
     name = "graph_only_ops_test",
     srcs = ["graph_only_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "graph_only_ops",
         "//tensorflow/python:client_testlib",
@@ -611,7 +612,6 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/saved_model:save_context",
-        "//tensorflow/python/saved_model:save_options",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -703,7 +703,6 @@ cuda_py_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":benchmarks_test_base",
@@ -723,7 +722,6 @@ cuda_py_test(
     name = "remote_benchmarks_test",
     srcs = ["remote_benchmarks_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":benchmarks_test_base",
@@ -749,7 +747,6 @@ tf_py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":context",
@@ -769,7 +766,6 @@ cuda_py_test(
     name = "ops_test",
     srcs = ["ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":context",
         ":execute",
@@ -795,7 +791,7 @@ tf_py_test(
     name = "pywrap_tfe_test",
     srcs = ["pywrap_tfe_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
+    tags = ["no_windows"],  # TODO(b/171756439)
     deps = [
         ":backprop",
         ":context",
@@ -861,7 +857,6 @@ tf_py_test(
     size = "medium",
     srcs = ["lift_to_graph_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "lift_to_graph",
         "//tensorflow/python:framework_ops",
@@ -1014,7 +1009,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
-    ],
+    ] + tf_additional_rpc_deps(),
 )
 
 cuda_py_test(
@@ -1071,6 +1066,7 @@ cuda_py_test(
     shard_count = 8,
     tags = [
         "no_oss",  # This test launches local server
+        "notsan",  # TODO(b/170783249)
     ],
     deps = [
         "//tensorflow/python:array_ops",
@@ -1118,6 +1114,7 @@ cuda_py_test(
     size = "small",
     srcs = ["device_placement_test.py"],
     python_version = "PY3",
+    shard_count = 5,
     deps = [
         ":context",
         ":def_function",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 71b1303ecf4..4915a5b8fe3 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -1111,7 +1111,13 @@ class GradientTape(object):
     Note: Unless you set `persistent=True` a GradientTape can only be used to
     compute one set of gradients (or jacobians).
 
-    See[wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant)
+    Note: By default the jacobian implementation uses parallel for (pfor), which
+    creates a tf.function under the hood for each jacobian call. For better
+    performance, and to avoid recompilation and vectorization rewrites on each
+    call, enclose GradientTape code in @tf.function.
+
+    See[wikipedia
+    article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant)
     for the definition of a Jacobian.
 
     Example usage:
@@ -1243,6 +1249,12 @@ class GradientTape(object):
     Note: Unless you set `persistent=True` a GradientTape can only be used to
     compute one set of gradients (or jacobians).
 
+    Note: By default the batch_jacobian implementation uses parallel for (pfor),
+    which creates a tf.function under the hood for each batch_jacobian call.
+    For better performance, and to avoid recompilation and vectorization
+    rewrites on each call, enclose GradientTape code in @tf.function.
+
+
     Example usage:
 
     ```python
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 584fed73158..edb02b072b1 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -1593,6 +1594,35 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
         self.assertIn('gradient_tape/my_scope/', op.name)
     self.assertEqual(num_sin_ops_found, 2)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testRecomputeGradWithDifferentShape(self):
+
+    @custom_gradient.recompute_grad
+    def outer(x):
+      return [x[0] + 1, x[1] + 1]
+
+    x = [
+        variables.Variable([1.0, 2.0], name='a'),
+        variables.Variable(1.0, name='b')
+    ]
+    with backprop.GradientTape():
+      y = outer(x)
+      self.assertAllEqual(y[0], [2.0, 3.0])
+      self.assertAllEqual(y[1], 2.0)
+
+    @custom_gradient.recompute_grad
+    def outer_dict(x):
+      for key in x.keys():
+        x[key] = x[key] + 1
+      return x
+
+    x = {x[0].ref(): x[0], x[1].ref(): x[1]}
+    with backprop.GradientTape():
+      y = outer_dict(x)
+      y = list(y.values())
+      self.assertAllEqual(y[0], [2.0, 3.0])
+      self.assertAllEqual(y[1], 2.0)
+
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testRecomputeGradWithNestedFunctionAndWhileLoop(self):
 
@@ -1713,6 +1743,60 @@ class JacobianTest(test.TestCase):
     dy_xx_answer = [[[2., 0], [0, 2.]]] * 10
     self.assertAllClose(dy_xx_answer, self.evaluate(dy_xx))
 
+  def test_nested_batch_jacobian_foldl(self):
+    def _grad(f):
+      def _grad_function(primal):
+        with backprop.GradientTape() as tape:
+          tape.watch(primal)
+          primal_out = f(primal)
+        return tape.batch_jacobian(primal_out, primal)
+      return _grad_function
+
+    def _func(x):
+      return array_ops.reshape(
+          functional_ops.foldl_v2(lambda a, b: math_ops.cos(a + b),
+                                  array_ops.transpose(x)),
+          [1, 1])
+
+    f = _func
+    x = constant_op.constant([[1., 2.]])
+    for _ in range(2):
+      theoretical, numerical = gradient_checker_v2.compute_gradient(f, [x])
+      self.assertAllClose(theoretical, numerical, rtol=1e-3)
+      f = _grad(f)
+      expected_flat = array_ops.reshape(numerical, [-1])
+      self.assertAllClose(expected_flat,
+                          array_ops.reshape(f(x), [-1]),
+                          rtol=1e-3)
+      self.assertAllClose(expected_flat,
+                          array_ops.reshape(def_function.function(f)(x), [-1]),
+                          rtol=1e-3)
+
+  def test_grad_jacobian_conv(self):
+    def _inner(x):
+      kernel = array_ops.ones([3, 3, 1, 9])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = nn_ops.conv2d(x, kernel, strides=(1, 1), padding='SAME',
+                          data_format='NHWC')
+        reduced = math_ops.reduce_sum(y ** 2., axis=[2, 3])
+      return math_ops.reduce_sum(tape.batch_jacobian(reduced, x))
+
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        def_function.function(_inner), [array_ops.ones([10, 4, 4, 1])])
+    self.assertAllClose(numerical, theoretical, rtol=1e-1)
+
+    @def_function.function
+    def _outer():
+      with backprop.GradientTape() as tape:
+        x = array_ops.ones([10, 4, 4, 1])
+        tape.watch(x)
+        y = _inner(x)
+      return tape.gradient(y, x)
+
+    self.assertAllClose(array_ops.reshape(numerical, [-1]),
+                        array_ops.reshape(_outer(), [-1]), rtol=1e-1)
+
   @test_util.run_in_graph_and_eager_modes
   def test_indexed_slices(self):
     with backprop.GradientTape(persistent=True) as g:
diff --git a/tensorflow/python/eager/backprop_util.py b/tensorflow/python/eager/backprop_util.py
index 117b05e0956..e1c719d4a9d 100644
--- a/tensorflow/python/eager/backprop_util.py
+++ b/tensorflow/python/eager/backprop_util.py
@@ -19,12 +19,35 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import handle_data_util
+
+
+def _DTypeFromTensor(tensor):
+  """Extract either `tensor.dtype` or the unanimous sub-type of a variant."""
+  dtype = tensor.dtype
+  if dtype.base_dtype == dtypes.variant:
+    # If we know statically that the data a variant points to is non-trainable
+    # then the variant itself is non-trainable.
+    if isinstance(tensor, ops.EagerTensor):
+      handle_data = tensor._handle_data  # pylint: disable=protected-access
+    else:
+      handle_data = handle_data_util.get_resource_handle_data(tensor)
+    if (handle_data is not None
+        and handle_data.is_set
+        and handle_data.shape_and_type):
+      first_type = handle_data.shape_and_type[0].dtype
+      if all(shape_and_type.dtype == first_type
+             for shape_and_type in handle_data.shape_and_type):
+        return first_type
+  return dtype
 
 
 def IsTrainable(tensor_or_dtype):
+  """Determines whether a tensor or dtype supports infinitesimal changes."""
   if tensor_util.is_tensor(tensor_or_dtype):
-    dtype = tensor_or_dtype.dtype
+    dtype = _DTypeFromTensor(tensor_or_dtype)
   else:
     dtype = tensor_or_dtype
   dtype = dtypes.as_dtype(dtype)
diff --git a/tensorflow/python/eager/benchmarks/resnet50/BUILD b/tensorflow/python/eager/benchmarks/resnet50/BUILD
index ccec9f858a2..6c63658e3c7 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/BUILD
+++ b/tensorflow/python/eager/benchmarks/resnet50/BUILD
@@ -46,7 +46,6 @@ cuda_py_test(
         "oss_serial",
         "v1only",
     ],
-    tfrt_enabled = True,
     deps = [
         ":resnet50",
         ":resnet50_test_util",
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 52803be7bf9..573c8bc2e10 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -336,7 +336,6 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_apply_with_defun(self):
     self._benchmark_eager_apply(
         'eager_apply_with_defun',
@@ -416,7 +415,6 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         resnet50_test_util.device_and_data_format(),
         defun=False)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_datasets_with_defun(self):
 
     def make_iterator(tensors):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index bf6e43f5e65..c567fcb762c 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -923,19 +923,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func = lambda: math_ops.reduce_logsumexp(x)
       self._run(func, 3000, execution_mode=execution_mode)
 
-  @test_util.disable_tfrt("b/169371018: Support ScalarHost in RTFB.")
   def benchmark_tf_reduce_logsumexp_CPU(self):
     self._benchmark_tf_reduce_logsumexp()
 
-  @test_util.disable_tfrt("b/169371018: Support ScalarHost in RTFB.")
   def benchmark_tf_reduce_logsumexp_CPU_async(self):
     self._benchmark_tf_reduce_logsumexp(execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("b/169371018: Support ScalarHost in RTFB.")
   def benchmark_tf_reduce_logsumexp_GPU(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU)
 
-  @test_util.disable_tfrt("b/169371018: Support ScalarHost in RTFB.")
   def benchmark_tf_reduce_logsumexp_GPU_async(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU,
                                         execution_mode=context.ASYNC)
@@ -1089,13 +1085,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._run(func, num_iters=self._num_iters_2_by_2)
 
   def _benchmark_tf_dropout_2_by_2(self,
+                                   rate=0.5,
                                    is_rate_tensor=True,
                                    noise_shape=None,
                                    device=CPU):
     if is_rate_tensor:
-      rate = constant_op.constant(0.5, dtype=dtypes.float32)
-    else:
-      rate = 0.5
+      rate = constant_op.constant(rate, dtype=dtypes.float32)
     with context.device(device):
 
       def func():
@@ -1116,6 +1111,19 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmark_tf_dropout_2_by_2_GPU(self):
     self._benchmark_tf_dropout_2_by_2(device=GPU)
 
+  def benchmark_tf_dropout_scalar_rate_2_by_2_CPU_rate_0(self):
+    self._benchmark_tf_dropout_2_by_2(rate=0, is_rate_tensor=False)
+
+  def benchmark_tf_dropout_scalar_rate_2_by_2_GPU_rate_0(self):
+    self._benchmark_tf_dropout_2_by_2(rate=0.0,
+                                      is_rate_tensor=False, device=GPU)
+
+  def benchmark_tf_dropout_2_by_2_CPU_rate_0(self):
+    self._benchmark_tf_dropout_2_by_2(rate=0.0)
+
+  def benchmark_tf_dropout_2_by_2_GPU_rate_0(self):
+    self._benchmark_tf_dropout_2_by_2(rate=0, device=GPU)
+
   def _benchmark_transpose(self,
                            m,
                            num_iters,
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index d6f854b4a2b..6c1047bcd52 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -414,10 +414,10 @@ class Context(object):
       raise ValueError(
           "execution_mode should be None/SYNC/ASYNC. Got %s" % execution_mode)
     if execution_mode is None:
-      execution_mode = ASYNC if tfrt_utils.enabled() else SYNC
+      execution_mode = SYNC
     self._default_is_async = execution_mode == ASYNC
     self._lazy_remote_inputs_copy = None
-    self._use_tfrt = tfrt_utils.enabled()
+    self._use_tfrt = is_tfrt_enabled()
     self._server_def = server_def
     self._collective_ops_server_def = None
     self._collective_leader = None
@@ -633,7 +633,7 @@ class Context(object):
     """Sync both local executors and the ones on remote workers.
 
     In async execution mode, local function calls can return before the
-    coresponding remote op/function execution requests are completed. Calling
+    corresponding remote op/function execution requests are completed. Calling
     this method creates a synchronization barrier for remote executors. It only
     returns when all remote pending nodes are finished, potentially with errors
     if any remote executors are in error state.
@@ -748,7 +748,7 @@ class Context(object):
     self.ensure_initialized()
     pywrap_tfe.TFE_AbortCollectiveOps(self._handle, code, message)
 
-  def check_collective_ops_peer_health(self, task):
+  def check_collective_ops_peer_health(self, task, timeout_in_ms):
     """Check collective peer health.
 
     This probes each task to see if they're still alive. Note that restarted
@@ -758,6 +758,7 @@ class Context(object):
 
     Args:
       task: a task string, must be in the format of /job:xxx/replica:0/task:N.
+      timeout_in_ms: an integer, the timeout. If zero, there's no timeout.
 
     Raises:
       tf.errors.UnavailableError: when a peer is down.
@@ -766,7 +767,8 @@ class Context(object):
       tf.errors.InvalidArgumentError: when the task string is invalid.
     """
     self.ensure_initialized()
-    pywrap_tfe.TFE_CollectiveOpsCheckPeerHealth(self._handle, task)
+    pywrap_tfe.TFE_CollectiveOpsCheckPeerHealth(self._handle, task,
+                                                timeout_in_ms)
 
   @property
   def _handle(self):
@@ -948,7 +950,12 @@ class Context(object):
     if self._log_device_placement is not None:
       config.log_device_placement = self._log_device_placement
 
-    config.experimental.enable_mlir_bridge = pywrap_tfe.TF_IsMlirBridgeEnabled()
+    is_mlir_bridge_enabled = pywrap_tfe.TF_IsMlirBridgeEnabled()
+    config.experimental.mlir_bridge_rollout = is_mlir_bridge_enabled
+    if (is_mlir_bridge_enabled ==
+        config_pb2.ConfigProto.Experimental.MLIR_BRIDGE_ROLLOUT_ENABLED):
+      config.experimental.enable_mlir_bridge = True
+
     if self._enable_mlir_graph_optimization is not None:
       config.experimental.enable_mlir_graph_optimization = (
           self._enable_mlir_graph_optimization)
@@ -2274,7 +2281,7 @@ def async_scope():
   execution, potentially raising exceptions if async execution results in
   an error state.
 
-  Users may write the following code to asynchronuously invoke `train_step_fn`
+  Users may write the following code to asynchronously invoke `train_step_fn`
   and log the `loss` metric for every `num_steps` steps in a training loop.
   `train_step_fn` internally consumes data using `iterator.get_next()`, and may
   throw OutOfRangeError when running out of data. In the case:
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
index fe86104cc0b..4ee59ff484a 100644
--- a/tensorflow/python/eager/context_test.py
+++ b/tensorflow/python/eager/context_test.py
@@ -75,9 +75,24 @@ class ContextTest(test.TestCase):
     del tensor2
     self.assertIs(weak_c(), None)
 
-  @test_util.disable_tfrt('b/169294215: tfrt does not support RunMetadata yet')
   def testSimpleGraphCollection(self):
 
+    @def_function.function
+    def f(x):
+      with ops.device('CPU:0'):
+        return x + constant_op.constant(1.)
+
+    with context.collect_graphs() as graphs:
+      f(constant_op.constant(1.))
+
+    self.assertLen(graphs, 1)
+    graph, = graphs
+    self.assertIn('CPU:0', graph.node[1].device)
+
+  @test_util.disable_tfrt(
+      'b/171600738: tfrt does not support exporting post-optimization graph')
+  def testGraphCollectionAfterDevicePlacement(self):
+
     @def_function.function
     def f(x):
       return x + constant_op.constant(1.)
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 5cc859aa033..907c257d605 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -475,56 +475,12 @@ class Function(object):
     Args:
       python_function: the function to be wrapped.
       name: the name given to it.
-      input_signature: a possibly nested sequence of `TensorSpec` objects
-        specifying the input signature of this function. If `None`, a separate
-        function is instantiated for each inferred input signature.
-      autograph: whether `python_function` should be converted to graph mode.
-        See https://www.tensorflow.org/guide/autograph for more information.
-      experimental_implements: If provided, contains a name of a "known"
-        function this implements. For example "mycompany.my_recurrent_cell".
-        This is stored as an attribute in the serialized representation,
-        which can then be detected and manipulated when processing serialized
-        graph.
-        See
-        https://github.com/tensorflow/community/blob/master/rfcs/20190610-standardizing-composite_ops.md
-        for details.  For an example of utilizing this attribute see:
-        https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
-        The code above automatically detects and substitutes function that
-        implements "embedded_matmul" and allows TFLite to substitute its own
-        implementations. For instance, a tensorflow user can use this
-         attribute to mark that their function also implements
-        `embedded_matmul``` (perhaps more efficiently!)
-        by specifying it using this flag.
-
-        ```python
-        @tf.function(
-            experimental_implements="lingvo.SimpleEmbeddingLayer.EmbMatmul")
-        def embedding_matmul(a, b):
-           # custom implementation here
-        ```
-        This can either be specified as just the string name of the function or
-        a NameAttrList corresponding to a list of key-value attributes
-        with the function name. The name of the function will be in the 'name'
-        field of the NameAttrList.
-      experimental_autograph_options: optional tuple of
-        tensorflow.autograph.Feature values. Allows enabling additional
-        conversion options when autograph is set to True.
-      experimental_relax_shapes: When true, argument shapes may be relaxed to
-        avoid unnecessary retracing.
-      experimental_compile: If `True`, compiles the function using XLA
-        (see https://tensorflow.org/xla). XLA performs compiler optimizations,
-        such as fusion, and attempts to emit more efficient code. This may
-        drastically improve the performance. If set to `True`,
-        the whole function needs to be compilable by XLA, or an
-        `errors.InvalidArgumentError` is thrown.
-        If `None` (default), compiles the function with XLA when running on TPU
-        and goes through the regular function execution path when running on
-        other devices.
-        If `False`, executes the function in a regular way (graph rewrite
-        passes are applied, kernels are dispatched one-by-one by the TensorFlow
-        executor). Set this value to `False` when directly running a
-        multi-device function on TPUs (e.g. two TPU cores, one TPU core and its
-        host CPU).
+      input_signature: See the documentation for `tf.function`.
+      autograph: See the documentation for `tf.function`.
+      experimental_implements: See the documentation for `tf.function`.
+      experimental_autograph_options: See the documentation for `tf.function`.
+      experimental_relax_shapes: See the documentation for `tf.function`.
+      experimental_compile: See the documentation for `tf.function`.
       experimental_follow_type_hints: See the documentation for `tf.function`.
 
     Raises:
@@ -1563,9 +1519,22 @@ def function(func=None,
       `tf.autograph.experimental.Feature` values.
     experimental_relax_shapes: When True, `tf.function` may generate fewer,
       graphs that are less specialized on input shapes.
-    experimental_compile: If True, the function is always compiled by
-      [XLA](https://www.tensorflow.org/xla). XLA may be more efficient in some
-      cases (e.g. TPU, XLA_GPU, dense tensor computations).
+    experimental_compile: If `True`, compiles the function using XLA
+      (see https://tensorflow.org/xla). XLA performs compiler optimizations,
+      such as fusion, and attempts to emit more efficient code. This may
+      drastically improve the performance. If set to `True`,
+      the whole function needs to be compilable by XLA, or an
+      `errors.InvalidArgumentError` is thrown.
+      If `None` (default), compiles the function with XLA when running on TPU
+      and goes through the regular function execution path when running on
+      other devices.
+      If `False`, executes the function in a regular way (graph rewrite
+      passes are applied, kernels are dispatched one-by-one by the TensorFlow
+      executor). Set this value to `False` when directly running a
+      multi-device function on TPUs (e.g. two TPU cores, one TPU core and its
+      host CPU).
+      Not all functions are compilable, see
+      https://tensorflow.org/xla/known_issues for a list of sharp corners.
     experimental_follow_type_hints: When True, the function may use type
       annotations from `func` to optimize the tracing performance. For example,
       arguments annotated with `tf.Tensor` will automatically be converted
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 334fde3f19d..42af94c6cb1 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -153,7 +153,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     init_fn()
     self.assertEqual(state[0].numpy(), 2.0)
 
-  @test_util.disable_tfrt('Error in native condition op.')
   def testVariableInitializerNotConstant(self):
 
     state = []
@@ -385,7 +384,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
                                 'defined in another function or code block'):
       f(array_ops.zeros(shape=(8, 42, 3)))
 
-  @test_util.disable_tfrt('b/169375363: error code support')
   def testRuntimeErrorNotSticky(self):
 
     @def_function.function
@@ -591,7 +589,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
             experimental_variable_policy=save_options.VariablePolicy.NONE)):
       func_d = func.get_concrete_function(constant_op.constant(2.))
 
-    self.assertIs(func_a, func_c)
+    self.assertIsNot(func_a, func_c)
     self.assertIsNot(func_a, func_d)
 
   def testInitializationInNestedCall(self):
@@ -954,6 +952,38 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(obj2.testDouble.experimental_get_tracing_count(), 3)
     self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
 
+  def test_experimental_get_tracing_count_function(self):
+
+    @def_function.function
+    def double(a):
+      return a + a
+
+    double(constant_op.constant(1))
+    double(constant_op.constant(2))
+    self.assertAllEqual(double.experimental_get_tracing_count(), 1)
+    double(constant_op.constant('a'))
+    self.assertAllEqual(double.experimental_get_tracing_count(), 2)
+
+  def test_experimental_get_tracing_count_method(self):
+
+    class TestClass():
+
+      @def_function.function
+      def testDouble(self, a):
+        return a + a
+
+    obj1 = TestClass()
+    obj1.testDouble(constant_op.constant(1))
+    obj1.testDouble(constant_op.constant(2))
+    obj1.testDouble(constant_op.constant(1.1))
+    self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
+    obj2 = TestClass()
+    obj2.testDouble(constant_op.constant(1))
+    obj2.testDouble(constant_op.constant(1.1))
+    obj2.testDouble(constant_op.constant('a'))
+    self.assertAllEqual(obj2.testDouble.experimental_get_tracing_count(), 3)
+    self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/def_function_test_cpu_only.py b/tensorflow/python/eager/def_function_test_cpu_only.py
index 7bb6ade8f6c..8f54845fa41 100644
--- a/tensorflow/python/eager/def_function_test_cpu_only.py
+++ b/tensorflow/python/eager/def_function_test_cpu_only.py
@@ -37,8 +37,8 @@ class DefFunctionCpuOnlyTest(test.TestCase, parameterized.TestCase):
     if test.is_built_with_rocm() or test_util.is_xla_enabled():
       return
 
-    with self.assertRaisesRegexp(errors.UnimplementedError,
-                                 'check target linkage'):
+    with self.assertRaisesRegex(errors.UnimplementedError,
+                                'check target linkage'):
 
       @def_function.function(experimental_compile=True)
       def fn(x):
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index ac178185fd2..26604729b90 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -473,7 +473,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
   def testFusedBatchNormGradsInference(self):
 
     if test.is_built_with_rocm():
-      # This test was addeded recently and has been failing on the ROCm
+      # This test was added recently and has been failing on the ROCm
       # platform, since it was added.
       # TODO(rocm): do root cause analysis of test failure and fix it.
       self.skipTest("Test fails on ROCm platform, needs further analysis")
@@ -760,7 +760,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       [("ForwardPropFirst", True),
        ("TapeFirst", False)])
   def testForwardOverBackwardMemoryEfficiency(self, forward_prop_first):
-    # Watching depends depends on nesting, not creation order
+    # Watching depends on nesting, not creation order
     c = constant_op.constant(1.)
     if forward_prop_first:
       forward_accumulator = forwardprop.ForwardAccumulator(c, .1)
@@ -793,7 +793,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
        ("TapeFirst", False)])
   def testBackwardOverForward(self, forward_prop_first):
     c = constant_op.constant(1.)
-    # Watching depends depends on nesting, not creation order
+    # Watching depends on nesting, not creation order
     if forward_prop_first:
       forward_accumulator = forwardprop.ForwardAccumulator(c, .1)
       gradient_tape = backprop.GradientTape()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 60dd3f17024..d92440e9594 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -67,7 +67,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
 from tensorflow.python.saved_model import save_context
-from tensorflow.python.saved_model import save_options
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lazy_loader
@@ -1388,6 +1387,7 @@ class _HigherOrderTapeGradientFunctions(_TapeGradientFunctions):
           gradients with respect to the inputs.
     """
     outputs = []
+    iteration_count = 0
     # First we need to figure out how many side outputs from the forward pass
     # will be required. We do this in a temporary graph to avoid actually
     # running multiple copies of the backward pass (one per _GradientsHelper
@@ -1402,15 +1402,42 @@ class _HigherOrderTapeGradientFunctions(_TapeGradientFunctions):
     # all of the forward op's outputs: symbolic gradients with tf.gradients
     # instead rely on regenerating backward functions when higher-order
     # gradients are requested.
-    while len(outputs) < len(self._func_graph.outputs):
+    while (len(outputs) < len(self._func_graph.outputs)
+           # It's possible for gradient generation to add new ops to the forward
+           # pass. If all of the new outputs are non-trainable, there's no
+           # reason to continue.
+           and any(backprop_util.IsTrainable(output)
+                   for output in self._func_graph.outputs[len(outputs):])):
+      iteration_count += 1
+      if iteration_count >= 20 and iteration_count % 5 == 0:
+        new_op_with_trainable_output = None
+        num_new_trainable_outputs = 0
+        for output in self._func_graph.outputs[len(outputs):]:
+          if backprop_util.IsTrainable(output):
+            num_new_trainable_outputs += 1
+            new_op_with_trainable_output = output.op
+        logging.warning(
+            ("Determining side outputs for the function '{}' is taking longer "
+             "than expected ({} iterations, typically this converges in 5 or "
+             "so). This could indicate that a gradient registration is adding "
+             "new ops to the forward pass every time gradients are generated. "
+             "{} new trainable output(s) were added this iteration, one from "
+             "the following op:\n {}\nThis may indicate a TensorFlow bug, or "
+             "an issue in a tf.custom_gradient.")
+            .format(
+                self._func_graph.name, iteration_count,
+                num_new_trainable_outputs, new_op_with_trainable_output))
       outputs = list(self._func_graph.outputs)
       self._build_functions_for_outputs(
           outputs, inference_args, input_tangents)
+
     (forward_function, forward_graph,
      backward_function, output_indices, num_output_tangents) = (
          self._build_functions_for_outputs(
              outputs, inference_args, input_tangents))
-    if len(self._func_graph.outputs) != len(outputs):
+    if (len(self._func_graph.outputs) > len(outputs)
+        and any(backprop_util.IsTrainable(output)
+                for output in self._func_graph.outputs[len(outputs):])):
       raise AssertionError(
           ("Unexpectedly added new outputs to the forward function when "
            "building the backward function: {}").format(
@@ -1419,13 +1446,6 @@ class _HigherOrderTapeGradientFunctions(_TapeGradientFunctions):
             num_output_tangents)
 
 
-# Represents the output of TFE_Py_TapeSetPossibleGradientTypes. Real enums are
-# unfortunately too slow to use here.
-_POSSIBLE_GRADIENT_TYPES_NONE = 0
-_POSSIBLE_GRADIENT_TYPES_FIRST_ORDER = 1
-_POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER = 2
-
-
 class _ForwardBackwardCall(object):
   """Holds the state of a function call between execution and recording."""
 
@@ -1919,9 +1939,8 @@ class ConcreteFunction(object):
                          "on invocation of %s, the %d-th input (%s) was not a "
                          "Tensor." % (self._func_graph.name, i, str(arg)))
     args = tensor_inputs + captured_inputs
-    possible_gradient_type = (
-        pywrap_tfe.TFE_Py_TapeSetPossibleGradientTypes(args))
-    if (possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_NONE
+    possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
+    if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
         and executing_eagerly):
       # No tape is watching; skip to running the function.
       return self._build_call_outputs(self._inference_function.call(
@@ -2081,7 +2100,7 @@ class ConcreteFunction(object):
     Args:
       args: A flat list of Tensors with all of the inputs to the forward
         function (including user-specified and captured inputs).
-      possible_gradient_type: One of _POSSIBLE_GRADIENT_TYPES_*.
+      possible_gradient_type: One of gradients_util.POSSIBLE_GRADIENT_TYPES_*.
       executing_eagerly: Boolean, the value of context.executing_eagerly().
 
     Returns:
@@ -2099,7 +2118,8 @@ class ConcreteFunction(object):
     # Allows re-use of forward and backward function pairs depending on the
     # tapes and forward accumulators watching its inputs.
     cache_key = (need_gradients_for_jvps, input_tangents.indices)
-    if possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_FIRST_ORDER:
+    if (possible_gradient_type
+        == gradients_util.POSSIBLE_GRADIENT_TYPES_FIRST_ORDER):
       if input_tangents.indices or executing_eagerly:
         # There is a single non-persistent tape active, so the user can only
         # request first-order gradients from a tape. We can spend less time
@@ -2130,7 +2150,8 @@ class ConcreteFunction(object):
         return _ForwardBackwardCall(
             self._delayed_rewrite_functions, args, input_tangents.tangents,
             tape_watching=True)
-    elif possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER:
+    elif (possible_gradient_type
+          == gradients_util.POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER):
       # Either there's a persistent tape watching, or there are multiple nested
       # tapes. Either way, the user may request higher-order gradients. We'll
       # spend a bit more time and make sure higher-order gradients are correct.
@@ -2145,7 +2166,7 @@ class ConcreteFunction(object):
         self._higher_order_tape_functions[cache_key] = functions
       return _ForwardBackwardCall(functions, args, input_tangents.tangents,
                                   tape_watching=True)
-    # else possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_NONE, meaning no
+    # else possible_gradient_type == POSSIBLE_GRADIENT_TYPES_NONE, meaning no
     # tape is recording.
     return _ForwardBackwardCall(
         self._delayed_rewrite_functions, args, input_tangents.tangents,
@@ -3177,10 +3198,7 @@ class Function(object):
       variable_policy = (
           save_context.get_save_options().experimental_variable_policy)
     else:
-      # With EXPAND_DISTRIBUTED_VARIABLES the variables have the same behavior
-      # in and out of saving. We use EXPAND_DISTRIBUTED_VARIABLES so that if the
-      # user saves with it, there's no need to retrace the functions.
-      variable_policy = save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES
+      variable_policy = None
 
     return (parent_graph, device_functions, colocation_stack,
             in_cross_replica_context, variable_policy, xla_context_id)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index bd91c6a564e..a584a6cc4fa 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -190,6 +190,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(AttributeError, 'no attribute'):
       add(c)
 
+  @test_util.disable_tfrt('Packed tensor is not supported in tfrt yet.')
   def testPackedVariable(self):
     with ops.device('/cpu:0'):
       v0_0 = resource_variable_ops.ResourceVariable(1.0)
@@ -842,6 +843,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     expected = [4.0] * 100
     self.assertSequenceEqual(outputs, expected)
 
+  @test_util.disable_tfrt('b/169431085: This test is flaky on tfrt')
   def testExecutingStatefulDefunConcurrently(self):
 
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -3347,6 +3349,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     test_fn()
     self.assertEqual(ag_ctx.control_status_ctx().status, prev_status)
 
+  @test_util.disable_tfrt('b/170435618')
   def testCancelBeforeFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -3364,6 +3367,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(errors.CancelledError):
       cancelable_func()
 
+  @test_util.disable_tfrt('b/170435618')
   def testCancelBlockedFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -3387,6 +3391,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       cancelable_func()
     t.join()
 
+  @test_util.disable_tfrt('b/170435618')
   def testCancelAfterFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -3519,6 +3524,20 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       for output in [cf(a), cf(x=a), cf(a, b), cf(x=a, y=b)]:
         self.assertAllEqual(output[0] + output[1], 1253)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionWithNonTensorStringInputs(self):
+
+    @def_function.function
+    def f(x, y):
+      return string_ops.string_join([x, y])
+
+    a = constant_op.constant('a')
+    b = 'b'
+
+    cf = f.get_concrete_function(a, b)
+    for output in [cf(a), cf(x=a), cf(a, b), cf(x=a, y=b)]:
+      self.assertAllEqual(output, b'ab')
+
   @test_util.run_in_graph_and_eager_modes
   def testConcreteFunctionWithBoundNestedNonTensorInputs(self):
 
@@ -4291,6 +4310,92 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(TypeError, 'missing required arguments: y'):
       foo.add(2)  # pylint: disable=no-value-for-parameter
 
+  def testShapeInferencePropagateConstNestedStack(self):
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec((None, None), dtype=dtypes.int32),
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+    ])
+    def f(x, s):
+      old_shape = array_ops.shape(x)
+      new_shape = array_ops.stack([old_shape[0], s], axis=0)
+      y = array_ops.ones(shape=new_shape, dtype=dtypes.int32)
+      return y
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=(3, 6), dtype=dtypes.int32)
+    ])
+    def g(x):
+      y = f(x, s=5)
+      assert y.shape.as_list() == [3, 5], y.shape.as_list()
+      return y
+
+    self.assertAllEqual(
+        g(array_ops.zeros([3, 6], dtype=dtypes.int32)), array_ops.ones([3, 5]))
+
+  def testShapeInferencePropagateConstNestedUnstackStack(self):
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec((None, None), dtype=dtypes.int32),
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+    ])
+    def f(x, s):
+      s0, _ = array_ops.unstack(array_ops.shape(x), axis=0)
+      new_shape = array_ops.stack([s0, s], axis=0)
+      y = array_ops.ones(shape=new_shape, dtype=dtypes.int32)
+      return y
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=(3, 6), dtype=dtypes.int32)
+    ])
+    def g(x):
+      y = f(x, s=5)
+      assert y.shape.as_list() == [3, 5], y.shape.as_list()
+      return y
+
+    self.assertAllEqual(
+        g(array_ops.zeros([3, 6], dtype=dtypes.int32)), array_ops.ones([3, 5]))
+
+  def testShapeInferencePropagateConstNestedConcat(self):
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+    ])
+    def f(d1, d2, d3):
+      new_shape = array_ops.concat([[d1], [d2], [d3]], axis=-1)
+      y = array_ops.ones(shape=new_shape, dtype=dtypes.int32)
+      return y
+
+    @def_function.function()
+    def g():
+      y = f(1, 2, 3)
+      assert y.shape.as_list() == [1, 2, 3], y.shape.as_list()
+      return y
+
+    self.assertAllEqual(g(), array_ops.ones([1, 2, 3]))
+
+  def testShapeInferencePropagateConstDoubleNested(self):
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+        tensor_spec.TensorSpec((), dtype=dtypes.int32),
+    ])
+    def f(d1, d2, d3):
+      new_shape = array_ops.concat([[d1], [d2], [d3]], axis=-1)
+      y = array_ops.ones(shape=new_shape, dtype=dtypes.int32)
+      return y
+
+    @def_function.function()
+    def g():
+      y = def_function.function(f)(1, 2, 3)
+      assert y.shape.as_list() == [1, 2, 3], y.shape.as_list()
+      return y
+
+    self.assertAllEqual(g(), array_ops.ones([1, 2, 3]))
+
   @test_util.run_v2_only
   def testControlDependencyAfterInline(self):
     v = variables.Variable(0.)
@@ -4313,6 +4418,25 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     for _ in range(30):
       f()
 
+  @test_util.run_v2_only
+  def testReadInFuncWriteOutside(self):
+    # Run many times since we are testing for a potential race condition.
+    for _ in range(30):
+      # pylint: disable=cell-var-from-loop
+      v = variables.Variable(1.)
+
+      @def_function.function
+      def add_one():
+        return v + 1.
+
+      @def_function.function
+      def get_v_plus_one():
+        v_plus_one = add_one()
+        v.assign_add(2.0)
+        return v_plus_one
+
+      self.assertAllEqual(get_v_plus_one(), 2.0)
+
 
 class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 0c8bbe76c98..494abdbf269 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -394,7 +394,6 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       ('Tensor', lambda: constant_op.constant(1.3+1j)),
       ('Variable', lambda: resource_variable_ops.ResourceVariable(1.3+1j)))
-  @test_util.disable_tfrt('cannot create complex tensor in TFRT.')
   def testCastToPrimitiveTypesFrom(self, value_fn):
     x = value_fn()
     self.assertIsInstance(int(x), int)
@@ -482,8 +481,8 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIs(weak_x(), None)
     self.assertIs(weak_y(), None)
 
-  @test_util.disable_tfrt('TFE_ContextGetExecutorForThread not implemented '
-                          'b/156188669')
+  @test_util.disable_tfrt(
+      'b/153697193: tfrt cannot decode python stacktrace yet')
   def testAsyncExceptionStackTrace(self):
     config.set_synchronous_execution(False)
 
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index e24717960a7..f23bdc6ec5c 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 357> a = {{
+  static std::array<OpIndexInfo, 358> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -227,6 +227,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},
+      {"QuantizeAndDequantizeV4Grad", 1, {3}},
       {"QueueClose"},
       {"QueueDequeue"},
       {"QueueDequeueMany"},
@@ -420,7 +421,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 473> a = {{
+  static std::array<OpIndexInfo, 475> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -669,6 +670,8 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},
+      {"QuantizeAndDequantizeV4"},
+      {"QuantizeAndDequantizeV4Grad"},
       {"QueueClose"},
       {"QueueEnqueue"},
       {"QueueEnqueueMany"},
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index facbba92f59..6c013c93ee1 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <Python.h>
 
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
@@ -397,6 +398,10 @@ void ReturnStatus(TF_Status* status);
 
 namespace tensorflow {
 
+// Returns the DataType for the specified tensor.  Returns DT_INVALID if
+// PyObject is not a tensor.
+DataType PyTensor_DataType(PyObject* tensor);
+
 // Thread-local data associated with a Python eager Context object.
 //
 // TODO(edloper): Consider changing device_name and scope_name to a const char*
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 128fb09d114..0ac2f56336f 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1110,23 +1110,35 @@ static tensorflow::int64 FastTensorId(PyObject* tensor) {
   return id;
 }
 
-static tensorflow::DataType FastTensorDtype(PyObject* tensor) {
+namespace tensorflow {
+DataType PyTensor_DataType(PyObject* tensor) {
   if (EagerTensor_CheckExact(tensor)) {
     return PyEagerTensor_Dtype(tensor);
+  } else {
+#if PY_MAJOR_VERSION < 3
+    // Python 2.x:
+    static PyObject* dtype_attr = PyString_InternFromString("dtype");
+    static PyObject* type_enum_attr = PyString_InternFromString("_type_enum");
+#else
+    // Python 3.x:
+    static PyObject* dtype_attr = PyUnicode_InternFromString("dtype");
+    static PyObject* type_enum_attr = PyUnicode_InternFromString("_type_enum");
+#endif
+    Safe_PyObjectPtr dtype_field(PyObject_GetAttr(tensor, dtype_attr));
+    if (!dtype_field) {
+      return DT_INVALID;
+    }
+
+    Safe_PyObjectPtr enum_field(
+        PyObject_GetAttr(dtype_field.get(), type_enum_attr));
+    if (!enum_field) {
+      return DT_INVALID;
+    }
+
+    return static_cast<DataType>(MakeInt(enum_field.get()));
   }
-  PyObject* dtype_field = PyObject_GetAttrString(tensor, "dtype");
-  if (dtype_field == nullptr) {
-    return tensorflow::DT_INVALID;
-  }
-  PyObject* enum_field = PyObject_GetAttrString(dtype_field, "_type_enum");
-  Py_DECREF(dtype_field);
-  if (enum_field == nullptr) {
-    return tensorflow::DT_INVALID;
-  }
-  tensorflow::int64 id = MakeInt(enum_field);
-  Py_DECREF(enum_field);
-  return static_cast<tensorflow::DataType>(id);
 }
+}  // namespace tensorflow
 
 class PyTapeTensor {
  public:
@@ -1423,9 +1435,9 @@ PyObject* PyTapeTensor::OnesLike() const {
     return py_vspace->OnesLike(tensor);
   }
   PyObject* py_shape = GetShape();
-  PyObject* py_dtype = GetPyDType();
-  PyObject* result = py_vspace->Ones(py_shape, py_dtype);
-  Py_DECREF(py_dtype);
+  PyObject* dtype_field = GetPyDType();
+  PyObject* result = py_vspace->Ones(py_shape, dtype_field);
+  Py_DECREF(dtype_field);
   Py_DECREF(py_shape);
   return result;
 }
@@ -1436,9 +1448,9 @@ PyObject* PyTapeTensor::ZerosLike() const {
     return py_vspace->ZerosLike(tensor);
   }
   PyObject* py_shape = GetShape();
-  PyObject* py_dtype = GetPyDType();
-  PyObject* result = py_vspace->Zeros(py_shape, py_dtype);
-  Py_DECREF(py_dtype);
+  PyObject* dtype_field = GetPyDType();
+  PyObject* result = py_vspace->Zeros(py_shape, dtype_field);
+  Py_DECREF(dtype_field);
   Py_DECREF(py_shape);
   return result;
 }
@@ -1929,7 +1941,7 @@ bool TensorShapesAndDtypes(PyObject* tensors,
   for (int i = 0; i < len; ++i) {
     PyObject* item = seq_array[i];
     tensor_ids->push_back(FastTensorId(item));
-    dtypes->push_back(FastTensorDtype(item));
+    dtypes->push_back(tensorflow::PyTensor_DataType(item));
   }
   return true;
 }
@@ -2238,7 +2250,7 @@ std::vector<tensorflow::DataType> MakeTensorDtypeList(PyObject* tensors) {
   list.reserve(len);
   for (int i = 0; i < len; ++i) {
     PyObject* tensor = seq_array[i];
-    list.push_back(FastTensorDtype(tensor));
+    list.push_back(tensorflow::PyTensor_DataType(tensor));
   }
   Py_DECREF(seq);
   return list;
@@ -2790,7 +2802,8 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
       if (result[i] == nullptr) {
         if (unconnected_gradients_zero) {
           // generate a zeros tensor in the shape of sources[i]
-          tensorflow::DataType dtype = FastTensorDtype(sources_obj[i]);
+          tensorflow::DataType dtype =
+              tensorflow::PyTensor_DataType(sources_obj[i]);
           PyTapeTensor tensor =
               PyTapeTensor(sources_vec[i], dtype, sources_obj[i]);
           result[i] = tensor.ZerosLike();
@@ -2886,7 +2899,7 @@ PyObject* TFE_Py_PackJVPs(PyObject* tensors) {
     if (input == Py_None) {
       continue;
     }
-    tensorflow::DataType input_dtype(FastTensorDtype(input));
+    tensorflow::DataType input_dtype(tensorflow::PyTensor_DataType(input));
     if (input_dtype == tensorflow::DT_INVALID) {
       return nullptr;
     }
@@ -3058,7 +3071,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
 
 tensorflow::DataType MaybeGetDType(PyObject* item) {
   if (EagerTensor_CheckExact(item) || CheckResourceVariable(item)) {
-    return FastTensorDtype(item);
+    return tensorflow::PyTensor_DataType(item);
   }
 
   return tensorflow::DT_INVALID;
@@ -3606,16 +3619,18 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   }
 
   TFE_Op* op = GetOp(ctx, op_name, op_exec_info.device_name, status);
-  tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace());
 
   auto cleaner = tensorflow::gtl::MakeCleanup([status, ctx, op] {
     ReturnStatus(status);
     ReturnOp(ctx, op);
   });
+
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
     return nullptr;
   }
 
+  tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace());
+
   const tensorflow::OpDef* op_def = tensorflow::unwrap(op)->OpDef();
   if (op_def == nullptr) return nullptr;
 
@@ -3802,10 +3817,10 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     }
   }
 
-  int num_retvals = 0;
+  int64_t num_outputs = 0;
   for (int i = 0; i < op_def->output_arg_size(); i++) {
     const auto& output_arg = op_def->output_arg(i);
-    int delta = 1;
+    int64_t delta = 1;
     if (!output_arg.number_attr().empty()) {
       delta = attr_list_sizes[output_arg.number_attr()];
     } else if (!output_arg.type_list_attr().empty()) {
@@ -3816,9 +3831,18 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
           "Attributes suggest that the size of an output list is less than 0");
       return nullptr;
     }
-    num_retvals += delta;
+    num_outputs += delta;
   }
 
+  // If number of retvals is larger than int32, we error out.
+  if (static_cast<int64_t>(static_cast<int32_t>(num_outputs)) != num_outputs) {
+    PyErr_SetString(
+        PyExc_ValueError,
+        Printf("Number of outputs is too big: %ld", num_outputs).c_str());
+    return nullptr;
+  }
+  int num_retvals = num_outputs;
+
   tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals(num_retvals);
 
   Py_BEGIN_ALLOW_THREADS;
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 07529e6a36e..324a314a540 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -224,6 +224,36 @@ class Tests(test.TestCase):
       pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Split", None,
                                         split_dim, value, "num_split", -1)
 
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastPathExecute_VeryLargeOutputs(self):
+    split_dim = constant_op.constant(0, dtype=dtypes.int32)
+    value = constant_op.constant([0, 1, 2, 3], dtype=dtypes.float32)
+    ctx = context.context()
+    ctx.ensure_initialized()
+
+    with self.assertRaisesRegex(ValueError, "Number of outputs is too big"):
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Split", None, split_dim, value,
+                                        "num_split", 1000000000000)
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testSlowPathExecute_VeryLargeOutputs(self):
+    split_dim = constant_op.constant(0, dtype=dtypes.int32)
+    value = [0, 1, 2, 3]
+    ctx = context.context()
+    ctx.ensure_initialized()
+
+    with self.assertRaises(core._FallbackException):
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Split", None, split_dim, value,
+                                        "num_split", 1000000000000)
+
+    value = constant_op.constant(value)
+    attrs = ("num_splits", 1000000000000)
+    with self.assertRaisesRegex(ValueError, "Number of outputs is too big"):
+      pywrap_tfe.TFE_Py_Execute(ctx._handle, None, "Split", [value], attrs,
+                                1000000000000)
+
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def testInvalidNumOutputs(self):
diff --git a/tensorflow/python/eager/remote_cluster_test.py b/tensorflow/python/eager/remote_cluster_test.py
index 84dbb11361a..e533ab8577d 100644
--- a/tensorflow/python/eager/remote_cluster_test.py
+++ b/tensorflow/python/eager/remote_cluster_test.py
@@ -320,6 +320,7 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
     t.start()
 
     for _ in range(num_calls):
+
       @def_function.function
       def worker_fn(i):
         return math_ops.matmul(i, i)
@@ -389,10 +390,10 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
     t1_results = [None] * num_calls
     t2_results = [None] * num_calls
     threads = []
-    threads.append(threading.Thread(target=thread_fn,
-                                    args=(self.device_t1, t1_results)))
-    threads.append(threading.Thread(target=thread_fn,
-                                    args=(self.device_t2, t2_results)))
+    threads.append(
+        threading.Thread(target=thread_fn, args=(self.device_t1, t1_results)))
+    threads.append(
+        threading.Thread(target=thread_fn, args=(self.device_t2, t2_results)))
     threads.append(threading.Thread(target=update_server_def_fn))
     for t in threads:
       t.start()
@@ -535,6 +536,7 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
       with ops.device(self.device_t2):
         add = mul + i
       return add - i
+
     worker_fn.get_concrete_function(x1)
 
     num_calls = 10
@@ -551,13 +553,13 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
       with self._coord.stop_on_exception():
         for i in range(num_calls):
           context.update_server_def(
-              server_def=(self.server_def_s1_s2_s3
-                          if i % 2 == 0 else self.server_def_s1_s2))
+              server_def=(self.server_def_s1_s2_s3 if i %
+                          2 == 0 else self.server_def_s1_s2))
 
     results = [None] * num_calls
     threads = []
-    threads.append(threading.Thread(target=thread_fn,
-                                    args=(self.device_t1, results)))
+    threads.append(
+        threading.Thread(target=thread_fn, args=(self.device_t1, results)))
     threads.append(threading.Thread(target=update_server_def_fn))
     for t in threads:
       t.start()
@@ -630,9 +632,8 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(context.check_alive("/job:remote_device/replica:0/task:0"))
     self.assertTrue(context.check_alive("/job:remote_device/replica:0/task:1"))
 
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError,
-        "Client for target /job:remote_device/replica:0/task:10 not found."):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Unable to find worker interface"):
       context.check_alive("/job:remote_device/replica:0/task:10")
 
 
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index ba6a67910b1..3f6ab98605b 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -373,12 +373,14 @@ class AutomaticControlDependencies(object):
       if control_flow_util.IsInWhileLoop(op):
         continue
       control_inputs = set()
-      # Ensure stateful ops run. Note that this includes read only ops, although
-      # they don't have direct side effect, they are affected by ops that writes
-      # the same resource and may be inputs to side-effect ops like tf.print. If
-      # the function gets inlined, they must execute before ops that depend on
-      # the function call.
-      if op_def_registry.get(op.type) is None or op_is_stateful(op):
+      # Ensure stateful ops run.
+      # Read-only ops are added to control outputs if the read value is
+      # consumed. This covers the case when the read value is returned from
+      # the function since that goes through a tf.identity in mark_as_return.
+      if (op_def_registry.get(op.type) is None or
+          (op_is_stateful(op) and
+           (op.type not in utils.RESOURCE_READ_OPS or
+            any(output.consumers() for output in op.outputs)))):
         ops_which_must_run.add(op)
       # Make a note of all opened manager_ids.
       if op.type == "NoOp":
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 8b549263229..a7b238c31b8 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -107,8 +107,10 @@ class AutomaticControlDependenciesTest(test.TestCase):
       v = resource_variable_ops.ResourceVariable(1.0)
       self.evaluate(variables.global_variables_initializer())
       with acd.AutomaticControlDependencies() as c:
-        read_op = gen_resource_variable_ops.read_variable_op(
-            v.handle, v.dtype).op
+        read_op = gen_resource_variable_ops.read_variable_op(v.handle,
+                                                             v.dtype).op
+        # Read ops get added to control outputs only if they have consumers.
+        c.mark_as_return(read_op.outputs[0])
       self.assertIn(read_op, c.ops_which_must_run)
 
   def testVariableMultipleReadsAndWrites(self):
@@ -133,6 +135,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
             v.handle, v + 1)
         assign_op4 = gen_resource_variable_ops.assign_variable_op(
             v.handle, v + 1)
+        # Read ops get added to control outputs only if they have consumers.
+        c.mark_as_return(read_op1.outputs[0])
+        c.mark_as_return(read_op2.outputs[0])
+        c.mark_as_return(read_op3.outputs[0])
+        c.mark_as_return(read_op4.outputs[0])
 
       # Verify the control edges.
       self.assertIn(read_op1, assign_op1.control_inputs)
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 7aed6fb2b70..2691665ffce 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -48,12 +48,11 @@ def enable_tensor_float_32_execution(enabled):
   This reduced precision should not impact convergence of deep learning models
   in practice.
 
-  TensorFloat-32 is enabled by default in the nightly versions of TensorFlow. We
-  expect it will remain enabled by default in the first stable version that
-  TensorFloat-32 is available, which is TensorFlow 2.4, as it increases
-  performance and does not reduce model quality in practice. If you want to use
-  the full float32 precision, you can disable TensorFloat-32 execution with this
-  function. For example:
+  TensorFloat-32 is enabled by default. TensorFloat-32 is only supported on
+  Ampere GPUs, so all other hardware will use the full float32 precision
+  regardless of whether TensorFloat-32 is enabled or not. If you want to use the
+  full float32 precision on Ampere, you can disable TensorFloat-32 execution
+  with this function. For example:
 
   ```python
   x = tf.fill((2, 2), 1.0001)
@@ -65,28 +64,26 @@ def enable_tensor_float_32_execution(enabled):
   print(tf.linalg.matmul(x, y))  # [[2.0002, 2.0002], [2.0002, 2.0002]]
   ```
 
-  There is [an RFC](https://github.com/tensorflow/community/pull/287) proposing
-  that TensorFloat-32 remain enabled by default in stable versions of
-  TensorFlow. We expect the RFC to be accepted, but if it isn't, TensorFloat-32
-  will be disabled by default in TensorFlow 2.4.
-
   To check whether TensorFloat-32 execution is currently enabled, use
   `tf.config.experimental.tensor_float_32_execution_enabled`.
 
-  Enabling TensorFloat-32 causes float32 inputs of supported ops, such as
-  `tf.linalg.matmul`, to be rounded from 23 bits of precision to 10 bits of
+  If TensorFloat-32 is enabled, float32 inputs of supported ops, such as
+  `tf.linalg.matmul`, will be rounded from 23 bits of precision to 10 bits of
   precision in most cases. This allows the ops to execute much faster by
   utilizing the GPU's tensor cores. TensorFloat-32 has the same dynamic range as
   float32, meaning it is no more likely to underflow or overflow than float32.
-  Ops still use float32 accumulation when TensorFloat-32 is enabled. Enabling
-  TensorFloat-32 only affects Ampere GPUs and subsequent GPUs that support
-  TensorFloat-32.
+  Ops still use float32 accumulation when TensorFloat-32 is enabled. Enabling or
+  disabling TensorFloat-32 only affects Ampere GPUs and subsequent GPUs that
+  support TensorFloat-32.
 
   Note TensorFloat-32 is not always used in supported ops, as only inputs of
   certain shapes are supported. Support for more input shapes and more ops may
   be added in the future. As a result, precision of float32 ops may decrease in
   minor versions of TensorFlow.
 
+  TensorFloat-32 is also used for some complex64 ops. Currently, TensorFloat-32
+  is used in fewer cases for complex64 as it is for float32.
+
   Args:
     enabled: Bool indicating whether to enable TensorFloat-32 execution.
   """
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index a20af802824..7dd26425037 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -214,14 +214,23 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
   def testEnableMlirBridge(self):
     # Default value of enable_mlir_bridge is false.
     self.assertFalse(context.context().config.experimental.enable_mlir_bridge)
+    self.assertEqual(
+        context.context().config.experimental.mlir_bridge_rollout,
+        config_pb2.ConfigProto.Experimental.MLIR_BRIDGE_ROLLOUT_UNSPECIFIED)
 
     # Tests enabling mlir bridge.
     config.enable_mlir_bridge()
     self.assertTrue(context.context().config.experimental.enable_mlir_bridge)
+    self.assertEqual(
+        context.context().config.experimental.mlir_bridge_rollout,
+        config_pb2.ConfigProto.Experimental.MLIR_BRIDGE_ROLLOUT_ENABLED)
 
     # Tests disabling mlir bridge.
     config.disable_mlir_bridge()
     self.assertFalse(context.context().config.experimental.enable_mlir_bridge)
+    self.assertEqual(
+        context.context().config.experimental.mlir_bridge_rollout,
+        config_pb2.ConfigProto.Experimental.MLIR_BRIDGE_ROLLOUT_DISABLED)
 
   @reset_eager
   def testEnableMlirGraphOptimization(self):
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 10541ed8e34..80491f1bf01 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -294,7 +294,7 @@ class _Node(_Convertible):
       The object referred to by 'input_name'.
     """
 
-    # The logic below oversimplifes the semantics, but is good enough for the
+    # The logic below oversimplifies the semantics, but is good enough for the
     # purposes of converting to constants. The introduction of new types of
     # operations may change this, forcing the code to be more generic.
     #
@@ -786,7 +786,7 @@ class _FunctionConverterData(_ConverterData):
       func: ConcreteFunction.
       lower_control_flow: Boolean indicating whether or not to lower control
         flow ops such as If and While.
-      aggressive_inlining: Boolean indicating whether or not to to aggressive
+      aggressive_inlining: Boolean indicating whether or not to do aggressive
         function inlining (might be unsafe if function has stateful ops, not
         properly connected to control outputs).
       variable_names_allowlist: The set of variable names to convert (by
@@ -918,7 +918,7 @@ def _run_inline_graph_optimization(func, lower_control_flow,
     func: ConcreteFunction.
     lower_control_flow: Boolean indicating whether or not to lower control flow
       ops such as If and While. (default True)
-    aggressive_inlining: Boolean indicating whether or not to to aggressive
+    aggressive_inlining: Boolean indicating whether or not to do aggressive
       function inlining (might be unsafe if function has stateful ops not
       properly connected to control outputs).
 
@@ -1057,7 +1057,7 @@ def convert_variables_to_constants_v2(func,
     func: ConcreteFunction.
     lower_control_flow: Boolean indicating whether or not to lower control flow
       ops such as If and While. (default True)
-    aggressive_inlining: Boolean indicating whether or not to to aggressive
+    aggressive_inlining: Boolean indicating whether or not to do aggressive
       function inlining (might be unsafe if function has stateful ops, not
       properly connected to control outputs). (default False)
 
@@ -1090,7 +1090,7 @@ def convert_variables_to_constants_v2_as_graph(func,
     func: ConcreteFunction.
     lower_control_flow: Boolean indicating whether or not to lower control flow
       ops such as If and While. (default True)
-    aggressive_inlining: Boolean indicating whether or not to to aggressive
+    aggressive_inlining: Boolean indicating whether or not to do aggressive
       function inlining (might be unsafe if function has stateful ops, not
       properly connected to control outputs).
 
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 8e2e9b983c8..b55c2b73c0b 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -25,7 +25,6 @@ import re
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -126,13 +125,8 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
 # so it is excluded from eager tests. Even when used in eager mode, it is
 # via FunctionGraphs, and directly verifying in graph mode is the narrowest
 # way to unit test the functionality.
-@test_util.run_deprecated_v1
 class CreateGraphDebugInfoDefTest(test.TestCase):
 
-  def setUp(self):
-    super(CreateGraphDebugInfoDefTest, self).setUp()
-    ops.reset_default_graph()
-
   def _getFirstStackTraceForFile(self, graph_debug_info, key, file_index):
     self.assertIn(key, graph_debug_info.traces)
     stack_trace = graph_debug_info.traces[key]
@@ -146,201 +140,205 @@ class CreateGraphDebugInfoDefTest(test.TestCase):
     return found_flc
 
   def testStackTraceExtraction(self):
-    # Since the create_graph_debug_info_def() function does not actually
-    # do anything special with functions except name mangling, just verify
-    # it with a loose op and manually provided function name.
-    # The following ops *must* be on consecutive lines (it will be verified
-    # in the resulting trace).
-    # pyformat: disable
-    global_op = constant_op.constant(0, name="Global").op
-    op1 = constant_op.constant(1, name="One").op
-    op2 = constant_op.constant(2, name="Two").op
-    non_traceback_op = constant_op.constant(3, name="NonTraceback").op
-    # Ensure op without traceback does not fail
-    del non_traceback_op._traceback
-    # pyformat: enable
+    # This test is verifying stack trace information added in graph mode, so
+    # only makes sense in graph mode.
+    with ops.Graph().as_default():
+      # Since the create_graph_debug_info_def() function does not actually
+      # do anything special with functions except name mangling, just verify
+      # it with a loose op and manually provided function name.
+      # The following ops *must* be on consecutive lines (it will be verified
+      # in the resulting trace).
+      # pyformat: disable
+      global_op = constant_op.constant(0, name="Global").op
+      op1 = constant_op.constant(1, name="One").op
+      op2 = constant_op.constant(2, name="Two").op
+      non_traceback_op = constant_op.constant(3, name="NonTraceback").op
+      # Ensure op without traceback does not fail
+      del non_traceback_op._traceback
+      # pyformat: enable
 
-    export_ops = [("", global_op), ("func1", op1), ("func2", op2),
-                  ("func2", non_traceback_op)]
-    graph_debug_info = error_interpolation.create_graph_debug_info_def(
-        export_ops)
-    this_file_index = -1
-    for file_index, file_name in enumerate(graph_debug_info.files):
-      if "{}error_interpolation_test.py".format(os.sep) in file_name:
-        this_file_index = file_index
-    self.assertGreaterEqual(
-        this_file_index, 0,
-        "Could not find this file in trace:" + repr(graph_debug_info))
+      export_ops = [("", global_op), ("func1", op1), ("func2", op2),
+                    ("func2", non_traceback_op)]
+      graph_debug_info = error_interpolation.create_graph_debug_info_def(
+          export_ops)
+      this_file_index = -1
+      for file_index, file_name in enumerate(graph_debug_info.files):
+        if "{}error_interpolation_test.py".format(os.sep) in file_name:
+          this_file_index = file_index
+      self.assertGreaterEqual(
+          this_file_index, 0,
+          "Could not find this file in trace:" + repr(graph_debug_info))
 
-    # Verify the traces exist for each op.
-    global_flc = self._getFirstStackTraceForFile(graph_debug_info, "Global@",
-                                                 this_file_index)
-    op1_flc = self._getFirstStackTraceForFile(graph_debug_info, "One@func1",
-                                              this_file_index)
-    op2_flc = self._getFirstStackTraceForFile(graph_debug_info, "Two@func2",
-                                              this_file_index)
+      # Verify the traces exist for each op.
+      global_flc = self._getFirstStackTraceForFile(graph_debug_info, "Global@",
+                                                   this_file_index)
+      op1_flc = self._getFirstStackTraceForFile(graph_debug_info, "One@func1",
+                                                this_file_index)
+      op2_flc = self._getFirstStackTraceForFile(graph_debug_info, "Two@func2",
+                                                this_file_index)
 
-    global_line = global_flc.line
-    self.assertEqual(op1_flc.line, global_line + 1, "op1 not on next line")
-    self.assertEqual(op2_flc.line, global_line + 2, "op2 not on next line")
+      global_line = global_flc.line
+      self.assertEqual(op1_flc.line, global_line + 1, "op1 not on next line")
+      self.assertEqual(op2_flc.line, global_line + 2, "op2 not on next line")
 
 
-@test_util.run_deprecated_v1
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
-  def setUp(self):
-    super(InterpolateFilenamesAndLineNumbersTest, self).setUp()
-    ops.reset_default_graph()
-    # Add nodes to the graph for retrieval by name later.
-    constant_op.constant(1, name="One")
-    constant_op.constant(2, name="Two")
-    three = constant_op.constant(3, name="Three")
-    self.graph = three.graph
-
   def testFindIndexOfDefiningFrameForOp(self):
-    local_op = constant_op.constant(42).op
-    user_filename = "hope.py"
-    _modify_op_stack_with_filenames(
-        local_op,
-        num_user_frames=3,
-        user_filename=user_filename,
-        num_inner_tf_frames=5)
-    idx = error_interpolation._find_index_of_defining_frame(local_op._traceback)
-    # Expected frame is 6th from the end because there are 5 inner frames witih
-    # TF filenames.
-    expected_frame = len(local_op._traceback) - 6
-    self.assertEqual(expected_frame, idx)
+    with ops.Graph().as_default():
+      local_op = constant_op.constant(42).op
+      user_filename = "hope.py"
+      _modify_op_stack_with_filenames(
+          local_op,
+          num_user_frames=3,
+          user_filename=user_filename,
+          num_inner_tf_frames=5)
+      idx = error_interpolation._find_index_of_defining_frame(
+          local_op._traceback)
+      # Expected frame is 6th from the end because there are 5 inner frames with
+      # TF filenames.
+      expected_frame = len(local_op._traceback) - 6
+      self.assertEqual(expected_frame, idx)
 
   def testFindIndexOfDefiningFrameForOpReturnsZeroOnError(self):
-    local_op = constant_op.constant(43).op
-    # Truncate stack to known length.
-    local_op._traceback = local_op._traceback[:7]
-    # Ensure all frames look like TF frames.
-    _modify_op_stack_with_filenames(
-        local_op,
-        num_user_frames=0,
-        user_filename="user_file.py",
-        num_inner_tf_frames=7)
-    idx = error_interpolation._find_index_of_defining_frame(local_op._traceback)
-    self.assertEqual(0, idx)
+    with ops.Graph().as_default():
+      local_op = constant_op.constant(43).op
+      # Truncate stack to known length.
+      local_op._traceback = local_op._traceback[:7]
+      # Ensure all frames look like TF frames.
+      _modify_op_stack_with_filenames(
+          local_op,
+          num_user_frames=0,
+          user_filename="user_file.py",
+          num_inner_tf_frames=7)
+      idx = error_interpolation._find_index_of_defining_frame(
+          local_op._traceback)
+      self.assertEqual(0, idx)
 
   def testNothingToDo(self):
-    normal_string = "This is just a normal string"
-    interpolated_string = error_interpolation.interpolate(
-        normal_string, self.graph)
-    self.assertEqual(interpolated_string, normal_string)
+    with ops.Graph().as_default():
+      constant_op.constant(1, name="One")
+      normal_string = "This is just a normal string"
+      interpolated_string = error_interpolation.interpolate(
+          normal_string, ops.get_default_graph())
+      self.assertEqual(interpolated_string, normal_string)
 
   def testOneTagWithAFakeNameResultsInPlaceholders(self):
-    one_tag_string = "{{node MinusOne}}"
-    interpolated_string = error_interpolation.interpolate(
-        one_tag_string, self.graph)
-    self.assertEqual(one_tag_string, interpolated_string)
+    with ops.Graph().as_default():
+      one_tag_string = "{{node MinusOne}}"
+      interpolated_string = error_interpolation.interpolate(
+          one_tag_string, ops.get_default_graph())
+      self.assertEqual(one_tag_string, interpolated_string)
 
   def testTwoTagsNoSeps(self):
-    two_tags_no_seps = "{{node One}}{{node Three}}"
-    interpolated_string = error_interpolation.interpolate(
-        two_tags_no_seps, self.graph)
-    self.assertRegex(
-        interpolated_string, r"error_interpolation_test\.py:[0-9]+."
-        r"*error_interpolation_test\.py:[0-9]+")
+    with ops.Graph().as_default():
+      constant_op.constant(1, name="One")
+      constant_op.constant(2, name="Two")
+      constant_op.constant(3, name="Three")
+      two_tags_no_seps = "{{node One}}{{node Three}}"
+      interpolated_string = error_interpolation.interpolate(
+          two_tags_no_seps, ops.get_default_graph())
+      self.assertRegex(
+          interpolated_string, r"error_interpolation_test\.py:[0-9]+."
+          r"*error_interpolation_test\.py:[0-9]+")
 
   def testTwoTagsWithSeps(self):
-    two_tags_with_seps = ";;;{{node Two}},,,{{node Three}};;;"
-    interpolated_string = error_interpolation.interpolate(
-        two_tags_with_seps, self.graph)
-    expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
-                      r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
-    self.assertRegex(interpolated_string, expected_regex)
+    with ops.Graph().as_default():
+      constant_op.constant(1, name="One")
+      constant_op.constant(2, name="Two")
+      constant_op.constant(3, name="Three")
+      two_tags_with_seps = ";;;{{node Two}},,,{{node Three}};;;"
+      interpolated_string = error_interpolation.interpolate(
+          two_tags_with_seps, ops.get_default_graph())
+      expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
+                        r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
+      self.assertRegex(interpolated_string, expected_regex)
 
   def testNewLine(self):
-    newline = "\n\n{{node One}}"
-    interpolated_string = error_interpolation.interpolate(newline, self.graph)
-    self.assertRegex(interpolated_string,
-                     r"error_interpolation_test\.py:[0-9]+.*")
+    with ops.Graph().as_default():
+      constant_op.constant(1, name="One")
+      constant_op.constant(2, name="Two")
+      newline = "\n\n{{node One}}"
+      interpolated_string = error_interpolation.interpolate(
+          newline, ops.get_default_graph())
+      self.assertRegex(interpolated_string,
+                       r"error_interpolation_test\.py:[0-9]+.*")
 
 
-@test_util.run_deprecated_v1
 class InputNodesTest(test.TestCase):
 
-  def setUp(self):
-    super(InputNodesTest, self).setUp()
-    # Add nodes to the graph for retrieval by name later.
-    one = constant_op.constant(1, name="One")
-    two = constant_op.constant(2, name="Two")
-    three = math_ops.add(one, two, name="Three")
-    non_traceback_op = constant_op.constant(3, name="NonTraceback")
-    # Ensure op without traceback does not fail
-    del non_traceback_op.op._traceback
-    self.graph = three.graph
-
   def testNoInputs(self):
-    two_tags_with_seps = ";;;{{node One}},,,{{node Two}};;;"
-    interpolated_string = error_interpolation.interpolate(
-        two_tags_with_seps, self.graph)
-    expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
-                      r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
-    self.assertRegex(interpolated_string, expected_regex)
+    with ops.Graph().as_default():
+      one = constant_op.constant(1, name="One")
+      two = constant_op.constant(2, name="Two")
+      _ = math_ops.add(one, two, name="Three")
+      two_tags_with_seps = ";;;{{node One}},,,{{node Two}};;;"
+      interpolated_string = error_interpolation.interpolate(
+          two_tags_with_seps, ops.get_default_graph())
+      expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
+                        r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
+      self.assertRegex(interpolated_string, expected_regex)
 
   def testBasicInputs(self):
-    tag = ";;;{{node Three}};;;"
-    interpolated_string = error_interpolation.interpolate(tag, self.graph)
-    expected_regex = re.compile(
-        r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
-        r";;;.*Input.*error_interpolation_test\.py:[0-9]+\)", re.DOTALL)
-    self.assertRegex(interpolated_string, expected_regex)
+    with ops.Graph().as_default():
+      one = constant_op.constant(1, name="One")
+      two = constant_op.constant(2, name="Two")
+      _ = math_ops.add(one, two, name="Three")
+      tag = ";;;{{node Three}};;;"
+      interpolated_string = error_interpolation.interpolate(
+          tag, ops.get_default_graph())
+      expected_regex = re.compile(
+          r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
+          r";;;.*Input.*error_interpolation_test\.py:[0-9]+\)", re.DOTALL)
+      self.assertRegex(interpolated_string, expected_regex)
 
 
-@test_util.run_deprecated_v1
 class InterpolateDeviceSummaryTest(test.TestCase):
 
   def _fancy_device_function(self, unused_op):
     return "/cpu:*"
 
-  def setUp(self):
-    super(InterpolateDeviceSummaryTest, self).setUp()
-    ops.reset_default_graph()
-    self.zero = constant_op.constant([0.0], name="zero")
-    with ops.device("/cpu"):
-      self.one = constant_op.constant([1.0], name="one")
-      with ops.device("/cpu:0"):
-        self.two = constant_op.constant([2.0], name="two")
-    with ops.device(self._fancy_device_function):
-      self.three = constant_op.constant(3.0, name="three")
-
-    self.graph = self.three.graph
-
   def testNodeZeroHasNoDeviceSummaryInfo(self):
-    message = "{{colocation_node zero}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertIn("No device assignments were active", result)
+    with ops.Graph().as_default():
+      self.zero = constant_op.constant([0.0], name="zero")
+      message = "{{colocation_node zero}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertIn("No device assignments were active", result)
 
   def testNodeOneHasExactlyOneInterpolatedDevice(self):
-    message = "{{colocation_node one}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertEqual(2, result.count("tf.device(/cpu)"))
+    with ops.Graph().as_default():
+      with ops.device("/cpu"):
+        self.one = constant_op.constant([1.0], name="one")
+      message = "{{colocation_node one}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertEqual(2, result.count("tf.device(/cpu)"))
 
   def testNodeTwoHasTwoInterpolatedDevice(self):
-    message = "{{colocation_node two}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertEqual(2, result.count("tf.device(/cpu)"))
-    self.assertEqual(2, result.count("tf.device(/cpu:0)"))
+    with ops.Graph().as_default():
+      with ops.device("/cpu"):
+        with ops.device("/cpu:0"):
+          self.two = constant_op.constant([2.0], name="two")
+      message = "{{colocation_node two}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertEqual(2, result.count("tf.device(/cpu)"))
+      self.assertEqual(2, result.count("tf.device(/cpu:0)"))
 
   def testNodeThreeHasFancyFunctionDisplayNameForInterpolatedDevice(self):
-    message = "{{colocation_node three}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    num_devices = result.count("tf.device")
-    self.assertEqual(2, num_devices)
-    name_re = r"_fancy_device_function<.*error_interpolation_test.py, [0-9]+>"
-    expected_re = r"with tf.device\(.*%s\)" % name_re
-    self.assertRegex(result, expected_re)
+    with ops.Graph().as_default():
+      with ops.device(self._fancy_device_function):
+        self.three = constant_op.constant(3.0, name="three")
+      message = "{{colocation_node three}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      num_devices = result.count("tf.device")
+      self.assertEqual(2, num_devices)
+      name_re = r"_fancy_device_function<.*error_interpolation_test.py, [0-9]+>"
+      expected_re = r"with tf.device\(.*%s\)" % name_re
+      self.assertRegex(result, expected_re)
 
 
-@test_util.run_deprecated_v1
 class InterpolateColocationSummaryTest(test.TestCase):
 
-  def setUp(self):
-    super(InterpolateColocationSummaryTest, self).setUp()
-    ops.reset_default_graph()
+  def _set_up_graph(self):
     # Add nodes to the graph for retrieval by name later.
     node_one = constant_op.constant(1, name="One")
     node_two = constant_op.constant(2, name="Two")
@@ -359,32 +357,39 @@ class InterpolateColocationSummaryTest(test.TestCase):
       with ops.colocate_with(node_one):
         constant_op.constant(5, name="Five_with_one_with_two")
 
-    self.graph = node_three.graph
-
   def testNodeThreeHasColocationInterpolation(self):
-    message = "{{colocation_node Three_with_one}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertIn("colocate_with(One)", result)
+    with ops.Graph().as_default():
+      self._set_up_graph()
+      message = "{{colocation_node Three_with_one}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertIn("colocate_with(One)", result)
 
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
-    message = "{{colocation_node Four_with_three}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertIn("colocate_with(Three_with_one)", result)
-    self.assertNotIn(
-        "One", result,
-        "Node One should not appear in Four_with_three's summary:\n%s" % result)
+    with ops.Graph().as_default():
+      self._set_up_graph()
+      message = "{{colocation_node Four_with_three}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertIn("colocate_with(Three_with_one)", result)
+      self.assertNotIn(
+          "One", result,
+          "Node One should not appear in Four_with_three's summary:\n%s" %
+          result)
 
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
-    message = "{{colocation_node Five_with_one_with_two}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertIn("colocate_with(One)", result)
-    self.assertIn("colocate_with(Two)", result)
+    with ops.Graph().as_default():
+      self._set_up_graph()
+      message = "{{colocation_node Five_with_one_with_two}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertIn("colocate_with(One)", result)
+      self.assertIn("colocate_with(Two)", result)
 
   def testColocationInterpolationForNodeLackingColocation(self):
-    message = "{{colocation_node One}}"
-    result = error_interpolation.interpolate(message, self.graph)
-    self.assertIn("No node-device colocations", result)
-    self.assertNotIn("Two", result)
+    with ops.Graph().as_default():
+      self._set_up_graph()
+      message = "{{colocation_node One}}"
+      result = error_interpolation.interpolate(message, ops.get_default_graph())
+      self.assertIn("No node-device colocations", result)
+      self.assertNotIn("Two", result)
 
 
 class IsFrameworkFilenameTest(test.TestCase):
diff --git a/tensorflow/python/framework/experimental/BUILD b/tensorflow/python/framework/experimental/BUILD
index 3d404a411c9..4f563d18a59 100644
--- a/tensorflow/python/framework/experimental/BUILD
+++ b/tensorflow/python/framework/experimental/BUILD
@@ -139,7 +139,6 @@ cuda_py_test(
         "no_pip",
         "no_windows",  # b/168218876
     ],
-    tfrt_enabled = True,
     deps = [
         ":_unified_api",
         ":context_stack",
diff --git a/tensorflow/python/framework/experimental/math_ops.cc b/tensorflow/python/framework/experimental/math_ops.cc
index 8a6d8525092..dcb96621f63 100644
--- a/tensorflow/python/framework/experimental/math_ops.cc
+++ b/tensorflow/python/framework/experimental/math_ops.cc
@@ -53,5 +53,38 @@ PYBIND11_MODULE(_math_ops, m) {
                     /*transpose_a=*/false, /*transpose_b=*/false));
     return outputs[0];
   });
+  m.def("neg",
+        [](AbstractContext* ctx, AbstractTensorHandle* a, const char* name) {
+          int num_outputs = 1;
+          std::vector<AbstractTensorHandle*> outputs(1);
+          if (!name) {
+            name = "Neg";
+          }
+          MaybeRaiseRegisteredFromStatus(
+              ops::Neg(ctx, {a}, absl::MakeSpan(outputs), name));
+          return outputs[0];
+        });
+  m.def("sub", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                  AbstractTensorHandle* b, const char* name) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "Sub";
+    }
+    MaybeRaiseRegisteredFromStatus(
+        ops::Sub(ctx, {a, b}, absl::MakeSpan(outputs), name));
+    return outputs[0];
+  });
+  m.def("mul", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                  AbstractTensorHandle* b, const char* name) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "Mul";
+    }
+    MaybeRaiseRegisteredFromStatus(
+        ops::Mul(ctx, {a, b}, absl::MakeSpan(outputs), name));
+    return outputs[0];
+  });
 }
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/math_ops.py b/tensorflow/python/framework/experimental/math_ops.py
index 7b3a171da1f..eee204a6cba 100644
--- a/tensorflow/python/framework/experimental/math_ops.py
+++ b/tensorflow/python/framework/experimental/math_ops.py
@@ -30,3 +30,18 @@ def add(a, b, name=None):
 def mat_mul(a, b, name=None):
   ctx = context.get_default()
   return _math_ops.mat_mul(ctx, a, b, name)
+
+
+def neg(a, name=None):
+  ctx = context.get_default()
+  return _math_ops.neg(ctx, a, name)
+
+
+def sub(a, b, name=None):
+  ctx = context.get_default()
+  return _math_ops.sub(ctx, a, b, name)
+
+
+def mul(a, b, name=None):
+  ctx = context.get_default()
+  return _math_ops.mul(ctx, a, b, name)
diff --git a/tensorflow/python/framework/experimental/tape.cc b/tensorflow/python/framework/experimental/tape.cc
index 85c943dddf9..8e7bcfad901 100644
--- a/tensorflow/python/framework/experimental/tape.cc
+++ b/tensorflow/python/framework/experimental/tape.cc
@@ -36,6 +36,9 @@ Status RegisterGradients(GradientRegistry* registry) {
   TF_RETURN_IF_ERROR(
       registry->Register("SparseSoftmaxCrossEntropyWithLogits",
                          SparseSoftmaxCrossEntropyWithLogitsRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Neg", NegRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Sub", SubRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Mul", MulRegisterer));
   return Status::OK();
 }
 
diff --git a/tensorflow/python/framework/experimental/unified_api_test.py b/tensorflow/python/framework/experimental/unified_api_test.py
index 3b476255c44..98615d4b43c 100644
--- a/tensorflow/python/framework/experimental/unified_api_test.py
+++ b/tensorflow/python/framework/experimental/unified_api_test.py
@@ -163,6 +163,148 @@ class UnifiedApiTest(test.TestCase, parameterized.TestCase):
       eager_output = model(negative)
       self.assertAllEqual(eager_output.numpy(), [0.])
 
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testNeg(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a):
+      return unified_math_ops.neg(a)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([2.]))
+
+      func_output = def_function.function(model)(a)
+      self.assertAllEqual(func_output.numpy(), [-2.])
+
+      eager_output = model(a)
+      self.assertAllEqual(eager_output.numpy(), [-2.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testNegGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        result = unified_math_ops.neg(a)
+      grads = tape.gradient(result, a)
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([2.]))
+
+      func_outputs = def_function.function(model)(a)
+      self.assertAllEqual(func_outputs.numpy(), [-1.0])
+
+      eager_outputs = model(a)
+      self.assertAllEqual(eager_outputs.numpy(), [-1.0])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testSub(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      return unified_math_ops.sub(a, b)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_output = def_function.function(model)(a, b)
+      self.assertAllEqual(func_output.numpy(), [-2., -2.])
+
+      eager_output = model(a, b)
+      self.assertAllEqual(eager_output.numpy(), [-2., -2.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testSubGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        tape.watch(b)
+        result = unified_math_ops.sub(a, b)
+      grads = tape.gradient(result, [a, b])
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_outputs = def_function.function(model)(a, b)
+      self.assertAllEqual(func_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(func_outputs[1].numpy(), [-1.0, -1.0])
+
+      eager_outputs = model(a, b)
+      self.assertAllEqual(eager_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(eager_outputs[1].numpy(), [-1.0, -1.0])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testMul(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      return unified_math_ops.mul(a, b)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_output = def_function.function(model)(a, b)
+      self.assertAllEqual(func_output.numpy(), [3., 8.])
+
+      eager_output = model(a, b)
+      self.assertAllEqual(eager_output.numpy(), [3., 8.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testMulGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        tape.watch(b)
+        result = unified_math_ops.mul(a, b)
+      grads = tape.gradient(result, [a, b])
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_outputs = def_function.function(model)(a, b)
+      self.assertAllEqual(func_outputs[0].numpy(), [3., 4.])
+      self.assertAllEqual(func_outputs[1].numpy(), [1., 2.])
+
+      eager_outputs = model(a, b)
+      self.assertAllEqual(eager_outputs[0].numpy(), [3., 4.])
+      self.assertAllEqual(eager_outputs[1].numpy(), [1., 2.])
+
 
 class UnifiedTapeBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 71c009095a0..f70bd75e36b 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -107,6 +107,7 @@ def convert_structure_to_signature(structure, arg_names=None):
         int,
         float,
         bool,
+        str,
         type(None),
         dtypes.DType,
         tensor_spec.TensorSpec,
@@ -442,6 +443,11 @@ class FuncGraph(ops.Graph):
       return self._fallback_outer_graph
     return current
 
+  @outer_graph.setter
+  def outer_graph(self, new_outer_graph):
+    """Sets `outer_graph` to `new_outer_graph`."""
+    self._weak_outer_graph = weakref.ref(new_outer_graph)
+
   @property
   def output_types(self):
     return [t.dtype for t in self.outputs]
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 90cd0f62986..ea376077ab7 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1487,6 +1487,8 @@ class FunctionCaptureByValueTest(test.TestCase):
       self.assertAllEqual(y, [[12.0]])
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Calls matmul in custom LSTM function")
 class UnrollLSTMTest(test.TestCase):
   BATCH_SIZE = 16
   LSTM_DIMS = 32
@@ -1593,7 +1595,6 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(mv0, mv2, rtol=1e-4)
       self.assertAllClose(mv0, mv3, rtol=1e-4)
 
-  @test_util.run_without_tensor_float_32("Calls matmul in custom LSTM function")
   def testUnrollLSTMGrad(self):
     # Run one step of the unrolled lstm graph.
     def RunForwardBackward(mode, cfg=None):
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index f37b48e76c2..d1a0c261a55 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -157,3 +157,27 @@ def load_library(library_location):
         errno.ENOENT,
         'The file or folder to load kernel libraries from does not exist.',
         library_location)
+
+
+@tf_export('experimental.register_filesystem_plugin')
+def register_filesystem_plugin(plugin_location):
+  """Loads a TensorFlow FileSystem plugin.
+
+  Args:
+    plugin_location: Path to the plugin. Relative or absolute filesystem plugin
+      path to a dynamic library file.
+
+  Returns:
+    None
+
+  Raises:
+    OSError: When the file to be loaded is not found.
+    RuntimeError: when unable to load the library.
+  """
+  if os.path.exists(plugin_location):
+    py_tf.TF_RegisterFilesystemPlugin(plugin_location)
+
+  else:
+    raise OSError(errno.ENOENT,
+                  'The file to load file system plugin from does not exist.',
+                  plugin_location)
diff --git a/tensorflow/python/framework/op_callbacks.py b/tensorflow/python/framework/op_callbacks.py
index 0f2515b6fd1..b30e1c7dd12 100644
--- a/tensorflow/python/framework/op_callbacks.py
+++ b/tensorflow/python/framework/op_callbacks.py
@@ -170,7 +170,7 @@ def invoke_op_callbacks(op_type,
       eager execution and are non-eager `Tensor`s in the case of graph
       construction.
     op_name: Name of the op. Applicable if and only if this method is invoked
-      due to the graph construction of an op or the eager execution of of a
+      due to the graph construction of an op or the eager execution of a
       `FuncGraph`.
     graph: The graph involved (if any).
       - In the case if the eager execution of an op or FuncGraph, this is
diff --git a/tensorflow/python/framework/op_def_util.cc b/tensorflow/python/framework/op_def_util.cc
index 4e1569f190d..794acbc0b28 100644
--- a/tensorflow/python/framework/op_def_util.cc
+++ b/tensorflow/python/framework/op_def_util.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/util.h"
 
 using ::tensorflow::swig::GetRegisteredPyObject;
diff --git a/tensorflow/python/framework/op_def_util.h b/tensorflow/python/framework/op_def_util.h
index 3b35c3ef7ad..dc72c1304f8 100644
--- a/tensorflow/python/framework/op_def_util.h
+++ b/tensorflow/python/framework/op_def_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 82a6dc44959..47561b2c115 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -371,7 +371,7 @@ class Tensor(internal.NativeObject, core_tf_types.Tensor):
       TypeError: If the op is not an `Operation`.
     """
     if not isinstance(op, Operation):
-      raise TypeError("op needs to be an Operation: %s" % op)
+      raise TypeError("op needs to be an Operation: %s" % (op,))
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
@@ -1946,19 +1946,19 @@ class Operation(object):
       assert op_def is None
       c_op = node_def
     else:
-      raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
+      raise TypeError("node_def needs to be a NodeDef: %s" % (node_def,))
 
     if not isinstance(g, Graph):
-      raise TypeError("g needs to be a Graph: %s" % g)
+      raise TypeError("g needs to be a Graph: %s" % (g,))
     self._graph = g
 
     if inputs is None:
       inputs = []
     elif not isinstance(inputs, list):
-      raise TypeError("inputs needs to be a list of Tensors: %s" % inputs)
+      raise TypeError("inputs needs to be a list of Tensors: %s" % (inputs,))
     for a in inputs:
       if not isinstance(a, Tensor):
-        raise TypeError("input needs to be a Tensor: %s" % a)
+        raise TypeError("input needs to be a Tensor: %s" % (a,))
     if input_types is None:
       input_types = [i.dtype.base_dtype for i in inputs]
     else:
@@ -4332,12 +4332,16 @@ class Graph(object):
   def _colocate_with_for_gradient(self, op, gradient_uid,
                                   ignore_existing=False):
     with self.colocate_with(op, ignore_existing):
-      if gradient_uid is not None and self._control_flow_context is not None:
-        self._control_flow_context.EnterGradientColocation(op, gradient_uid)
-        try:
+      if gradient_uid is not None:
+        ctx = _get_enclosing_context(self)
+        if ctx is not None:
+          ctx.EnterGradientColocation(op, gradient_uid)
+          try:
+            yield
+          finally:
+            ctx.ExitGradientColocation(op, gradient_uid)
+        else:
           yield
-        finally:
-          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
       else:
         yield
 
@@ -6099,7 +6103,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
 
   op_input_list = tuple(op_input_list)  # Handle generators correctly
   if graph and not isinstance(graph, Graph):
-    raise TypeError("Input graph needs to be a Graph: %s" % graph)
+    raise TypeError("Input graph needs to be a Graph: %s" % (graph,))
 
   # 1. We validate that all of the inputs are from the same graph. This is
   #    either the supplied graph parameter, or the first one selected from one
@@ -6955,3 +6959,15 @@ def set_int_list_attr(op, attr_name, ints):
   """TF internal method used to set a list(int) attribute in the node_def."""
   ints_list = attr_value_pb2.AttrValue.ListValue(i=ints)
   op._set_attr(attr_name, attr_value_pb2.AttrValue(list=ints_list))  # pylint:disable=protected-access
+
+
+def _get_enclosing_context(graph):
+  # pylint: disable=protected-access
+  if graph is None:
+    return None
+
+  if graph._control_flow_context is not None:
+    return graph._control_flow_context
+
+  if graph.building_function and hasattr(graph, "outer_graph"):
+    return _get_enclosing_context(graph.outer_graph)
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index e3fe4c07a57..04b6d90a838 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -326,7 +326,6 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(z, [False, False, False, True])
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testBitwiseAndErrors(self):
     x_int = constant_op.constant(0)
@@ -368,7 +367,6 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(z, [False, True, True, True])
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testBitwiseOrErrors(self):
     x_int = constant_op.constant(0)
@@ -410,7 +408,6 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(z, [False, True, True, False])
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testBitwiseXorErrors(self):
     x_int = constant_op.constant(0)
@@ -450,7 +447,6 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(y, [True, False])
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testBitwiseNotErrors(self):
     if context.executing_eagerly():  # :(
diff --git a/tensorflow/python/framework/python_api_dispatcher.cc b/tensorflow/python/framework/python_api_dispatcher.cc
new file mode 100644
index 00000000000..57a6a9ce94b
--- /dev/null
+++ b/tensorflow/python/framework/python_api_dispatcher.cc
@@ -0,0 +1,220 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_api_dispatcher.h"
+
+#include <set>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+#include "tensorflow/python/util/util.h"
+
+namespace tensorflow {
+
+using ParamInfo = PythonAPIDispatcher::ParamInfo;
+
+// List of python types to check for dispatch.  In most cases, this vector
+// will have size zero or one; and sizes greater than 3 should be rare.
+using TypeList = absl::InlinedVector<PyTypeObject*, 3>;
+
+namespace {
+
+// Returns the __tf__dispatch__ attribute of `obj`.
+Safe_PyObjectPtr GetAttr_TFDispatch(PyObject* obj) {
+#if PY_MAJOR_VERSION < 3
+  // Python 2.x:
+  static PyObject* attr = PyString_InternFromString("__tf_dispatch__");
+#else
+  // Python 3.x:
+  static PyObject* attr = PyUnicode_InternFromString("__tf_dispatch__");
+#endif
+  return Safe_PyObjectPtr(PyObject_GetAttr(obj, attr));
+}
+
+// Searches `params` for dispatchable types, and returns a vector of borrowed
+// references to those types.  Removes consecutive duplicates (i.e., if a
+// dispatchable parameter has the same type as the previously encountered
+// dispatcahble parameter, then it's type is not added again), so the result
+// will usually have a length of zero or one; but in the general case, it may be
+// longer, and may contain (nonconsecutive) duplicates.
+//
+// Assumes that `params` is a tuple, and that all parameter indices in
+// `dispatch_params` and `dispatch_list_params` are valid.
+TypeList FindDispatchTypes(PyObject* params,
+                           const std::vector<ParamInfo>& dispatchable_params) {
+  TypeList dispatch_types;
+  for (const auto& param : dispatchable_params) {
+    DCHECK_GE(param.index, 0);
+    DCHECK_LT(param.index, PyTuple_GET_SIZE(params));
+    PyObject* value = PyTuple_GET_ITEM(params, param.index);
+    if (param.is_list) {
+      DCHECK(PyList_Check(value));
+      Py_ssize_t num_items = PyList_Size(value);
+      for (Py_ssize_t i = 0; i < num_items; ++i) {
+        PyObject* item = PyList_GET_ITEM(value, i);
+        // TODO(b/164980194) Consider changing IsDispatchable to not use a
+        // cache.  This may impact efficiency (needs to be measured), but would
+        // allow us to support monkey-patching classes to be dispatchable.
+        if (swig::IsDispatchable(item)) {
+          if (dispatch_types.empty() ||
+              value->ob_type != dispatch_types.back()) {
+            dispatch_types.push_back(item->ob_type);
+          }
+        }
+      }
+    } else {
+      if (swig::IsDispatchable(value)) {
+        if (dispatch_types.empty() || value->ob_type != dispatch_types.back()) {
+          dispatch_types.push_back(value->ob_type);
+        }
+      }
+    }
+  }
+
+  return dispatch_types;
+}
+
+// Removes duplicates from `dispatch_types`, and moves any subtypes to
+// before their supertypes.  Note: this method is only called when
+// `dispatch_types.size() > 1`.
+void SortDispatchTypes(TypeList& dispatch_types) {
+  // Remove duplicates.  Note: this is O(n^2) in the number of dispatchable
+  // types, but we expect this number to be very small in almost every case
+  // (usually zero, sometimes one, and rarely larger than two).
+  for (int i = 0; i < dispatch_types.size() - 1; ++i) {
+    if (dispatch_types[i] == nullptr) continue;
+    for (int j = i + 1; j < dispatch_types.size(); ++j) {
+      if (dispatch_types[i] == dispatch_types[j]) {
+        dispatch_types[j] = nullptr;  // mark duplicate
+      }
+    }
+  }
+  dispatch_types.erase(
+      std::remove_if(dispatch_types.begin(), dispatch_types.end(),
+                     [](PyTypeObject* t) { return t == nullptr; }),
+      dispatch_types.end());
+
+  // Move subclasses before superclasses.  As above, this is O(n^2), but we
+  // expect n to be small.
+  TypeList sorted;
+  TypeList subtypes;
+  for (int i = 0; i < dispatch_types.size(); ++i) {
+    if (dispatch_types[i] == nullptr) continue;
+    subtypes.clear();
+    for (int j = i + 1; j < dispatch_types.size(); ++j) {
+      if (dispatch_types[j] == nullptr) continue;
+      if (PyType_IsSubtype(dispatch_types[j], dispatch_types[i])) {
+        subtypes.push_back(dispatch_types[j]);
+        dispatch_types[j] = nullptr;  // mark as already added.
+      }
+    }
+    if (!subtypes.empty()) {
+      std::sort(subtypes.begin(), subtypes.end(), PyType_IsSubtype);
+      sorted.insert(sorted.end(), subtypes.begin(), subtypes.end());
+    }
+    sorted.push_back(dispatch_types[i]);
+  }
+  DCHECK_EQ(dispatch_types.size(), sorted.size());
+  dispatch_types.swap(sorted);
+}
+
+}  // namespace
+
+PythonAPIDispatcher::PythonAPIDispatcher(const std::string& api_name,
+                                         PyObject* api_func, int num_params,
+                                         bool right_to_left)
+    : api_name_(PyUnicode_FromStringAndSize(api_name.c_str(), api_name.size())),
+      api_func_(api_func),
+      num_params_(num_params),
+      right_to_left_(right_to_left) {
+  Py_INCREF(api_func);
+}
+
+bool PythonAPIDispatcher::Initialize(
+    std::vector<ParamInfo> dispatchable_params) {
+  dispatchable_params_.swap(dispatchable_params);
+  std::sort(dispatchable_params_.begin(), dispatchable_params_.end(),
+            [](const ParamInfo& a, const ParamInfo& b) -> bool {
+              return a.index < b.index;
+            });
+  if (right_to_left_) {
+    std::reverse(dispatchable_params_.begin(), dispatchable_params_.end());
+  }
+
+  for (const auto& p : dispatchable_params_) {
+    if (p.index < 0 || p.index >= num_params_) {
+      PyErr_SetString(
+          PyExc_ValueError,
+          absl::StrCat("PythonAPIDispatcher: dispatchable parameter index out ",
+                       "of range: ", p.index, " not in [0, ", num_params_, ")")
+              .c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+PyObject* PythonAPIDispatcher::Dispatch(PyObject* params) const {
+  DCHECK(PyTuple_Check(params));
+
+  // TODO(b/164980194) Consider removing this check, if the caller is also
+  // checking/guaranteeing it (once dispatch has been integrated w/ the Python
+  // API handlers).
+  if (num_params_ != PyTuple_Size(params)) {
+#if PY_MAJOR_VERSION < 3
+    // Python 2.x:
+    Safe_PyObjectPtr api_name_str(PyUnicode_AsUTF8String(api_name_.get()));
+    if (!api_name_str) return nullptr;
+    const char* api_name = PyString_AsString(api_name_str.get());
+#else
+    // Python 3.x:
+    const char* api_name = PyUnicode_AsUTF8AndSize(api_name_.get(), nullptr);
+#endif
+    PyErr_SetString(
+        PyExc_TypeError,
+        absl::StrCat(api_name ? api_name : "unknown PythonAPIDispatcher",
+                     " expected ", num_params_, " parameters, but got ",
+                     PyTuple_Size(params))
+            .c_str());
+    return nullptr;
+  }
+
+  TypeList dispatch_types = FindDispatchTypes(params, dispatchable_params_);
+
+  if (dispatch_types.empty()) {
+    return Py_NotImplemented;
+  }
+
+  if (dispatch_types.size() > 1) {
+    SortDispatchTypes(dispatch_types);
+  }
+
+  for (PyTypeObject* dispatch_type : dispatch_types) {
+    Safe_PyObjectPtr dispatcher =
+        GetAttr_TFDispatch(reinterpret_cast<PyObject*>(dispatch_type));
+    if (!dispatcher) return nullptr;
+    PyObject* result = PyObject_CallFunctionObjArgs(
+        dispatcher.get(), api_name_.get(), api_func_.get(), params, nullptr);
+    if (result != Py_NotImplemented) {
+      return result;
+    }
+  }
+
+  return Py_NotImplemented;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_api_dispatcher.h b/tensorflow/python/framework/python_api_dispatcher.h
new file mode 100644
index 00000000000..7cb3879dd74
--- /dev/null
+++ b/tensorflow/python/framework/python_api_dispatcher.h
@@ -0,0 +1,131 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_API_DISPATCHER_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_API_DISPATCHER_H_
+
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+// Dispatch handler for Python APIs.
+//
+// A separate PythonAPIDispatcher object is created for each Python API, and
+// keeps track of which parameters should be checked for dispatch.
+//
+// When PythonAPIDispatcher::Dispatch() is called with a tuple of
+// canonicalized parameters, it checks the indicated parameters' values for
+// `__tf_dispatch__` methods.  If found, then this method is called with the
+// following arguments: `__tf_dispatch__(api_name, api_func, canon_args)`,
+// where:
+//
+//   * `api_name` is the fully-qualified name of the python API (e.g.,
+//     `"tf.math.sum"`).
+//   * `api_func` is the function that implements the APIs for `Tensor` inputs.
+//   * `canon_args` is the canonicalized argument list.
+//
+class PythonAPIDispatcher {
+ public:
+  // Information about an API parameter that supports dispatch.  `index` is the
+  // parameter's index in the canonicalized parameter list, and `is_list` is
+  // true if the parameter expects a list of values (e.g. the `values` parameter
+  // to `tf.concat`).
+  struct ParamInfo {
+    int index;
+    bool is_list;
+  };
+
+  // Constructs a PythonAPIDispatcher.
+  //
+  // Args:
+  //   api_name: The fully qualified name of the API handled by this dispatcher.
+  //   api_func: The python function for which implements the API for `Tensor`
+  //       inputs.
+  //   num_params: The number of canonical parameters that the API expects.
+  //   right_to_left: If true, then the normal precedence rules (in which
+  //       dispatchers are tried from left-to-right) are changed to try
+  //       dispatchers from right-to-left instead.  This is used for operations
+  //       such as `__radd__`, where the normal parameter order is reversed.
+  PythonAPIDispatcher(const std::string& api_name, PyObject* api_func,
+                      int num_params, bool right_to_left = false);
+
+  // Initiliaze this PythonAPIDispatcher with information about which parameters
+  // support dispatch.  Returns true on success, or sets a python exception and
+  // returns false on error.
+  bool Initialize(std::vector<ParamInfo> dispatchable_params);
+
+  // Checks if any of the dispatchable parameters have a `__tf_dispatch__`
+  // method, and if so, calls them.  In particular, this method:
+  //
+  // 1. Constructs an ordered list of dispatchable types.
+  //
+  //   * Checks each argument that support dispatch to see if its value(s) have
+  //     a `__tf_dispatch__` method.
+  //   * Arguments are checked left-to-right unless `right_to_left` was set to
+  //     True in the constructor.  *Within* a list-valued parameter, elements
+  //     are always checked left-to-right (even if `right_to_left` is True).
+  //   * Duplicate types are removed (only the first occurrence of each type is
+  //     kept).
+  //   * If any type `T_sub` is a subtype of another type `T_super`, but occurs
+  //     after `T_super` in the list of dispatchable types, then it is moved to
+  //     just before `T_super`.
+  //
+  // 2. Tries calling each of the dispatchable types' `__tf_dispatch__` methods.
+  //
+  //    * Dispatch methods are called with the following arguments:
+  //      `__tf_dispatch__(api_name, api_func, canon_args)`
+  //    * Dispatch methods are tried in the order described above.
+  //    * If a dispatch method returns a value, then `Dispatch()` returns a
+  //      new reference to that value.
+  //    * If a dispatch method raises an exception, then `Dispatch()` returns
+  //      null (i.e., propogates the exception).
+  //    * If a dispatch method returns `NotImplemented`, then the dispatcher
+  //      moves on to the next type.
+  //
+  // 3. If no dispatchers for found, or all dispatchers returned
+  //    `NotImplemented', then the dispatcher returns a *borrowed* reference
+  //    to `Py_NotImplemented`.
+  //
+  // Args:
+  //   params: A `PyTuple` containing the canonicalized parameters to the API.
+  //     All `POSITIONAL_OR_KEYWORD` arguments must be converted to positional
+  //     arguments (`KEYWORD_ONLY` arguments are not currently supported).  Any
+  //     dispatchable parameter with `is_list=True` must have been converted to
+  //     `PyList`.
+  //
+  // Returns:
+  //   * If a `__tf_dispatch__` handler successfully handled the API:
+  //     Returns a *new* reference to the handler's return value.
+  //   * If no handler was found, or all handlers returned NotImplemented:
+  //     Returns a *borrowed* reference to `Py_NotImplemented`.
+  //   * On error: Sets an exception and returns `nullptr`.
+  PyObject* Dispatch(PyObject* params) const;
+
+ private:
+  Safe_PyObjectPtr api_name_;
+  Safe_PyObjectPtr api_func_;
+  int num_params_;
+  std::vector<ParamInfo> dispatchable_params_;
+  bool right_to_left_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_API_DISPATCHER_H_
diff --git a/tensorflow/python/framework/python_api_dispatcher_test.py b/tensorflow/python/framework/python_api_dispatcher_test.py
new file mode 100644
index 00000000000..51dda8a0f9f
--- /dev/null
+++ b/tensorflow/python/framework/python_api_dispatcher_test.py
@@ -0,0 +1,244 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.python_api_dispatcher."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python import _pywrap_python_api_dispatcher
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+class Trace(object):
+  """A dispatchable type that builds traces of ops it's called with."""
+
+  log = []
+
+  def __init__(self, api_name, *args):
+    self.api_name = api_name
+    self.args = args
+
+  @classmethod
+  def __tf_dispatch__(cls, api_name, api_func, args):
+    Trace.log.append("__tf_dispatch__%s" % ((cls.__name__, api_name),))
+    if "disabled" in str(args) or api_name == "disabled":
+      return NotImplemented
+    del api_func  # not used
+    return cls(api_name, *args)
+
+  def __repr__(self):
+    return "%s%s" % (type(self).__name__, (self.api_name,) + self.args)
+
+  def __eq__(self, other):
+    return (type(self) is type(other) and self.api_name == other.api_name and
+            self.args == other.args)
+
+
+class Trace2(Trace):
+  pass
+
+
+class Trace2B(Trace2):
+  pass
+
+
+class Trace3(Trace):
+  pass
+
+
+class Trace4(Trace):
+  pass
+
+
+class WeightedTensor(object):
+
+  def __init__(self, tensor, weight):
+    self.tensor = ops.convert_to_tensor(tensor)
+    self.weight = weight  # Python float
+
+  @classmethod
+  def __tf_dispatch__(cls, api_name, api_func, args):
+    del api_name  # unused
+    weights = [arg.weight for arg in args if isinstance(arg, WeightedTensor)]
+    tensors = [
+        arg.tensor if isinstance(arg, WeightedTensor) else arg for arg in args
+    ]
+    tensor_result = api_func(*tensors)
+    avg_weight = sum(weights) / len(weights)
+    return cls(tensor_result, avg_weight)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PythonAPIDispatcherTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+
+  def testNoDispatchableTypes(self):
+    add_dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "tf.math.add", math_ops.add, 2, [0, 1], [], False)
+    self.assertEqual(add_dispatcher.Dispatch(1, 2), NotImplemented)
+
+    concat_dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "tf.concat", array_ops.concat, 2, [1], [0], False)
+    self.assertEqual(concat_dispatcher.Dispatch([1], 0), NotImplemented)
+
+  def testSimpleDispatchWithTrace(self):
+    dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "tf.math.add", math_ops.add, 2, [0, 1], [], False)
+    x = 5
+    y = Trace("constant", "y")
+    z = Trace("constant", "z")
+
+    Trace.log.clear()
+    self.assertEqual(dispatcher.Dispatch(x, y), Trace("tf.math.add", x, y))
+    self.assertEqual(dispatcher.Dispatch(y, x), Trace("tf.math.add", y, x))
+    self.assertEqual(dispatcher.Dispatch(y, z), Trace("tf.math.add", y, z))
+    self.assertEqual(Trace.log, [
+        "__tf_dispatch__('Trace', 'tf.math.add')",
+        "__tf_dispatch__('Trace', 'tf.math.add')",
+        "__tf_dispatch__('Trace', 'tf.math.add')"
+    ])
+
+  def testDispatcherReturnsNotImplemented(self):
+    dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "tf.math.add", math_ops.add, 2, [0, 1], [], False)
+    x = 5
+    y = Trace("constant", "disabled")
+    z = Trace("constant", "z")
+
+    self.assertEqual(dispatcher.Dispatch(x, y), NotImplemented)
+    self.assertEqual(dispatcher.Dispatch(y, x), NotImplemented)
+    self.assertEqual(dispatcher.Dispatch(y, z), NotImplemented)
+    self.assertEqual(dispatcher.Dispatch(z, z), Trace("tf.math.add", z, z))
+
+  def testSimpleDispatchWithWeightedTensor(self):
+    dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "tf.math.add", math_ops.add, 2, [0, 1], [], False)
+    x = 5
+    y = WeightedTensor([1, 2, 3], 0.6)
+    z = WeightedTensor([10, 20, 30], 0.2)
+
+    x_plus_y = dispatcher.Dispatch(x, y)
+    y_plus_x = dispatcher.Dispatch(y, x)
+    y_plus_z = dispatcher.Dispatch(y, z)
+
+    self.assertAllEqual(x_plus_y.tensor, [6, 7, 8])
+    self.assertAllEqual(y_plus_x.tensor, [6, 7, 8])
+    self.assertAllEqual(y_plus_z.tensor, [11, 22, 33])
+
+    self.assertEqual(x_plus_y.weight, 0.6)
+    self.assertEqual(y_plus_x.weight, 0.6)
+    self.assertEqual(y_plus_z.weight, 0.4)
+
+  def testDispatchPrecedence(self):
+    # We use an API for which dispatch is disabled, so all dispatchers get
+    # called (since this test checks the order of the dispatcher list).
+    dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "disabled", None, 5, [0, 1, 4], [2, 3], False)
+
+    t = Trace("constant", "t")
+    t2_1 = Trace2("constant", "t2_1")
+    t2_2 = Trace2("constant", "t2_2")
+    t2b = Trace2B("constant", "t2b")
+    t3 = Trace3("constant", "t3")
+    t4 = Trace4("constant", "t4")
+
+    # Three dispatchable types, none of which is a subclass of the other:
+    # * precedence is left-to-right.
+    # * duplicates are removed.
+    Trace.log.clear()
+    result = dispatcher.Dispatch(t2_1, t3, [], [t2_2, t3], t4)
+    self.assertEqual(result, NotImplemented)
+    self.assertEqual(Trace.log, [
+        "__tf_dispatch__('Trace2', 'disabled')",
+        "__tf_dispatch__('Trace3', 'disabled')",
+        "__tf_dispatch__('Trace4', 'disabled')"
+    ])
+
+    # Subtypes are moved before their base types.
+    Trace.log.clear()
+    result = dispatcher.Dispatch(t2_1, t3, [t], [t2_2, t, t3, t4], t2b)
+    self.assertEqual(result, NotImplemented)
+    self.assertEqual(Trace.log, [
+        "__tf_dispatch__('Trace2B', 'disabled')",
+        "__tf_dispatch__('Trace2', 'disabled')",
+        "__tf_dispatch__('Trace3', 'disabled')",
+        "__tf_dispatch__('Trace4', 'disabled')",
+        "__tf_dispatch__('Trace', 'disabled')"
+    ])
+
+  def testDispatchPrecedenceRightToLeft(self):
+    # We use an API for which dispatch is disabled, so all dispatchers get
+    # called (since this test checks the order of the dispatcher list).
+    dispatcher = _pywrap_python_api_dispatcher.PythonAPIDispatcher(
+        "disabled", None, 5, [4, 0, 1], [2, 3], True)
+
+    t = Trace("constant", "t")
+    t2_1 = Trace2("constant", "t2_1")
+    t2_2 = Trace2("constant", "t2_2")
+    t2b = Trace2B("constant", "t2b")
+    t3 = Trace3("constant", "t3")
+    t4 = Trace4("constant", "t4")
+
+    # Three dispatchable types, none of which is a subclass of the other:
+    # * precedence is right_to_left (since we set right_to_left=True in the
+    #   PtyonAPIDispatcher constructor).  (Note: arguments are scanned
+    #   right-to-left, but the elements of list arguments are still scanned
+    #   left-to-right.)
+    # * duplicates are removed.
+    Trace.log.clear()
+    result = dispatcher.Dispatch(t2_1, t3, [], [t2_2, t3], t4)
+    self.assertEqual(result, NotImplemented)
+    self.assertEqual(Trace.log, [
+        "__tf_dispatch__('Trace4', 'disabled')",
+        "__tf_dispatch__('Trace2', 'disabled')",
+        "__tf_dispatch__('Trace3', 'disabled')"
+    ])
+
+    # Subtypes are moved before their base types.  (Note: moving subtypes occurs
+    # *after* we swap the order to be right-to-left; so the dispatch order here
+    # is not what we'd get by just reversing the final dispatch order if
+    # right_to_left were false.)
+    Trace.log.clear()
+    result = dispatcher.Dispatch(t2_1, t3, [t], [t2_2, t, t3, t4], t2b)
+    self.assertEqual(result, NotImplemented)
+    self.assertEqual(Trace.log, [
+        "__tf_dispatch__('Trace2B', 'disabled')",
+        "__tf_dispatch__('Trace2', 'disabled')",
+        "__tf_dispatch__('Trace3', 'disabled')",
+        "__tf_dispatch__('Trace4', 'disabled')",
+        "__tf_dispatch__('Trace', 'disabled')"
+    ])
+
+  def testDispatchParamOutOfRange(self):
+    with self.assertRaisesRegex(ValueError, "index out of range"):
+      _pywrap_python_api_dispatcher.PythonAPIDispatcher("some_api", None, 5,
+                                                        [0, 1, 5], [2, 3], True)
+    with self.assertRaisesRegex(ValueError, "index out of range"):
+      _pywrap_python_api_dispatcher.PythonAPIDispatcher("some_api", None, 5,
+                                                        [0, -3], [2, 3], True)
+    with self.assertRaisesRegex(ValueError, "index out of range"):
+      _pywrap_python_api_dispatcher.PythonAPIDispatcher("some_api", None, 5,
+                                                        [0, 1], [10, 3], True)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/python_api_dispatcher_wrapper.cc b/tensorflow/python/framework/python_api_dispatcher_wrapper.cc
new file mode 100644
index 00000000000..4f707a902e2
--- /dev/null
+++ b/tensorflow/python/framework/python_api_dispatcher_wrapper.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Note: This library is only used by python_api_dispatcher_test.  It is
+// not meant to be used in other circumstances.
+
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/framework/python_api_dispatcher.h"
+
+namespace py = pybind11;
+
+namespace {
+
+tensorflow::PythonAPIDispatcher MakePythonAPIDispatcher(
+    const std::string& api_name, py::handle api_func, int num_params,
+    const std::vector<int>& dispatch_params,
+    const std::vector<int>& dispatch_list_params, bool right_to_left) {
+  std::vector<tensorflow::PythonAPIDispatcher::ParamInfo> dispatchable_params;
+  dispatchable_params.reserve(dispatch_params.size() +
+                              dispatch_list_params.size());
+  for (int p : dispatch_params) {
+    dispatchable_params.push_back({p, false});
+  }
+  for (int p : dispatch_list_params) {
+    dispatchable_params.push_back({p, true});
+  }
+
+  auto dispatcher = tensorflow::PythonAPIDispatcher(api_name, api_func.ptr(),
+                                                    num_params, right_to_left);
+  if (!dispatcher.Initialize(dispatchable_params)) {
+    throw py::error_already_set();
+  }
+  return dispatcher;
+}
+
+py::handle Dispatch(tensorflow::PythonAPIDispatcher* self, py::args args) {
+  auto result = self->Dispatch(args.ptr());
+  if (result == nullptr) {
+    throw py::error_already_set();
+  } else if (result == Py_NotImplemented) {
+    Py_INCREF(result);
+    return result;
+  } else {
+    return result;
+  }
+}
+
+}  // namespace
+
+PYBIND11_MODULE(_pywrap_python_api_dispatcher, m) {
+  py::class_<tensorflow::PythonAPIDispatcher>(m, "PythonAPIDispatcher")
+      .def(py::init(&MakePythonAPIDispatcher))
+      .def("Dispatch", Dispatch);
+}
diff --git a/tensorflow/python/framework/python_tensor_converter.cc b/tensorflow/python/framework/python_tensor_converter.cc
new file mode 100644
index 00000000000..f18c8a8c681
--- /dev/null
+++ b/tensorflow/python/framework/python_tensor_converter.cc
@@ -0,0 +1,136 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/python/framework/python_tensor_converter.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/python/eager/pywrap_tensor.h"
+#include "tensorflow/python/eager/pywrap_tfe.h"
+#include "tensorflow/python/util/util.h"
+
+#if PY_MAJOR_VERSION < 3
+// Python 2.x:
+#define PY_INT_AS_LONG(x) (PyInt_AsLong(x))
+#define PY_STRING_INTERN_FROM_STRING(x) (PyString_InternFromString(x))
+#else
+// Python 3.x:
+#define PY_INT_AS_LONG(x) (PyLong_AsLong(x))
+#define PY_STRING_INTERN_FROM_STRING(x) (PyUnicode_InternFromString(x))
+#endif
+
+namespace tensorflow {
+namespace {
+
+// Returns `tensor.dtype._type_enum` as a DataType enum.  Assumes that `tensor`
+// is a python `Tensor` object.
+//
+// On error: sets a python AttributeError exception and returns DT_INVALID.
+DataType DataTypeForTensor(PyObject* tensor) {
+  static PyObject* dtype_attr = PY_STRING_INTERN_FROM_STRING("dtype");
+  static PyObject* type_enum_attr = PY_STRING_INTERN_FROM_STRING("_type_enum");
+
+  Safe_PyObjectPtr py_dtype(PyObject_GetAttr(tensor, dtype_attr));
+  if (!py_dtype) return DT_INVALID;
+
+  Safe_PyObjectPtr enum_field(PyObject_GetAttr(py_dtype.get(), type_enum_attr));
+  if (!enum_field) return DT_INVALID;
+
+  DataType result = static_cast<DataType>(PY_INT_AS_LONG(enum_field.get()));
+  return result;
+}
+
+// Check that actual_dtype == expected_dtype.  If not, set an exception and
+// return false.  (If expected_dtype is DT_INVALID, then instead simply update
+// its value to `actual_dtype` and return true.)
+bool CheckDType(DataType actual_dtype, DataType& expected_dtype) {
+  if (expected_dtype == DT_INVALID) {
+    expected_dtype = actual_dtype;  // set output parameter.
+  } else if (expected_dtype != actual_dtype) {
+    PyErr_SetString(PyExc_TypeError,
+                    absl::StrCat("Expected ", DataType_Name(expected_dtype),
+                                 " but got ", DataType_Name(actual_dtype))
+                        .c_str());
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+Safe_PyObjectPtr PythonTensorConverter::Convert(PyObject* src, DataType& dtype,
+                                                bool* used_fallback) const {
+  // First, try converting `src` to a Tensor without calling back into Python.
+  if (ctx_) {  // Eager mode
+    // TODO(b/164980194): Handle resource variables as well.  (See
+    // ConvertToTensor function in pywrap_tfe_src.cc).
+    if (EagerTensor_CheckExact(src)) {
+      // `src` is already an eager tensor; check its type, and return it as-is.
+      if (!CheckDType(PyEagerTensor_Dtype(src), dtype)) return nullptr;
+      Py_INCREF(src);
+      return Safe_PyObjectPtr(src);
+    } else {
+      TFE_TensorHandle* handle =
+          tensorflow::ConvertToEagerTensor(ctx_, src, dtype, device_name_);
+      if (handle) {
+        Safe_PyObjectPtr result(EagerTensorFromHandle(handle));
+        if (!CheckDType(PyEagerTensor_Dtype(result.get()), dtype)) {
+          return nullptr;
+        }
+        return result;
+      } else {
+        PyErr_Clear();
+      }
+    }
+  } else {  // Graph mode
+    if (swig::IsTensor(src)) {
+      DataType src_dtype = DataTypeForTensor(src);
+      if (src_dtype == DT_INVALID) return nullptr;
+      if (!CheckDType(src_dtype, dtype)) return nullptr;
+      Py_INCREF(src);
+      return Safe_PyObjectPtr(src);
+    }
+  }
+
+  // Fallback: use the Python tf.convert_to_tensor function.
+  // Currently this is used:
+  //
+  // * In Eager mode: for anything that's not already an Eager tensor, or
+  //   handled by `tensorflow::ConvertToEagerTensor`.  (At time of writing
+  //   for this comment, ConvertToEagerTensor handles simple values like ints,
+  //   nested lists of simple values, and numpy arrays.)
+  // * In graph mode: for anything that's not already a tensor.
+  //
+  // TODO(b/164980194) Reduce/eliminate cases where fallback is used.
+  if (used_fallback) *used_fallback = true;
+  static PyObject* convert_to_tensor =
+      swig::GetRegisteredPyObject("tf.convert_to_tensor");
+  if (!convert_to_tensor) return nullptr;
+
+  Safe_PyObjectPtr args(PyTuple_New(dtype == DT_INVALID ? 1 : 2));
+  Safe_PyObjectPtr kwargs(PyDict_New());
+  Py_INCREF(src);
+  PyTuple_SetItem(args.get(), 0, src);
+  if (dtype != DT_INVALID) {
+    PyTuple_SetItem(args.get(), 1, PyLong_FromLong(dtype));
+  }
+  PyDict_SetItemString(kwargs.get(), "ctx", py_eager_context_);
+  Safe_PyObjectPtr result(
+      PyObject_Call(convert_to_tensor, args.get(), kwargs.get()));
+  if (!result) return nullptr;
+  dtype = DataTypeForTensor(result.get());  // set output parameter.
+  if (dtype == DT_INVALID) return nullptr;
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_tensor_converter.h b/tensorflow/python/framework/python_tensor_converter.h
new file mode 100644
index 00000000000..faf1793d4cd
--- /dev/null
+++ b/tensorflow/python/framework/python_tensor_converter.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_TENSOR_CONVERTER_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_TENSOR_CONVERTER_H_
+
+#include <Python.h>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+// Converts PyObject* values to Tensors.
+//
+// This converter attempts to convert values as efficiently as possible; but
+// it has fallback paths to handle any PyObject* value for which tensor
+// conversion is defined.
+class PythonTensorConverter {
+ public:
+  // Constructs a new PythonTensorConverter.
+  //
+  // Note: the arguments to this constructor may change in the future, as
+  // we move more of python tensor conversion from the Python layer to the
+  // c++ layer.
+  //
+  // Args:
+  //   py_eager_context: the value of context.context() from eager/context.py.
+  //   ctx: The c++ eager context, or nullptr in graph mode.
+  //   device_name: The current device name.
+  //
+  // All three argument values must remain alive until `this` is deleted.
+  PythonTensorConverter(PyObject* py_eager_context, TFE_Context* ctx,
+                        const char* device_name)
+      : py_eager_context_(py_eager_context),
+        ctx_(ctx),
+        device_name_(device_name) {}
+
+  // Converts `src` to a tensor (if it's not already one), and returns a new
+  // reference to the converted value.
+  //
+  // Args:
+  //   src: The object that should be converted to a Tensor.
+  //   dtype: The requested dtype.  Use `DT_INVALID` if the dtype should be
+  //     inferred from the `src` value (in which case `dtype` will be updated
+  //     in-place to be the actual dtype of the converted value).
+  //   used_fallback: Output parameter used to record whether the conversion
+  //     was done by falling back to the Python `tf.convert_to_tensor()`
+  //     function.  This is for testing/logging purposes only.  May be null.
+  //
+  // If `src` can't be converted to a tensor with the requested dtype, sets a
+  // Python exception and returns nullptr.
+  Safe_PyObjectPtr Convert(PyObject* src, DataType& dtype,
+                           bool* used_fallback = nullptr) const;
+
+ private:
+  PyObject* py_eager_context_;
+  TFE_Context* ctx_;
+  const char* device_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_TENSOR_CONVERTER_H_
diff --git a/tensorflow/python/framework/python_tensor_converter_test.py b/tensorflow/python/framework/python_tensor_converter_test.py
new file mode 100644
index 00000000000..a29f87f3e23
--- /dev/null
+++ b/tensorflow/python/framework/python_tensor_converter_test.py
@@ -0,0 +1,208 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.python_tensor_converter."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.core.framework import types_pb2
+from tensorflow.python import _pywrap_python_tensor_converter
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PythonTensorConverterTest(test_util.TensorFlowTestCase,
+                                parameterized.TestCase):
+
+  def setUp(self):
+    context.ensure_initialized()
+    super(PythonTensorConverterTest, self).setUp()
+
+  def makePythonTensorConverter(self):
+    return _pywrap_python_tensor_converter.PythonTensorConverter(
+        context.context())
+
+  #=============================================================================
+  # Convert int to tensor.
+
+  def testConvertIntWithInferredDType(self):
+    converter = self.makePythonTensorConverter()
+    result, dtype, used_fallback = converter.Convert(12, types_pb2.DT_INVALID)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, 12)
+    self.assertEqual(dtype, types_pb2.DT_INT32)
+    self.assertEqual(used_fallback, not context.executing_eagerly())
+
+  def testConvertIntWithExplicitDtype(self):
+    converter = self.makePythonTensorConverter()
+    result, dtype, used_fallback = converter.Convert(12, types_pb2.DT_INT64)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, 12)
+    self.assertEqual(dtype, types_pb2.DT_INT64)
+    self.assertEqual(used_fallback, not context.executing_eagerly())
+
+  def testConvertIntWithIncompatibleDtype(self):
+    converter = self.makePythonTensorConverter()
+    with self.assertRaisesRegex(
+        TypeError, "Expected string, got 3 of type 'int' instead."
+        "|Cannot convert 3 to EagerTensor of dtype string"):
+      converter.Convert(3, types_pb2.DT_STRING)
+
+  #=============================================================================
+  # Convert tensor to tensor.
+
+  def testConvertTensorWithInferredDType(self):
+    converter = self.makePythonTensorConverter()
+    result, dtype, used_fallback = converter.Convert(
+        constant_op.constant([1, 2, 3]), types_pb2.DT_INVALID)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [1, 2, 3])
+    self.assertEqual(dtype, types_pb2.DT_INT32)
+    self.assertFalse(used_fallback)
+
+  def testConvertTensorWithExplicitDtype(self):
+    converter = self.makePythonTensorConverter()
+    result, dtype, used_fallback = converter.Convert(
+        constant_op.constant([1, 2, 3], dtypes.int64), types_pb2.DT_INT64)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [1, 2, 3])
+    self.assertEqual(dtype, types_pb2.DT_INT64)
+    self.assertFalse(used_fallback)
+
+  def testConvertTensorWithIncorrectDtype(self):
+    converter = self.makePythonTensorConverter()
+    with self.assertRaises((TypeError, ValueError)):
+      converter.Convert(
+          constant_op.constant([1, 2, 3], dtypes.int32), types_pb2.DT_INT64)
+
+  #=============================================================================
+  # Convert list to tensor.
+
+  def testConvertListWithInferredDType(self):
+    converter = self.makePythonTensorConverter()
+    result, dtype, used_fallback = converter.Convert([[1, 2, 3], [4, 5, 6]],
+                                                     types_pb2.DT_INVALID)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [[1, 2, 3], [4, 5, 6]])
+    self.assertEqual(dtype, types_pb2.DT_INT32)
+    self.assertEqual(used_fallback, not context.executing_eagerly())
+
+  def testConvertListWithExplicitDtype(self):
+    converter = self.makePythonTensorConverter()
+    result, dtype, used_fallback = converter.Convert([[1, 2, 3], [4, 5, 6]],
+                                                     types_pb2.DT_INT64)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [[1, 2, 3], [4, 5, 6]])
+    self.assertEqual(dtype, types_pb2.DT_INT64)
+    self.assertEqual(used_fallback, not context.executing_eagerly())
+
+  def testConvertListWithIncompatibleDtype(self):
+    converter = self.makePythonTensorConverter()
+    with self.assertRaisesRegex(
+        TypeError, "Expected string, got .* of type 'int' instead."
+        "|Cannot convert .* to EagerTensor of dtype string"):
+      converter.Convert([[1, 2, 3], [4, 5, 6]], types_pb2.DT_STRING)
+
+  def testConvertListWithInconsistentDtype(self):
+    converter = self.makePythonTensorConverter()
+    with self.assertRaisesRegex(
+        (TypeError, ValueError),
+        "Can't convert Python sequence with mixed types to Tensor."
+        "|Failed to convert"):
+      converter.Convert([[1, 2], ["a", "b"]], types_pb2.DT_INVALID)
+
+  #=============================================================================
+  # Convert np.array to tensor.
+
+  def testConvertNumpyArrayWithInferredDType(self):
+    converter = self.makePythonTensorConverter()
+    x = np.array([[1, 2, 3], [4, 5, 6]], np.int32)
+    result, dtype, used_fallback = converter.Convert(x, types_pb2.DT_INVALID)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [[1, 2, 3], [4, 5, 6]])
+    self.assertEqual(dtype, types_pb2.DT_INT32)
+    self.assertEqual(used_fallback, not context.executing_eagerly())
+
+  def testConvertNumpyArrayWithExplicitDtype(self):
+    converter = self.makePythonTensorConverter()
+    x = np.array([[1, 2, 3], [4, 5, 6]], np.int32)
+    result, dtype, used_fallback = converter.Convert(x, types_pb2.DT_INT64)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [[1, 2, 3], [4, 5, 6]])
+    self.assertEqual(dtype, types_pb2.DT_INT64)
+    self.assertEqual(used_fallback, not context.executing_eagerly())
+
+  def testConvertNumpyArrayWithIncompatibleDtype(self):
+    converter = self.makePythonTensorConverter()
+    x = np.array([[1, 2, 3], [4, 5, 6]], np.int32)
+    with self.assertRaises((ValueError, TypeError)):
+      converter.Convert(x, types_pb2.DT_STRING)
+
+  def testConvertNumpyArrayWithUnsupportedDtype(self):
+    converter = self.makePythonTensorConverter()
+    x = np.array([[1, 2], ["a", "b"]], np.object)
+    with self.assertRaises((ValueError, TypeError)):
+      converter.Convert(x, types_pb2.DT_INVALID)
+
+  #=============================================================================
+  # Convert IndexedSlices to tensor.
+
+  def testConvertIndexedSlicesWithInferredDType(self):
+    converter = self.makePythonTensorConverter()
+    x = indexed_slices.IndexedSlices(
+        constant_op.constant([[1, 2, 3]], dtypes.int32, name="x_values"),
+        constant_op.constant([1], dtypes.int64, name="x_indices"),
+        constant_op.constant([3, 3], dtypes.int64, name="x_shape"))
+    result, dtype, used_fallback = converter.Convert(x, types_pb2.DT_INVALID)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [[0, 0, 0], [1, 2, 3], [0, 0, 0]])
+    self.assertEqual(dtype, types_pb2.DT_INT32)
+    self.assertTrue(used_fallback)
+
+  def testConvertIndexedSlicesWithExplicitDtype(self):
+    converter = self.makePythonTensorConverter()
+    x = indexed_slices.IndexedSlices(
+        constant_op.constant([[1, 2, 3]], dtypes.int32, name="x_values"),
+        constant_op.constant([1], dtypes.int64, name="x_indices"),
+        constant_op.constant([3, 3], dtypes.int64, name="x_shape"))
+    result, dtype, used_fallback = converter.Convert(x, types_pb2.DT_INT32)
+    self.assertIsInstance(result, ops.Tensor)
+    self.assertAllEqual(result, [[0, 0, 0], [1, 2, 3], [0, 0, 0]])
+    self.assertEqual(dtype, types_pb2.DT_INT32)
+    self.assertTrue(used_fallback)
+
+  def testConvertIndexedSlicesWithIncorrectDtype(self):
+    converter = self.makePythonTensorConverter()
+    x = indexed_slices.IndexedSlices(
+        constant_op.constant([[1, 2, 3]], dtypes.int32, name="x_values"),
+        constant_op.constant([1], dtypes.int64, name="x_indices"),
+        constant_op.constant([3, 3], dtypes.int64, name="x_shape"))
+    with self.assertRaises((ValueError, TypeError)):
+      converter.Convert(x, types_pb2.DT_FLOAT)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/python_tensor_converter_wrapper.cc b/tensorflow/python/framework/python_tensor_converter_wrapper.cc
new file mode 100644
index 00000000000..33491869dc6
--- /dev/null
+++ b/tensorflow/python/framework/python_tensor_converter_wrapper.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Note: This library is only used by python_tensor_converter_test.  It is
+// not meant to be used in other circumstances.
+
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/eager/pywrap_tfe.h"
+#include "tensorflow/python/framework/python_tensor_converter.h"
+
+#if PY_MAJOR_VERSION < 3
+// Python 2.x:
+#define PY_STRING_INTERN_FROM_STRING(x) (PyString_InternFromString(x))
+#define PY_INT_AS_LONG(x) (PyInt_AsLong(x))
+#define PY_INT_FROM_LONG(x) (PyInt_FromLong(x))
+#else
+// Python 3.x:
+#define PY_INT_AS_LONG(x) (PyLong_AsLong(x))
+#define PY_STRING_INTERN_FROM_STRING(x) (PyUnicode_InternFromString(x))
+#define PY_INT_FROM_LONG(x) (PyLong_FromLong(x))
+#endif
+
+namespace py = pybind11;
+
+namespace tensorflow {
+namespace {
+
+Safe_PyObjectPtr GetAttr_ThreadLocalData(PyObject* eager_context) {
+  static PyObject* attr = PY_STRING_INTERN_FROM_STRING("_thread_local_data");
+  return Safe_PyObjectPtr(PyObject_GetAttr(eager_context, attr));
+}
+
+Safe_PyObjectPtr GetAttr_ContextHandle(PyObject* eager_context) {
+  static PyObject* attr = PY_STRING_INTERN_FROM_STRING("_context_handle");
+  return Safe_PyObjectPtr(PyObject_GetAttr(eager_context, attr));
+}
+
+Safe_PyObjectPtr GetAttr_IsEager(PyObject* tld) {
+  static PyObject* attr = PY_STRING_INTERN_FROM_STRING("is_eager");
+  return Safe_PyObjectPtr(PyObject_GetAttr(tld, attr));
+}
+
+Safe_PyObjectPtr GetAttr_DeviceName(PyObject* tld) {
+  static PyObject* attr = PY_STRING_INTERN_FROM_STRING("device_name");
+  return Safe_PyObjectPtr(PyObject_GetAttr(tld, attr));
+}
+
+Safe_PyObjectPtr GetAttr_TypeEnum(PyObject* dtype) {
+  static PyObject* attr = PY_STRING_INTERN_FROM_STRING("_type_enum");
+  return Safe_PyObjectPtr(PyObject_GetAttr(dtype, attr));
+}
+
+PythonTensorConverter MakePythonTensorConverter(py::handle py_eager_context) {
+  Safe_PyObjectPtr tld = GetAttr_ThreadLocalData(py_eager_context.ptr());
+  if (!tld) throw py::error_already_set();
+
+  Safe_PyObjectPtr py_is_eager = GetAttr_IsEager(tld.get());
+  if (!py_is_eager) throw py::error_already_set();
+  bool is_eager = PyObject_IsTrue(py_is_eager.get());
+
+  // Initialize the eager context, if necessary.
+  TFE_Context* ctx = nullptr;
+  const char* device_name = nullptr;
+  if (is_eager) {
+    Safe_PyObjectPtr context_handle =
+        GetAttr_ContextHandle(py_eager_context.ptr());
+    if (!context_handle) throw py::error_already_set();
+    if (context_handle.get() == Py_None) {
+      throw std::runtime_error("Error retrieving context handle.");
+    }
+    Safe_PyObjectPtr py_device_name = GetAttr_DeviceName(tld.get());
+    if (!py_device_name) {
+      throw std::runtime_error("Error retrieving device name.");
+    }
+    device_name = TFE_GetPythonString(py_device_name.get());
+    ctx = reinterpret_cast<TFE_Context*>(
+        PyCapsule_GetPointer(context_handle.get(), nullptr));
+  }
+
+  return PythonTensorConverter(py_eager_context.ptr(), ctx, device_name);
+}
+
+py::handle Convert(tensorflow::PythonTensorConverter* self, py::handle obj,
+                   py::handle dtype) {
+  DataType dtype_enum = static_cast<DataType>(PY_INT_AS_LONG(dtype.ptr()));
+  bool used_fallback = false;
+  Safe_PyObjectPtr converted =
+      self->Convert(obj.ptr(), dtype_enum, &used_fallback);
+  if (!converted) throw py::error_already_set();
+
+  PyObject* result = PyTuple_New(3);
+  PyTuple_SET_ITEM(result, 0, converted.release());
+  PyTuple_SET_ITEM(result, 1, PY_INT_FROM_LONG(dtype_enum));
+  PyTuple_SET_ITEM(result, 2, used_fallback ? Py_True : Py_False);
+  Py_INCREF(PyTuple_GET_ITEM(result, 1));
+  Py_INCREF(PyTuple_GET_ITEM(result, 2));
+  return result;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+PYBIND11_MODULE(_pywrap_python_tensor_converter, m) {
+  py::class_<tensorflow::PythonTensorConverter>(m, "PythonTensorConverter")
+      .def(py::init(&tensorflow::MakePythonTensorConverter))
+      .def("Convert", tensorflow::Convert);
+}
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index b6589bc9bd7..f55cf51062d 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -108,7 +108,7 @@ except Exception:  # pylint: disable=broad-except
 # Uses the same mechanism as above to selectively enable/disable MLIR
 # compilation.
 def is_mlir_bridge_enabled():
-  return False
+  return None
 
 
 try:
@@ -120,8 +120,11 @@ except ImportError:
     pass
 
 
-def _get_object_count_by_type():
-  return collections.Counter([type(obj).__name__ for obj in gc.get_objects()])
+def _get_object_count_by_type(exclude=()):
+  return (
+      collections.Counter([type(obj).__name__ for obj in gc.get_objects()]) -
+      collections.Counter([type(obj).__name__ for obj in exclude]))
+
 
 @tf_export("test.gpu_device_name")
 def gpu_device_name():
@@ -657,12 +660,20 @@ def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
         # versions of python2.7.x.
         for _ in range(warmup_iters):
           f(self, *args, **kwargs)
+        # Since we aren't in the normal test lifecylce, we need to manually run
+        # cleanups to clear out their object references.
+        self.doCleanups()
 
         # Some objects are newly created by _get_object_count_by_type().  So
         # create and save as a dummy variable to include it as a baseline.
         obj_count_by_type = _get_object_count_by_type()
         gc.collect()
-        obj_count_by_type = _get_object_count_by_type()
+        # unittest.doCleanups adds to self._outcome with each unwound call.
+        # These objects are retained across gc collections so we exclude them
+        # from the object count calculation.
+        obj_count_by_type = _get_object_count_by_type(
+            exclude=gc.get_referents(self._outcome.errors,
+                                     self._outcome.skipped))
 
         if ops.has_default_graph():
           collection_sizes_before = {
@@ -671,6 +682,9 @@ def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
           }
         for _ in range(3):
           f(self, *args, **kwargs)
+        # Since we aren't in the normal test lifecylce, we need to manually run
+        # cleanups to clear out their object references.
+        self.doCleanups()
         # Note that gc.get_objects misses anything that isn't subject to garbage
         # collection (C types). Collections are a common source of leaks, so we
         # test for collection sizes explicitly.
@@ -692,7 +706,11 @@ def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
         gc.collect()
 
         # There should be no new Python objects hanging around.
-        obj_count_by_type = _get_object_count_by_type() - obj_count_by_type
+        obj_count_by_type = (
+            _get_object_count_by_type(
+                exclude=gc.get_referents(self._outcome.errors,
+                                         self._outcome.skipped)) -
+            obj_count_by_type)
         # In some cases (specifically on MacOS), new_count is somehow
         # smaller than previous_count.
         # Using plain assert because not all classes using this decorator
@@ -1959,6 +1977,9 @@ def matmul_without_tf32(a, b, *args, **kwargs):
   If a matmul itself is being tested, or some other op which uses matmul, use
   `run_without_tensor_float_32` instead.
 
+  This also casts complex64 inputs to complex128, since TensorFloat-32 can also
+  be used with complex64
+
   Args:
     a: First input to tf.linalg.matmul
     b: Second input to tf.linalg.matmul
@@ -1973,6 +1994,11 @@ def matmul_without_tf32(a, b, *args, **kwargs):
     b = math_ops.cast(b, "float64")
     ret = math_ops.matmul(a, b, *args, **kwargs)
     return math_ops.cast(ret, a.dtype)
+  elif config.tensor_float_32_execution_enabled() and a.dtype == "complex64":
+    a = math_ops.cast(a, "complex128")
+    b = math_ops.cast(b, "complex128")
+    ret = math_ops.matmul(a, b, *args, **kwargs)
+    return math_ops.cast(ret, a.dtype)
   else:
     return math_ops.matmul(a, b, *args, **kwargs)
 
@@ -2004,8 +2030,13 @@ class TensorFlowTestCase(googletest.TestCase):
       # disable it here.
       pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(True)
 
+    # Check if the mlir bridge has been explicitly enabled or disabled. If
+    # is_mlir_bridge_enabled() returns None, the user did not explictly enable
+    # or disable the bridge so do not update enable_mlir_bridge.
     if is_mlir_bridge_enabled():
       context.context().enable_mlir_bridge = True
+    elif is_mlir_bridge_enabled() is not None:
+      context.context().enable_mlir_bridge = False
 
     self._threads = []
     self._tempdir = None
@@ -2013,6 +2044,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._test_start_time = None
 
   def setUp(self):
+    super(TensorFlowTestCase, self).setUp()
     self._ClearCachedSession()
     random.seed(random_seed.DEFAULT_GRAPH_SEED)
     np.random.seed(random_seed.DEFAULT_GRAPH_SEED)
@@ -2047,6 +2079,7 @@ class TensorFlowTestCase(googletest.TestCase):
       thread.check_termination()
 
     self._ClearCachedSession()
+    super(TensorFlowTestCase, self).tearDown()
 
   def _ClearCachedSession(self):
     if self._cached_session is not None:
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index fd1b782d621..ec5d7b6a85b 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -962,12 +962,13 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_new_objects_decorator(self):
 
-    class LeakedObjectTest(object):
+    class LeakedObjectTest(unittest.TestCase):
 
-      def __init__(inner_self):  # pylint: disable=no-self-argument
-        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
-        inner_self.accumulation = []
+      def __init__(self, *args, **kwargs):
+        super(LeakedObjectTest, self).__init__(*args, **kwargs)
+        self.accumulation = []
 
+      @unittest.expectedFailure
       @test_util.assert_no_new_pyobjects_executing_eagerly
       def test_has_leak(self):
         self.accumulation.append([1.])
@@ -976,10 +977,8 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
       def test_has_no_leak(self):
         self.not_accumulating = [1.]
 
-    with self.assertRaises(AssertionError):
-      LeakedObjectTest().test_has_leak()
-
-    LeakedObjectTest().test_has_no_leak()
+    self.assertTrue(LeakedObjectTest("test_has_leak").run().wasSuccessful())
+    self.assertTrue(LeakedObjectTest("test_has_no_leak").run().wasSuccessful())
 
 
 class RunFunctionsEagerlyInV2Test(test_util.TensorFlowTestCase,
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 263b05047da..bee2874b294 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1199,6 +1199,36 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('LeakyReluGrad-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
+  def testLeakyReluGradFor5DTensors(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      w = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      y = gen_nn_ops.conv3d(x, w, [1, 1, 1, 1, 1], 'SAME')
+      y = gen_nn_ops.leaky_relu_grad(y, x, alpha=0.2)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('LeakyReluGrad-1', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('LeakyReluGrad-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
   @test_util.deprecated_graph_mode_only
   def testReduceOpsFor5DTensors(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 7c8492a5f48..6099964be5b 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -36,9 +36,10 @@ py_library(
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras/applications",
         "//tensorflow/python/keras/datasets",
+        "//tensorflow/python/keras/distribute",
         "//tensorflow/python/keras/feature_column",
         "//tensorflow/python/keras/layers",
-        "//tensorflow/python/keras/mixed_precision/experimental:mixed_precision_experimental",
+        "//tensorflow/python/keras/mixed_precision:mixed_precision_experimental",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/keras/premade",
         "//tensorflow/python/keras/preprocessing",
@@ -95,6 +96,7 @@ py_library(
         "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/keras/engine:keras_tensor",
         "//tensorflow/python/keras/utils:control_flow_util",
+        "//tensorflow/python/keras/utils:object_identity",
         "//tensorflow/python/keras/utils:tf_contextlib",
         "//tensorflow/python/keras/utils:tf_inspect",
     ],
@@ -130,6 +132,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
+        "//tensorflow/python/keras/layers:advanced_activations",
         "//tensorflow/python/keras/utils:engine_utils",
     ],
 )
@@ -152,7 +155,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
-        "//tensorflow/python/distribute:distributed_file_utils",
+        "//tensorflow/python/keras/distribute:distributed_file_utils",
         "//tensorflow/python/keras/distribute:worker_training_state",
         "//tensorflow/python/keras/protobuf:projector_config_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
@@ -339,7 +342,6 @@ tf_py_test(
     size = "small",
     srcs = ["activations_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":activations",
         ":backend",
@@ -359,7 +361,6 @@ tf_py_test(
     size = "small",
     srcs = ["combinations_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":combinations",
         ":testing_utils",
@@ -376,7 +377,6 @@ tf_py_test(
     size = "small",
     srcs = ["constraints_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backend",
         ":combinations",
@@ -391,7 +391,6 @@ tf_py_test(
     size = "small",
     srcs = ["initializers_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backend",
         ":combinations",
@@ -426,7 +425,6 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 8,
     tags = ["notsan"],
-    tfrt_enabled = True,
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -630,7 +628,6 @@ tf_py_test(
     size = "medium",
     srcs = ["backend_config_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":backend",
         ":backend_config",
@@ -645,7 +642,6 @@ tf_py_test(
     srcs = ["keras_parameterized_test.py"],
     python_version = "PY3",
     tags = ["notsan"],
-    tfrt_enabled = True,
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -653,3 +649,67 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_library(
+    name = "expect_absl_installed",
+    # This is a dummy rule used as a absl dependency in open-source.
+    # We expect absl to already be installed on the system, e.g. via
+    # `pip install absl`
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_h5py_installed",
+    # This is a dummy rule used as a h5 dependency in open-source.
+    # We expect h5py to already be installed on the system, e.g. via
+    # `pip install h5py'
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_numpy_installed",
+    # This is a dummy rule used as a numpy dependency in open-source.
+    # We expect numpy to already be installed on the system, e.g. via
+    # `pip install numpy`
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_pandas_installed",
+    # This is a dummy rule used as a pandas dependency in open-source.
+    # We expect h5py to already be installed on the system, e.g. via
+    # `pip install h5py'
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_six_installed",
+    # This is a dummy rule used as a six dependency in open-source.
+    # We expect six to already be installed on the system, e.g. via
+    # `pip install six`
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_tensorboard_installed",
+    # This is a dummy rule used as a tensorboard dependency in open-source.
+    # We expect tensorboard to already be installed on the system, e.g. via
+    # `pip install tensorboard`.
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_tensorflow_installed",
+    # This is a dummy rule used as a tensorflow dependency in open-source.
+    # We expect tensorflow to already be installed on the system, e.g. via
+    # `pip install tensorflow` or `pip install tensorflow_gpu`
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "expect_tensorflow_estimator_installed",
+    # This is a dummy rule used as a estimator dependency in open-source.
+    # We expect tensorflow estimator to already be installed on the system, e.g. via
+    # `pip install tensorflow` or `pip install tensorflow_gpu`
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index 47f207329d7..e6f09334da6 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -21,17 +21,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=unused-import
 from tensorflow.python import tf2
+from tensorflow.python.keras import distribute
 
 # See b/110718070#comment18 for more details about this import.
 from tensorflow.python.keras import models
 
+
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.sequential import Sequential
 from tensorflow.python.keras.engine.training import Model
 
 from tensorflow.python.util.tf_export import keras_export
 
-__version__ = '2.4.0'
+__version__ = '2.5.0'
 
 keras_export('keras.__version__').export_constant(__name__, '__version__')
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 4f1ef96c8ef..b1433120749 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.keras.layers import advanced_activations
 
 # b/123041942
 # In TF 2.x, if the `tf.nn.softmax` is used as an activation function in Keras
@@ -529,9 +530,17 @@ def deserialize(name, custom_objects=None):
       ValueError: `Unknown activation function` if the input string does not
       denote any defined Tensorflow activation function.
   """
+  globs = globals()
+
+  # only replace missing activations
+  advanced_activations_globs = advanced_activations.get_globals()
+  for key, val in advanced_activations_globs.items():
+    if key not in globs:
+      globs[key] = val
+
   return deserialize_keras_object(
       name,
-      module_objects=globals(),
+      module_objects=globs,
       custom_objects=custom_objects,
       printable_module_name='activation function')
 
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index ddd3863a3f6..e2bdec0dd45 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -65,12 +65,19 @@ class KerasActivationsTest(test.TestCase, parameterized.TestCase):
     activation = advanced_activations.LeakyReLU(alpha=0.1)
     layer = core.Dense(3, activation=activation)
     config = serialization.serialize(layer)
+    # with custom objects
     deserialized_layer = serialization.deserialize(
         config, custom_objects={'LeakyReLU': activation})
     self.assertEqual(deserialized_layer.__class__.__name__,
                      layer.__class__.__name__)
     self.assertEqual(deserialized_layer.activation.__class__.__name__,
                      activation.__class__.__name__)
+    # without custom objects
+    deserialized_layer = serialization.deserialize(config)
+    self.assertEqual(deserialized_layer.__class__.__name__,
+                     layer.__class__.__name__)
+    self.assertEqual(deserialized_layer.activation.__class__.__name__,
+                     activation.__class__.__name__)
 
   def test_softmax(self):
     x = backend.placeholder(ndim=2)
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
index d69930b7455..fba0cf557bc 100644
--- a/tensorflow/python/keras/api/BUILD
+++ b/tensorflow/python/keras/api/BUILD
@@ -73,9 +73,9 @@ keras_packages = [
     "tensorflow.python.keras.layers.wrappers",
     "tensorflow.python.keras.losses",
     "tensorflow.python.keras.metrics",
-    "tensorflow.python.keras.mixed_precision.experimental.get_layer_policy",
-    "tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer",
-    "tensorflow.python.keras.mixed_precision.experimental.policy",
+    "tensorflow.python.keras.mixed_precision.get_layer_policy",
+    "tensorflow.python.keras.mixed_precision.loss_scale_optimizer",
+    "tensorflow.python.keras.mixed_precision.policy",
     "tensorflow.python.keras.models",
     "tensorflow.python.keras.optimizer_v2.adadelta",
     "tensorflow.python.keras.optimizer_v2.adagrad",
diff --git a/tensorflow/python/keras/applications/BUILD b/tensorflow/python/keras/applications/BUILD
index 8140a1ed806..c8151c031d3 100644
--- a/tensorflow/python/keras/applications/BUILD
+++ b/tensorflow/python/keras/applications/BUILD
@@ -63,7 +63,6 @@ tf_py_test(
         "no_rocm",
         "notsan",  # b/168814536
     ],
-    tfrt_enabled = True,
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
@@ -101,6 +100,7 @@ tf_py_test(
     tags = [
         "no_oss",
         "no_pip",
+        "notsan",  # TODO(b/170901700)
     ],
     deps = [
         ":applications",
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index c66fe75554e..482181cc38d 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -33,22 +33,22 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
-BASE_WEIGTHS_PATH = ('https://storage.googleapis.com/tensorflow/'
+BASE_WEIGHTS_PATH = ('https://storage.googleapis.com/tensorflow/'
                      'keras-applications/densenet/')
 DENSENET121_WEIGHT_PATH = (
-    BASE_WEIGTHS_PATH + 'densenet121_weights_tf_dim_ordering_tf_kernels.h5')
+    BASE_WEIGHTS_PATH + 'densenet121_weights_tf_dim_ordering_tf_kernels.h5')
 DENSENET121_WEIGHT_PATH_NO_TOP = (
-    BASE_WEIGTHS_PATH +
+    BASE_WEIGHTS_PATH +
     'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5')
 DENSENET169_WEIGHT_PATH = (
-    BASE_WEIGTHS_PATH + 'densenet169_weights_tf_dim_ordering_tf_kernels.h5')
+    BASE_WEIGHTS_PATH + 'densenet169_weights_tf_dim_ordering_tf_kernels.h5')
 DENSENET169_WEIGHT_PATH_NO_TOP = (
-    BASE_WEIGTHS_PATH +
+    BASE_WEIGHTS_PATH +
     'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5')
 DENSENET201_WEIGHT_PATH = (
-    BASE_WEIGTHS_PATH + 'densenet201_weights_tf_dim_ordering_tf_kernels.h5')
+    BASE_WEIGHTS_PATH + 'densenet201_weights_tf_dim_ordering_tf_kernels.h5')
 DENSENET201_WEIGHT_PATH_NO_TOP = (
-    BASE_WEIGTHS_PATH +
+    BASE_WEIGHTS_PATH +
     'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5')
 
 layers = VersionAwareLayers()
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 5887cfca594..67f837dfc58 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -62,20 +62,19 @@ NASNET_LARGE_WEIGHT_PATH_NO_TOP = BASE_WEIGHTS_PATH + 'NASNet-large-no-top.h5'
 layers = VersionAwareLayers()
 
 
-def NASNet(
-    input_shape=None,
-    penultimate_filters=4032,
-    num_blocks=6,
-    stem_block_filters=96,
-    skip_reduction=True,
-    filter_multiplier=2,
-    include_top=True,
-    weights=None,
-    input_tensor=None,
-    pooling=None,
-    classes=1000,
-    default_size=None,
-    classifier_activation='softmax'):
+def NASNet(input_shape=None,
+           penultimate_filters=4032,
+           num_blocks=6,
+           stem_block_filters=96,
+           skip_reduction=True,
+           filter_multiplier=2,
+           include_top=True,
+           weights='imagenet',
+           input_tensor=None,
+           pooling=None,
+           classes=1000,
+           default_size=None,
+           classifier_activation='softmax'):
   """Instantiates a NASNet model.
 
   Reference:
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 2493d32fe6a..e6b2b65b27b 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -41,6 +41,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device_spec
@@ -54,6 +55,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import object_identity
 from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
@@ -81,8 +83,8 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training.tracking import util as tracking_util
 from tensorflow.python.util import dispatch
+from tensorflow.python.util import keras_deps
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -114,7 +116,7 @@ PER_GRAPH_OBJECT_NAME_UIDS = weakref.WeakKeyDictionary()
 
 
 # A global set tracking what object names have been seen so far.
-# Optionally used as an avoid-list when generaing names
+# Optionally used as an avoid-list when generating names
 OBSERVED_NAMES = set()
 
 
@@ -317,6 +319,10 @@ def clear_session():
     _GRAPH_VARIABLES.pop(graph, None)
     _GRAPH_TF_OPTIMIZERS.pop(graph, None)
 
+# Inject the clear_session function to keras_deps to remove the dependency
+# from TFLite to Keras.
+keras_deps.register_clear_session_function(clear_session)
+
 
 @keras_export('keras.backend.manual_variable_initialization')
 @doc_controls.do_not_generate_docs
@@ -444,7 +450,7 @@ def deprecated_internal_set_learning_phase(value):
   This method is an internal-only version of `set_learning_phase` that
   does not raise a deprecation error. It is required because
   saved_model needs to keep working with user code that uses the deprecated
-  learning phase methods until those apis are fully removed from the public api.
+  learning phase methods until those APIs are fully removed from the public API.
 
   Specifically SavedModel saving needs to make sure the learning phase is 0
   during tracing even if users overwrote it to a different value.
@@ -510,7 +516,7 @@ def deprecated_internal_learning_phase_scope(value):
   with code that sets/gets the learning phase, but saved model
   saving itself shouldn't raise a deprecation warning.
 
-  We can get rid of this method and its usages when the public api is
+  We can get rid of this method and its usages when the public API is
   removed.
 
   Arguments:
@@ -585,9 +591,109 @@ def eager_learning_phase_scope(value):
       del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH.key]
 
 
-def _current_graph(op_input_list):
-  """Return the graph members of `op_input_list`, or the current graph."""
-  return ops._get_graph_from_inputs(op_input_list)
+def _as_graph_element(obj):
+  """Convert `obj` to a graph element if possible, otherwise return `None`.
+
+  Args:
+    obj: Object to convert.
+
+  Returns:
+    The result of `obj._as_graph_element()` if that method is available;
+        otherwise `None`.
+  """
+  conv_fn = getattr(obj, '_as_graph_element', None)
+  if conv_fn and callable(conv_fn):
+    return conv_fn()
+  return None
+
+
+def _assert_same_graph(original_item, item):
+  """Fail if the 2 items are from different graphs.
+
+  Args:
+    original_item: Original item to check against.
+    item: Item to check.
+
+  Raises:
+    ValueError: if graphs do not match.
+  """
+  original_graph = getattr(original_item, 'graph', None)
+  graph = getattr(item, 'graph', None)
+  if original_graph and graph and original_graph is not graph:
+    raise ValueError(
+        '%s must be from the same graph as %s (graphs are %s and %s).' %
+        (item, original_item, graph, original_graph))
+
+
+def _current_graph(op_input_list, graph=None):
+  """Returns the appropriate graph to use for the given inputs.
+
+  This library method provides a consistent algorithm for choosing the graph
+  in which an Operation should be constructed:
+
+  1. If the default graph is being used to construct a function, we
+     use the default graph.
+  2. If the "graph" is specified explicitly, we validate that all of the inputs
+     in "op_input_list" are compatible with that graph.
+  3. Otherwise, we attempt to select a graph from the first Operation-
+     or Tensor-valued input in "op_input_list", and validate that all other
+     such inputs are in the same graph.
+  4. If the graph was not specified and it could not be inferred from
+     "op_input_list", we attempt to use the default graph.
+
+  Args:
+    op_input_list: A list of inputs to an operation, which may include `Tensor`,
+      `Operation`, and other objects that may be converted to a graph element.
+    graph: (Optional) The explicit graph to use.
+
+  Raises:
+    TypeError: If op_input_list is not a list or tuple, or if graph is not a
+      Graph.
+    ValueError: If a graph is explicitly passed and not all inputs are from it,
+      or if the inputs are from multiple graphs, or we could not find a graph
+      and there was no default graph.
+
+  Returns:
+    The appropriate graph to use for the given inputs.
+
+  """
+  current_default_graph = ops.get_default_graph()
+  if current_default_graph.building_function:
+    return current_default_graph
+
+  op_input_list = tuple(op_input_list)  # Handle generators correctly
+  if graph and not isinstance(graph, ops.Graph):
+    raise TypeError('Input graph needs to be a Graph: %s' % (graph,))
+
+  # 1. We validate that all of the inputs are from the same graph. This is
+  #    either the supplied graph parameter, or the first one selected from one
+  #    the graph-element-valued inputs. In the latter case, we hold onto
+  #    that input in original_graph_element so we can provide a more
+  #    informative error if a mismatch is found.
+  original_graph_element = None
+  for op_input in op_input_list:
+    # Determine if this is a valid graph_element.
+    # TODO(josh11b): Note that we exclude subclasses of Tensor. Need to clean this
+    # up.
+    if (isinstance(op_input, (
+        ops.Operation, ops.Tensor, composite_tensor.CompositeTensor)) and
+        ((not isinstance(op_input, ops.Tensor))
+         or type(op_input) == ops.Tensor)):  # pylint: disable=unidiomatic-typecheck
+      graph_element = op_input
+    else:
+      graph_element = _as_graph_element(op_input)
+
+    if graph_element is not None:
+      if not graph:
+        original_graph_element = graph_element
+        graph = getattr(graph_element, 'graph', None)
+      elif original_graph_element is not None:
+        _assert_same_graph(original_graph_element, graph_element)
+      elif graph_element.graph is not graph:
+        raise ValueError('%s is not from the passed-in graph.' % graph_element)
+
+  # 2. If all else fails, we use the default graph, which is always there.
+  return graph or current_default_graph
 
 
 def _get_session(op_input_list=()):
@@ -644,6 +750,9 @@ def get_session(op_input_list=()):
       _initialize_variables(session)
   return session
 
+# Inject the get_session function to keras_deps to remove the dependency
+# from TFLite to Keras.
+keras_deps.register_get_session_function(get_session)
 
 # Inject the get_session function to tracking_util to avoid the backward
 # dependency from TF to Keras.
@@ -785,7 +894,7 @@ def _is_current_explicit_device(device_type):
 
 
 def _get_available_gpus():
-  """Get a list of available gpu devices (formatted as strings).
+  """Get a list of available GPU devices (formatted as strings).
 
   Returns:
       A list of available GPU devices.
@@ -6412,7 +6521,7 @@ def configure_and_create_distributed_session(distribution_strategy):
     dc.run_distribute_coordinator(
         _create_session,
         distribution_strategy,
-        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+        mode='independent_worker')
   else:
     _create_session(distribution_strategy)
 
@@ -6473,9 +6582,9 @@ class ContextValueCache(weakref.WeakKeyDictionary):
 
   This class is similar to defaultdict, where values may be produced by the
   default factory specified during initialization. This class also has a default
-  value for the key (when key is `None`) -- the key is set to the the current
-  graph or eager context. The default factories for key and value are only used
-  in `__getitem__` and `setdefault`. The `.get()` behavior remains the same.
+  value for the key (when key is `None`) -- the key is set to the current graph
+  or eager context. The default factories for key and value are only used in
+  `__getitem__` and `setdefault`. The `.get()` behavior remains the same.
 
   This object will return the value of the current graph or closest parent graph
   if the current graph is a function. This is to reflect the fact that if a
diff --git a/tensorflow/python/keras/benchmarks/BUILD b/tensorflow/python/keras/benchmarks/BUILD
index d25afb24f9a..1e249d3febf 100644
--- a/tensorflow/python/keras/benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -73,7 +73,6 @@ cuda_py_test(
     tags = COMMON_TAGS + [
         "no_oss_py38",  # TODO(b/162044699)
     ],
-    tfrt_enabled = True,
     deps = [
         ":profiler_lib",
         "//tensorflow:tensorflow_py",
@@ -85,7 +84,6 @@ cuda_py_test(
     name = "model_components_benchmarks_test",
     srcs = ["model_components_benchmarks_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":profiler_lib",
         "//tensorflow:tensorflow_py",
@@ -107,7 +105,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         ":profiler_lib",
@@ -120,7 +117,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/text_classification_transformer_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -132,7 +128,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/antirectifier_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -144,7 +139,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_conv_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -157,7 +151,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -169,7 +162,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_irnn_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -181,7 +173,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/reuters_mlp_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -194,7 +185,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/cifar10_cnn_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -206,7 +196,6 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
-    tfrt_enabled = True,
     deps = [
         ":distribution_util",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/python/keras/benchmarks/layer_benchmarks/BUILD b/tensorflow/python/keras/benchmarks/layer_benchmarks/BUILD
new file mode 100644
index 00000000000..7c3b55c02bd
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/layer_benchmarks/BUILD
@@ -0,0 +1,66 @@
+# Description:
+#   Implementation of benchmarks on Keras layers.
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
+BECHMARK_TAGS = [
+    "no_oss_py38",  # TODO(b/162044699)
+    "no_pip",  # TODO(b/161253163)
+    "no_windows",  # TODO(b/160628318)
+]
+
+# To run CPU benchmarks:
+#   bazel run -c opt benchmarks_test -- --benchmarks=.
+
+# To run GPU benchmarks:
+#   bazel run -c opt --config=cuda benchmarks_test -- \
+#     --benchmarks=.
+
+# To run benchmarks with TFRT:
+#   bazel run -c opt --config=cuda --test_env=EXPERIMENTAL_ENABLE_TFRT=1 benchmarks_test -- \
+#     --benchmarks=.
+
+# To run a subset of benchmarks using --benchmarks flag.
+# --benchmarks: the list of benchmarks to run. The specified value is interpreted
+# as a regular expression and any benchmark whose name contains a partial match
+# to the regular expression is executed.
+# e.g. --benchmarks=".*lstm*." will run all lstm layer related benchmarks.
+
+py_library(
+    name = "run_xprof",
+    srcs = ["run_xprof.py"],
+    visibility = ["//tensorflow:internal"],
+)
+
+py_library(
+    name = "layer_benchmarks_test_base",
+    srcs = ["layer_benchmarks_test_base.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":run_xprof",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
+    ],
+)
+
+tf_py_test(
+    name = "layer_benchmarks_test",
+    srcs = ["layer_benchmarks_test.py"],
+    python_version = "PY3",
+    tags = BECHMARK_TAGS,
+    deps = [
+        ":layer_benchmarks_test_base",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/python/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py b/tensorflow/python/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py
new file mode 100644
index 00000000000..59ae60257dd
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py
@@ -0,0 +1,172 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on Keras layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import six
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.layer_benchmarks import layer_benchmarks_test_base
+from tensorflow.python.platform import benchmark
+
+
+def _get_benchmark_name(name):
+  return name.split("__")[-1].split("_")
+
+
+def _layer_call_backward(layer, x):
+  with tf.GradientTape() as tape:
+    y = layer(x)
+    loss = tf.reduce_mean(y**2)
+
+  _ = tape.gradient(loss, layer.trainable_variables)
+
+
+class KerasLayerBenchmarks(six.with_metaclass(
+    benchmark.ParameterizedBenchmark,
+    layer_benchmarks_test_base.LayerBenchmarksBase)):
+
+  # The parameter of each layer benchmark is a tuple, and the first one is
+  # the benchmark name. It must follow the convention of
+  # "{layer_name}_{small|normal|large}_shape" to make it compatible with
+  # `self.report_benchmark()` method.
+  _benchmark_parameters = [
+      ("Conv2D_small_shape", tf.keras.layers.Conv2D,
+       {"filters": 1, "kernel_size": 1, "activation": "relu"},
+       (1, 1, 1, 1), 10000),
+      ("Conv2D_normal_shape", tf.keras.layers.Conv2D,
+       {"filters": 1, "kernel_size": 1, "activation": "relu"},
+       (64, 28, 28, 3), 10000),
+      ("LSTM_small_shape", tf.keras.layers.LSTM,
+       {"units": 1}, (1, 1, 1), 10000),
+      ("LSTM_normal_shape", tf.keras.layers.LSTM,
+       {"units": 4}, (32, 10, 8), 10000),
+  ]
+
+  def benchmark_layer_call(self, layer_cls, layer_args, input_shape, num_iters):
+    layer = layer_cls(**layer_args)
+    x = tf.ones(input_shape)
+
+    fn = functools.partial(layer, x)
+    name = _get_benchmark_name(self._get_name())
+    metadata = {
+        "model_name": "ideal_layers",
+        "implementation": name[0] + ".layer.call",
+        "parameters": name[1] + "_shape"
+    }
+    self.run_report(fn, num_iters, metadata)
+
+  def benchmark_layer_call_with_function(
+      self, layer_cls, layer_args, input_shape, num_iters):
+    layer = layer_cls(**layer_args)
+    x = tf.ones(input_shape)
+    layer.call = tf.function(layer.call)
+
+    fn = functools.partial(layer, x)
+    name = _get_benchmark_name(self._get_name())
+    metadata = {
+        "model_name": "ideal_layers",
+        "implementation": name[0] + ".layer.call.function",
+        "parameters": name[1] + "_shape"
+    }
+    self.run_report(fn, num_iters, metadata)
+
+  def benchmark_layer_call_with_xla(
+      self, layer_cls, layer_args, input_shape, num_iters):
+    layer = layer_cls(**layer_args)
+    x = tf.ones(input_shape)
+    layer.call = tf.function(
+        layer.call, experimental_compile=True)
+
+    fn = functools.partial(layer, x)
+    name = _get_benchmark_name(self._get_name())
+    metadata = {
+        "model_name": "ideal_layers",
+        "implementation": name[0] + ".layer.call.xla",
+        "parameters": name[1] + "_shape"
+    }
+    self.run_report(fn, num_iters, metadata)
+
+  def benchmark_layer_call_backward(
+      self, layer_cls, layer_args, input_shape, num_iters):
+    layer = layer_cls(**layer_args)
+    x = tf.ones(input_shape)
+
+    fn = functools.partial(_layer_call_backward, layer, x)
+    name = _get_benchmark_name(self._get_name())
+    metadata = {
+        "model_name": "ideal_layers",
+        "implementation": name[0] + ".layer.call.backward",
+        "parameters": name[1] + "_shape"
+    }
+    self.run_report(fn, num_iters, metadata)
+
+  def benchmark_layer_call_backward_with_function(
+      self, layer_cls, layer_args, input_shape, num_iters):
+    layer = layer_cls(**layer_args)
+    x = tf.ones(input_shape)
+    layer.call = tf.function(layer.call)
+
+    fn = functools.partial(_layer_call_backward, layer, x)
+    name = _get_benchmark_name(self._get_name())
+    metadata = {
+        "model_name": "ideal_layers",
+        "implementation": name[0] + ".layer.call.backward.function",
+        "parameters": name[1] + "_shape"
+    }
+    self.run_report(fn, num_iters, metadata)
+
+
+class KerasLayerBenchmarksBackwardXLA(six.with_metaclass(
+    benchmark.ParameterizedBenchmark,
+    layer_benchmarks_test_base.LayerBenchmarksBase)):
+
+  _benchmark_parameters = [
+      ("Conv2D_small_shape", tf.keras.layers.Conv2D,
+       {"filters": 1, "kernel_size": 1, "activation": "relu"},
+       (1, 1, 1, 1), 10000),
+      ("Conv2D_normal_shape", tf.keras.layers.Conv2D,
+       {"filters": 1, "kernel_size": 1, "activation": "relu"},
+       (64, 28, 28, 3), 10000),
+      # TODO(b/153480400)
+      # ("LSTM_small_shape", tf.keras.layers.LSTM,
+      #  {"units": 1}, (1, 1, 1), 10000),
+      # ("LSTM_normal_shape", tf.keras.layers.LSTM,
+      #  {"units": 4}, (32, 10, 8), 10000),
+  ]
+
+  def benchmark_layer_call_backward_with_xla(
+      self, layer_cls, layer_args, input_shape, num_iters):
+    layer = layer_cls(**layer_args)
+    x = tf.ones(input_shape)
+    layer.call = tf.function(
+        layer.call, experimental_compile=True)
+
+    fn = functools.partial(_layer_call_backward, layer, x)
+    name = _get_benchmark_name(self._get_name())
+    metadata = {
+        "model_name": "ideal_layers",
+        "implementation": name[0] + ".layer.call.backward.xla",
+        "parameters": name[1] + "_shape"
+    }
+    self.run_report(fn, num_iters, metadata)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py b/tensorflow/python/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py
new file mode 100644
index 00000000000..a326f0b1210
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py
@@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Benchmark base to run and report Keras layers benchmark results."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks.layer_benchmarks import run_xprof
+
+
+class LayerBenchmarksBase(tf.test.Benchmark):
+  """Run and report benchmark results.
+
+  The first run is without any profiling to purly measure running time.
+  Second run is with xprof but no python trace.
+  Third run is with xprof and python trace.
+  Note: xprof runs fewer iterations, and the maximum iterations is 100.
+  """
+
+  def run_report(self, func, num_iters, metadata=None):
+    """Run and report benchmark results for different settings."""
+
+    # 0. Warm up.
+    func()
+
+    # 1. Run without profiling.
+    start = time.time()
+    for _ in range(num_iters):
+      func()
+    total_time = time.time() - start
+    us_mean_time = total_time * 1e6 / num_iters
+
+    metrics = [
+        {"name": "examples_per_sec",
+         "value": float("{0:.3f}".format(num_iters / total_time))},
+        {"name": "us_per_example",
+         "value": float("{0:.3f}".format(us_mean_time))}]
+
+    # 2. Run with xprof with no python trace.
+    num_iters_xprof = min(100, num_iters)
+    xprof_link, us_per_example = run_xprof.run_with_xprof(
+        func, num_iters_xprof, False)
+    # This xprof link will appear in the benchmark dashboard.
+    extras = {
+        "xprof_link": xprof_link,
+        "us_per_example_with_xprof": us_per_example
+    }
+
+    # 3. Run with xprof and python trace.
+    xprof_link, us_per_example = run_xprof.run_with_xprof(
+        func, num_iters_xprof, True)
+    extras["python_trace_xprof_link"] = xprof_link
+    extras["us_per_example_with_xprof_and_python"] = us_per_example
+
+    if metadata:
+      extras.update(metadata)
+    self.report_benchmark(
+        iters=num_iters, wall_time=us_mean_time, extras=extras, metrics=metrics)
diff --git a/tensorflow/python/keras/benchmarks/layer_benchmarks/run_xprof.py b/tensorflow/python/keras/benchmarks/layer_benchmarks/run_xprof.py
new file mode 100644
index 00000000000..aef4d7b9877
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/layer_benchmarks/run_xprof.py
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+import time
+import uuid
+
+from tensorflow.python.profiler import profiler_v2 as profiler
+
+def run_with_xprof(self, func, num_iters_xprof=100, enable_python_trace=True,
+                   logdir='/tmp/layer_benchmark_xprof/'):
+  suid = str(uuid.uuid4())
+  if enable_python_trace:
+    options = profiler.ProfilerOptions(python_tracer_level=1)
+    logdir = os.path.join(logdir, str(uuid.uuid4()) + "_with_python")
+  else:
+    options = profiler.ProfilerOptions(python_tracer_level=0)
+    logdir = os.path.join(logdir, suid)
+
+  start = time.time()
+  with profiler.Profile(logdir, options):
+    for _ in range(num_iters_xprof):
+      func()
+  total_time = time.time() - start
+  us_per_example = float("{0:.3f}".format(total_time * 1e6 / num_iters_xprof))
+  return logdir, us_per_example
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
index 5501aedcd4e..c99d062143f 100644
--- a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
@@ -43,7 +43,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -58,7 +57,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -73,7 +71,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -88,7 +85,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -103,7 +99,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -118,7 +113,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -133,7 +127,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -148,7 +141,6 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
-    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
index a0760fa075c..111aa62dead 100644
--- a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
@@ -24,7 +24,7 @@ import time
 import tensorflow as tf
 
 from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 def save_and_load_benchmark(app):
@@ -34,7 +34,7 @@ def save_and_load_benchmark(app):
   model = app(weights=None)
   model_name = app.__name__
 
-  tmp_dir = googletest.GetTempDir()
+  tmp_dir = test.get_temp_dir()
   gfile.MakeDirs(tmp_dir)
   save_dir = tempfile.mkdtemp(dir=tmp_dir)
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index de97c80fd62..d27557133ee 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import collections.abc as collections_abc
 import copy
 import csv
 import io
@@ -32,15 +33,18 @@ import time
 import numpy as np
 import six
 
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.distribute import distributed_file_utils
 from tensorflow.python.keras.distribute import worker_training_state
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.utils import generic_utils
@@ -61,7 +65,6 @@ from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training.saving import checkpoint_options as checkpoint_options_lib
 from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -184,7 +187,7 @@ def set_callback_parameters(callback_list,
 def _is_generator_like(data):
   """Checks if data is a generator, Sequence, or Iterator."""
   return (hasattr(data, '__next__') or hasattr(data, 'next') or isinstance(
-      data, (Sequence, iterator_ops.Iterator, iterator_ops.OwnedIterator)))
+      data, (Sequence, iterator_ops.Iterator, iterator_ops.IteratorBase)))
 
 
 def make_logs(model, logs, outputs, mode, prefix=''):
@@ -1920,6 +1923,51 @@ class LearningRateScheduler(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
 
 
+def keras_model_summary(name, data, step=None):
+  """Writes a Keras model as JSON to as a Summary.
+
+  Writing the Keras model configuration allows the TensorBoard graph plugin to
+  render a conceptual graph, as opposed to graph of ops. In case the model fails
+  to serialize as JSON, it ignores and returns False.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A Keras Model to write.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+
+  Returns:
+    True on success, or False if no summary was written because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = 'graph_keras_model'
+  # version number = 1
+  summary_metadata.plugin_data.content = b'1'
+
+  try:
+    json_string = data.to_json()
+  except Exception as exc:  # pylint: disable=broad-except
+    # An exception should not break a model code.
+    logging.warn('Model failed to serialize as JSON. Ignoring... %s', exc)
+    return False
+
+  with summary_ops_v2.summary_scope(name, 'graph_keras_model',
+                                    [data, step]) as (tag, _):
+    with ops.device('cpu:0'):
+      tensor = constant_op.constant(json_string, dtype=dtypes.string)
+    return summary_ops_v2.write(
+        tag=tag, tensor=tensor, step=step, metadata=summary_metadata)
+
+
 @keras_export('keras.callbacks.TensorBoard', v1=[])
 class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   # pylint: disable=line-too-long
@@ -2150,7 +2198,7 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   def _write_keras_model_train_graph(self):
     """Writes Keras model train_function graph to TensorBoard."""
     with self._train_writer.as_default():
-      with summary_ops_v2.always_record_summaries():
+      with summary_ops_v2.record_if(True):
         train_fn = self.model.train_function
         # If the train_function is a `tf.function`, we can write out a graph
         if hasattr(train_fn, 'function_spec'):
@@ -2159,12 +2207,12 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   def _write_keras_model_summary(self):
     """Writes Keras graph network summary to TensorBoard."""
     with self._train_writer.as_default():
-      with summary_ops_v2.always_record_summaries():
+      with summary_ops_v2.record_if(True):
         summary_writable = (
             self.model._is_graph_network or  # pylint: disable=protected-access
             self.model.__class__.__name__ == 'Sequential')  # pylint: disable=protected-access
         if summary_writable:
-          summary_ops_v2.keras_model('keras', self.model, step=0)
+          keras_model_summary('keras', self.model, step=0)
 
   def _configure_embeddings(self):
     """Configure the Projector for embeddings."""
@@ -2351,7 +2399,7 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
     if batch is None:
       batch = self._stop_batch
     with self._train_writer.as_default():
-      with summary_ops_v2.always_record_summaries():
+      with summary_ops_v2.record_if(True):
         # TODO(b/126388999): Remove step info in the summary name.
         summary_ops_v2.trace_export(name='batch_%d' % batch, step=batch)
     profiler.stop()
@@ -2377,7 +2425,7 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
     val_logs = {k: v for k, v in logs.items() if k.startswith('val_')}
     train_logs = self._collect_learning_rate(train_logs)
 
-    with summary_ops_v2.always_record_summaries():
+    with summary_ops_v2.record_if(True):
       if train_logs:
         with self._train_writer.as_default():
           for name, value in train_logs.items():
@@ -2391,7 +2439,7 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   def _log_weights(self, epoch):
     """Logs the weights of the Model to TensorBoard."""
     with self._train_writer.as_default():
-      with summary_ops_v2.always_record_summaries():
+      with summary_ops_v2.record_if(True):
         for layer in self.model.layers:
           for weight in layer.weights:
             weight_name = weight.name.replace(':', '_')
@@ -2650,8 +2698,8 @@ class LambdaCallback(Callback):
   r"""Callback for creating simple, custom callbacks on-the-fly.
 
   This callback is constructed with anonymous functions that will be called
-  at the appropriate time. Note that the callbacks expects positional
-  arguments, as:
+  at the appropriate time (during `Model.{fit | evaluate | predict}`).
+  Note that the callbacks expects positional arguments, as:
 
   - `on_epoch_begin` and `on_epoch_end` expect two positional arguments:
     `epoch`, `logs`
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 5cb33e73622..538f981d509 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -33,20 +33,25 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.core.util import event_pb2
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import Dense
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import save_options as save_options_lib
@@ -2617,5 +2622,117 @@ class MostRecentlyModifiedFileMatchingPatternTest(test.TestCase):
         ckpt_file_path)
 
 
+class SummaryOpsTest(test.TestCase):
+
+  def tearDown(self):
+    super(SummaryOpsTest, self).tearDown()
+    summary_ops_v2.trace_off()
+
+  def keras_model(self, *args, **kwargs):
+    logdir = self.get_temp_dir()
+    writer = summary_ops_v2.create_file_writer_v2(logdir)
+    with writer.as_default():
+      keras.callbacks.keras_model_summary(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    # The first event contains no summary values. The written content goes to
+    # the second event.
+    return events[1]
+
+  @testing_utils.run_v2_only
+  def testKerasModel(self):
+    model = keras.Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    event = self.keras_model(name='my_name', data=model, step=1)
+    first_val = event.summary.value[0]
+    self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
+
+  @testing_utils.run_v2_only
+  def testKerasModel_usesDefaultStep(self):
+    model = keras.Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    try:
+      summary_ops_v2.set_step(42)
+      event = self.keras_model(name='my_name', data=model)
+      self.assertEqual(42, event.step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops_v2.set_step(None)
+
+  @testing_utils.run_v2_only
+  def testKerasModel_subclass(self):
+
+    class SimpleSubclass(keras.Model):
+
+      def __init__(self):
+        super(SimpleSubclass, self).__init__(name='subclass')
+        self.dense = Dense(10, input_shape=(100,))
+        self.activation = Activation('relu', name='my_relu')
+
+      def call(self, inputs):
+        x = self.dense(inputs)
+        return self.activation(x)
+
+    model = SimpleSubclass()
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      self.assertFalse(
+          keras.callbacks.keras_model_summary(
+              name='my_name', data=model, step=1))
+      self.assertRegex(
+          str(mock_log.call_args), 'Model failed to serialize as JSON.')
+
+  @testing_utils.run_v2_only
+  def testKerasModel_otherExceptions(self):
+    model = keras.Sequential()
+
+    with test.mock.patch.object(model, 'to_json') as mock_to_json:
+      with test.mock.patch.object(logging, 'warn') as mock_log:
+        mock_to_json.side_effect = Exception('oops')
+        self.assertFalse(
+            keras.callbacks.keras_model_summary(
+                name='my_name', data=model, step=1))
+        self.assertRegex(
+            str(mock_log.call_args),
+            'Model failed to serialize as JSON. Ignoring')
+
+
+def events_from_file(filepath):
+  """Returns all events in a single event file.
+
+  Args:
+    filepath: Path to the event file.
+
+  Returns:
+    A list of all tf.Event protos in the event file.
+  """
+  result = []
+  raw_dataset = readers.TFRecordDatasetV2([filepath])
+  for raw_record in raw_dataset.take(10):
+    event = event_pb2.Event()
+    event.ParseFromString(raw_record.numpy())
+    result.append(event)
+  return result
+
+
+def events_from_logdir(logdir):
+  """Returns all events in the single eventfile in logdir.
+
+  Args:
+    logdir: The directory in which the single event file is sought.
+
+  Returns:
+    A list of all tf.Event protos from the single event file.
+
+  Raises:
+    AssertionError: If logdir does not contain exactly one file.
+  """
+  assert gfile.Exists(logdir)
+  files = gfile.ListDirectory(logdir)
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
+  return events_from_file(os.path.join(logdir, files[0]))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
index 5c0c3ff6e96..4e763e00761 100644
--- a/tensorflow/python/keras/callbacks_v1.py
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -167,7 +167,7 @@ class TensorBoard(callbacks.TensorBoard):
   def _init_writer(self, model):
     """Sets file writer."""
     if context.executing_eagerly():
-      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
+      self.writer = summary_ops_v2.create_file_writer_v2(self.log_dir)
       if not model.run_eagerly and self.write_graph:
         with self.writer.as_default():
           summary_ops_v2.graph(K.get_graph(), step=0)
@@ -327,7 +327,7 @@ class TensorBoard(callbacks.TensorBoard):
     logs = logs or {}
     if context.executing_eagerly():
       # use v2 summary ops
-      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
+      with self.writer.as_default(), summary_ops_v2.record_if(True):
         for name, value in logs.items():
           if isinstance(value, np.ndarray):
             value = value.item()
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index f067cd1fdb1..fd77ad278b5 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -3,7 +3,7 @@
 #   related to dist-strat used by Keras..
 
 load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
@@ -31,6 +31,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":sidecar_evaluator",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:distribute_coordinator",
@@ -49,8 +50,8 @@ py_library(
         "//tensorflow/python/keras:losses",
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
-        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
-        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/mixed_precision:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
@@ -89,6 +90,17 @@ py_library(
         "worker_training_state.py",
     ],
     srcs_version = "PY2AND3",
+    deps = [
+        ":distributed_file_utils",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/training:checkpoint_management",
+        "//tensorflow/python/training/tracking:util",
+    ],
 )
 
 py_library(
@@ -136,7 +148,6 @@ cuda_py_test(
     srcs = ["worker_training_state_test.py"],
     python_version = "PY3",
     shard_count = 4,
-    tfrt_enabled = True,
     deps = [
         ":multi_worker_testing_utils",
         ":worker_training_state",
@@ -150,7 +161,6 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
-    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -171,6 +181,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "nomsan",  # TODO(b/162894966)
+        "notsan",  # TODO(b/171040408): data race
     ],
     # b/155301154 broken with XLA:GPU
     xla_enable_strict_auto_jit = True,
@@ -197,8 +208,8 @@ cuda_py_test(
         "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers",
-        "//tensorflow/python/keras/mixed_precision/experimental:policy",
-        "//tensorflow/python/keras/mixed_precision/experimental:test_util",
+        "//tensorflow/python/keras/mixed_precision:policy",
+        "//tensorflow/python/keras/mixed_precision:test_util",
         "//tensorflow/python/ops/losses",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -211,6 +222,7 @@ distribute_py_test(
     shard_count = 10,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
     ],
     deps = [
         ":optimizer_combinations",
@@ -251,9 +263,12 @@ distribute_py_test(
     main = "custom_training_loop_models_test.py",
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
+        "notsan",  # TODO(b/170954243)
     ],
     tpu_tags = [
         "no_oss",  # b/153615544.
+        "notsan",  # TODO(b/170869466)
     ],
     deps = [
         "//tensorflow/python:math_ops",
@@ -305,6 +320,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:parameter_server_strategy",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/eager:context",
@@ -339,11 +355,13 @@ distribute_py_test(
     full_precision = True,
     main = "distribute_strategy_test.py",
     python_version = "PY3",
-    shard_count = 10,
+    shard_count = 20,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
+        "noasan",  # TODO(b/170902997)
+        "notap",  # TODO(b/170902997)
         "notsan",
     ],
     tpu_tags = [
@@ -413,6 +431,7 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
+        "nogpu",  # TODO(b/170905292)
         "notsan",
     ],
     deps = [
@@ -429,7 +448,10 @@ distribute_py_test(
     main = "keras_embedding_model_correctness_test.py",
     shard_count = 8,
     tags = [
+        "broken",  # b/170975619
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
+        "no_rocm",
         "no_windows_gpu",
         "notsan",
     ],
@@ -450,6 +472,7 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
         "notsan",
     ],
     xla_enable_strict_auto_jit = False,  # Tensorflow also fails.
@@ -505,9 +528,8 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_cuda11",
-        "no_oss",
         "no_windows_gpu",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
         "notpu",  # TODO(b/153672562)
         "notsan",
     ],
@@ -520,13 +542,15 @@ distribute_py_test(
     name = "keras_save_load_test",
     size = "medium",
     srcs = ["keras_save_load_test.py"],
-    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_save_load_test.py",
     shard_count = 7,
     tags = [
         "multi_and_single_gpu",
     ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":saved_model_test_base",
         "//tensorflow/python/keras/saving",
@@ -561,6 +585,7 @@ distribute_py_test(
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
         "no_rocm",
         "no_windows_gpu",
         "notsan",
@@ -689,7 +714,7 @@ cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
     python_version = "PY3",
-    shard_count = 32,
+    shard_count = 2,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/130369494): Investigate why it times out on OSS.
@@ -727,6 +752,7 @@ py_test(
     python_version = "PY3",
     shard_count = 5,
     deps = [
+        ":distributed_file_utils",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
@@ -763,41 +789,21 @@ py_library(
     ],
 )
 
-py_test(
-    name = "multi_worker_tutorial_test",
-    srcs = ["multi_worker_tutorial_test.py"],
-    python_version = "PY3",
-    shard_count = 5,
-    tags = [
-        "noasan",  # TODO(b/156029134)
-        "nomsan",  # TODO(b/156029134)
-        "notap",  # TODO(b/165865820): restore when not flaky
-        "notsan",  # TODO(b/156029134)
-    ],
-    deps = [
-        "//tensorflow/python:platform",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:multi_process_runner",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-        "//tensorflow/python/keras/optimizer_v2",
-    ],
-)
-
 distribute_py_test(
     name = "saved_model_save_load_test",
     size = "medium",
     srcs = ["saved_model_save_load_test.py"],
-    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_save_load_test.py",
     shard_count = 7,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda_asan",  # times out
         "no_rocm",
     ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":saved_model_test_base",
         "//tensorflow/python/saved_model",
@@ -808,7 +814,6 @@ distribute_py_test(
     name = "saved_model_mixed_api_test",
     size = "medium",
     srcs = ["saved_model_mixed_api_test.py"],
-    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
     shard_count = 7,
@@ -816,6 +821,9 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",
     ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":saved_model_test_base",
         "//tensorflow/python/distribute:combinations",
@@ -861,14 +869,65 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:sharded_variable",
-        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras",
     ],
 )
+
+py_library(
+    name = "distributed_file_utils",
+    srcs = [
+        "distributed_file_utils.py",
+    ],
+    deps = [
+        "//tensorflow/python:lib",
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+py_test(
+    name = "distributed_file_utils_test",
+    srcs = ["distributed_file_utils_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":distributed_file_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "sidecar_evaluator",
+    srcs = ["sidecar_evaluator.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_py_test(
+    name = "sidecar_evaluator_test",
+    size = "small",
+    srcs = ["sidecar_evaluator_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":sidecar_evaluator",
+        "//tensorflow/python/compat:v2_compat",
+    ],
+)
diff --git a/tensorflow/python/keras/distribute/__init__.py b/tensorflow/python/keras/distribute/__init__.py
index 853a7b752b9..f9854093f57 100644
--- a/tensorflow/python/keras/distribute/__init__.py
+++ b/tensorflow/python/keras/distribute/__init__.py
@@ -17,3 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.keras.distribute import sidecar_evaluator
diff --git a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
index 7dd285a585f..1c78d934959 100644
--- a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
@@ -24,12 +24,10 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
 from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
@@ -43,8 +41,8 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.keras.mixed_precision import policy
+from tensorflow.python.keras.mixed_precision import test_util as mp_test_util
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
@@ -70,7 +68,7 @@ def create_test_objects(cluster_spec=None,
 
   if cluster_spec and task_type and task_id is not None:
     cluster_resolver = SimpleClusterResolver(
-        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        cluster_spec=ClusterSpec(cluster_spec),
         task_type=task_type,
         task_id=task_id,
         num_accelerators={'GPU': num_gpus})
@@ -80,7 +78,7 @@ def create_test_objects(cluster_spec=None,
         ClusterSpec({}), num_accelerators={'GPU': num_gpus})
     target = ''
 
-  strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+  strategy = mwms_lib.CollectiveAllReduceStrategy(
       cluster_resolver=cluster_resolver)
   sess_config = strategy.update_config_proto(sess_config)
 
@@ -95,9 +93,7 @@ class CollectiveAllReduceStrategyTestBase(
   def setUp(self):
     # We use a different key_base for each test so that collective keys won't be
     # reused.
-    # TODO(yuefengz, ayushd): enable it to reuse collective keys in different
-    # tests.
-    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    mwms_lib.CollectiveAllReduceStrategy._collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
   def _get_test_object(self, task_type, task_id, num_gpus=0):
@@ -106,18 +102,6 @@ class CollectiveAllReduceStrategyTestBase(
         task_type=task_type,
         task_id=task_id,
         num_gpus=num_gpus)
-
-    collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        op_instance_key_start=100 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base)
-    strategy.extended._collective_keys = collective_keys
-    strategy.extended._cross_device_ops._collective_keys = collective_keys
-    strategy.extended._host_cross_device_ops._collective_keys = collective_keys
-
     return strategy, target, session_config
 
   def _test_complex_model(self, task_type, task_id, num_gpus):
@@ -344,13 +328,15 @@ class DistributedCollectiveAllReduceStrategyEagerTest(test.TestCase,
       return model
 
     def _get_dataset():
-      inputs = array_ops.expand_dims_v2(constant_op.constant(range(10)), axis=1)
+      inputs = array_ops.expand_dims_v2(
+          constant_op.constant(range(10)), axis=1)
       targets = array_ops.expand_dims_v2(
           constant_op.constant(range(10)), axis=1)
-      # Make global batch size 12 for 2 replicas and a non-repeated dataset with
-      # 10 elements so that we have partial batch
+      # Make global batch size 12 for 2 replicas and a non-repeated dataset
+      # with 10 elements so that we have partial batch
       dataset = dataset_ops.Dataset.from_tensor_slices(
-          (inputs, targets)).batch(12, drop_remainder=False)
+          (inputs, targets)).batch(
+              12, drop_remainder=False)
       return dataset
 
     with strategy.scope():
@@ -359,16 +345,14 @@ class DistributedCollectiveAllReduceStrategyEagerTest(test.TestCase,
       model = _model_fn()
       loss = 'mse'
       metrics = ['mae']
-      model.compile(
-          optimizer,
-          loss,
-          metrics=metrics)
+      model.compile(optimizer, loss, metrics=metrics)
     dataset = _get_dataset()
     kernel_before = model.get_weights()[0][0]
     model.fit(dataset, epochs=10)
     kernel_after = model.get_weights()[0][0]
     self.assertNotEqual(kernel_before, kernel_after)
-    self.assertGreater(abs(kernel_before-1), abs(kernel_after-1))
+    self.assertGreater(abs(kernel_before - 1), abs(kernel_after - 1))
+
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 7bc70101fb4..ce0544213b3 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -35,8 +35,8 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
@@ -52,7 +52,7 @@ from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.distribute import distributed_training_utils_v1
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
@@ -66,6 +66,7 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 _RANDOM_SEED = 1337
@@ -528,8 +529,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
   @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_mixed_precision(self, distribution):
-    if isinstance(distribution.extended,
-                  parameter_server_strategy.ParameterServerStrategyExtended):
+    if isinstance(distribution,
+                  (parameter_server_strategy.ParameterServerStrategyV1,
+                   parameter_server_strategy_v2.ParameterServerStrategyV2,
+                   central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
       self.skipTest('b/152097775')
     if _is_tpu_strategy(distribution):
       policy_name = 'mixed_bfloat16'
@@ -577,8 +581,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
     # AutoCastVariable to a tensor on a TPU, where the variable was the LHS of
     # the '+' operator, used to cause the gradient w.r.t. the variable to be
     # None.
-    if isinstance(distribution.extended,
-                  parameter_server_strategy.ParameterServerStrategyExtended):
+    if isinstance(distribution,
+                  (parameter_server_strategy.ParameterServerStrategyV1,
+                   parameter_server_strategy_v2.ParameterServerStrategyV2,
+                   central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
       self.skipTest('b/152097775')
 
     if _is_tpu_strategy(distribution):
@@ -1151,9 +1158,6 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
     if mode == 'graph' and _is_tpu_strategy(distribution):
       self.skipTest('partial batch not supported with TPU in graph mode.')
 
-    if isinstance(distribution,
-                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
-      self.skipTest('EOF error causes subsequent collective ops fail.')
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -1166,8 +1170,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             loss,
             metrics=metrics)
 
-      inputs = np.zeros((1000, 3), dtype=np.float32)
-      targets = np.zeros((1000, 4), dtype=np.float32)
+      inputs = np.zeros((100, 3), dtype=np.float32)
+      targets = np.zeros((100, 4), dtype=np.float32)
       # steps/steps_per_epoch are calculated when using numpy arrays as
       # input data.
       fit_with_numpy = model.fit(
@@ -1881,6 +1885,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
   @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop(self, distribution):
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest('b/172032817')
     with distribution.scope():
       inputs = keras.Input((10, 10, 3))
       x = keras.layers.Conv2D(3, kernel_size=3)(inputs)
@@ -1907,6 +1914,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
   @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_last_partial_execution(self, distribution):
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest('b/172032817')
     with distribution.scope():
       inputs = keras.Input(10)
       outputs = keras.layers.Dense(1)(inputs)
@@ -1931,6 +1941,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
   @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_dataset_unknown_size(self, distribution):
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest('b/172032817')
     with distribution.scope():
       inputs = keras.Input(10)
       outputs = keras.layers.Dense(1)(inputs)
@@ -1967,6 +1980,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
   @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_truncate_to_epoch(self, distribution):
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest('b/172032817')
     with distribution.scope():
       inputs = keras.Input(10)
       outputs = keras.layers.Dense(1)(inputs)
@@ -2449,16 +2465,15 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2)
     cluster_resolver = SimpleClusterResolver(
-        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        cluster_spec=server_lib.ClusterSpec(cluster_spec),
         task_type='worker',
         task_id=1,
         num_accelerators={'GPU': 0})
-    distribution = parameter_server_strategy.ParameterServerStrategy(
+    distribution = parameter_server_strategy.ParameterServerStrategyV1(
         cluster_resolver)
 
     self.assertIsInstance(distribution,
-                          (parameter_server_strategy.ParameterServerStrategyV1,
-                           parameter_server_strategy.ParameterServerStrategy))
+                          parameter_server_strategy.ParameterServerStrategyV1)
 
     with self.assertRaisesRegex(NotImplementedError,
                                 'ParameterServerStrategy*'):
diff --git a/tensorflow/python/distribute/distributed_file_utils.py b/tensorflow/python/keras/distribute/distributed_file_utils.py
similarity index 100%
rename from tensorflow/python/distribute/distributed_file_utils.py
rename to tensorflow/python/keras/distribute/distributed_file_utils.py
diff --git a/tensorflow/python/distribute/distributed_file_utils_test.py b/tensorflow/python/keras/distribute/distributed_file_utils_test.py
similarity index 97%
rename from tensorflow/python/distribute/distributed_file_utils_test.py
rename to tensorflow/python/keras/distribute/distributed_file_utils_test.py
index 985af870080..d5b07ee0135 100644
--- a/tensorflow/python/distribute/distributed_file_utils_test.py
+++ b/tensorflow/python/keras/distribute/distributed_file_utils_test.py
@@ -17,9 +17,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import os
-from tensorflow.python.distribute import distributed_file_utils
-from tensorflow.python.eager import test
+
+from tensorflow.python.keras.distribute import distributed_file_utils
+from tensorflow.python.platform import test
 
 
 class DistributedFileUtilsTest(test.TestCase):
diff --git a/tensorflow/python/keras/distribute/keras_correctness_test_base.py b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
index 77a5f290439..f40f45cccbb 100644
--- a/tensorflow/python/keras/distribute/keras_correctness_test_base.py
+++ b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
@@ -32,7 +32,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.preprocessing import sequence
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
diff --git a/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
index 7e00ff5ec7f..43092fc2191 100644
--- a/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.layers import recurrent as rnn_v1
 from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index ea4b349b1cf..bb9d303ee3d 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -20,21 +20,27 @@ from __future__ import print_function
 
 import json
 import os
+import sys
 
 from absl.testing import parameterized
 
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
 from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras.distribute import distributed_file_utils
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
+def _is_oss():
+  """Returns whether the test is run under OSS."""
+  return len(sys.argv) >= 1 and 'bazel' in sys.argv[0]
+
+
 def checkpoint_exists(filepath):
   """Returns whether the checkpoint `filepath` refers to exists."""
   if filepath.endswith('.h5'):
@@ -183,6 +189,8 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
     def proc_model_checkpoint_works_with_same_file_path(test_obj,
                                                         saving_filepath):
+      if _is_oss():
+        test_obj.skipTest('TODO(b/170838633): Failing in OSS')
       model, _, train_ds, steps = _model_setup(test_obj, file_format='')
       num_epoch = 4
 
@@ -205,7 +213,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           raise
 
       multi_process_runner.get_barrier().wait()
-      backup_filepath = os.path.join(bar_dir, 'checkpoint')
+      backup_filepath = os.path.join(bar_dir, 'chief', 'checkpoint')
       test_obj.assertTrue(file_io.file_exists_v2(backup_filepath))
       test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_test.py b/tensorflow/python/keras/distribute/multi_worker_test.py
index 54c72004bb3..bdca678bc54 100644
--- a/tensorflow/python/keras/distribute/multi_worker_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_test.py
@@ -33,10 +33,7 @@ from tensorflow.python import keras
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
 from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.distribute import parameter_server_strategy
-from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
@@ -48,23 +45,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
-# TODO(b/130375202): remove this class which is a temporary solution before we
-# get rid of configure method.
-class ParameterServerStrategy(distribute_lib.Strategy):
-  """Temporarily mock the original strategy to bypass cluster_spec check."""
-
-  def __init__(self, cluster_resolver=None):
-    """Initializes this strategy."""
-    # The `cluster_resolver` must be set so that
-    # `ParameterServerStrategyExtended` will keep num_gpus for `configure`
-    # method.
-    if cluster_resolver is None:
-      cluster_resolver = TFConfigClusterResolver()
-    extended = parameter_server_strategy.ParameterServerStrategyExtended(
-        self, cluster_resolver=cluster_resolver)
-    super(ParameterServerStrategy, self).__init__(extended)
-
-
 def _clone_and_build_model(model, strategy):
   # The new "original" model in worker 0.
   with strategy.scope():
@@ -262,69 +242,6 @@ class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
     self.join_independent_workers(threads_to_join)
     verification_callback.verify(self)
 
-  @ds_combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          strategy_cls=[ParameterServerStrategy],
-          required_gpus=[0, 1]))
-  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
-    num_workers = 2
-    num_epoch = 2
-    cluster_spec = test_base.create_cluster_spec(
-        num_workers=num_workers, num_ps=2)
-    self._barrier = dc._Barrier(4)
-
-    # The verification callback will be shared by multiple threads.
-    verification_callback = MultiWorkerVerificationCallback(
-        num_epoch=num_epoch, num_worker=num_workers)
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      """Simulates an Independent Worker inside of a thread."""
-      # TODO(rchao/yuefengz): The following is run by both worker and ps
-      # threads. The distribute coordinator should run std server immediately
-      # without configuring the session (or building the graph) on PS.
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        batch_size = 64
-        steps = 2
-        strategy = strategy_cls()
-        verification_callback.is_between_graph = \
-            strategy.extended.experimental_between_graph
-
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-        val_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-          # TODO(b/123868066): Verify callback for model.evaluate().
-          callbacks_for_fit = nest.flatten(
-              kwargs.get('verification_callback', []))
-          history = model.fit(
-              x=train_ds,
-              epochs=num_epoch,
-              steps_per_epoch=steps,
-              validation_data=val_ds,
-              validation_steps=steps,
-              callbacks=callbacks_for_fit)
-        self.assertIsInstance(history, keras.callbacks.History)
-
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        verification_callback=verification_callback)
-
-    threads_to_join = []
-    for task_type, ts in threads.items():
-      # This test can finish once the worker threads complete, and thus
-      # the ps threads don't need to be joined.
-      if task_type == 'ps':
-        continue
-      threads_to_join.extend(ts)
-    self.join_independent_workers(threads_to_join)
-    verification_callback.verify(self)
-
 
 if __name__ == '__main__':
   # Enable manual variable initialization to make sure variables are initialized
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
deleted file mode 100644
index 42d0e4d4630..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test for multi-worker training tutorial."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import contextlib
-import os
-import re
-import zipfile
-from absl import logging
-from absl.testing import parameterized
-import numpy as np
-from tensorflow.python import keras
-from tensorflow.python.data.experimental.ops import distribute_options
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras.datasets import mnist
-from tensorflow.python.keras.optimizer_v2 import gradient_descent
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.platform import test
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.tracking import util as tracking_util
-from tensorflow.python.util import nest
-
-
-class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
-  """Test multi-worker training flow demo'ed in go/multi-worker-with-keras."""
-
-  @contextlib.contextmanager
-  def skip_fetch_failure_exception(self):
-    try:
-      yield
-    except zipfile.BadZipfile as e:
-      self.skipTest('Data loading error: Bad magic number for file header.')
-    except Exception as e:  # pylint: disable=broad-except
-      if 'URL fetch failure' in str(e):
-        self.skipTest('URL fetch error not considered failure of the test.')
-      else:
-        raise
-
-  @ds_combinations.generate(
-      combinations.combine(
-          mode=['eager'],
-          shard_policy=[None] + list(distribute_options.AutoShardPolicy)))
-  def testMultiWorkerTutorial(self, mode, shard_policy):
-    """Test multi-worker training flow demo'ed in go/multi-worker-with-keras.
-
-    This test should be kept in sync with the code samples in
-    go/multi-worker-with-keras.
-
-    Args:
-      mode: Runtime mode.
-      shard_policy: None or any of tf.data.experimental.AutoShardPolicy for
-        testing.
-    """
-    if shard_policy is distribute_options.AutoShardPolicy.FILE:
-      self.skipTest('TensorSliceDataset is not shardable with FILE policy.')
-
-    def mnist_dataset(batch_size):
-      with self.skip_fetch_failure_exception():
-        (x_train, y_train), _ = mnist.load_data()
-      # The `x` arrays are in uint8 and have values in the range [0, 255].
-      # We need to convert them to float32 with values in the range [0, 1]
-      x_train = x_train / np.float32(255)
-      y_train = y_train.astype(np.int64)
-      train_dataset = dataset_ops.DatasetV2.from_tensor_slices(
-          (x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
-      return train_dataset
-
-    def build_and_compile_cnn_model():
-      model = keras.Sequential([
-          keras.layers.Input(shape=(28, 28)),
-          keras.layers.Reshape(target_shape=(28, 28, 1)),
-          keras.layers.Conv2D(32, 3, activation='relu'),
-          keras.layers.Flatten(),
-          keras.layers.Dense(128, activation='relu'),
-          keras.layers.Dense(10)
-      ])
-      model.compile(
-          loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-          optimizer=gradient_descent.SGD(learning_rate=0.001),
-          metrics=['accuracy'])
-      return model
-
-    per_worker_batch_size = 64
-
-    single_worker_dataset = mnist_dataset(per_worker_batch_size)
-    single_worker_model = build_and_compile_cnn_model()
-    single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)
-
-    num_workers = 4
-
-    def fn(model_path, checkpoint_dir):
-      global_batch_size = per_worker_batch_size * num_workers
-      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
-      with strategy.scope():
-        multi_worker_model = build_and_compile_cnn_model()
-
-      callbacks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
-      ]
-
-      multi_worker_dataset = mnist_dataset(global_batch_size)
-      if shard_policy:
-        options = dataset_ops.Options()
-        options.experimental_distribute.auto_shard_policy = shard_policy
-        multi_worker_dataset = multi_worker_dataset.with_options(options)
-
-      multi_worker_model.fit(
-          multi_worker_dataset,
-          epochs=2,
-          steps_per_epoch=20,
-          callbacks=callbacks)
-
-      def _is_chief(task_type, task_id):
-        return task_type is None or task_type == 'chief' or (
-            task_type == 'worker' and task_id == 0)
-
-      def _get_temp_dir(dirpath, task_id):
-        base_dirpath = 'workertemp_' + str(task_id)
-        temp_dir = os.path.join(dirpath, base_dirpath)
-        file_io.recursive_create_dir_v2(temp_dir)
-        return temp_dir
-
-      def write_filepath(filepath, task_type, task_id):
-        dirpath = os.path.dirname(filepath)
-        base = os.path.basename(filepath)
-        if not _is_chief(task_type, task_id):
-          dirpath = _get_temp_dir(dirpath, task_id)
-        return os.path.join(dirpath, base)
-
-      task_type, task_id = (strategy.cluster_resolver.task_type,
-                            strategy.cluster_resolver.task_id)
-      write_model_path = write_filepath(model_path, task_type, task_id)
-
-      multi_worker_model.save(write_model_path)
-      if not _is_chief(task_type, task_id):
-        file_io.delete_recursively_v2(os.path.dirname(write_model_path))
-
-      # Make sure chief finishes saving before non-chief's assertions.
-      multi_process_runner.get_barrier().wait()
-
-      if not file_io.file_exists_v2(model_path):
-        raise RuntimeError()
-      if file_io.file_exists_v2(write_model_path) != _is_chief(
-          task_type, task_id):
-        raise RuntimeError()
-
-      loaded_model = keras.saving.save.load_model(model_path)
-      loaded_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20)
-
-      checkpoint = tracking_util.Checkpoint(model=multi_worker_model)
-      write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id)
-      checkpoint_manager = checkpoint_management.CheckpointManager(
-          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
-
-      checkpoint_manager.save()
-      if not _is_chief(task_type, task_id):
-        file_io.delete_recursively_v2(write_checkpoint_dir)
-
-      # Make sure chief finishes saving before non-chief's assertions.
-      multi_process_runner.get_barrier().wait()
-
-      if not file_io.file_exists_v2(checkpoint_dir):
-        raise RuntimeError()
-      if file_io.file_exists_v2(write_checkpoint_dir) != _is_chief(
-          task_type, task_id):
-        raise RuntimeError()
-
-      latest_checkpoint = checkpoint_management.latest_checkpoint(
-          checkpoint_dir)
-      checkpoint.restore(latest_checkpoint)
-      multi_worker_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20)
-
-      logging.info('testMultiWorkerTutorial successfully ends')
-
-    model_path = os.path.join(self.get_temp_dir(), 'model.tf')
-    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
-    with test_util.skip_if_error(self, errors_impl.UnavailableError):
-      mpr_result = multi_process_runner.run(
-          fn,
-          multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
-          args=(model_path, checkpoint_dir),
-          return_output=True)
-
-    self.assertTrue(
-        any([
-            'testMultiWorkerTutorial successfully ends' in msg
-            for msg in mpr_result.stdout
-        ]))
-
-    def extract_accuracy(worker_id, input_string):
-      match = re.match(
-          r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id),
-          input_string)
-      return None if match is None else float(match.group(1))
-
-    for worker_id in range(num_workers):
-      accu_result = nest.map_structure(
-          lambda x: extract_accuracy(worker_id, x),  # pylint: disable=cell-var-from-loop
-          mpr_result.stdout)
-      self.assertTrue(
-          any(accu_result), 'Every worker is supposed to have accuracy result.')
-
-
-if __name__ == '__main__':
-  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/parameter_server_training_test.py b/tensorflow/python/keras/distribute/parameter_server_training_test.py
index e4801d909ec..503dd68eb71 100644
--- a/tensorflow/python/keras/distribute/parameter_server_training_test.py
+++ b/tensorflow/python/keras/distribute/parameter_server_training_test.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ParameterServerClient and Keras models."""
+"""Tests for ClusterCoordinator and Keras models."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,14 +21,16 @@ from __future__ import print_function
 
 import random
 import tempfile
+from absl.testing import parameterized
 
 from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import client as client_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -44,7 +46,15 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
-def make_client(num_workers, num_ps):
+# These vocabularies usually come from TFT or a Beam pipeline.
+FEATURE_VOCAB = [
+    "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
+    "wonder_woman"
+]
+LABEL_VOCAB = ["yes", "no"]
+
+
+def make_coordinator(num_workers, num_ps):
   cluster_def = multi_worker_test_base.create_in_process_cluster(
       num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
   cluster_def["chief"] = [
@@ -52,68 +62,71 @@ def make_client(num_workers, num_ps):
   ]
   cluster_resolver = SimpleClusterResolver(
       ClusterSpec(cluster_def), rpc_layer="grpc")
-  return client_lib.Client(
+  return coordinator_lib.ClusterCoordinator(
       parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver))
 
 
-class KPLTest(test.TestCase):
+class KPLTest(test.TestCase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
     super(KPLTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-  def testTrainAndServe(self):
-    # These vocabularies usually come from TFT or a Beam pipeline.
-    feature_vocab = [
-        "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
-        "wonder_woman"
-    ]
-    label_vocab = ["yes", "no"]
-
-    with self.client.strategy.scope():
-
-      # Define KPLs under strategy's scope. Right now, if they have look up
-      # tables, they will be created on the client. Their variables will be
-      # created on PS. Ideally they should be cached on each worker since they
-      # will not be changed in a training step.
-      feature_lookup_layer = string_lookup.StringLookup()
-      raw_feature_input = keras.layers.Input(
-          shape=(3,), dtype=dtypes.string, name="feature", ragged=True)
-      feature_id_input = feature_lookup_layer(raw_feature_input)
-
-      # Model creates variables as well.
-      feature_ps = keras.Model({"features": raw_feature_input},
-                               feature_id_input)
-
-      # TODO(yuefengz): adapt may be expensive for large vocab?
-      feature_lookup_layer.adapt(feature_vocab)
+    cls.coordinator = make_coordinator(num_workers=3, num_ps=2)
 
+  def define_kpls_for_training(self, use_adapt):
+    # Define KPLs under strategy's scope. Right now, if they have look up
+    # tables, they will be created on the client. Their variables will be
+    # created on PS. Ideally they should be cached on each worker since they
+    # will not be changed in a training step.
+    if use_adapt:
+      feature_lookup_layer = string_lookup.StringLookup(num_oov_indices=1)
+      feature_lookup_layer.adapt(FEATURE_VOCAB)
       label_lookup_layer = string_lookup.StringLookup(
           num_oov_indices=0, mask_token=None)
-      raw_label_input = keras.layers.Input(
-          shape=(), dtype=dtypes.string, name="label")
-      label_id_input = label_lookup_layer(raw_label_input)
-      label_ps = keras.Model({"label": raw_label_input}, label_id_input)
+      label_lookup_layer.adapt(LABEL_VOCAB)
+    else:
+      feature_lookup_layer = string_lookup.StringLookup(
+          vocabulary=FEATURE_VOCAB, num_oov_indices=1)
+      label_lookup_layer = string_lookup.StringLookup(
+          vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None)
 
-      label_lookup_layer.adapt(label_vocab)
+    raw_feature_input = keras.layers.Input(
+        shape=(3,), dtype=dtypes.string, name="feature", ragged=True)
+    feature_id_input = feature_lookup_layer(raw_feature_input)
 
-      # Only needed for serving.
-      label_inverse_lookup_layer = string_lookup.StringLookup(
-          num_oov_indices=1,
-          mask_token=None,
-          vocabulary=label_lookup_layer.get_vocabulary(),
-          invert=True)
+    # Model creates variables as well.
+    feature_ps = keras.Model({"features": raw_feature_input}, feature_id_input)
+
+    raw_label_input = keras.layers.Input(
+        shape=(), dtype=dtypes.string, name="label")
+    label_id_input = label_lookup_layer(raw_label_input)
+    label_ps = keras.Model({"label": raw_label_input}, label_id_input)
+
+    return feature_ps, label_ps
+
+  def define_reverse_lookup_layer(self):
+    # Only needed for serving.
+    label_inverse_lookup_layer = string_lookup.StringLookup(
+        num_oov_indices=1, mask_token=None, vocabulary=LABEL_VOCAB, invert=True)
+    return label_inverse_lookup_layer
+
+  @combinations.generate(
+      combinations.combine(mode=["eager"], use_adapt=[True, False]))
+  def testTrainAndServe(self, use_adapt):
+
+    with self.coordinator.strategy.scope():
+
+      feature_ps, label_ps = self.define_kpls_for_training(use_adapt)
 
       def dataset_fn():
 
         def feature_and_label_gen():
           while True:
-            features = random.sample(feature_vocab, 3)
+            features = random.sample(FEATURE_VOCAB, 3)
             label = "yes" if "avenger" in features else "no"
             yield {"features": features, "label": label}
 
-        # The dataset will be created on the client?
+        # The dataset will be created on the coordinator?
         raw_dataset = dataset_ops.Dataset.from_generator(
             feature_and_label_gen,
             output_types={
@@ -131,25 +144,30 @@ class KPLTest(test.TestCase):
             }, [x["label"]]))
         return train_dataset
 
-      distributed_dataset = self.client.create_per_worker_dataset(dataset_fn)
+      distributed_dataset = self.coordinator.create_per_worker_dataset(
+          dataset_fn)
 
+      # Create the model. The input needs to be compatible with KPLs.
       model_input = keras.layers.Input(
           shape=(3,), dtype=dtypes.int64, name="model_input")
+
+      # input_dim includes a mask token and an oov token.
       emb_output = keras.layers.Embedding(
-          input_dim=len(feature_lookup_layer.get_vocabulary()), output_dim=20)(
+          input_dim=len(FEATURE_VOCAB) + 2, output_dim=20)(
               model_input)
       emb_output = math_ops.reduce_mean(emb_output, axis=1)
       dense_output = keras.layers.Dense(
           units=1, activation="sigmoid")(
               emb_output)
       model = keras.Model({"features": model_input}, dense_output)
+
       optimizer = rmsprop.RMSprop(learning_rate=0.01)
       accuracy = keras.metrics.Accuracy()
 
       @def_function.function
       def worker_fn(iterator):
 
-        def train_step(iterator):
+        def replica_fn(iterator):
           batch_data, labels = next(iterator)
           with backprop.GradientTape() as tape:
             pred = model(batch_data, training=True)
@@ -163,18 +181,18 @@ class KPLTest(test.TestCase):
           actual_pred = math_ops.cast(math_ops.greater(pred, 0.5), dtypes.int64)
           accuracy.update_state(labels, actual_pred)
 
-        self.client._strategy.run(train_step, args=(iterator,))
+        self.coordinator._strategy.run(replica_fn, args=(iterator,))
 
     distributed_iterator = iter(distributed_dataset)
     for _ in range(10):
-      self.client.schedule(worker_fn, args=(distributed_iterator,))
-    self.client.join()
+      self.coordinator.schedule(worker_fn, args=(distributed_iterator,))
+    self.coordinator.join()
     self.assertGreater(accuracy.result().numpy(), 0.0)
 
     # Create a saved model.
     model.feature_ps = feature_ps
     model.label_ps = label_ps
-    model.label_inverse_lookup_layer = label_inverse_lookup_layer
+    model.label_inverse_lookup_layer = self.define_reverse_lookup_layer()
 
     def create_serving_signature(model):
 
diff --git a/tensorflow/python/keras/distribute/sidecar_evaluator.py b/tensorflow/python/keras/distribute/sidecar_evaluator.py
new file mode 100644
index 00000000000..1bd2b7e2f48
--- /dev/null
+++ b/tensorflow/python/keras/distribute/sidecar_evaluator.py
@@ -0,0 +1,195 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python module for evaluation loop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import logging
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training.tracking import util as tracking_util
+
+_PRINT_EVAL_STEP_EVERY_SEC = 60.0
+_ITERATIONS_UNINITIALIZED = -1
+
+
+class SidecarEvaluator(object):
+  """A class designed for a dedicated evaluator task.
+
+  `SidecarEvaluator` is expected to be run on a process in
+  a separate machine from the training cluster. It continuously loads
+  checkpoints saved periodically by that training counterpart, and performs
+  evaluation using the model (with compiled metrics) provided at `__init__`. It
+  is expected to be used for the purpose of a dedicated evaluator, evaluating
+  the metric results of a training cluster which has one or more workers
+  performing the training and saving checkpoints. `SidecarEvaluator` uses
+  `model.evaluate` for evaluation.
+
+  Example:
+  ```python
+  model = tf.keras.models.Sequential(...)
+  model.compile(metrics=tf.keras.metrics.SparseCategoricalAccuracy(
+      name="eval_metrics"))
+  data = tf.data.Dataset.from_tensor_slices(...)
+
+  SidecarEvaluator(
+      model=model,
+      data=data,
+      checkpoint_dir='/tmp/checkpoint_dir',  # dir for training-saved checkpoint
+      steps=None,  # Eval until dataset is exhausted
+      log_dir='/tmp/log_dir',
+      max_evaluations=None  # The evaluation needs to be stopped manually
+  ).start()
+  ```
+
+  `SidecarEvaluator.start` writes a series of summary
+  files which can be visualized by tensorboard (which provides a webpage link):
+
+  ```bash
+  $ tensorboard --logdir=/tmp/log_dir
+  ...
+  TensorBoard 2.4.0a0 at http://host:port (Press CTRL+C to quit)
+  ```
+
+  The `checkpoint_dir` should contain checkpoints that track `model` and
+  `optimizer` to fulfill `SidecarEvaluator`'s
+  expectation:
+
+  ```python
+  checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint, checkpoint_dir=..., max_to_keep=...)
+  checkpoint_manager.save()
+  ```
+
+  """
+
+  def __init__(self,
+               model,
+               data,
+               checkpoint_dir,
+               log_dir=None,
+               steps=None,
+               max_evaluations=None):
+    """Initializes an `SidecarEvaluator` object.
+
+    Args:
+      model: Model to use for evaluation. The model object used here should be a
+        `tf.keras.Model`, and should be the same as the one that is used in
+        training, where `tf.keras.Model`s are checkpointed. The model should
+        have one or more metrics compiled before using `SidecarEvaluator`.
+      data: The input data for evaluation. `SidecarEvaluator` supports all data
+        types that Keras `model.evaluate` supports as the input data `x`, such
+        as a `tf.data.Dataset`.
+      checkpoint_dir: Directory where checkpoint files are saved.
+      log_dir: Directory where summary files for TensorBoard are saved.
+      steps: Number of steps to perform evaluation for, when evaluating a single
+        checkpoint file. If `None`, evaluation continues until the dataset is
+        exhausted. For repeated evaluation dataset, user must specify `steps` to
+        avoid infinite evaluation loop.
+      max_evaluations: Maximum number of the checkpoint file to be evaluated,
+        for `SidecarEvaluator` to know when to stop. The evaluator will stop
+        after it evaluates a checkpoint filepath ending with
+        '<ckpt_name>-<max_evaluations>'. If using
+        `tf.train.CheckpointManager.save` for saving checkpoints, the kth saved
+        checkpoint has the filepath suffix '<ckpt_name>-<k>' (k=1 for the first
+        saved), and if checkpoints are saved every epoch after training, the
+        filepath saved at the kth epoch would end with '<ckpt_name>-<k>. Thus,
+        if training runs for n epochs, and the evaluator should end after the
+        training finishes, use n for this parameter. Note that this is not
+        necessarily equal to the number of total evaluations, since some
+        checkpoints may be skipped if evaluation is slower than checkpoint
+        creation. If `None`, `SidecarEvaluator` will evaluate indefinitely, and
+        the user must terminate evaluator program themselves.
+    """
+    self.model = model
+    self.data = data
+    self.checkpoint_dir = checkpoint_dir
+    if log_dir:
+      self._summary_writer = summary_ops_v2.create_file_writer_v2(
+          logdir=log_dir)
+    else:
+      self._summary_writer = None
+    self._iterations = variables.Variable(
+        name='iterations',
+        initial_value=_ITERATIONS_UNINITIALIZED,
+        dtype=dtypes.int64)
+    self.max_evaluations = max_evaluations
+    self.steps = steps
+
+  def start(self):
+    """Starts the evaluation loop."""
+    optimizer_checkpoint = tracking_util.Checkpoint(iter=self._iterations)
+    checkpoint = tracking_util.Checkpoint(
+        model=self.model, optimizer=optimizer_checkpoint)
+
+    for latest_checkpoint in checkpoint_utils.checkpoints_iterator(
+        self.checkpoint_dir):
+      try:
+        # `expect_partial` because the checkpoint can have other `Trackable`s
+        # such as `optimizer`.
+        checkpoint.restore(latest_checkpoint).expect_partial()
+      except (errors_impl.OpError,) as e:
+        # A couple errors can happen here with the coordinator racing to write
+        # checkpoint:
+        # 1) OpError: open failed for <file path>: No such file or directory
+        # 2) NotFoundError (subclass of OpError): Unsuccessful
+        # TensorSliceReader constructor.
+        # TODO(rchao): Remove this except block once b/150954027 is resolved.
+        logging.info(
+            'SidecarEvaluator has an error loading '
+            'checkpoint: %s. Retrying. Error: %s: %s', latest_checkpoint,
+            e.__class__.__name__, e)
+        continue
+
+      if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED:
+        raise RuntimeError(
+            '`iterations` cannot be loaded from the '
+            'checkpoint file. Please ensure `iterations` is '
+            'tracked in the `checkpoint` saved by the coordinator.')
+
+      logging.info(
+          'Evaluation starts: Model weights loaded from latest '
+          'checkpoint file: %s.', latest_checkpoint)
+
+      # TODO(rchao): Support arbitrary callback for extensibility.
+      self.model.evaluate(self.data, steps=self.steps)
+
+      logging.info('End of evaluation. Accuracy: %r', [
+          metric.result().numpy()
+          for metric in self.model.compiled_metrics.metrics
+      ])
+
+      if self._summary_writer:
+        with summary_ops_v2.always_record_summaries(
+        ), self._summary_writer.as_default():
+          for metric in self.model.compiled_metrics.metrics:
+            summary_ops_v2.scalar(
+                metric.name,
+                metric.result(),
+                step=self._iterations.read_value())
+
+      if (self.max_evaluations and
+          latest_checkpoint.endswith('-{}'.format(self.max_evaluations))):
+        # Exit the loop because we have evaluated the final checkpoint file.
+        logging.info('Last checkpoint evaluated. SidecarEvaluator stops.')
+        return
diff --git a/tensorflow/python/keras/distribute/sidecar_evaluator_test.py b/tensorflow/python/keras/distribute/sidecar_evaluator_test.py
new file mode 100644
index 00000000000..dfee06b8875
--- /dev/null
+++ b/tensorflow/python/keras/distribute/sidecar_evaluator_test.py
@@ -0,0 +1,111 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test covering sidecar_evaluator.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl import logging
+import numpy as np
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras.distribute import sidecar_evaluator as sidecar_evaluator_lib
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as tracking_util
+
+
+class SidecarEvaluatorTest(test.TestCase):
+
+  def testIterationsNotSavedWillRaiseError(self):
+    model = keras.Sequential([keras.layers.Dense(10)])
+
+    checkpoint_dir = self.get_temp_dir()
+    checkpoint = tracking_util.Checkpoint(model=model)
+    checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint, checkpoint_dir, max_to_keep=2)
+    checkpoint_manager.save()
+
+    sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
+        model, data=None, checkpoint_dir=checkpoint_dir, log_dir=None)
+    with self.assertRaisesRegexp(
+        RuntimeError, '`iterations` cannot be loaded '
+        'from the checkpoint file.'):
+      sidecar_evaluator.start()
+
+  def testSidecarEvaluatorOutputsSummary(self):
+    # Create a model with synthetic data, and fit for one epoch.
+    model = keras.models.Sequential([keras.layers.Dense(10)])
+    model.compile(
+        gradient_descent.SGD(),
+        loss='mse',
+        metrics=keras.metrics.CategoricalAccuracy())
+    data = np.random.random((1000, 32))
+    labels = np.random.random((1000, 10))
+    dataset = dataset_ops.Dataset.from_tensor_slices((data, labels))
+    dataset = dataset.batch(32)
+    model.fit(dataset, epochs=1)
+
+    # Save a checkpoint.
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
+    log_dir = os.path.join(self.get_temp_dir(), 'summary')
+    logging.info('checkpoint_dir = %s, log_dir = %s', checkpoint_dir, log_dir)
+    checkpoint = tracking_util.Checkpoint(
+        model=model, optimizer=model.optimizer)
+    checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint, checkpoint_dir, max_to_keep=2)
+    logging.info('Checkpoint manager saved to: %s', checkpoint_manager.save())
+
+    # Have an sidecar_evaluator evaluate once.
+    sidecar_evaluator_lib.SidecarEvaluator(
+        model,
+        data=dataset,
+        checkpoint_dir=checkpoint_dir,
+        log_dir=log_dir,
+        max_evaluations=1).start()
+
+    # Asserts summary files do get written when log_dir is provided.
+    summary_files = file_io.list_directory_v2(log_dir)
+    self.assertNotEmpty(
+        file_io.list_directory_v2(checkpoint_dir),
+        'Checkpoint should have been written and '
+        'checkpoint_dir should not be empty.')
+    self.assertNotEmpty(
+        summary_files, 'Summary should have been written and '
+        'log_dir should not be empty.')
+
+    # Asserts the content of the summary file.
+    event_pb_written = False
+    for event_pb in summary_iterator.summary_iterator(
+        os.path.join(log_dir, summary_files[0])):
+      if event_pb.step > 0:
+        self.assertEqual(event_pb.step, 32)
+        self.assertEqual(event_pb.summary.value[0].tag, 'categorical_accuracy')
+        event_pb_written = True
+
+    # Verifying at least one non-zeroth step is written to summary.
+    self.assertTrue(event_pb_written)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py b/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py
index 14fdf92c2c7..2cf08113972 100644
--- a/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py
+++ b/tensorflow/python/keras/distribute/tpu_strategy_test_utils.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import google_type_annotations
 from __future__ import print_function
 
+from absl import flags
+
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import remote
-from tensorflow.python.platform import flags
+# from tensorflow.python.platform import flags
 from tensorflow.python.tpu import tpu_strategy_util
 
 FLAGS = flags.FLAGS
diff --git a/tensorflow/python/keras/distribute/worker_training_state.py b/tensorflow/python/keras/distribute/worker_training_state.py
index 6385594e0c0..41377a27936 100644
--- a/tensorflow/python/keras/distribute/worker_training_state.py
+++ b/tensorflow/python/keras/distribute/worker_training_state.py
@@ -19,10 +19,10 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.distribute import distributed_file_utils
 from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
@@ -73,15 +73,17 @@ class WorkerTrainingState(object):
     # workers need to perform `save()`.
     # But all workers should restore from the same checkpoint_dir as passed in
     # read_checkpoint_manager.
-    self.write_checkpoint_dir = distributed_file_utils.write_dirpath(
+    self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint,
+        directory=os.path.join(checkpoint_dir, 'chief'),
+        max_to_keep=1)
+    write_checkpoint_dir = distributed_file_utils.write_dirpath(
         checkpoint_dir, self._model.distribute_strategy)
-    self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
-        checkpoint, directory=self.write_checkpoint_dir, max_to_keep=1)
-    if self.write_checkpoint_dir == checkpoint_dir:
-      self.read_checkpoint_manager = self.write_checkpoint_manager
+    if self._model.distribute_strategy.extended.should_checkpoint:
+      self.write_checkpoint_manager = self.read_checkpoint_manager
     else:
-      self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
-          checkpoint, directory=checkpoint_dir, max_to_keep=1)
+      self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
 
   def back_up(self, epoch):
     """Back up the current state of training into a checkpoint file.
@@ -111,13 +113,8 @@ class WorkerTrainingState(object):
     Delete the backup directories which should not exist after `fit()`
     successfully finishes.
     """
-    # pylint: disable=protected-access
-    for pathname in file_io.get_matching_files_v2(
-        self.write_checkpoint_manager._prefix + '*'):
-      file_io.delete_recursively_v2(pathname)
-    for pathname in file_io.get_matching_files_v2(
-        os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
-      file_io.delete_recursively_v2(pathname)
+    if self.write_checkpoint_manager is self.read_checkpoint_manager:
+      file_io.delete_recursively_v2(self.write_checkpoint_manager.directory)
 
   def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
     """Maybe load initial epoch from ckpt considering possible worker recovery.
diff --git a/tensorflow/python/keras/distribute/worker_training_state_test.py b/tensorflow/python/keras/distribute/worker_training_state_test.py
index cb65747c239..67f15b9748a 100644
--- a/tensorflow/python/keras/distribute/worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/worker_training_state_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
 from tensorflow.python.lib.io import file_io
@@ -51,13 +50,8 @@ class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
               filepath=saving_filepath, save_weights_only=save_weights_only)
       ]
       self.assertFalse(file_io.file_exists_v2(saving_filepath))
-
-      try:
-        model.fit(
-            x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
-      except NotFoundError as e:
-        if 'Failed to create a NewWriteableFile' in e.message:
-          self.skipTest('b/138941852, path not found error in Windows py35.')
+      model.fit(
+          x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
       tf_saved_model_exists = file_io.file_exists_v2(saving_filepath)
       tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
           saving_filepath + '.index')
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 88c2023bd68..e2605b98652 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -54,6 +54,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:parameter_server_strategy",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras:activations",
@@ -67,9 +68,9 @@ py_library(
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/distribute",
-        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
-        "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
-        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/mixed_precision:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision:loss_scale_optimizer",
+        "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:metrics_utils",
@@ -152,12 +153,13 @@ py_library(
         # TODO(keras-team): Fix the cyclar deps between layer and metrics.
         # "//tensorflow/python/keras:metrics",
         "//tensorflow/python/keras:regularizers",
-        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
-        "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
-        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/mixed_precision:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision:loss_scale_optimizer",
+        "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:layer_utils",
+        "//tensorflow/python/keras/utils:object_identity",
         "//tensorflow/python/keras/utils:tf_utils",
         "//tensorflow/python/keras/utils:version_utils",
         "//tensorflow/python/module",
@@ -205,6 +207,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lib",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/keras/utils:object_identity",
         "@six_archive//:six",
     ],
 )
@@ -302,6 +305,9 @@ cuda_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":engine",
         "//tensorflow/python:client_testlib",
@@ -340,7 +346,6 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -356,7 +361,6 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -372,7 +376,6 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -409,6 +412,7 @@ tf_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:callbacks",
         "//tensorflow/python/keras:combinations",
@@ -640,7 +644,6 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
-    tfrt_enabled = True,
     deps = [
         ":base_layer",
         ":engine",
@@ -665,6 +668,7 @@ tf_py_test(
         ":engine",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:composite_tensor",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -677,6 +681,7 @@ tf_py_test(
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
@@ -688,7 +693,7 @@ tf_py_test(
         "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/legacy_tf_layers:core",
-        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/keras/utils:tf_utils",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -731,7 +736,7 @@ tf_py_test(
 
 tf_py_test(
     name = "deferred_sequential_test",
-    size = "small",
+    size = "medium",
     srcs = ["deferred_sequential_test.py"],
     python_version = "PY3",
     tags = [
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index bde0a63b212..32b53f61d5b 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -56,12 +56,13 @@ from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.engine import node as node_module
-from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import autocast_variable
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import object_identity
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
@@ -82,7 +83,7 @@ from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
+from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -124,9 +125,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   Arguments:
     trainable: Boolean, whether the layer's variables should be trainable.
     name: String name of the layer.
-    dtype: The dtype of the layer's computations and weights (default of
-      `None` means use `tf.keras.backend.floatx` in TensorFlow 2, or the type
-      of the first input in TensorFlow 1).
+    dtype: The dtype of the layer's computations and weights. Can also be a
+      `tf.keras.mixed_precision.Policy`, which allows the computation and weight
+      dtype to differ. Default of `None` means to use
+      `tf.keras.mixed_precision.global_policy()`, which is a float32 policy
+      unless set to different value.
     dynamic: Set this to `True` if your layer should only be run eagerly, and
       should not be used to generate a static computation graph.
       This would be the case for a Tree-RNN or a recursive network,
@@ -136,10 +139,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   Attributes:
     name: The name of the layer (string).
-    dtype: The dtype of the layer's computations and weights. If mixed
-      precision is used with a `tf.keras.mixed_precision.experimental.Policy`,
-      this is instead just the dtype of the layer's weights, as the computations
-      are done in a different dtype.
+    dtype: The dtype of the layer's weights.
+    variable_dtype: Alias of `dtype`.
+    compute_dtype: The dtype of the layer's computations. Layers automatically
+      cast inputs to this dtype which causes the computations and output to also
+      be in this dtype. When mixed precision is used with a
+      `tf.keras.mixed_precision.Policy`, this will be different than
+      `variable_dtype`.
+    dtype_policy: The layer's dtype policy. See the
+      `tf.keras.mixed_precision.Policy` documentation for details.
     trainable_weights: List of variables to be included in backprop.
     non_trainable_weights: List of variables that should not be
       included in backprop.
@@ -269,18 +277,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   For more information about creating layers, see the guide
   [Writing custom layers and models with Keras](
     https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-
-  About the layer's `dtype` attribute:
-
-  Each layer has a dtype, which is typically the dtype of the layer's
-  computations and variables. A layer's dtype can be queried via the
-  `Layer.dtype` property. The dtype is specified with the `dtype` constructor
-  argument. In TensorFlow 2, the dtype defaults to `tf.keras.backend.floatx()`
-  if no dtype is passed. `floatx()` itself defaults to "float32". Additionally,
-  layers will cast their inputs to the layer's dtype in TensorFlow 2. When mixed
-  precision is used, layers may have different computation and variable dtypes.
-  See `tf.keras.mixed_precision.experimental.Policy` for details on layer
-  dtypes.
   """
 
   # See tf.Module for the usage of this property.
@@ -304,6 +300,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   # not available to the restoration code).
   _must_restore_from_config = False
 
+  def _instrument_layer_creation(self):
+    self._instrumented_keras_api = False
+    self._instrumented_keras_layer_class = False
+    self._instrumented_keras_model_class = False
+    if not getattr(self, '_disable_keras_instrumentation', False):
+      keras_api_gauge.get_cell('layer').set(True)
+      self._instrumented_keras_api = True
+      if getattr(self, '_is_model_for_instrumentation', False):
+        keras_models_gauge.get_cell(self.__class__.__name__).set(True)
+        self._instrumented_keras_model_class = True
+      else:
+        keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
+        self._instrumented_keras_layer_class = True
+
   @trackable.no_automatic_dependency_tracking
   def __init__(self,
                trainable=True,
@@ -311,11 +321,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                dtype=None,
                dynamic=False,
                **kwargs):
-    keras_api_gauge.get_cell('layer').set(True)
-    if getattr(self, '_is_model_for_instrumentation', False):
-      keras_models_gauge.get_cell(self.__class__.__name__).set(True)
-    else:
-      keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
+    self._instrument_layer_creation()
+
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -343,13 +350,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Indicates whether `build` needs to be called upon layer call, to create
     # the layer's weights.
     self.built = False
+    # Provides information about which inputs are compatible with the layer.
+    self._input_spec = None
+
+    # SavedModel-related attributes.
     # Record the build input shape for loading purposes.
     # TODO(kathywu): Move this to Layer._set_save_spec once cl/290121460 is
     # submitted.
     self._build_input_shape = None
     self._saved_model_inputs_spec = None
-    # Provides information about which inputs are compatible with the layer.
-    self._input_spec = None
 
     # `Layer.compute_mask` will be called at the end of `Layer.__call__` if
     # `Layer.compute_mask` is overridden, or if the `Layer` subclass sets
@@ -377,19 +386,21 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._metrics_lock = threading.Lock()
 
     # Both graph and subclassed networks have a dtype policy. For graph
-    # networks, the policy's compute and variable dtypes are ignored, but other
-    # fields, like the loss scale, are used by Models. For subclassed networks,
-    # the compute and variable dtypes are used as like any ordinary layer.
+    # networks, the policy's compute and variable dtypes are ignored. Such
+    # networks only use the policy if it is a PolicyV1, in which case it uses
+    # the PolicyV1's loss_scale (Policy does not have a loss_scale). For
+    # subclassed networks, the compute and variable dtypes are used as like any
+    # ordinary layer.
     self._set_dtype_policy(dtype)
     # Boolean indicating whether the layer automatically casts its inputs to the
     # layer's compute_dtype.
     self._autocast = kwargs.get('autocast',
                                 base_layer_utils.v2_dtype_behavior_enabled())
 
-    # Dependencies tracked via attribute assignment.
-    # All layers in order of horizontal graph traversal.
-    # Entries are unique. For models includes input and output layers.
-    self._maybe_create_attribute('_layers', [])
+    # Tracks `TrackableDataStructure`s, `Module`s, and `Layer`s.
+    # Ordered by when the object was assigned as an attr.
+    # Entries are unique.
+    self._maybe_create_attribute('_self_tracked_trackables', [])
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
@@ -514,7 +525,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Arguments:
       name: Variable name.
       shape: Variable shape. Defaults to scalar if unspecified.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      dtype: The type of the variable. Defaults to `self.dtype`.
       initializer: Initializer instance (callable).
       regularizer: Regularizer instance (callable).
       trainable: Boolean, whether the variable should be part of the layer's
@@ -596,8 +607,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                          ' for layer %s' % (name, dtype.base_dtype, self.name))
 
     getter = kwargs.pop('getter', base_layer_utils.make_variable)
-    if (autocast and self._dtype_policy.should_cast_variables and
-        dtype.is_floating):
+    if (autocast and
+        self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype
+        and dtype.is_floating):
       old_getter = getter
       # Wrap variable constructor to return an AutoCastVariable.
       def getter(*args, **kwargs):  # pylint: disable=function-redefined
@@ -780,9 +792,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     """
     def check_type_return_shape(s):
       if not isinstance(s, tensor_spec.TensorSpec):
-        raise TypeError(
-            'Only TensorSpec signature types are supported, '
-            'but saw signature signature entry: {}.'.format(s))
+        raise TypeError('Only TensorSpec signature types are supported, '
+                        'but saw signature entry: {}.'.format(s))
       return s.shape
     input_shape = nest.map_structure(check_type_return_shape, input_signature)
     output_shape = self.compute_output_shape(input_shape)
@@ -860,7 +871,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           keras_tensor.keras_tensor_from_tensor, outputs)
 
     if hasattr(self, '_set_inputs') and not self.inputs:
-      # TODO(kaftan): figure out if we ned to do this at all
+      # TODO(kaftan): figure out if we need to do this at all
       # Subclassed network: explicitly set metadata normally set by
       # a call to self._set_inputs().
       self._set_inputs(inputs, outputs)
@@ -1232,7 +1243,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def dtype(self):
-    """Dtype used by the weights of the layer, set in the constructor."""
+    """The dtype of the layer weights.
+
+    This is equivalent to `Layer.dtype_policy.variable_dtype`. Unless
+    mixed precision is used, this is the same as `Layer.compute_dtype`, the
+    dtype of the layer's computations.
+    """
     return self._dtype_policy.variable_dtype
 
   @property
@@ -1334,14 +1350,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Trainable weights are updated via gradient descent during training.
 
-    Note: This will not track the weights of nested `tf.Modules` that are not
-    themselves Keras layers.
-
     Returns:
       A list of trainable variables.
     """
     if self.trainable:
-      children_weights = self._gather_children_attribute('trainable_weights')
+      children_weights = self._gather_children_attribute('trainable_variables')
       return self._dedup_weights(self._trainable_weights + children_weights)
     else:
       return []
@@ -1353,18 +1366,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Non-trainable weights are *not* updated during training. They are expected
     to be updated manually in `call()`.
 
-    Note: This will not track the weights of nested `tf.Modules` that are not
-    themselves Keras layers.
-
     Returns:
       A list of non-trainable variables.
     """
     if self.trainable:
       children_weights = self._gather_children_attribute(
-          'non_trainable_weights')
+          'non_trainable_variables')
       non_trainable_weights = self._non_trainable_weights + children_weights
     else:
-      children_weights = self._gather_children_attribute('weights')
+      children_weights = self._gather_children_attribute('variables')
       non_trainable_weights = (
           self._trainable_weights + self._non_trainable_weights +
           children_weights)
@@ -1374,9 +1384,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def weights(self):
     """Returns the list of all layer variables/weights.
 
-    Note: This will not track the weights of nested `tf.Modules` that are not
-    themselves Keras layers.
-
     Returns:
       A list of variables.
     """
@@ -1592,7 +1599,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
     # Set to thread local directly to avoid Layer.__setattr__ overhead.
-    if not getattr(self, '_layers', None):  # Fast path for single Layer.
+    if not getattr(self, '_self_tracked_trackables',
+                   None):  # Fast path for single Layer.
       self._thread_local._eager_losses = []
     else:
       for layer in self._flatten_layers():
@@ -2348,20 +2356,46 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     else:
       self._compute_dtype_object = None
 
-  # TODO(reedwm): Expose this property?
   @property
-  def _compute_dtype(self):
-    """The layer's compute dtype.
+  def dtype_policy(self):
+    """The dtype policy associated with this layer.
 
-    Unless mixed-precision is used, this is the same as `Layer.dtype`.
+    This is an instance of a `tf.keras.mixed_precision.Policy`.
+    """
+    return self._dtype_policy
 
-    If self._autocast is True, layer's will cast floating-point inputs to this.
+  @property
+  def compute_dtype(self):
+    """The dtype of the layer's computations.
+
+    This is equivalent to `Layer.dtype_policy.compute_dtype`. Unless
+    mixed precision is used, this is the same as `Layer.dtype`, the dtype of
+    the weights.
+
+    Layers automatically cast their inputs to the compute dtype, which causes
+    computations and the output to be in the compute dtype as well. This is done
+    by the base Layer class in `Layer.__call__`, so you do not have to insert
+    these casts if implementing your own layer.
+
+    Layers often perform certain internal computations in higher precision when
+    `compute_dtype` is float16 or bfloat16 for numeric stability. The output
+    will still typically be float16 or bfloat16 in such cases.
 
     Returns:
       The layer's compute dtype.
     """
     return self._dtype_policy.compute_dtype
 
+  @property
+  def _compute_dtype(self):
+    """Deprecated alias of `compute_dtype`."""
+    return self._dtype_policy.compute_dtype
+
+  @property
+  def variable_dtype(self):
+    """Alias of `Layer.dtype`, the dtype of the weights."""
+    return self.dtype
+
   def _maybe_cast_inputs(self, inputs, input_list=None):
     """Maybe casts the inputs to the compute dtype.
 
@@ -2736,7 +2770,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       default_value: Object, the default value of the attribute.
     """
     if not hasattr(self, name):
-      super(Layer, self).__setattr__(name, default_value)
+      self.__setattr__(name, default_value)
 
   def __delattr__(self, name):
     # For any super.__delattr__() call, we will directly use the implementation
@@ -2770,8 +2804,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if (isinstance(existing_value, Layer)
         or base_layer_utils.has_weights(existing_value)):
       super(tracking.AutoTrackable, self).__setattr__(
-          '_layers',
-          [l for l in self._layers if l is not existing_value])
+          '_self_tracked_trackables',
+          [l for l in self._self_tracked_trackables if l is not existing_value])
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2794,7 +2828,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
              'different name.').format(name))
       return
 
-    # Keep track of trackable objects, for the needs of `Network.save_weights`.
+    # Wraps data structures in `Trackable`, unwraps `NoDependency` objects.
     value = data_structures.sticky_attribute_assignment(
         trackable=self, value=value, name=name)
 
@@ -2813,16 +2847,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if isinstance(val, metrics_mod.Metric) and hasattr(self, '_metrics'):
         self._metrics.append(val)
 
-    # TODO(scottzhu): Need to track Module object as well for weight tracking.
-    # Be careful about metric if it becomes a Module in future.
-    # Append value to self._layers if relevant
+    # Append value to self._self_tracked_trackables if relevant
     if (getattr(self, '_auto_track_sub_layers', True) and
-        (isinstance(value, Layer) or base_layer_utils.has_weights(value))):
-      self._maybe_create_attribute('_layers', [])
+        (isinstance(value, module.Module) or
+         base_layer_utils.has_weights(value))):
+      self._maybe_create_attribute('_self_tracked_trackables', [])
       # We need to check object identity to avoid de-duplicating empty
       # container types which compare equal.
-      if not any((layer is value for layer in self._layers)):
-        self._layers.append(value)
+      if not any((layer is value for layer in self._self_tracked_trackables)):
+        self._self_tracked_trackables.append(value)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2831,7 +2864,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Append value to list of trainable / non-trainable weights if relevant
     # TODO(b/125122625): This won't pick up on any variables added to a
     # list/dict after creation.
-    for val in nest.flatten(value):
+    for val in nest.flatten(value, expand_composites=True):
       # TODO(b/126450014): Remove `_UnreadVariable` check here when assign ops
       # no longer return True for isinstance Variable checks.
       if not isinstance(val, tf_variables.Variable):
@@ -2860,45 +2893,59 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   def _gather_children_attribute(self, attribute):
     assert attribute in {
-        'weights', 'trainable_weights', 'non_trainable_weights'
+        'variables', 'trainable_variables', 'non_trainable_variables'
     }
-    if hasattr(self, '_layers'):
-      nested_layers = layer_utils.filter_empty_layer_containers(
-          self._layers)
+    if hasattr(self, '_self_tracked_trackables'):
+      nested_layers = self._flatten_modules(include_self=False, recursive=False)
       return list(
           itertools.chain.from_iterable(
               getattr(layer, attribute) for layer in nested_layers))
     return []
 
   def _flatten_layers(self, recursive=True, include_self=True):
+    for m in self._flatten_modules(
+        recursive=recursive, include_self=include_self):
+      if isinstance(m, Layer):
+        yield m
+
+  def _flatten_modules(self, recursive=True, include_self=True):
+    """Flattens `tf.Module` instances (excluding `Metrics`).
+
+    Arguments:
+      recursive: Whether to recursively flatten through submodules.
+      include_self: Whether to include this `Layer` instance.
+
+    Yields:
+      `tf.Module` instance tracked by this `Layer`.
+    """
     if include_self:
       yield self
 
     # Only instantiate set and deque if needed.
-    layers_or_containers = getattr(self, '_layers', None)
-    if layers_or_containers:
+    trackables = getattr(self, '_self_tracked_trackables', None)
+    if trackables:
       seen_object_ids = set()
-      deque = collections.deque(layers_or_containers)
+      deque = collections.deque(trackables)
       while deque:
-        layer_or_container = deque.popleft()
-
-        layer_or_container_id = id(layer_or_container)
-        if layer_or_container_id in seen_object_ids:
+        trackable_obj = deque.popleft()
+        trackable_id = id(trackable_obj)
+        if trackable_id in seen_object_ids:
           continue
-        seen_object_ids.add(layer_or_container_id)
+        seen_object_ids.add(trackable_id)
 
-        if (isinstance(layer_or_container, Layer) and
-            not isinstance(layer_or_container, metrics_mod.Metric)):
-          yield layer_or_container
+        # Metrics are not considered part of the Layer's topology.
+        if (isinstance(trackable_obj, module.Module) and
+            not isinstance(trackable_obj, metrics_mod.Metric)):
+          yield trackable_obj
           # Introspect recursively through sublayers.
           if recursive:
-            sublayers = getattr(layer_or_container, '_layers', None)
-            if sublayers:
-              deque.extendleft(reversed(sublayers))
-        elif isinstance(layer_or_container,
-                        data_structures.TrackableDataStructure):
+            subtrackables = getattr(trackable_obj, '_self_tracked_trackables',
+                                    None)
+            if subtrackables:
+              deque.extendleft(reversed(subtrackables))
+        elif isinstance(trackable_obj, data_structures.TrackableDataStructure):
           # Data structures are introspected even with `recursive=False`.
-          tracked_values = layer_or_container._values
+          tracked_values = trackable_obj._values
           if tracked_values:
             deque.extendleft(reversed(tracked_values))
 
@@ -3046,6 +3093,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return (self._trackable_saved_model_saver
             .list_functions_for_serialization(serialization_cache))
 
+  @property
+  def _use_input_spec_as_call_signature(self):
+    # Whether input spec can be used as the call signature when tracing the
+    # Layer for SavedModel. By default, this is set to `True` for layers
+    # exported from the Keras library, because the layers more rigidly define
+    # the `input_specs` property (many custom layers only set the `ndims`)
+    return get_canonical_name_for_symbol(type(self)) is not None
+
   def __getstate__(self):
     # Override to support `copy.deepcopy` and pickling.
     # Thread-local objects cannot be copied in Python 3, so pop these.
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index f5cb63226f3..f422e796b5d 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -27,12 +27,14 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -46,6 +48,7 @@ from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.legacy_tf_layers import core as legacy_core
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -83,6 +86,13 @@ class InvalidLayer(base_layer.Layer):
 
 class BaseLayerTest(keras_parameterized.TestCase):
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_layer_instrumentation(self):
+    layer = layers.Add()
+    self.assertTrue(layer._instrumented_keras_api)
+    self.assertTrue(layer._instrumented_keras_layer_class)
+    self.assertFalse(layer._instrumented_keras_model_class)
+
   @combinations.generate(combinations.times(
       combinations.keras_model_type_combinations(),
       combinations.keras_tensor_combinations()))
@@ -433,6 +443,49 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # Checks that variables get initialized.
     model.fit(x, y, batch_size=2, epochs=2)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_composite_variable_assignment(self):
+
+    class Spec(type_spec.TypeSpec):
+
+      value_type = property(lambda self: CompositeVariable)
+
+      def _component_specs(self):
+        pass
+
+      def _serialize(self):
+        pass
+
+      def _to_components(self, value):
+        return value._variables
+
+      def _from_components(self, variable_list):
+        return CompositeVariable(variable_list)
+
+    class CompositeVariable(composite_tensor.CompositeTensor):
+
+      def __init__(self, variable_list):
+        self._variables = variable_list
+
+      @property
+      def _type_spec(self):
+        return Spec()
+
+    class CompositeVariableLayer(base_layer.Layer):
+
+      def __init__(self):
+        super().__init__()
+        self.composite_var = CompositeVariable(
+            [variables.Variable(1.),
+             variables.Variable(2.)])
+
+    layer = CompositeVariableLayer()
+    self.assertLen(layer.weights, 2)
+    self.assertIsInstance(layer.weights[0], variables.Variable)
+    self.assertIsInstance(layer.weights[1], variables.Variable)
+    self.assertEqual(self.evaluate(layer.weights[0]), 1.)
+    self.assertEqual(self.evaluate(layer.weights[1]), 2.)
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_layer_names(self):
     with testing_utils.use_keras_tensors_scope(False):
@@ -788,6 +841,58 @@ class BaseLayerTest(keras_parameterized.TestCase):
     layer = MyLayer(activity_regularizer='l2')
     self.assertIsInstance(layer.activity_regularizer, regularizers.L2)
 
+  def test_tf_module_tracking(self):
+
+    class MyModule(module.Module):
+
+      def __init__(self):
+        super(MyModule, self).__init__()
+        self.v1 = variables.Variable(1., trainable=True, name='v1')
+        self.v2 = variables.Variable(2., trainable=False, name='v2')
+
+      def __call__(self, x):
+        return x * self.v1 * self.v2
+
+    class MyLayer(base_layer.Layer):
+
+      def __init__(self, **kwargs):
+        super(MyLayer, self).__init__(self, **kwargs)
+        self.my_modules = {}
+        self.my_modules['a'] = MyModule()
+
+      def call(self, x):
+        return self.my_modules['a'](x)
+
+    layer = MyLayer()
+    self.assertLen(layer.variables, 2)
+    self.assertLen(layer.trainable_variables, 1)
+    self.assertLen(layer.non_trainable_variables, 1)
+
+    layer.trainable = False
+    self.assertLen(layer.variables, 2)
+    self.assertLen(layer.trainable_variables, 0)
+    self.assertLen(layer.non_trainable_variables, 2)
+
+    class MyModel(training_lib.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.my_modules = []
+        self.my_modules.append(MyModule())
+
+      def call(self, x):
+        return self.my_modules[0](x)
+
+    model = MyModel()
+    self.assertLen(model.variables, 2)
+    self.assertLen(model.trainable_variables, 1)
+    self.assertLen(model.non_trainable_variables, 1)
+
+    model.trainable = False
+    self.assertLen(model.variables, 2)
+    self.assertLen(model.trainable_variables, 0)
+    self.assertLen(model.non_trainable_variables, 2)
+
 
 class SymbolicSupportTest(keras_parameterized.TestCase):
 
@@ -894,7 +999,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
 
     tmp_dir = self.get_temp_dir()
     writer = summary_ops_v2.create_file_writer_v2(tmp_dir)
-    with writer.as_default(), summary_ops_v2.always_record_summaries():
+    with writer.as_default(), summary_ops_v2.record_if(True):
       my_layer = MyLayer()
       x = array_ops.ones((10, 10))
 
@@ -1026,12 +1131,13 @@ class NestedTrackingTest(test.TestCase):
     del l.c
     l.d = last_assignment
     del l.d
-    self.assertEqual([last_assignment], l._layers)
+    sublayers = list(l._flatten_layers(include_self=False, recursive=False))
+    self.assertEqual([last_assignment], sublayers)
     self.assertEqual([], l.trainable_weights)
     self.assertEqual([], l.non_trainable_weights)
     self.assertEqual([], l.weights)
     del l.a
-    self.assertEqual([], l._layers)
+    self.assertEqual([], l._self_tracked_trackables)
 
   def test_assign_op_not_tracked_as_variable(self):
 
@@ -1385,8 +1491,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
     if testing_utils.should_run_eagerly():
       model.fit(x, y, epochs=2, batch_size=5)
     else:
-      with self.assertRaisesRegex(errors_impl.InaccessibleTensorError,
-                                  'ActivityRegularizer'):
+      with self.assertRaisesRegex(ValueError, 'ActivityRegularizer'):
         model.fit(x, y, epochs=2, batch_size=5)
 
   def test_conditional_activity_regularizer_with_wrappers_in_call(self):
@@ -1417,8 +1522,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
     if testing_utils.should_run_eagerly():
       model.fit(x, y, epochs=2, batch_size=5)
     else:
-      with self.assertRaisesRegex(errors_impl.InaccessibleTensorError,
-                                  'ActivityRegularizer'):
+      with self.assertRaisesRegex(ValueError, 'ActivityRegularizer'):
         model.fit(x, y, epochs=2, batch_size=5)
 
 
@@ -1450,7 +1554,7 @@ class IdentityLayer(base_layer.Layer):
 class DTypeTest(keras_parameterized.TestCase):
 
   # This class only have tests relating to layer.dtype. Tests for dtype policies
-  # are in mixed_precision/experimental/keras_test.py
+  # are in mixed_precision/keras_test.py
 
   # TODO(reedwm): Maybe have a separate test file for input casting tests.
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 98414bc9d49..8755be24c57 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -33,10 +33,10 @@ from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking
+from tensorflow.python.util import keras_deps
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -417,7 +417,9 @@ def call_context():
   return call_ctx
 
 
-control_flow_util_v2._register_keras_layer_context_function(call_context)  # pylint: disable=protected-access
+# Inject the call_context function to keras_deps to remove the dependency
+# from TFLite to Keras.
+keras_deps.register_call_context_function(call_context)
 
 
 class CallContext(object):
@@ -741,9 +743,9 @@ def enable_v2_dtype_behavior():
   autocasting part of the V2 behavior for that layer, but not the defaulting to
   floatx part of the V2 behavior.
 
-  When a global `tf.keras.mixed_precision.experimental.Policy` is set, a Keras
-  layer's dtype will default to the global policy instead of floatx. Layers
-  will automatically cast inputs to the policy's compute_dtype.
+  When a global `tf.keras.mixed_precision.Policy` is set, a Keras layer's dtype
+  will default to the global policy instead of floatx. Layers will automatically
+  cast inputs to the policy's compute_dtype.
   """
   global V2_DTYPE_BEHAVIOR
   V2_DTYPE_BEHAVIOR = True
@@ -847,7 +849,7 @@ def no_ragged_support(inputs, layer_name):
 
 
 def is_split_variable(v):
-  """Returns True if `v` is either a PartionedVariable or a SharedVariable."""
+  """Returns True if `v` is either a PartionedVariable or a ShardedVariable."""
   return hasattr(v, '_variable_list') or hasattr(v, '_variables')
 
 
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index fd9db0e4346..ddb9d53120c 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -46,12 +46,13 @@ from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
-from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import autocast_variable
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import object_identity
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
@@ -68,7 +69,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 from tensorflow.tools.docs import doc_controls
 
 
@@ -112,9 +112,9 @@ class Layer(base_layer.Layer):
   Attributes:
     name: The name of the layer (string).
     dtype: The dtype of the layer's computations and weights. If mixed
-      precision is used with a `tf.keras.mixed_precision.experimental.Policy`,
-      this is instead just the dtype of the layer's weights, as the computations
-      are done in a different dtype.
+      precision is used with a `tf.keras.mixed_precision.Policy`, this is
+      instead just the dtype of the layer's weights, as the computations are
+      done in a different dtype.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
     trainable_weights: List of variables to be included in backprop.
@@ -133,8 +133,7 @@ class Layer(base_layer.Layer):
   if no dtype is passed. `floatx()` itself defaults to "float32". Additionally,
   layers will cast their inputs to the layer's dtype in TensorFlow 2. When mixed
   precision is used, layers may have different computation and variable dtypes.
-  See `tf.keras.mixed_precision.experimental.Policy` for details on layer
-  dtypes.
+  See `tf.keras.mixed_precision.Policy` for details on layer dtypes.
   """
 
   # See tf.Module for the usage of this property.
@@ -152,8 +151,8 @@ class Layer(base_layer.Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
                **kwargs):
-    base_layer.keras_api_gauge.get_cell('layer').set(True)
-    base_layer.keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
+    self._instrument_layer_creation()
+
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -199,9 +198,11 @@ class Layer(base_layer.Layer):
     self._metrics = []
 
     # Both graph and subclassed networks have a dtype policy. For graph
-    # networks, the policy's compute and variable dtypes are ignored, but other
-    # fields, like the loss scale, are used by Models. For subclassed networks,
-    # the compute and variable dtypes are used as like any ordinary layer.
+    # networks, the policy's compute and variable dtypes are ignored. Such
+    # networks only use the policy if it is a PolicyV1, in which case it uses
+    # the PolicyV1's loss_scale (Policy does not have a loss_scale). For
+    # subclassed networks, the compute and variable dtypes are used as like any
+    # ordinary layer.
     self._set_dtype_policy(dtype)
     # Boolean indicating whether the layer automatically casts its inputs to the
     # layer's compute_dtype.
@@ -211,7 +212,7 @@ class Layer(base_layer.Layer):
     # Dependencies tracked via attribute assignment.
     # All layers in order of horizontal graph traversal.
     # Entries are unique. For models includes input and output layers.
-    self._maybe_create_attribute('_layers', [])
+    self._maybe_create_attribute('_self_tracked_trackables', [])
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
@@ -420,8 +421,9 @@ class Layer(base_layer.Layer):
         raise ValueError('An initializer for variable %s of type %s is required'
                          ' for layer %s' % (name, dtype.base_dtype, self.name))
 
-    if (autocast and self._dtype_policy.should_cast_variables and
-        dtype.is_floating):
+    if (autocast and
+        self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype
+        and dtype.is_floating):
       # Wrap 'getter' with a version that returns an AutoCastVariable.
       old_getter = getter
       def getter(*args, **kwargs):  # pylint: disable=function-redefined
@@ -598,9 +600,8 @@ class Layer(base_layer.Layer):
     """
     def check_type_return_shape(s):
       if not isinstance(s, tensor_spec.TensorSpec):
-        raise TypeError(
-            'Only TensorSpec signature types are supported, '
-            'but saw signature signature entry: {}.'.format(s))
+        raise TypeError('Only TensorSpec signature types are supported, '
+                        'but saw signature entry: {}.'.format(s))
       return s.shape
     input_shape = nest.map_structure(check_type_return_shape, input_signature)
     output_shape = self.compute_output_shape(input_shape)
@@ -879,7 +880,7 @@ class Layer(base_layer.Layer):
   @trainable.setter
   def trainable(self, value):
     self._trainable = value
-    for layer in getattr(self, '_layers', []):
+    for layer in getattr(self, '_self_tracked_trackables', []):
       layer.trainable = value
 
   @property
@@ -907,36 +908,6 @@ class Layer(base_layer.Layer):
                         'Got: {}'.format(v))
     self._input_spec = value
 
-  @property
-  def trainable_weights(self):
-    if self.trainable:
-      children_weights = self._gather_children_attribute('trainable_weights')
-      return self._dedup_weights(self._trainable_weights + children_weights)
-    else:
-      return []
-
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      children_weights = self._gather_children_attribute(
-          'non_trainable_weights')
-      non_trainable_weights = self._non_trainable_weights + children_weights
-    else:
-      children_weights = self._gather_children_attribute('weights')
-      non_trainable_weights = (
-          self._trainable_weights + self._non_trainable_weights +
-          children_weights)
-    return self._dedup_weights(non_trainable_weights)
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
   @property
   def updates(self):
     collected_updates = []
@@ -949,9 +920,15 @@ class Layer(base_layer.Layer):
           if callable(u):
             try:
               u = u()
-            except errors.InaccessibleTensorError:
-              base_layer_utils.check_graph_consistency(
-                  method='add_update', force_raise=True)
+            except ValueError as e:
+              if 'InaccessibleTensorError' in type(e).__name__:
+                # For one specific case of error we try to raise
+                # a more meaningful error message about the graph if we can.
+                # This error is an internal TF symbol that is not
+                # publicly exposed, so we check the name directly rather
+                # than using a direct import.
+                base_layer_utils.check_graph_consistency(
+                    method='add_update', force_raise=True)
               raise  # check_graph_consistency may not always raise.
           base_layer_utils.check_graph_consistency(u, method='add_update')
           collected_updates.append(u)
@@ -1015,7 +992,7 @@ class Layer(base_layer.Layer):
     x = tf.keras.layers.Dense(10)(inputs)
     outputs = tf.keras.layers.Dense(1)(x)
     model = tf.keras.Model(inputs, outputs)
-    # Actvity regularization.
+    # Activity regularization.
     model.add_loss(tf.abs(tf.reduce_mean(x)))
     ```
 
@@ -2102,9 +2079,10 @@ class Layer(base_layer.Layer):
         # operations.
         with tf_utils.maybe_init_scope(self):
           self.build(input_shapes)
-      # We must set self.built since user defined build functions are not
-      # constrained to set self.built.
-      self.built = True
+      # We must set also ensure that the layer is marked as built, and the build
+      # shape is stored since user defined build functions may not be calling
+      # `super.build()`
+      Layer.build(self, input_shapes)
 
     # Optionally load weight values specified at layer instantiation.
     if self._initial_weights is not None:
@@ -2128,21 +2106,20 @@ class Layer(base_layer.Layer):
     Returns:
       A dict mapping all sublayers to their `trainable` value.
     """
-    layers = layer_utils.filter_empty_layer_containers(self._layers)
-    # Keep track of each top-level layers' `trainable` as well as the
-    # state of all of its sublayers.
+    layers = self._flatten_layers(include_self=False, recursive=False)
     trainable_state = {self: self.trainable}
-    for layer in layers:
-      trainable_state.update(layer._get_trainable_state())
+    for l in layers:
+      trainable_state.update(l._get_trainable_state())
     return trainable_state
 
   def _set_trainable_state(self, trainable_state):
     """Set `trainable` state for each sublayer."""
-    layers = layer_utils.filter_empty_layer_containers(self._layers)
     if self in trainable_state:
       self.trainable = trainable_state[self]
-    for layer in layers:
-      layer._set_trainable_state(trainable_state)
+    layers = self._flatten_layers(include_self=False, recursive=False)
+    for l in layers:
+      if l in trainable_state:
+        l._set_trainable_state(trainable_state)
 
   @property
   def _obj_reference_counts(self):
@@ -2166,7 +2143,7 @@ class Layer(base_layer.Layer):
       default_value: Object, the default value of the attribute.
     """
     if not hasattr(self, name):
-      super(Layer, self).__setattr__(name, default_value)
+      self.__setattr__(name, default_value)
 
   def __delattr__(self, name):
     # For any super.__delattr__() call, we will directly use the implementation
@@ -2200,8 +2177,8 @@ class Layer(base_layer.Layer):
     if (isinstance(existing_value, Layer)
         or base_layer_utils.has_weights(existing_value)):
       super(tracking.AutoTrackable, self).__setattr__(
-          '_layers',
-          [l for l in self._layers if l is not existing_value])
+          '_self_tracked_trackables',
+          [l for l in self._self_tracked_trackables if l is not existing_value])
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2249,11 +2226,11 @@ class Layer(base_layer.Layer):
     # Append value to self._layers if relevant
     if (getattr(self, '_auto_track_sub_layers', True) and
         (isinstance(value, Layer) or base_layer_utils.has_weights(value))):
-      self._maybe_create_attribute('_layers', [])
+      self._maybe_create_attribute('_self_tracked_trackables', [])
       # We need to check object identity to avoid de-duplicating empty
       # container types which compare equal.
-      if not any((layer is value for layer in self._layers)):
-        self._layers.append(value)
+      if not any((layer is value for layer in self._self_tracked_trackables)):
+        self._self_tracked_trackables.append(value)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2289,18 +2266,6 @@ class Layer(base_layer.Layer):
     # at __delattr__.
     super(tracking.AutoTrackable, self).__setattr__(name, value)
 
-  def _gather_children_attribute(self, attribute):
-    assert attribute in {
-        'weights', 'trainable_weights', 'non_trainable_weights'
-    }
-    if hasattr(self, '_layers'):
-      nested_layers = layer_utils.filter_empty_layer_containers(
-          self._layers)
-      return list(
-          itertools.chain.from_iterable(
-              getattr(layer, attribute) for layer in nested_layers))
-    return []
-
   # This is a hack so that the is_layer (within
   # training/trackable/layer_utils.py) check doesn't get the weights attr.
   # TODO(b/110718070): Remove when fixed.
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 5b8f1492fd0..259e6e4694a 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.keras import losses as losses_mod
 from tensorflow.python.keras import metrics as metrics_mod
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -261,7 +262,9 @@ class LossesContainer(Container):
 
     loss = losses_mod.get(loss)
     if not isinstance(loss, losses_mod.Loss):
-      loss_name = loss.__name__
+      loss_name = get_custom_object_name(loss)
+      if loss_name is None:
+        raise ValueError('Loss should be a callable, found: {}'.format(loss))
       loss = losses_mod.LossFunctionWrapper(loss, name=loss_name)
     loss._allow_sum_over_batch_size = True  # pylint: disable=protected-access
     return loss
@@ -289,11 +292,25 @@ class MetricsContainer(Container):
 
   @property
   def metrics(self):
-    """Metrics created by this container."""
+    """All metrics in this container."""
     if not self._built:
       return []
     return self._metrics_in_order
 
+  @property
+  def unweighted_metrics(self):
+    """Metrics in this container that should not be passed `sample_weight`."""
+    if not self._built:
+      return None
+    return nest.flatten(self._metrics)
+
+  @property
+  def weighted_metrics(self):
+    """Metrics in this container that should be passed `sample_weight`."""
+    if not self._built:
+      return None
+    return nest.flatten(self._weighted_metrics)
+
   def build(self, y_pred, y_true):
     """One-time setup of metric objects."""
     super(MetricsContainer, self).build(y_pred)
@@ -466,11 +483,11 @@ class MetricsContainer(Container):
     if not isinstance(metric_obj, metrics_mod.Metric):
       if isinstance(metric, six.string_types):
         metric_name = metric
-      elif hasattr(metric, 'name'):
-        metric_name = metric.name  # TODO(omalleyt): Is this needed?
       else:
-        # function was passed.
-        metric_name = metric.__name__
+        metric_name = get_custom_object_name(metric)
+        if metric_name is None:
+          raise ValueError(
+              'Metric should be a callable, found: {}'.format(metric))
 
       metric_obj = metrics_mod.MeanMetricWrapper(metric_obj, name=metric_name)
 
@@ -638,3 +655,22 @@ def apply_mask(y_p, sw, mask):
     else:
       sw = mask
   return sw
+
+
+def get_custom_object_name(obj):
+  """Returns the name to use for a custom loss or metric callable.
+
+  Arguments:
+    obj: Custom loss of metric callable
+
+  Returns:
+    Name to use, or `None` if the object was not recognized.
+  """
+  if hasattr(obj, 'name'):  # Accept `Loss` instance as `Metric`.
+    return obj.name
+  elif hasattr(obj, '__name__'):  # Function.
+    return obj.__name__
+  elif hasattr(obj, '__class__'):  # Class instance.
+    return generic_utils.to_snake_case(obj.__class__.__name__)
+  else:  # Unrecognized object.
+    return None
diff --git a/tensorflow/python/keras/engine/compile_utils_test.py b/tensorflow/python/keras/engine/compile_utils_test.py
index ae92b9aeb09..033207ed95f 100644
--- a/tensorflow/python/keras/engine/compile_utils_test.py
+++ b/tensorflow/python/keras/engine/compile_utils_test.py
@@ -338,6 +338,24 @@ class LossesContainerTest(keras_parameterized.TestCase):
     self.assertEqual(loss_metric.name, 'loss')
     self.assertAlmostEqual(loss_metric.result().numpy(), .125)
 
+  def test_custom_loss_callables(self):
+
+    def custom_loss_fn(y_true, y_pred):
+      return math_ops.reduce_sum(y_true - y_pred)
+
+    class CustomLossClass(object):
+
+      def __call__(self, y_true, y_pred):
+        return math_ops.reduce_sum(y_true - y_pred)
+
+    loss_container = compile_utils.LossesContainer(
+        [custom_loss_fn, CustomLossClass()])
+    y_t, y_p = array_ops.ones((10, 5)), array_ops.zeros((10, 5))
+    loss_container(y_t, y_p)
+
+    self.assertEqual(loss_container._losses[0].name, 'custom_loss_fn')
+    self.assertEqual(loss_container._losses[1].name, 'custom_loss_class')
+
 
 class MetricsContainerTest(keras_parameterized.TestCase):
 
@@ -402,6 +420,18 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     self.assertEqual(acc_metric_2.result().numpy(), 0.)
     self.assertEqual(acc_metric_2._fn, metrics_mod.binary_accuracy)
 
+    weighted_metrics = metric_container.weighted_metrics
+    self.assertLen(weighted_metrics, 2)
+    self.assertEqual(weighted_metrics[0].name, 'output_1_accuracy')
+    self.assertEqual(weighted_metrics[1].name, 'output_2_accuracy')
+
+    unweighted_metrics = metric_container.unweighted_metrics
+    self.assertLen(unweighted_metrics, 4)
+    self.assertEqual(unweighted_metrics[0].name, 'output_1_mse')
+    self.assertEqual(unweighted_metrics[1].name, 'output_1_mae')
+    self.assertEqual(unweighted_metrics[2].name, 'output_2_mse')
+    self.assertEqual(unweighted_metrics[3].name, 'output_2_mae')
+
   def test_metric_dict(self):
     metric_container = compile_utils.MetricsContainer(
         metrics={
@@ -685,6 +715,24 @@ class MetricsContainerTest(keras_parameterized.TestCase):
       self.assertEqual(metric.name, 'mean_squared_error')
       self.assertEqual(metric.result().numpy(), 1.)
 
+  def test_custom_metric_callables(self):
+
+    def custom_metric_fn(y_true, y_pred):
+      return math_ops.reduce_sum(y_true - y_pred)
+
+    class CustomMetricClass(object):
+
+      def __call__(self, y_true, y_pred):
+        return math_ops.reduce_sum(y_true - y_pred)
+
+    metric_container = compile_utils.MetricsContainer(
+        [custom_metric_fn, CustomMetricClass()])
+    y_t, y_p = array_ops.ones((10, 5)), array_ops.zeros((10, 5))
+    metric_container.update_state(y_t, y_p)
+
+    self.assertEqual(metric_container.metrics[0].name, 'custom_metric_fn')
+    self.assertEqual(metric_container.metrics[1].name, 'custom_metric_class')
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 2cc6f69403e..fde9b533a4a 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -31,6 +31,7 @@ import six
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.eager import context
@@ -523,9 +524,11 @@ class CompositeTensorDataAdapter(DataAdapter):
       flat_inputs += nest.flatten(y)
 
     def _is_composite(v):
-      # Dataset inherits from CompositeTensor but shouldn't be handled here.
+      # Dataset/iterator inherits from CompositeTensor but should be handled
+      # by DatasetAdapter and GeneratorAdapter.
       if (tf_utils.is_extension_type(v) and
-          not isinstance(v, dataset_ops.DatasetV2)):
+          not isinstance(v, (dataset_ops.DatasetV2,
+                             iterator_ops.IteratorBase))):
         return True
       # Support Scipy sparse tensors if scipy is installed
       if scipy_sparse is not None and scipy_sparse.issparse(v):
@@ -1539,7 +1542,4 @@ def _scipy_sparse_to_sparse_tensor(t):
 
 
 def _is_distributed_dataset(ds):
-  # TODO(b/151165986): Use public APIs.
-  return isinstance(
-      ds,
-      (input_lib.DistributedDataset, input_lib.DistributedDatasetsFromFunction))
+  return isinstance(ds, input_lib.DistributedDatasetInterface)
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index 9ca63ec42f0..59613439bf9 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -953,6 +953,25 @@ class DataHandlerTest(keras_parameterized.TestCase):
     self.assertEqual(returned_data, [[([0],), ([1],),
                                       ([2],)], [([0],), ([1],), ([2],)]])
 
+  def test_iterator(self):
+    def generator():
+      for _ in range(2):
+        for step in range(3):
+          yield (ops.convert_to_tensor_v2_with_dispatch([step]),)
+
+    it = iter(dataset_ops.Dataset.from_generator(
+        generator, output_types=('float32',)))
+    data_handler = data_adapter.DataHandler(it, epochs=2, steps_per_epoch=3)
+    returned_data = []
+    for _, iterator in data_handler.enumerate_epochs():
+      epoch_data = []
+      for _ in data_handler.steps():
+        epoch_data.append(next(iterator))
+      returned_data.append(epoch_data)
+    returned_data = self.evaluate(returned_data)
+    self.assertEqual(returned_data, [[([0],), ([1],), ([2],)],
+                                     [([0],), ([1],), ([2],)]])
+
   def test_list_of_scalars(self):
     data_handler = data_adapter.DataHandler([[0], [1], [2]],
                                             epochs=2,
diff --git a/tensorflow/python/keras/engine/deferred_sequential_test.py b/tensorflow/python/keras/engine/deferred_sequential_test.py
index 06f0aa33d5c..5e1cc5bd060 100644
--- a/tensorflow/python/keras/engine/deferred_sequential_test.py
+++ b/tensorflow/python/keras/engine/deferred_sequential_test.py
@@ -129,7 +129,10 @@ class TestDeferredSequential(keras_parameterized.TestCase):
     path = os.path.join(self.get_temp_dir(), 'model_path')
     model.save(path)
     new_model = keras.models.load_model(path)
-    for layer1, layer2 in zip(model._layers, new_model._layers):
+    model_layers = model._flatten_layers(include_self=True, recursive=False)
+    new_model_layers = new_model._flatten_layers(
+        include_self=True, recursive=False)
+    for layer1, layer2 in zip(model_layers, new_model_layers):
       self.assertEqual(layer1.name, layer2.name)
       for w1, w2 in zip(layer1.weights, layer2.weights):
         self.assertAllClose(w1, w2)
@@ -144,7 +147,10 @@ class TestDeferredSequential(keras_parameterized.TestCase):
     path = os.path.join(self.get_temp_dir(), 'model_path.h5')
     model.save(path)
     new_model = keras.models.load_model(path)
-    for layer1, layer2 in zip(model._layers, new_model._layers):
+    model_layers = model._flatten_layers(include_self=True, recursive=False)
+    new_model_layers = new_model._flatten_layers(
+        include_self=True, recursive=False)
+    for layer1, layer2 in zip(model_layers, new_model_layers):
       self.assertEqual(layer1.name, layer2.name)
       for w1, w2 in zip(layer1.weights, layer2.weights):
         self.assertAllClose(w1, w2)
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 892773fa656..84db63b273e 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -204,9 +204,9 @@ class Functional(training_lib.Model):
         self.inputs, self.outputs)
     self._network_nodes = nodes
     self._nodes_by_depth = nodes_by_depth
-    self._layers = layers
+    self._self_tracked_trackables = layers
     self._layer_call_argspecs = {}
-    for layer in self._layers:
+    for layer in self._self_tracked_trackables:
       self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
 
     # Build self.input_names and self.output_names.
@@ -608,8 +608,8 @@ class Functional(training_lib.Model):
   def _conform_to_reference_input(self, tensor, ref_input):
     """Set shape and dtype based on `keras.Input`s."""
     if isinstance(tensor, ops.Tensor):
-      # Allow (None,) and (None, 1) Tensors to be passed interchangably. Use the
-      # shape specified by the `keras.Input`.
+      # Allow (None,) and (None, 1) Tensors to be passed interchangeably. Use
+      # the shape specified by the `keras.Input`.
       t_shape = tensor.shape
       t_rank = t_shape.rank
       ref_shape = ref_input.shape
@@ -797,11 +797,11 @@ class Functional(training_lib.Model):
         self._nodes_by_depth[depth].append(node)
 
     # Insert layers and update other layer attrs.
-    layer_set = set(self._layers)
+    layer_set = set(self._self_tracked_trackables)
     deferred_layers = []
     for layer in layers:
       if layer not in layer_set:
-        self._layers.append(layer)
+        self._self_tracked_trackables.append(layer)
         deferred_layers.append(layer)
         self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
         layer_set.add(layer)
@@ -869,6 +869,13 @@ class Functional(training_lib.Model):
   def _trackable_saved_model_saver(self):
     return network_serialization.NetworkSavedModelSaver(self)
 
+  def _get_save_spec(self, dynamic_batch=True):
+    if getattr(self, '_has_explicit_input_shape', True):
+      # Functional models and Sequential models that have an explicit input
+      # shape should use the batch size set by the input layer.
+      dynamic_batch = False
+    return super(Functional, self)._get_save_spec(dynamic_batch)
+
 
 def _make_node_key(layer_name, node_index):
   return layer_name + '_ib-' + str(node_index)
@@ -1082,7 +1089,8 @@ def _should_skip_first_node(layer):
   # the network config.
   return (isinstance(layer, Functional) and
           # Filter out Sequential models without an input shape.
-          isinstance(layer._layers[0], input_layer_module.InputLayer))
+          isinstance(layer._self_tracked_trackables[0],
+                     input_layer_module.InputLayer))
 
 
 def connect_ancillary_layers(model, created_layers):
@@ -1109,7 +1117,7 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
     custom_objects: Optional dictionary mapping names (strings) to custom
       classes or functions to be considered during deserialization.
     created_layers: Optional dictionary mapping names to Layer objects. Any
-      layer not in this dictionary will be be created and added to the dict.
+      layer not in this dictionary will be created and added to the dict.
       This function will add new nodes to all layers (excluding InputLayers),
       instead of re-using pre-existing nodes in the layers.
 
diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 0c2c8bfc44d..fbefbe73fb1 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -25,11 +25,11 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec as type_spec_module
+from tensorflow.python.keras.utils import object_identity
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_operators  # pylint: disable=unused-import
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 
 # pylint: disable=g-classes-have-attributes
 
@@ -96,8 +96,8 @@ class KerasTensor(object):
   placeholders.
 
   In rare cases (such as when directly manipulating shapes using Keras layers),
-  the layer may be able to partially infer the value of of the output in
-  addition to just inferring the signature.
+  the layer may be able to partially infer the value of the output in addition
+  to just inferring the signature.
   When this happens, the returned KerasTensor will also contain the inferred
   value information. Follow-on layers can use this information.
   during their own output signature inference.
@@ -117,7 +117,7 @@ class KerasTensor(object):
   Calling a `tf.function` does not support dispatching, so you cannot pass
   `KerasTensor`s as inputs to a `tf.function`.
 
-  Higher-order apis that take methods which produce tensors (e.g. `tf.while`,
+  Higher-order APIs that take methods which produce tensors (e.g. `tf.while`,
   `tf.map_fn`, `tf.cond`) also do not currently support dispatching. So, you
   cannot directly pass KerasTensors as inputs to these APIs either. If you
   want to use these APIs inside of a Functional model, you must put them inside
@@ -170,7 +170,8 @@ class KerasTensor(object):
       name = getattr(tensor, 'name', None)
       type_spec = type_spec_module.type_spec_from_value(tensor)
       inferred_value = None
-      if (type_spec.dtype == dtypes.int32 and type_spec.shape.rank < 2):
+      if (type_spec.dtype == dtypes.int32 and type_spec.shape.rank is not None
+          and type_spec.shape.rank < 2):
         # If this tensor might be representing shape information,
         # (dtype=int32, rank of 0 or 1, not too large to represent a shape)
         # we attempt to capture any value information tensorflow's
diff --git a/tensorflow/python/keras/engine/ragged_keras_tensor_test.py b/tensorflow/python/keras/engine/ragged_keras_tensor_test.py
index 92abdc82240..fc85fef29bf 100644
--- a/tensorflow/python/keras/engine/ragged_keras_tensor_test.py
+++ b/tensorflow/python/keras/engine/ragged_keras_tensor_test.py
@@ -20,15 +20,20 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 class RaggedKerasTensorTest(keras_parameterized.TestCase):
@@ -89,6 +94,278 @@ class RaggedKerasTensorTest(keras_parameterized.TestCase):
     x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
     self.assertAllEqual(model(x), x / x)
 
+  @parameterized.parameters(
+      {'property_name': 'values'},
+      {'property_name': 'flat_values'},
+      {'property_name': 'row_splits'},
+      {'property_name': 'nested_row_splits'},
+  )
+  def test_instance_property(self, property_name):
+    inp = layers.Input(shape=[None], ragged=True)
+    out = getattr(inp, property_name)
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    expected_property = getattr(x, property_name)
+    self.assertAllEqual(model(x), expected_property)
+
+    # Test that it works with serialization and deserialization as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected_property)
+
+  @parameterized.parameters(
+      {'name': 'value_rowids'},
+      {'name': 'nested_value_rowids'},
+      {'name': 'nrows'},
+      {'name': 'row_starts'},
+      {'name': 'row_limits'},
+      {'name': 'row_lengths'},
+      {'name': 'nested_row_lengths'},
+      {'name': 'bounding_shape'},
+      {
+          'name': 'with_values',
+          'args': [[1, 2, 3, 4, 5, 6]]
+      },
+      {
+          'name': 'with_flat_values',
+          'kwargs': {
+              'new_values': [1, 2, 3, 4, 5, 6]
+          }
+      },
+      {
+          'name': 'with_row_splits_dtype',
+          'kwargs': {
+              'dtype': dtypes.int32
+          }
+      },
+      {
+          'name': 'merge_dims',
+          'args': [0],
+          'kwargs': {
+              'inner_axis': 1
+          }
+      },
+      {'name': 'to_tensor'},
+      {'name': 'to_sparse'},
+  )
+  def test_instance_method(self, name, args=None, kwargs=None):
+    if not args:
+      args = []
+    if not kwargs:
+      kwargs = {}
+
+    inp = layers.Input(shape=[None], ragged=True)
+    out = getattr(inp, name)(*args, **kwargs)
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    expected_property = getattr(x, name)(*args, **kwargs)
+    # We expand composites before checking equality because
+    # assertAllEqual otherwise wouldn't work for SparseTensor outputs
+    for a, b in zip(nest.flatten(model(x), expand_composites=True),
+                    nest.flatten(expected_property, expand_composites=True)):
+      self.assertAllEqual(a, b)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    for a, b in zip(nest.flatten(model2(x), expand_composites=True),
+                    nest.flatten(expected_property, expand_composites=True)):
+      self.assertAllEqual(a, b)
+
+
+class RaggedTensorClassMethodAsLayerTest(keras_parameterized.TestCase):
+
+  def test_from_value_rowids(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_value_rowids(
+        inp, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_value_rowids(
+        x, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_splits(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_row_splits(
+        inp, row_splits=[0, 4, 4, 7, 8, 8])
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_row_splits(
+        x, row_splits=[0, 4, 4, 7, 8, 8])
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_lengths(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_row_lengths(
+        inp, row_lengths=[4, 0, 3, 1, 0])
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_row_lengths(
+        x, row_lengths=[4, 0, 3, 1, 0])
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_starts(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_row_starts(
+        inp, row_starts=[0, 4, 4, 7, 8])
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_row_starts(
+        x, row_starts=[0, 4, 4, 7, 8])
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_limits(self):
+    row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
+
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_row_limits(
+        inp, row_limits, validate=False)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_row_limits(
+        x, row_limits, validate=False)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_uniform_row_length(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_uniform_row_length(inp, 2, 8)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
+    expected = ragged_tensor.RaggedTensor.from_uniform_row_length(x, 2, 8)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_nested_value_row_ids(self):
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_nested_value_rowids(
+        inp, nested_value_rowids)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_nested_value_rowids(
+        x, nested_value_rowids)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_nested_row_splits(self):
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        inp, nested_row_splits)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        x, nested_row_splits)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_nested_row_lengths(self):
+    nested_row_lengths = [
+        constant_op.constant([2, 1, 0, 2], dtypes.int64),
+        constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
+    ]
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        inp, nested_row_lengths)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        x, nested_row_lengths)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_tensor(self):
+    inp = layers.Input(shape=[None], ragged=False)
+    out = ragged_tensor.RaggedTensor.from_tensor(inp)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([[3., 4.], [1., 2.], [3., 5.]])
+    expected = ragged_tensor.RaggedTensor.from_tensor(x)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_sparse(self):
+    inp = layers.Input(shape=[None], sparse=True, dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_sparse(inp)
+    model = training.Model(inp, out)
+
+    indices = [[0, 0], [1, 0], [1, 1], [2, 0]]
+    values = [b'a', b'b', b'c', b'd']
+    shape = [4, 5]
+    sp_value = sparse_tensor.SparseTensor(indices, values, shape)
+
+    expected = ragged_tensor.RaggedTensor.from_sparse(sp_value)
+    self.assertAllEqual(model(sp_value), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(sp_value), expected)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 75bbcd024e0..a54f803cc96 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -192,7 +192,8 @@ class Sequential(functional.Functional):
 
     self.built = False
     set_inputs = False
-    if not self._layers:
+    self._maybe_create_attribute('_self_tracked_trackables', [])
+    if not self._self_tracked_trackables:
       if isinstance(layer, input_layer.InputLayer):
         # Case where the user passes an Input or InputLayer layer via `add`.
         set_inputs = True
@@ -230,7 +231,7 @@ class Sequential(functional.Functional):
       self._init_graph_network(self.inputs, self.outputs)
       self._graph_initialized = True
     else:
-      self._layers.append(layer)
+      self._self_tracked_trackables.append(layer)
       self._handle_deferred_layer_dependencies([layer])
 
     self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
@@ -245,7 +246,7 @@ class Sequential(functional.Functional):
     if not self.layers:
       raise TypeError('There are no layers in the model.')
 
-    layer = self._layers.pop()
+    layer = self._self_tracked_trackables.pop()
     self._layer_call_argspecs.pop(layer)
     if not self.layers:
       self.outputs = None
@@ -466,8 +467,8 @@ class Sequential(functional.Functional):
     layer_configs = []
     for layer in super(Sequential, self).layers:
       # `super().layers` include the InputLayer if available (it is filtered out
-      # of `self.layers`). Note that `self._layers` is managed by the
-      # tracking infrastructure and should not be used.
+      # of `self.layers`). Note that `self._self_tracked_trackables` is managed
+      # by the tracking infrastructure and should not be used.
       layer_configs.append(generic_utils.serialize_keras_object(layer))
     config = {
         'name': self.name,
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 6a9a3bf9bcc..39fcb2ef5a3 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -412,20 +412,30 @@ class TestSequential(keras_parameterized.TestCase):
     """Test that Sequential only tracks layers added in init or `.add`."""
     layer = keras.layers.Dense(1)
     model = keras.Sequential([layer])
-    self.assertEqual(model._layers[-1], layer)
+    self.assertEqual(
+        list(model._flatten_layers(include_self=False, recursive=False))[-1],
+        layer)
 
     model.a = [keras.layers.Dense(3)]  # should not be added to the layers list.
-    self.assertEqual(model._layers[-1], layer)
+    self.assertEqual(
+        list(model._flatten_layers(include_self=False, recursive=False))[-1],
+        layer)
 
     layer2 = keras.layers.Dense(2)
     model.add(layer2)
-    self.assertEqual(model._layers[-1], layer2)
+    self.assertEqual(
+        list(model._flatten_layers(include_self=False, recursive=False))[-1],
+        layer2)
 
     model.a = [keras.layers.Dense(3)]  # should not be added to the layers list.
-    self.assertEqual(model._layers[-1], layer2)
+    self.assertEqual(
+        list(model._flatten_layers(include_self=False, recursive=False))[-1],
+        layer2)
 
     model.pop()
-    self.assertEqual(model._layers[-1], layer)
+    self.assertEqual(
+        list(model._flatten_layers(include_self=False, recursive=False))[-1],
+        layer)
 
   def test_config_preserves_input_layer(self):
     model = keras.Sequential([
@@ -436,8 +446,10 @@ class TestSequential(keras_parameterized.TestCase):
     config = model.get_config()
     new_model = keras.Sequential.from_config(config)
     self.assertTrue(new_model.built)
-    self.assertEqual(new_model._layers[0].dtype, 'int32')
-    self.assertEqual(new_model._layers[0].name, 'my_embedding_input')
+    layers = list(
+        new_model._flatten_layers(include_self=False, recursive=False))
+    self.assertEqual(layers[0].dtype, 'int32')
+    self.assertEqual(layers[0].name, 'my_embedding_input')
 
   def test_name_unicity(self):
     model = keras.Sequential()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 8284a6709ff..7ccfd39c82a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -51,13 +51,15 @@ from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import data_adapter
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer as lso
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
@@ -76,7 +78,6 @@ from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -548,12 +549,25 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
   def _get_optimizer(self, optimizer):
     """Wraps `optimizer` in `LossScaleOptimizer` if necessary."""
+    # The deprecated PolicyV1 has a loss_scale, which we use for backwards
+    # compatibility to match TF 2.3 behavior. The new Policy does not have a
+    # loss_scale, so we use dynamic loss scaling if the mixed_float16 policy is
+    # used.
+    if isinstance(self._dtype_policy, policy.PolicyV1):
+      loss_scale = self._dtype_policy.loss_scale
+    elif self._dtype_policy.name == 'mixed_float16':
+      loss_scale = 'dynamic'
+    else:
+      loss_scale = None
 
     def _get_single_optimizer(opt):
       opt = optimizers.get(opt)
-      if (self._dtype_policy.loss_scale is not None and
+      if (loss_scale is not None and
           not isinstance(opt, lso.LossScaleOptimizer)):
-        opt = lso.LossScaleOptimizer(opt, self._dtype_policy.loss_scale)
+        if loss_scale == 'dynamic':
+          opt = lso.LossScaleOptimizer(opt)
+        else:
+          opt = lso.LossScaleOptimizerV1(opt, loss_scale)
       return opt
 
     return nest.map_structure(_get_single_optimizer, optimizer)
@@ -868,7 +882,11 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             interactively (eg, in a production environment).
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
-            See `tf.keras.callbacks`.
+            See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
+            and `tf.keras.callbacks.History` callbacks are created automatically
+            and need not be passed into `model.fit`.
+            `tf.keras.callbacks.ProgbarLogger` is created or not based on
+            `verbose` argument to `model.fit`.
         validation_split: Float between 0 and 1.
             Fraction of the training data to be used as validation data.
             The model will set apart this fraction of the training data,
@@ -949,7 +967,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             form of datasets, generators, or `keras.utils.Sequence` instances
             (since they generate batches).
         validation_freq: Only relevant if validation data is provided. Integer
-            or `collections_abc.Container` instance (e.g. list, tuple, etc.).
+            or `collections.abc.Container` instance (e.g. list, tuple, etc.).
             If an integer, specifies how many training epochs to run before a
             new validation run is performed, e.g. `validation_freq=2` runs
             validation every 2 epochs. If a Container, specifies the epochs on
@@ -1095,6 +1113,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         if validation_data and self._should_eval(epoch, validation_freq):
           # Create data_handler for evaluation and cache it.
           if getattr(self, '_eval_data_handler', None) is None:
+            self._fit_frame = tf_inspect.currentframe()
             self._eval_data_handler = data_adapter.DataHandler(
                 x=val_x,
                 y=val_y,
@@ -1130,6 +1149,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       # If eval data_hanlder exists, delete it after all epochs are done.
       if getattr(self, '_eval_data_handler', None) is not None:
         del self._eval_data_handler
+        del self._fit_frame
       callbacks.on_train_end(logs=training_logs)
       return self.history
 
@@ -1323,7 +1343,10 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     _disallow_inside_tf_function('evaluate')
 
     with self.distribute_strategy.scope():
-      if getattr(self, '_eval_data_handler', None) is not None:
+      # Use cached evaluation data only when it's called in `Model.fit`
+      if (getattr(self, '_fit_frame', None) is not None
+          and tf_inspect.currentframe().f_back is self._fit_frame
+          and getattr(self, '_eval_data_handler', None) is not None):
         data_handler = self._eval_data_handler
       else:
         # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
@@ -1898,21 +1921,35 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   @property
   def trainable_weights(self):
     self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._trainable_weights))
+    if not self._trainable:
+      return []
+    trainable_variables = []
+    for trackable_obj in self._self_tracked_trackables:
+      trainable_variables += trackable_obj.trainable_variables
+    trainable_variables += self._trainable_weights
+    return self._dedup_weights(trainable_variables)
 
   @property
   def non_trainable_weights(self):
     self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_non_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._non_trainable_weights +
-            self._trainable_weights))
+    non_trainable_variables = []
+    for trackable_obj in self._self_tracked_trackables:
+      non_trainable_variables += trackable_obj.non_trainable_variables
+
+    if not self._trainable:
+      # Return order is all trainable vars, then all non-trainable vars.
+      trainable_variables = []
+      for trackable_obj in self._self_tracked_trackables:
+        trainable_variables += trackable_obj.trainable_variables
+
+      non_trainable_variables = (
+          trainable_variables + self._trainable_weights +
+          non_trainable_variables + self._non_trainable_weights)
+    else:
+      non_trainable_variables = (
+          non_trainable_variables + self._non_trainable_weights)
+
+    return self._dedup_weights(non_trainable_variables)
 
   def get_weights(self):
     """Retrieves the weights of the model.
@@ -1929,31 +1966,14 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
            include_optimizer=True,
            save_format=None,
            signatures=None,
-           options=None):
+           options=None,
+           save_traces=True):
+    # pylint: disable=line-too-long
     """Saves the model to Tensorflow SavedModel or a single HDF5 file.
 
-    The savefile includes:
-
-    - The model architecture, allowing to re-instantiate the model.
-    - The model weights.
-    - The state of the optimizer, allowing to resume training
-        exactly where you left off.
-
-    This allows you to save the entirety of the state of a model
-    in a single file.
-
-    Saved models can be re-instantiated via `keras.models.load_model`.
-    The model returned by `load_model` is a compiled model ready to be used
-    (unless the saved model was never compiled in the first place).
-
-    Models built with the Sequential and Functional API can be saved to both the
-    HDF5 and SavedModel formats. Subclassed models can only be saved with the
-    SavedModel format.
-
-    Note that the model weights may have different scoped names after being
-    loaded. Scoped names include the model/layer names, such as
-    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+    Please see `tf.keras.models.save_model` or the
+    [Serialization and Saving guide](https://keras.io/guides/serialization_and_saving/)
+    for details.
 
     Arguments:
         filepath: String, PathLike, path to SavedModel or H5 file to save the
@@ -1967,8 +1987,15 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         signatures: Signatures to save with the SavedModel. Applicable to the
             'tf' format only. Please see the `signatures` argument in
             `tf.saved_model.save` for details.
-        options: Optional `tf.saved_model.SaveOptions` object that specifies
-            options for saving to SavedModel.
+        options: (only applies to SavedModel format)
+            `tf.saved_model.SaveOptions` object that specifies options for
+            saving to SavedModel.
+        save_traces: (only applies to SavedModel format) When enabled, the
+            SavedModel will store the function traces for each layer. This
+            can be disabled, so that only the configs of each layer are stored.
+            Defaults to `True`. Disabling this will decrease serialization time
+            and reduce file size, but it requires that all custom layers/models
+            implement a `get_config()` method.
 
     Example:
 
@@ -1983,8 +2010,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     model = load_model('my_model.h5')
     ```
     """
+    # pylint: enable=line-too-long
     save.save_model(self, filepath, overwrite, include_optimizer, save_format,
-                    signatures, options)
+                    signatures, options, save_traces)
 
   def save_weights(self,
                    filepath,
@@ -2334,8 +2362,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     """Returns the undeduplicated list of all layer variables/weights."""
     self._assert_weights_created()
     weights = []
-    for layer in self._layers:
-      weights += layer.weights
+    for layer in self._self_tracked_trackables:
+      weights += layer.variables
     weights += (self._trainable_weights + self._non_trainable_weights)
     return weights
 
@@ -2731,20 +2759,24 @@ def _collective_all_reduce_multi_worker(strategy):
 # for all strategies
 def _multi_worker_concat(v, strategy):
   """Order PerReplica objects for CollectiveAllReduceStrategy and concat."""
-  replicas = strategy._gather(v, axis=0)  # pylint: disable=protected-access
-  # v might not have the same shape on different replicas
-  if isinstance(v, ds_values.PerReplica):
-    shapes = array_ops.concat([
-        array_ops.expand_dims_v2(array_ops.shape(single_value)[0], axis=0)
-        for single_value in v.values
-    ],
-                              axis=0)
-    all_shapes = strategy._gather(shapes, axis=0)  # pylint: disable=protected-access
-  else:
-    # v is a tensor. This may happen when, say, we have 2x1 multi-worker.
-    all_shapes = strategy._gather(  # pylint: disable=protected-access
-        array_ops.expand_dims_v2(array_ops.shape(v)[0], axis=0),
-        axis=0)
+  replicas = strategy.gather(v, axis=0)  # pylint: disable=protected-access
+  # TODO(b/170435030): We now need to make sure these run after the iterator
+  # GetNext, so that we don't trigger aborting collective ops in the case of
+  # EOF. Remove after the issue is fixed.
+  with ops.control_dependencies([replicas]):
+    # v might not have the same shape on different replicas
+    if isinstance(v, ds_values.PerReplica):
+      shapes = array_ops.concat([
+          array_ops.expand_dims_v2(array_ops.shape(single_value)[0], axis=0)
+          for single_value in v.values
+      ],
+                                axis=0)
+      all_shapes = strategy.gather(shapes, axis=0)
+    else:
+      # v is a tensor. This may happen when, say, we have 2x1 multi-worker.
+      all_shapes = strategy.gather(
+          array_ops.expand_dims_v2(array_ops.shape(v)[0], axis=0),
+          axis=0)
 
   replicas = array_ops.split(
       replicas,
diff --git a/tensorflow/python/keras/engine/training_arrays_v1.py b/tensorflow/python/keras/engine/training_arrays_v1.py
index ad9b37f569e..b08d6dc59bc 100644
--- a/tensorflow/python/keras/engine/training_arrays_v1.py
+++ b/tensorflow/python/keras/engine/training_arrays_v1.py
@@ -94,7 +94,7 @@ def model_iteration(model,
         validation from data tensors). Ignored with the default value of
         `None`.
       validation_freq: Only relevant if validation data is provided. Integer or
-        `collections_abc.Container` instance (e.g. list, tuple, etc.). If an
+        `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
         integer, specifies how many training epochs to run before a new
         validation run is performed, e.g. `validation_freq=2` runs
         validation every 2 epochs. If a Container, specifies the epochs on
@@ -510,7 +510,7 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
     # in Distribution Strategy case as it follows the same code path for both
     # eager and graph modes.
     # TODO(priyag,omalleyt): Either we should move the training DS with
-    # OwnedIterator to use training_generator code path, or figure out how to
+    # IteratorBase to use training_generator code path, or figure out how to
     # set a symbolic Iterator out of a Dataset when in eager mode.
     if context.executing_eagerly():
       return get_distributed_inputs
diff --git a/tensorflow/python/keras/engine/training_distributed_v1.py b/tensorflow/python/keras/engine/training_distributed_v1.py
index 0e73b21adc1..4a40fdc2015 100644
--- a/tensorflow/python/keras/engine/training_distributed_v1.py
+++ b/tensorflow/python/keras/engine/training_distributed_v1.py
@@ -777,7 +777,7 @@ def _train_with_multi_worker(method):
     return dc.run_distribute_coordinator(
         _worker_fn,
         model._distribution_strategy,
-        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+        mode='independent_worker')
 
   return wrapper
 
diff --git a/tensorflow/python/keras/engine/training_eager_v1.py b/tensorflow/python/keras/engine/training_eager_v1.py
index 2acd7493cb0..a52b20c5aa0 100644
--- a/tensorflow/python/keras/engine/training_eager_v1.py
+++ b/tensorflow/python/keras/engine/training_eager_v1.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine import training_utils_v1
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
diff --git a/tensorflow/python/keras/engine/training_generator_v1.py b/tensorflow/python/keras/engine/training_generator_v1.py
index 9b6fc1577bb..e6ae352c416 100644
--- a/tensorflow/python/keras/engine/training_generator_v1.py
+++ b/tensorflow/python/keras/engine/training_generator_v1.py
@@ -412,7 +412,7 @@ def _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
 
   val_gen = (
       data_utils.is_generator_or_sequence(validation_data) or
-      isinstance(validation_data, iterator_ops.OwnedIterator))
+      isinstance(validation_data, iterator_ops.IteratorBase))
   if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
       not validation_steps):
     raise ValueError('Please specify the `validation_steps` argument.')
@@ -455,7 +455,7 @@ def convert_to_generator_like(data,
         ele for ele in data if not all(e is None for e in nest.flatten(ele)))
 
   if data_utils.is_generator_or_sequence(data) or isinstance(
-      data, iterator_ops.OwnedIterator):
+      data, iterator_ops.IteratorBase):
     if isinstance(data, data_utils.Sequence):
       if steps_per_epoch is None:
         steps_per_epoch = len(data)
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 6a833560cff..be85bc44eb8 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -68,6 +68,19 @@ except ImportError:
 
 class TrainingTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
+  def test_model_instrumentation(self):
+    layers = [
+        layers_module.Dense(10, dtype=np.float64),
+        layers_module.Dense(10, dtype=np.float64)
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+
+    self.assertTrue(model._instrumented_keras_api)
+    self.assertTrue(model._instrumented_keras_model_class)
+    self.assertFalse(model._instrumented_keras_layer_class)
+
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_fit_training_arg(self):
@@ -833,7 +846,8 @@ class TrainingTest(keras_parameterized.TestCase):
         return self.layer2(self.layer1(inputs))
 
     l = LayerWithWeightSharedLayers()
-    self.assertEqual(l._layers, [l.layer1, l.layer2])
+    layers = list(l._flatten_layers(include_self=False, recursive=False))
+    self.assertEqual(layers, [l.layer1, l.layer2])
     self.assertEqual(l.variables,
                      [l.layer1.trainable_var, l.layer1.non_trainable_var])
     self.assertEqual(l.trainable_variables, [l.layer1.trainable_var])
@@ -2445,7 +2459,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       output_a_np = np.random.random((10, 4))
       output_b_np = np.random.random((10, 3))
 
-      input_v = backend.variables_module.Variable(input_a_np, dtype='float32')
+      input_v = variables_lib.Variable(input_a_np, dtype='float32')
       self.evaluate(variables_lib.variables_initializer([input_v]))
       a = input_layer.Input(tensor=input_v)
       b = input_layer.Input(shape=(3,), name='input_b')
@@ -2656,7 +2670,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       out = model.evaluate(input_a_np, None)
 
       # Test model with no external data at all.
-      input_v = backend.variables_module.Variable(input_a_np, dtype='float32')
+      input_v = variables_lib.Variable(input_a_np, dtype='float32')
       self.evaluate(variables_lib.variables_initializer([input_v]))
       a = input_layer.Input(tensor=input_v)
       a_2 = layers_module.Dense(4, name='dense_1')(a)
@@ -2815,7 +2829,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
                                })
 
       # test with custom TF placeholder as target
-      pl_target_a = backend.array_ops.placeholder('float32', shape=(None, 4))
+      pl_target_a = array_ops.placeholder('float32', shape=(None, 4))
       model.compile(optimizer='rmsprop', loss='mse',
                     target_tensors={'dense_1': pl_target_a})
       model.train_on_batch([input_a_np, input_b_np],
@@ -3642,16 +3656,20 @@ class TestAutoUpdates(keras_parameterized.TestCase):
 
 class TestFunctionTracing(keras_parameterized.TestCase):
 
+  def _seq_model_and_data(self):
+    model = sequential.Sequential([layers_module.Dense(4, activation='relu')])
+    model.compile(loss='mse', optimizer='rmsprop')
+    x = np.random.random((10, 6))
+    y = np.random.random((10, 4))
+    return model, x, y
+
   @keras_parameterized.run_all_keras_modes(
       always_skip_v1=True, always_skip_eager=True)
   def test_no_tracing_between_epoch(self):
     if sys.version_info[0] < 3:
       self.skipTest('self.assertLogs() call is not available in Python 2.')
 
-    model = sequential.Sequential([layers_module.Dense(4, activation='relu')])
-    model.compile(loss='mse', optimizer='rmsprop')
-    x = np.random.random((10, 6))
-    y = np.random.random((10, 4))
+    model, x, y = self._seq_model_and_data()
 
     logging.set_verbosity(1)
     with self.assertLogs(level=1) as logs:
@@ -3660,6 +3678,21 @@ class TestFunctionTracing(keras_parameterized.TestCase):
     new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
     self.assertEqual(sum(new_func_graph in log for log in logs.output), 9)
 
+  @keras_parameterized.run_all_keras_modes(
+      always_skip_v1=True, always_skip_eager=True)
+  def test_evaluate_no_cached_data(self):
+    if sys.version_info[0] < 3:
+      self.skipTest('self.assertLogs() call is not available in Python 2.')
+
+    model, x, y = self._seq_model_and_data()
+
+    new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
+    logging.set_verbosity(1)
+    with self.assertLogs(level=1) as eval_logs:
+      for _ in range(6):
+        model.evaluate(x, y, batch_size=5)
+    self.assertEqual(sum(new_func_graph in log for log in eval_logs.output), 20)
+
 
 class TestBuildCustomModel(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/training_utils_v1.py b/tensorflow/python/keras/engine/training_utils_v1.py
index c198bad1511..58bcaf84985 100644
--- a/tensorflow/python/keras/engine/training_utils_v1.py
+++ b/tensorflow/python/keras/engine/training_utils_v1.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import abc
 import atexit
 import collections
+import collections.abc as collections_abc
 import functools
 import multiprocessing.pool
 import threading
@@ -36,11 +37,13 @@ from tensorflow.python.data.experimental.ops.distribute_options import AutoShard
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
@@ -55,9 +58,19 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
+
+
+def is_composite_or_composite_value(tensor):
+  """Returns true if 'tensor' is a CompositeTensor or a CT Value object."""
+  # TODO(b/125094323): This should be isinstance(CompositeTensor) or
+  # isinstance(CompositeTensorValue) once we support that.
+  return isinstance(
+      tensor,
+      (composite_tensor.CompositeTensor, sparse_tensor.SparseTensorValue,
+       ragged_tensor_value.RaggedTensorValue))
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -156,8 +169,7 @@ class ConcatAggregator(Aggregator):
         use_steps=True, num_samples=None, steps=None, batch_size=batch_size)
 
   def create(self, batch_element):
-    self.composite = composite_tensor_utils.is_composite_or_composite_value(
-        batch_element)
+    self.composite = is_composite_or_composite_value(batch_element)
 
   def aggregate(self, batch_element, batch_start=None, batch_end=None):
 
@@ -313,12 +325,11 @@ class OutputsAggregator(Aggregator):
     # SparseTensorValue is a named tuple which nest will flatten, so we need
     # to guard it to properly handle the structure.
     self._structure = nest.get_traverse_shallow_structure(
-        lambda x: not composite_tensor_utils.is_composite_or_composite_value(x),
-        batch_outs)
+        lambda x: not is_composite_or_composite_value(x), batch_outs)
     batch_outs = nest.flatten_up_to(self._structure, batch_outs)
 
     for batch_element in batch_outs:
-      if composite_tensor_utils.is_composite_or_composite_value(batch_element):
+      if is_composite_or_composite_value(batch_element):
         # If the output is not a ndarray, it will be either a composite tensor
         # or a composite tensor's Value object. In either case, we can't
         # allocate an array to hold the object - we'll handle it later.
@@ -399,7 +410,7 @@ def standardize_single_array(x, expected_shape=None):
   if x is None:
     return None
 
-  if composite_tensor_utils.is_composite_or_composite_value(x):
+  if is_composite_or_composite_value(x):
     return x
 
   if isinstance(x, int):
@@ -517,7 +528,7 @@ def standardize_input_data(data,
           if not tensorshape:
             continue
           data_shape = tuple(tensorshape.as_list())
-        elif composite_tensor_utils.is_composite_or_composite_value(data[i]):
+        elif is_composite_or_composite_value(data[i]):
           tensorshape = composite_tensor_utils.get_shape(data[i])
           data_shape = tuple(tensorshape.as_list())
         else:
@@ -610,8 +621,7 @@ def check_array_lengths(inputs, targets, weights=None):
   """
 
   def is_tensor_or_composite_tensor(x):
-    return tensor_util.is_tensor(
-        x) or composite_tensor_utils.is_composite_or_composite_value(x)
+    return tensor_util.is_tensor(x) or is_composite_or_composite_value(x)
 
   def set_of_lengths(x):
     # Returns a set with the variation between
@@ -1194,7 +1204,7 @@ def check_steps_argument(input_data, steps, steps_name):
         but not provided.
   """
   is_x_iterator = isinstance(
-      input_data, (iterator_ops.Iterator, iterator_ops.OwnedIterator))
+      input_data, (iterator_ops.Iterator, iterator_ops.IteratorBase))
   if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
       (isinstance(input_data, list) and not input_data)):
     if steps is None:
@@ -1418,7 +1428,7 @@ def is_feature_layer(layer):
 def is_eager_dataset_or_iterator(data):
   return context.executing_eagerly() and isinstance(
       data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
-             iterator_ops.OwnedIterator))
+             iterator_ops.IteratorBase))
 
 
 # pylint: disable=protected-access
@@ -1456,7 +1466,7 @@ def verify_dataset_shuffled(x):
 
 def is_dataset_or_iterator(data):
   return isinstance(data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
-                           iterator_ops.Iterator, iterator_ops.OwnedIterator))
+                           iterator_ops.Iterator, iterator_ops.IteratorBase))
 
 
 def get_iterator(dataset):
@@ -1697,7 +1707,7 @@ def should_run_validation(validation_freq, epoch):
 
   if not isinstance(validation_freq, collections_abc.Container):
     raise ValueError('`validation_freq` must be an Integer or '
-                     '`collections_abc.Container` (e.g. list, tuple, etc.)')
+                     '`collections.abc.Container` (e.g. list, tuple, etc.)')
   return one_indexed_epoch in validation_freq
 
 
@@ -1741,7 +1751,7 @@ def unpack_validation_data(validation_data, raise_if_ambiguous=True):
     tuple of 3, (x, y, sample_weights) for numpy and tensor input.
   """
   if (isinstance(validation_data, (iterator_ops.Iterator,
-                                   iterator_ops.OwnedIterator,
+                                   iterator_ops.IteratorBase,
                                    dataset_ops.DatasetV2,
                                    data_utils.Sequence))
       or not hasattr(validation_data, '__len__')):
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 54969bb5e83..c84364673dc 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import collections.abc as collections_abc
 import warnings
 
 import numpy as np
@@ -27,9 +28,9 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -52,7 +53,8 @@ from tensorflow.python.keras.engine import training_eager_v1
 from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine import training_utils_v1
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import data_utils
@@ -67,7 +69,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.types import core
 from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -359,10 +360,16 @@ class Model(training_lib.Model):
               distribution_strategy_context.get_strategy())
 
     if isinstance(self._distribution_strategy,
-                  (parameter_server_strategy.ParameterServerStrategyV1,
-                   parameter_server_strategy.ParameterServerStrategy)):
-      raise NotImplementedError('ParameterServerStrategy currently only works '
-                                'with the tf.Estimator API')
+                  parameter_server_strategy.ParameterServerStrategyV1):
+      raise NotImplementedError(
+          '`tf.compat.v1.distribute.experimental.ParameterServerStrategy` '
+          'currently only works with the tf.Estimator API')
+
+    if isinstance(self._distribution_strategy,
+                  parameter_server_strategy_v2.ParameterServerStrategyV2):
+      raise NotImplementedError(
+          '`tf.distribute.experimental.ParameterServerStrategy` is only '
+          'supported in TF2.')
 
     if not self._experimental_run_tf_function:
       self._validate_compile_param_for_distribution_strategy(self.run_eagerly,
@@ -497,7 +504,9 @@ class Model(training_lib.Model):
         return super(Model, self).metrics
       metrics += self._compile_metric_functions
     metrics.extend(self._metrics)
-    metrics.extend(_get_metrics_from_layers(self._layers))
+    metrics.extend(
+        _get_metrics_from_layers(
+            list(self._flatten_layers(include_self=False, recursive=False))))
     return metrics
 
   @property
@@ -575,7 +584,7 @@ class Model(training_lib.Model):
     #  integrated into the data adapters in the v2 loop. We can't do this yet
     #  because we currently have to fall back for unhandled data types.
     if isinstance(inputs, (iterator_ops.Iterator,
-                           iterator_ops.OwnedIterator)):
+                           iterator_ops.IteratorBase)):
       raise ValueError('For performance reasons Keras `fit`, `evaluate` and'
                        '`predict` accept tf.data `Datasets` as input but not '
                        'iterators that have been manually generated from '
@@ -739,7 +748,7 @@ class Model(training_lib.Model):
             the dataset at each epoch. This ensures that the same validation
             samples are used every time.
         validation_freq: Only relevant if validation data is provided. Integer
-            or `collections_abc.Container` instance (e.g. list, tuple, etc.).
+            or `collections.abc.Container` instance (e.g. list, tuple, etc.).
             If an integer, specifies how many training epochs to run before a
             new validation run is performed, e.g. `validation_freq=2` runs
             validation every 2 epochs. If a Container, specifies the epochs on
@@ -1342,7 +1351,14 @@ class Model(training_lib.Model):
     else:
       self.optimizer = optimizers.get(optimizer)
 
-    if (self._dtype_policy.loss_scale is not None and
+    if isinstance(self._dtype_policy, policy.PolicyV1):
+      loss_scale = self._dtype_policy.loss_scale
+    elif self._dtype_policy.name == 'mixed_float16':
+      loss_scale = 'dynamic'
+    else:
+      loss_scale = None
+
+    if (loss_scale is not None and
         not isinstance(self.optimizer,
                        loss_scale_optimizer.LossScaleOptimizer)):
       if isinstance(self.optimizer, list):
@@ -1356,18 +1372,11 @@ class Model(training_lib.Model):
                          'with a loss scale  used, but got: %s. Using policy: '
                          '%s' %
                          (self.optimizer, self._dtype_policy))
-      self.optimizer = loss_scale_optimizer.LossScaleOptimizer(
-          self.optimizer, self._dtype_policy.loss_scale)
-    if (isinstance(self.optimizer, loss_scale_optimizer.LossScaleOptimizer) and
-        self._dtype_policy.loss_scale and
-        self.optimizer.loss_scale != self._dtype_policy.loss_scale):
-      logging.warning('LossScale of LossScaleOptimizer passed to compile (%s) '
-                      'is not the same as the dtype policy\'s loss scale (%s). '
-                      'Because the dtype policy has a loss scale, you should '
-                      'pass an optimizer that is not wrapped with a '
-                      'LossScaleOptimizer,'
-                      % (self.optimizer.loss_scale,
-                         self._dtype_policy.loss_scale))
+      if loss_scale == 'dynamic':
+        self.optimizer = loss_scale_optimizer.LossScaleOptimizer(self.optimizer)
+      else:
+        self.optimizer = loss_scale_optimizer.LossScaleOptimizerV1(
+            self.optimizer, loss_scale)
 
   def _prepare_validation_data(self, validation_data, batch_size,
                                validation_steps):
@@ -1710,7 +1719,7 @@ class Model(training_lib.Model):
 
     # Avoids the override in Sequential.layers which filters Input layers.
     # (Which are often the very layers that we're after.)
-    layers = layer_utils.filter_empty_layer_containers(self._layers)
+    layers = self._flatten_layers(include_self=False, recursive=False)
     first_layer = next(layers, None)
     if first_layer:
       # The per-replica static batch size.
@@ -1741,7 +1750,7 @@ class Model(training_lib.Model):
 
         # Check Dataset/Iterator batch size is consistent with InputLayer.
         if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
-                          iterator_ops.OwnedIterator)):
+                          iterator_ops.IteratorBase)):
           ds_batch_size = tensor_shape.as_dimension(
               nest.flatten(dataset_ops.get_legacy_output_shapes(x))[0][0]).value
           if ds_batch_size is not None:
@@ -2494,7 +2503,7 @@ class Model(training_lib.Model):
     # users should explicitly add composite tensor inputs to their subclassed
     # models.
     for input_tensor in processed_inputs:
-      if composite_tensor_utils.is_composite_or_composite_value(input_tensor):
+      if training_utils_v1.is_composite_or_composite_value(input_tensor):
         # TODO(b/132691975): Document subclass-model CT input handling.
         raise ValueError(
             'All SparseTensor and RaggedTensor inputs must be explicitly '
diff --git a/tensorflow/python/keras/initializers/initializers_v2.py b/tensorflow/python/keras/initializers/initializers_v2.py
index 66e6719f31f..0e4fd66027e 100644
--- a/tensorflow/python/keras/initializers/initializers_v2.py
+++ b/tensorflow/python/keras/initializers/initializers_v2.py
@@ -34,7 +34,7 @@ class Initializer(object):
   signature:
 
   ```python
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     # returns a tensor of shape `shape` and dtype `dtype`
     # containing values drawn from a distribution of your choice.
   ```
@@ -54,7 +54,7 @@ class Initializer(object):
       self.mean = mean
       self.stddev = stddev
 
-    def __call__(self, shape, dtype=None):
+    def __call__(self, shape, dtype=None, **kwargs):
       return tf.random.normal(
           shape, mean=self.mean, stddev=self.stddev, dtype=dtype)
 
@@ -68,12 +68,13 @@ class Initializer(object):
   works fine.
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor.
+      **kwargs: Additional keyword arguments.
     """
     raise NotImplementedError
 
@@ -124,7 +125,7 @@ class Zeros(init_ops_v2.Zeros, Initializer):
   >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
@@ -133,8 +134,9 @@ class Zeros(init_ops_v2.Zeros, Initializer):
        supported. If not specified, `tf.keras.backend.floatx()` is used,
        which default to `float32` unless you configured it otherwise
        (via `tf.keras.backend.set_floatx(float_dtype)`).
+      **kwargs: Additional keyword arguments.
     """
-    return super(Zeros, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(Zeros, self).__call__(shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.Ones', 'keras.initializers.ones', v1=[])
@@ -154,7 +156,7 @@ class Ones(init_ops_v2.Ones, Initializer):
   >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
@@ -163,8 +165,9 @@ class Ones(init_ops_v2.Ones, Initializer):
        supported. If not specified, `tf.keras.backend.floatx()` is used,
        which default to `float32` unless you configured it otherwise
        (via `tf.keras.backend.set_floatx(float_dtype)`).
+      **kwargs: Additional keyword arguments.
     """
-    return super(Ones, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(Ones, self).__call__(shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.Constant',
@@ -196,7 +199,7 @@ class Constant(Initializer):
   def __init__(self, value=0):
     self.value = value
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized to `self.value`.
 
     Args:
@@ -205,7 +208,9 @@ class Constant(Initializer):
        `tf.keras.backend.floatx()` is used,
        which default to `float32` unless you configured it otherwise
        (via `tf.keras.backend.set_floatx(float_dtype)`).
+      **kwargs: Additional keyword arguments.
     """
+    del kwargs
     return constant_op.constant(
         self.value, dtype=_get_dtype(dtype), shape=shape)
 
@@ -241,7 +246,7 @@ class RandomUniform(init_ops_v2.RandomUniform, Initializer):
       always produce the same random tensor for a given shape and dtype.
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
@@ -251,8 +256,10 @@ class RandomUniform(init_ops_v2.RandomUniform, Initializer):
         `tf.keras.backend.floatx()` is used,
        which default to `float32` unless you configured it otherwise
        (via `tf.keras.backend.set_floatx(float_dtype)`).
+      **kwargs: Additional keyword arguments.
     """
-    return super(RandomUniform, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(RandomUniform, self).__call__(
+        shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.RandomNormal',
@@ -283,17 +290,19 @@ class RandomNormal(init_ops_v2.RandomNormal, Initializer):
       always produce the same random tensor for a given shape and dtype.
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized to random normal values.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`)
+        supported. If not specified, `tf.keras.backend.floatx()` is used, which
+        default to `float32` unless you configured it otherwise (via
+        `tf.keras.backend.set_floatx(float_dtype)`)
+      **kwargs: Additional keyword arguments.
     """
-    return super(RandomNormal, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(RandomNormal, self).__call__(
+        shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.TruncatedNormal',
@@ -329,17 +338,19 @@ class TruncatedNormal(init_ops_v2.TruncatedNormal, Initializer):
       always produce the same random tensor for a given shape and dtype.
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized to random normal values (truncated).
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`)
+        supported. If not specified, `tf.keras.backend.floatx()` is used, which
+        default to `float32` unless you configured it otherwise (via
+        `tf.keras.backend.set_floatx(float_dtype)`)
+      **kwargs: Additional keyword arguments.
     """
-    return super(TruncatedNormal, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(TruncatedNormal, self).__call__(
+        shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.VarianceScaling',
@@ -384,17 +395,19 @@ class VarianceScaling(init_ops_v2.VarianceScaling, Initializer):
       always produce the same random tensor for a given shape and dtype.
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`)
+        supported. If not specified, `tf.keras.backend.floatx()` is used, which
+        default to `float32` unless you configured it otherwise (via
+        `tf.keras.backend.set_floatx(float_dtype)`)
+      **kwargs: Additional keyword arguments.
     """
-    return super(VarianceScaling, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(VarianceScaling, self).__call__(
+        shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.Orthogonal',
@@ -436,7 +449,7 @@ class Orthogonal(init_ops_v2.Orthogonal, Initializer):
       ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized to an orthogonal matrix.
 
     Args:
@@ -445,8 +458,10 @@ class Orthogonal(init_ops_v2.Orthogonal, Initializer):
         supported. If not specified, `tf.keras.backend.floatx()` is used,
        which default to `float32` unless you configured it otherwise
        (via `tf.keras.backend.set_floatx(float_dtype)`)
+      **kwargs: Additional keyword arguments.
     """
-    return super(Orthogonal, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(Orthogonal, self).__call__(
+        shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.Identity',
@@ -473,7 +488,7 @@ class Identity(init_ops_v2.Identity, Initializer):
     gain: Multiplicative factor to apply to the identity matrix.
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized to a 2D identity matrix.
 
     Args:
@@ -482,8 +497,10 @@ class Identity(init_ops_v2.Identity, Initializer):
        supported. If not specified, `tf.keras.backend.floatx()` is used,
        which default to `float32` unless you configured it otherwise
        (via `tf.keras.backend.set_floatx(float_dtype)`)
+      **kwargs: Additional keyword arguments.
     """
-    return super(Identity, self).__call__(shape, dtype=_get_dtype(dtype))
+    return super(Identity, self).__call__(
+        shape, dtype=_get_dtype(dtype), **kwargs)
 
 
 @keras_export('keras.initializers.GlorotUniform',
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index f03de2b436e..47822ef0893 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -253,6 +253,34 @@ class KerasInitializersTest(test.TestCase):
     initializer = initializers.deserialize(external_serialized_json)
     self.assertEqual(initializer.distribution, 'truncated_normal')
 
+  def test_partition(self):
+    with self.cached_session():
+      partition_enabled_initializers = [
+          initializers.ZerosV2(),
+          initializers.OnesV2(),
+          initializers.RandomUniformV2(),
+          initializers.RandomNormalV2(),
+          initializers.TruncatedNormalV2(),
+          initializers.LecunUniformV2(),
+          initializers.GlorotUniformV2(),
+          initializers.HeUniformV2()
+      ]
+      for initializer in partition_enabled_initializers:
+        got = initializer(
+            shape=(4, 2), partition_shape=(2, 2), partition_offset=(0, 0))
+        self.assertEqual(got.shape, (2, 2))
+
+      partition_forbidden_initializers = [
+          initializers.OrthogonalV2(),
+          initializers.IdentityV2()
+      ]
+      for initializer in partition_forbidden_initializers:
+        with self.assertRaisesRegex(
+            ValueError,
+            "initializer doesn't support partition-related arguments"):
+          initializer(
+              shape=(4, 2), partition_shape=(2, 2), partition_offset=(0, 0))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 3b4db66ab55..164b731d56b 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -103,3 +103,18 @@ tpu_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+tf_py_test(
+    name = "multi_worker_tutorial_test",
+    srcs = ["multi_worker_tutorial_test.py"],
+    python_version = "PY3",
+    shard_count = 3,
+    tags = [
+        "noasan",  # TODO(b/156029134)
+        "nomsan",  # TODO(b/156029134)
+        "notsan",  # TODO(b/156029134)
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index cc0daa4cf70..24fcbf8fa4b 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -19,6 +19,9 @@ from __future__ import print_function
 import gc
 
 import tensorflow as tf
+
+from tensorflow.python.platform import test as test_lib
+
 layers = tf.keras.layers
 optimizers = tf.keras.optimizers
 
@@ -150,6 +153,10 @@ class GradientCheckpointTest(tf.test.TestCase):
   def test_does_not_raise_oom_exception(self):
     if not _limit_gpu_memory():
       self.skipTest('No virtual GPUs found')
+    if test_lib.is_built_with_rocm():
+      self.skipTest(
+          'ROCm MIOpen does not support searching for memory-limited'
+          'solvers yet so skip the subtest which would result in OOM.')
     n_step = 2
     losses = _train_with_recompute(n_step)
     self.assertLen(losses, n_step)
diff --git a/tensorflow/python/keras/integration_test/multi_worker_tutorial_test.py b/tensorflow/python/keras/integration_test/multi_worker_tutorial_test.py
new file mode 100644
index 00000000000..20ba4d79af2
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/multi_worker_tutorial_test.py
@@ -0,0 +1,346 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for multi-worker training tutorial."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import os
+import re
+import unittest
+import uuid
+import zipfile
+from absl import logging
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+PER_WORKER_BATCH_SIZE = 64
+NUM_WORKERS = 2
+NUM_EPOCHS = 2
+NUM_STEPS_PER_EPOCH = 50
+
+
+def _is_chief(task_type, task_id):
+  return task_type is None or task_type == 'chief' or (task_type == 'worker' and
+                                                       task_id == 0)
+
+
+def _get_temp_dir(dirpath, task_id):
+  base_dirpath = 'workertemp_' + str(task_id)
+  temp_dir = os.path.join(dirpath, base_dirpath)
+  tf.io.gfile.makedirs(temp_dir)
+  return temp_dir
+
+
+def write_filepath(filepath, task_type, task_id):
+  dirpath = os.path.dirname(filepath)
+  base = os.path.basename(filepath)
+  if not _is_chief(task_type, task_id):
+    dirpath = _get_temp_dir(dirpath, task_id)
+  return os.path.join(dirpath, base)
+
+
+class MultiWorkerTutorialTest(parameterized.TestCase, tf.test.TestCase):
+  """Test of multi-worker training flow in tutorials on tensorflow.org.
+
+  Please see below test method docs for what actual tutorial is being covered.
+  """
+
+  # TODO(rchao): Add a test to demonstrate gather with MWMS.
+
+  @contextlib.contextmanager
+  def skip_fetch_failure_exception(self):
+    try:
+      yield
+    except zipfile.BadZipfile as e:
+      # There can be a race when multiple processes are downloading the data.
+      # Skip the test if that results in loading errors.
+      self.skipTest('Data loading error: Bad magic number for file header.')
+    except Exception as e:  # pylint: disable=broad-except
+      if 'URL fetch failure' in str(e):
+        self.skipTest('URL fetch error not considered failure of the test.')
+      else:
+        raise
+
+  def mnist_dataset(self):
+    path_to_use = 'mnist_{}.npz'.format(str(uuid.uuid4()))
+    with self.skip_fetch_failure_exception():
+      (x_train,
+       y_train), _ = tf.keras.datasets.mnist.load_data(path=path_to_use)
+    # The `x` arrays are in uint8 and have values in the range [0, 255].
+    # We need to convert them to float32 with values in the range [0, 1]
+    x_train = x_train / np.float32(255)
+    y_train = y_train.astype(np.int64)
+    train_dataset = tf.data.Dataset.from_tensor_slices(
+        (x_train, y_train)).shuffle(60000)
+    return train_dataset
+
+  def dataset_fn(self, global_batch_size, input_context):
+    batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+    dataset = self.mnist_dataset()
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def build_cnn_model(self):
+    return tf.keras.Sequential([
+        tf.keras.layers.Input(shape=(28, 28)),
+        tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
+        tf.keras.layers.Conv2D(32, 3, activation='relu'),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dense(128, activation='relu'),
+        tf.keras.layers.Dense(10)
+    ])
+
+  def build_and_compile_cnn_model(self):
+    model = self.build_cnn_model()
+    model.compile(
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
+        metrics=['accuracy'])
+    return model
+
+  @tf.__internal__.test.combinations.generate(
+      tf.__internal__.test.combinations.combine(
+          mode=['eager'], tf_api_version=2))
+  def testSingleWorkerModelFit(self):
+    single_worker_dataset = self.mnist_dataset().batch(
+        PER_WORKER_BATCH_SIZE)
+    single_worker_model = self.build_and_compile_cnn_model()
+    single_worker_model.fit(single_worker_dataset, epochs=NUM_EPOCHS)
+
+  @tf.__internal__.test.combinations.generate(
+      tf.__internal__.test.combinations.combine(
+          mode=['eager'], tf_api_version=2))
+  def testMwmsWithModelFit(self, mode):
+    """Test multi-worker training flow demo'ed in go/multi-worker-with-keras.
+
+    This test should be kept in sync with the code samples in
+    go/multi-worker-with-keras.
+
+    Args:
+      mode: Runtime mode.
+    """
+    def fn(model_path, checkpoint_dir):
+      global_batch_size = PER_WORKER_BATCH_SIZE * NUM_WORKERS
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      with strategy.scope():
+        multi_worker_model = self.build_and_compile_cnn_model()
+
+      callbacks = [
+          tf.keras.callbacks.ModelCheckpoint(
+              filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
+      ]
+
+      multi_worker_dataset = strategy.distribute_datasets_from_function(
+          lambda input_context: self.dataset_fn(global_batch_size, input_context
+                                               ))
+
+      multi_worker_model.fit(
+          multi_worker_dataset,
+          epochs=NUM_EPOCHS,
+          steps_per_epoch=50,
+          callbacks=callbacks)
+
+      task_type, task_id = (strategy.cluster_resolver.task_type,
+                            strategy.cluster_resolver.task_id)
+      write_model_path = write_filepath(model_path, task_type, task_id)
+
+      multi_worker_model.save(write_model_path)
+      if not _is_chief(task_type, task_id):
+        tf.io.gfile.rmtree(os.path.dirname(write_model_path))
+
+      # Make sure chief finishes saving before non-chief's assertions.
+      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+      if not tf.io.gfile.exists(model_path):
+        raise RuntimeError()
+      if tf.io.gfile.exists(write_model_path) != _is_chief(task_type, task_id):
+        raise RuntimeError()
+
+      with strategy.scope():
+        loaded_model = tf.keras.models.load_model(model_path)
+      loaded_model.fit(multi_worker_dataset, epochs=1, steps_per_epoch=1)
+
+      checkpoint = tf.train.Checkpoint(model=multi_worker_model)
+      write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id)
+      checkpoint_manager = tf.train.CheckpointManager(
+          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
+
+      checkpoint_manager.save()
+      if not _is_chief(task_type, task_id):
+        tf.io.gfile.rmtree(write_checkpoint_dir)
+
+      # Make sure chief finishes saving before non-chief's assertions.
+      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+      if not tf.io.gfile.exists(checkpoint_dir):
+        raise RuntimeError()
+      if tf.io.gfile.exists(write_checkpoint_dir) != _is_chief(
+          task_type, task_id):
+        raise RuntimeError()
+
+      latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
+      checkpoint.restore(latest_checkpoint)
+      multi_worker_model.fit(multi_worker_dataset, epochs=1, steps_per_epoch=1)
+
+      logging.info('testMwmsWithModelFit successfully ends')
+
+    model_path = os.path.join(self.get_temp_dir(), 'model.tf')
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
+    try:
+      mpr_result = tf.__internal__.distribute.multi_process_runner.run(
+          fn,
+          tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+              num_workers=NUM_WORKERS),
+          args=(model_path, checkpoint_dir),
+          return_output=True)
+    except tf.errors.UnavailableError:
+      self.skipTest('Skipping rare disconnection among the workers.')
+
+    self.assertTrue(
+        any([
+            'testMwmsWithModelFit successfully ends' in msg
+            for msg in mpr_result.stdout
+        ]))
+
+    def extract_accuracy(worker_id, input_string):
+      match = re.match(
+          r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id),
+          input_string)
+      return None if match is None else float(match.group(1))
+
+    for worker_id in range(NUM_WORKERS):
+      accu_result = tf.nest.map_structure(
+          lambda x: extract_accuracy(worker_id, x),  # pylint: disable=cell-var-from-loop
+          mpr_result.stdout)
+      self.assertTrue(
+          any(accu_result), 'Every worker is supposed to have accuracy result.')
+
+  @tf.__internal__.test.combinations.generate(
+      tf.__internal__.test.combinations.combine(
+          mode=['eager'], tf_api_version=2))
+  def testMwmsWithCtl(self, mode):
+    """Test multi-worker CTL training flow demo'ed in a to-be-added tutorial."""
+
+    def proc_func(checkpoint_dir):
+      global_batch_size = PER_WORKER_BATCH_SIZE * NUM_WORKERS
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      try:
+
+        with strategy.scope():
+          multi_worker_model = self.build_cnn_model()
+
+        multi_worker_dataset = strategy.distribute_datasets_from_function(
+            lambda input_context: self.dataset_fn(global_batch_size,  # pylint: disable=g-long-lambda
+                                                  input_context))
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+            name='train_accuracy')
+
+        @tf.function
+        def train_step(iterator):
+          """Training step function."""
+
+          def step_fn(inputs):
+            """Per-Replica step function."""
+            x, y = inputs
+            with tf.GradientTape() as tape:
+              predictions = multi_worker_model(x, training=True)
+              per_batch_loss = tf.keras.losses.SparseCategoricalCrossentropy(
+                  from_logits=True,
+                  reduction=tf.keras.losses.Reduction.NONE)(y, predictions)
+              loss = tf.nn.compute_average_loss(
+                  per_batch_loss, global_batch_size=global_batch_size)
+
+            grads = tape.gradient(loss, multi_worker_model.trainable_variables)
+            optimizer.apply_gradients(
+                zip(grads, multi_worker_model.trainable_variables))
+            train_accuracy.update_state(y, predictions)
+
+            return loss
+
+          per_replica_losses = strategy.run(step_fn, args=(next(iterator),))
+          return strategy.reduce(
+              tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
+
+        epoch = tf.Variable(
+            initial_value=tf.constant(0, dtype=tf.dtypes.int64), name='epoch')
+        step_in_epoch = tf.Variable(
+            initial_value=tf.constant(0, dtype=tf.dtypes.int64),
+            name='step_in_epoch')
+
+        task_type, task_id = (strategy.cluster_resolver.task_type,
+                              strategy.cluster_resolver.task_id)
+        checkpoint = tf.train.Checkpoint(
+            model=multi_worker_model, epoch=epoch, step_in_epoch=step_in_epoch)
+        write_checkpoint_dir = write_filepath(checkpoint_dir, task_type,
+                                              task_id)
+        checkpoint_manager = tf.train.CheckpointManager(
+            checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
+
+        latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
+        if latest_checkpoint:
+          checkpoint.restore(latest_checkpoint)
+
+        while epoch.numpy() < NUM_EPOCHS:
+          iterator = iter(multi_worker_dataset)
+          total_loss = 0.0
+          num_batches = 0
+
+          while step_in_epoch.numpy() < NUM_STEPS_PER_EPOCH:
+            total_loss += train_step(iterator)
+            num_batches += 1
+            step_in_epoch.assign_add(1)
+
+          train_loss = total_loss / num_batches
+          logging.info('Epoch: %d, accuracy: %f, train_loss: %f.',
+                       epoch.numpy(), train_accuracy.result(), train_loss)
+
+          train_accuracy.reset_states()
+
+          checkpoint_manager.save()
+          if not _is_chief(task_type, task_id):
+            tf.io.gfile.rmtree(write_checkpoint_dir)
+
+          epoch.assign_add(1)
+          step_in_epoch.assign(0)
+
+      except tf.errors.UnavailableError as e:
+        logging.info('UnavailableError occurred: %r', e)
+        raise unittest.SkipTest('Skipping test due to UnavailableError')
+
+      logging.info('testMwmsWithCtl successfully ends')
+
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
+
+    mpr_result = tf.__internal__.distribute.multi_process_runner.run(
+        proc_func,
+        tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+            num_workers=NUM_WORKERS),
+        return_output=True,
+        args=(checkpoint_dir,))
+
+    self.assertTrue(
+        any([
+            'testMwmsWithCtl successfully ends' in msg
+            for msg in mpr_result.stdout
+        ]))
+
+
+if __name__ == '__main__':
+  tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index a69452e247f..43e7b2d270b 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections.abc as collections_abc
 import functools
 import itertools
 import unittest
@@ -31,7 +32,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -113,7 +113,6 @@ def run_with_all_saved_model_formats(
     tf.test.main()
   ```
 
-
   Args:
     test_or_class: test method or class to be annotated. If None,
       this method returns a decorator that can be applied to a test method or
@@ -134,7 +133,7 @@ def run_with_all_saved_model_formats(
   # Exclude h5 save format if H5py isn't available.
   if h5py is None:
     exclude_formats.append(['h5'])
-  saved_model_formats = ['h5', 'tf']
+  saved_model_formats = ['h5', 'tf', 'tf_no_traces']
   params = [('_%s' % saved_format, saved_format)
             for saved_format in saved_model_formats
             if saved_format not in nest.flatten(exclude_formats)]
@@ -150,6 +149,8 @@ def run_with_all_saved_model_formats(
         _test_h5_saved_model_format(f, self, *args, **kwargs)
       elif saved_format == 'tf':
         _test_tf_saved_model_format(f, self, *args, **kwargs)
+      elif saved_format == 'tf_no_traces':
+        _test_tf_saved_model_format_no_traces(f, self, *args, **kwargs)
       else:
         raise ValueError('Unknown model type: %s' % (saved_format,))
     return decorated
@@ -167,6 +168,18 @@ def _test_tf_saved_model_format(f, test_or_class, *args, **kwargs):
     f(test_or_class, *args, **kwargs)
 
 
+def _test_tf_saved_model_format_no_traces(f, test_or_class, *args, **kwargs):
+  with testing_utils.saved_model_format_scope('tf', save_traces=False):
+    f(test_or_class, *args, **kwargs)
+
+
+def run_with_all_weight_formats(test_or_class=None, exclude_formats=None):
+  """Runs all tests with the supported formats for saving weights."""
+  exclude_formats = exclude_formats or []
+  exclude_formats.append('tf_no_traces')  # Only applies to saving models
+  return run_with_all_saved_model_formats(test_or_class, exclude_formats)
+
+
 # TODO(kaftan): Possibly enable 'subclass_custom_build' when tests begin to pass
 # it. Or perhaps make 'subclass' always use a custom build method.
 def run_with_all_model_types(
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
index 33c68df62c4..ff2cf3cac37 100644
--- a/tensorflow/python/keras/keras_parameterized_test.py
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import keras_tensor
-from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 class KerasParameterizedTest(keras_parameterized.TestCase):
@@ -600,4 +600,4 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     self.assertEqual(arg, True)
 
 if __name__ == "__main__":
-  googletest.main()
+  test.main()
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 4846df4f213..54e815dc60a 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Contains the Keras layers (internal TensorFlow version).
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     # TODO(scottzhu): Remove non-keras deps from TF.
@@ -421,7 +421,6 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:platform",
-        "//tensorflow/python:platform_build_info",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:util",
@@ -513,6 +512,9 @@ cuda_py_test(
     srcs = ["convolutional_test.py"],
     python_version = "PY3",
     shard_count = 8,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -526,6 +528,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["convolutional_transpose_test.py"],
     python_version = "PY3",
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -543,6 +548,9 @@ cuda_py_test(
     tags = [
         "no_windows_gpu",
     ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -590,7 +598,6 @@ tf_py_test(
     srcs = ["subclassed_layers_test.py"],
     python_version = "PY3",
     shard_count = 3,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -708,6 +715,9 @@ cuda_py_test(
         "no_rocm",
         "notsan",
     ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -777,7 +787,10 @@ tf_py_test(
     srcs = ["recurrent_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        "notsan",  # TODO(b/170870794)
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -805,7 +818,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["separable_convolutional_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -822,6 +834,10 @@ cuda_py_test(
     shard_count = 12,
     tags = [
         "no_oss",
+        "notsan",  # TODO(b/170954246)
+    ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
     ],
     deps = [
         "//tensorflow/python:client_testlib",
@@ -837,10 +853,6 @@ cuda_py_test(
     srcs = ["gru_v2_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = [
-        "no_cuda11",
-        "no_oss",
-    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -855,7 +867,6 @@ tf_py_test(
     size = "small",
     srcs = ["serialization_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -922,7 +933,6 @@ tf_py_test(
     tags = [
         "notsan",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -939,7 +949,6 @@ tf_py_test(
     size = "small",
     srcs = ["layers_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":layers",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index e4323b45dc4..456b6758dc6 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -30,6 +30,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
+def get_globals():
+  return globals()
+
+
 @keras_export('keras.layers.LeakyReLU')
 class LeakyReLU(Layer):
   """Leaky version of a Rectified Linear Unit.
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index c383ef2b8da..d305dd37bb2 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -524,7 +524,7 @@ class Conv2D(Conv):
 
   When using this layer as the first layer in a model,
   provide the keyword argument `input_shape`
-  (tuple of integers, does not include the sample axis),
+  (tuple of integers or `None`, does not include the sample axis),
   e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
   in `data_format="channels_last"`.
 
@@ -683,7 +683,7 @@ class Conv3D(Conv):
 
   When using this layer as the first layer in a model,
   provide the keyword argument `input_shape`
-  (tuple of integers, does not include the sample axis),
+  (tuple of integers or `None`, does not include the sample axis),
   e.g. `input_shape=(128, 128, 128, 1)` for 128x128x128 volumes
   with a single channel,
   in `data_format="channels_last"`.
@@ -832,7 +832,7 @@ class Conv1DTranspose(Conv1D):
 
   When using this layer as the first layer in a model,
   provide the keyword argument `input_shape`
-  (tuple of integers, does not include the sample axis),
+  (tuple of integers or `None`, does not include the sample axis),
   e.g. `input_shape=(128, 3)` for data with 128 time steps and 3 channels.
 
   Arguments:
@@ -1079,7 +1079,7 @@ class Conv2DTranspose(Conv2D):
 
   When using this layer as the first layer in a model,
   provide the keyword argument `input_shape`
-  (tuple of integers, does not include the sample axis),
+  (tuple of integers or `None`, does not include the sample axis),
   e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
   in `data_format="channels_last"`.
 
@@ -1382,7 +1382,7 @@ class Conv3DTranspose(Conv3D):
 
   When using this layer as the first layer in a model,
   provide the keyword argument `input_shape`
-  (tuple of integers, does not include the sample axis),
+  (tuple of integers or `None`, does not include the sample axis),
   e.g. `input_shape=(128, 128, 128, 3)` for a 128x128x128 volume with 3 channels
   if `data_format="channels_last"`.
 
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index a6f205676ca..04ae43c1879 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -884,7 +884,6 @@ class ConvLSTM2D(ConvRNN2D):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self._maybe_reset_cell_dropout_mask(self.cell)
     return super(ConvLSTM2D, self).call(inputs,
                                         mask=mask,
                                         training=training,
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 34d896f9486..e64a2a6b8c4 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -50,10 +50,10 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import dispatch
@@ -671,7 +671,7 @@ class Flatten(Layer):
       # Full static shape is guaranteed to be available.
       # Performance: Using `constant_op` is much faster than passing a list.
       flattened_shape = constant_op.constant([inputs.shape[0], -1])
-      return gen_array_ops.reshape(inputs, flattened_shape)
+      return array_ops.reshape(inputs, flattened_shape)
     else:
       input_shape = inputs.shape
       rank = input_shape.rank
@@ -765,9 +765,10 @@ class Lambda(Layer):
 
   The main reason to subclass `tf.keras.layers.Layer` instead of using a
   `Lambda` layer is saving and inspecting a Model. `Lambda` layers
-  are saved by serializing the Python bytecode, whereas subclassed
-  Layers can be saved via overriding their `get_config` method. Overriding
-  `get_config` improves the portability of Models. Models that rely on
+  are saved by serializing the Python bytecode, which is fundamentally
+  non-portable. They should only be loaded in the same environment where
+  they were saved. Subclassed layers can be saved in a more portable way
+  by overriding their `get_config` method. Models that rely on
   subclassed Layers are also often easier to visualize and reason about.
 
   Examples:
@@ -1312,8 +1313,12 @@ class TFOpLambda(Layer):
       # (For standard layers users could just set `name` when creating the
       # layer to work around a collision, but they can't do that for
       # auto-generated layers)
+      if self.symbol:
+        name = 'tf.' + self.symbol
+      else:
+        name = self.function.__name__
       kwargs['name'] = K.unique_object_name(
-          'tf.' + self.symbol, zero_based=True, avoid_observed_names=True)
+          name, zero_based=True, avoid_observed_names=True)
     kwargs['autocast'] = False
 
     # Decorate the function to produce this layer's call method
@@ -1537,3 +1542,261 @@ for slicing_op in [array_ops._slice_helper,  # pylint: disable=protected-access
                    array_ops.boolean_mask,
                    array_ops.boolean_mask_v2]:
   TFSlicingOpDispatcher(slicing_op).register(slicing_op)
+
+
+class InstanceProperty(Layer):
+  """Wraps an instance property access (e.g. `x.foo`) in a Keras Layer.
+
+  This layer takes an attribute name `attr_name` in the constructor and,
+  when called on input tensor `obj` returns `obj.attr_name`.
+
+  KerasTensors specialized for specific extension types use it to
+  represent instance property accesses on the represented object in the
+  case where the property needs to be dynamically accessed as opposed to
+  being statically computed from the typespec, e.g.
+
+  x = keras.Input(..., ragged=True)
+  out = x.flat_values
+  """
+
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, attr_name, **kwargs):
+    self.attr_name = attr_name
+
+    if 'name' not in kwargs:
+      kwargs['name'] = K.unique_object_name(
+          'input.' + self.attr_name, zero_based=True, avoid_observed_names=True)
+    kwargs['autocast'] = False
+
+    # Do not individually trace op layers in the SavedModel.
+    self._must_restore_from_config = True
+
+    super(InstanceProperty, self).__init__(**kwargs)
+
+    # Preserve all argument data structures when saving/loading a config
+    # (e.g., don't unnest lists that contain one element)
+    self._preserve_input_structure_in_config = True
+
+  def call(self, obj):
+    return getattr(obj, self.attr_name)
+
+  def get_config(self):
+    config = {
+        'attr_name': self.attr_name
+    }
+    base_config = super(InstanceProperty, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+class InstanceMethod(InstanceProperty):
+  """Wraps an instance method access (e.g. `x.foo(arg)` in a Keras Layer.
+
+  This layer takes an attribute name `attr_name` in the constructor and,
+  when called on input tensor `obj` with additional arguments `args` and
+  `kwargs` returns `obj.attr_name(*args, **kwargs)`.
+
+  KerasTensors specialized for specific extension types use it to
+  represent dynamic instance method calls on the represented object, e.g.
+
+  x = keras.Input(..., ragged=True)
+  new_values = keras.Input(...)
+  out = x.with_values(new_values)
+  """
+
+  def call(self, obj, args, kwargs):
+    method = getattr(obj, self.attr_name)
+    return method(*args, **kwargs)
+
+
+def _delegate_property(keras_tensor_cls, property_name):  # pylint: disable=invalid-name
+  """Register property on a KerasTensor class.
+
+  Calling this multiple times with the same arguments should be a no-op.
+
+  This method exposes a property on the KerasTensor class that will use an
+  `InstanceProperty` layer to access the property on the represented
+  intermediate values in the model.
+
+  Arguments:
+    keras_tensor_cls: The KerasTensor subclass that should expose the property.
+    property_name: The name of the property to expose and delegate to the
+      represented (Composite)Tensor.
+  """
+  # We use a lambda because we can't create a Keras layer at import time
+  # due to dynamic layer class versioning.
+  property_access = property(lambda self: InstanceProperty(property_name)(self))  # pylint: disable=unnecessary-lambda
+  setattr(keras_tensor_cls, property_name, property_access)
+
+
+def _delegate_method(keras_tensor_cls, method_name):  # pylint: disable=invalid-name
+  """Register method on a KerasTensor class.
+
+  Calling this function times with the same arguments should be a no-op.
+
+  This method exposes an instance method on the KerasTensor class that will use
+  an `InstanceMethod` layer to run the desired method on the represented
+  intermediate values in the model.
+
+  Arguments:
+    keras_tensor_cls: The KerasTensor subclass that should expose the property.
+    method_name: The name of the method to expose and delegate to the
+      represented (Composite)Tensor.
+  """
+  def delegate(self, *args, **kwargs):
+    return InstanceMethod(method_name)(self, args, kwargs)
+  setattr(keras_tensor_cls, method_name, delegate)
+
+# We do not support the `uniform_row_length` property because it
+# returns either `None` or an int tensor, and code that relies on it tends
+# to check `is None` directly. Delegating it here would always return a
+# `KerasTensor`, regardless of what can be statically inferred. This would
+# never equal `None`, breaking code that expects it to be partially-static
+# in unpredictable ways.
+for ragged_property in [
+    'values',
+    'flat_values',
+    'row_splits',
+    'nested_row_splits'
+]:
+  _delegate_property(keras_tensor.RaggedKerasTensor, ragged_property)
+
+for ragged_method_name in [
+    'value_rowids',
+    'nested_value_rowids',
+    'nrows',
+    'row_starts',
+    'row_limits',
+    'row_lengths',
+    'nested_row_lengths',
+    'bounding_shape',
+    'with_values',
+    'with_flat_values',
+    'with_row_splits_dtype',
+    'merge_dims',
+    'to_tensor',
+    'to_sparse',
+]:
+  _delegate_method(keras_tensor.RaggedKerasTensor, ragged_method_name)
+
+for sparse_property in [
+    'indices',
+    'values',
+]:
+  _delegate_property(keras_tensor.SparseKerasTensor, sparse_property)
+
+for sparse_method in [
+    'with_values',
+]:
+  _delegate_method(keras_tensor.SparseKerasTensor, sparse_method)
+
+
+class ClassMethod(Layer):
+  """Wraps a TF API Class's class method  in a `Layer` object.
+
+  It is inserted by the Functional API construction whenever users call
+  a supported TF Class's class method on KerasTensors.
+
+  This is useful in the case where users do something like:
+  x = keras.Input(...)
+  y = keras.Input(...)
+  out = tf.RaggedTensor.from_row_splits(x, y)
+  """
+
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, cls_ref, method_name, **kwargs):
+    self.cls_ref = cls_ref
+    self.method_name = method_name
+    self.cls_symbol = (
+        get_canonical_name_for_symbol(self.cls_ref,
+                                      add_prefix_to_v1_names=True) or
+        get_canonical_name_for_symbol(self.cls_ref,
+                                      api_name='keras',
+                                      add_prefix_to_v1_names=True))
+    if 'name' not in kwargs:
+      kwargs['name'] = K.unique_object_name(
+          'tf.' + self.cls_symbol + '.' + self.method_name, zero_based=True,
+          avoid_observed_names=True)
+    kwargs['autocast'] = False
+
+    # Do not individually trace op layers in the SavedModel.
+    self._must_restore_from_config = True
+
+    super(ClassMethod, self).__init__(**kwargs)
+
+    # Preserve all argument data structures when saving/loading a config
+    # (e.g., don't unnest lists that contain one element)
+    self._preserve_input_structure_in_config = True
+
+    self._expects_training_arg = False
+    self._expects_mask_arg = False
+
+  def call(self, args, kwargs):
+    return getattr(self.cls_ref, self.method_name)(*args, **kwargs)
+
+  def get_config(self):
+    if not self.cls_symbol:
+      raise ValueError('This Keras class method conversion tried to convert '
+                       'a method belonging to class %s, a class '
+                       'that is not an exposed in the TensorFlow API. '
+                       'To ensure cross-version compatibility of Keras models '
+                       'that use op layers, only op layers produced from '
+                       'exported TF API symbols can be serialized.'
+                       % self.cls_symbol)
+    config = {
+        'cls_symbol': self.cls_symbol,
+        'method_name': self.method_name
+    }
+
+    base_config = super(ClassMethod, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = config.copy()
+    symbol_name = config.pop('cls_symbol')
+    cls_ref = get_symbol_from_name(symbol_name)
+    if not cls_ref:
+      raise ValueError(
+          'TF symbol `tf.%s` could not be found.' % symbol_name)
+
+    config['cls_ref'] = cls_ref
+
+    return cls(**config)
+
+
+class TFClassMethodDispatcher(dispatch.OpDispatcher):
+  """A class method dispatcher that allows building a functional model with TF class methods."""
+
+  def __init__(self, cls, method_name):
+    self.cls = cls
+    self.method_name = method_name
+
+  def handle(self, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+    if any(
+        isinstance(x, keras_tensor.KerasTensor)
+        for x in nest.flatten([args, kwargs])):
+      return ClassMethod(self.cls, self.method_name)(args[1:], kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+for ragged_class_method in [
+    'from_value_rowids',
+    'from_row_splits',
+    'from_row_lengths',
+    'from_row_starts',
+    'from_row_limits',
+    'from_uniform_row_length',
+    'from_nested_value_rowids',
+    'from_nested_row_splits',
+    'from_nested_row_lengths',
+    'from_tensor',
+    'from_sparse',
+]:
+  TFClassMethodDispatcher(
+      ragged_tensor.RaggedTensor, ragged_class_method).register(
+          getattr(ragged_tensor.RaggedTensor, ragged_class_method))
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index b7a11d32c71..ff346737edc 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -559,5 +559,20 @@ class CoreLayersTest(keras_parameterized.TestCase):
       self.assertAllEqual(np.ones((10, 20)), layer([x, y]))
 
 
+@keras_parameterized.run_all_keras_modes
+class TFOpLambdaTest(keras_parameterized.TestCase):
+
+  def test_non_tf_symbol(self):
+    def dummy_func(a, b):
+      return a + b
+
+    layer = core.TFOpLambda(dummy_func)
+    self.assertIsNone(layer.symbol)
+    self.assertEqual(layer.name, 'dummy_func')
+
+    with self.assertRaisesRegex(ValueError, 'was generated from .*dummy_func'):
+      layer.get_config()
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index fc393ea7290..e970232de40 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -303,7 +303,7 @@ class CuDNNGRU(_CuDNNRNN):
         'rnn_mode': 'gru',
     }
 
-    outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
+    outputs, h, _, _, _ = gen_cudnn_rnn_ops.CudnnRNNV2(**args)
 
     if self.stateful or self.return_state:
       h = h[0]
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 9b4a0622daa..2f73074f4a0 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -194,7 +194,7 @@ class Embedding(Layer):
       out = embedding_ops.embedding_lookup_v2(self.embeddings.variables, inputs)
     else:
       out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
-    if self._dtype_policy.should_cast_variables:
+    if self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype:
       # Instead of casting the variable as in most layers, cast the output, as
       # this is mathematically equivalent but is faster.
       out = math_ops.cast(out, self._dtype_policy.compute_dtype)
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index fd468bd15b1..50ea36d1c8a 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index c8a6a65d68c..f6160cf0981 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import keras_export
 
@@ -219,9 +220,9 @@ class RandomFourierFeatures(base_layer.Layer):
 
   def call(self, inputs):
     inputs = ops.convert_to_tensor_v2_with_dispatch(inputs, dtype=self.dtype)
-    inputs = gen_math_ops.cast(inputs, dtypes.float32)
+    inputs = math_ops.cast(inputs, dtypes.float32)
     kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
-    outputs = gen_math_ops.mat_mul(inputs, kernel)
+    outputs = gen_math_ops.MatMul(a=inputs, b=kernel)
     outputs = nn.bias_add(outputs, self.bias)
     return gen_math_ops.cos(outputs)
 
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 2809cbb0108..0737fe11712 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -80,8 +80,8 @@ class BatchNormalizationBase(Layer):
   inference data*.
 
   Arguments:
-    axis: Integer, the axis that should be normalized (typically the features
-      axis). For instance, after a `Conv2D` layer with
+    axis: Integer or a list of integers, the axis that should be normalized
+    (typically the features axis). For instance, after a `Conv2D` layer with
       `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 79ecc3c3fe1..a98db36ceea 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/python/keras/layers/ops/core.py b/tensorflow/python/keras/layers/ops/core.py
index 1a30472cba3..d53fb4d2aab 100644
--- a/tensorflow/python/keras/layers/ops/core.py
+++ b/tensorflow/python/keras/layers/ops/core.py
@@ -50,7 +50,7 @@ def dense(inputs, kernel, bias=None, activation=None, dtype=None):
     if isinstance(inputs, sparse_tensor.SparseTensor):
       outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, kernel)
     else:
-      outputs = gen_math_ops.mat_mul(inputs, kernel)
+      outputs = gen_math_ops.MatMul(a=inputs, b=kernel)
   # Broadcast kernel to inputs.
   else:
     outputs = standard_ops.tensordot(inputs, kernel, [[rank - 1], [0]])
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 2d01304ac0b..d1310014bb1 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -47,6 +47,7 @@ py_library(
     name = "discretization",
     srcs = [
         "discretization.py",
+        "discretization_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -54,6 +55,7 @@ py_library(
         "//tensorflow/python:boosted_trees_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:resources",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf_export",
@@ -461,6 +463,7 @@ tf_py_test(
     size = "small",
     srcs = ["discretization_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     tags = ["no_rocm"],
     deps = [
         ":discretization",
@@ -476,7 +479,9 @@ distribute_py_test(
     srcs = ["discretization_distribution_test.py"],
     main = "discretization_distribution_test.py",
     python_version = "PY3",
-    tags = ["multi_and_single_gpu"],
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":discretization",
         ":preprocessing_test_utils",
@@ -588,6 +593,10 @@ tf_py_test(
     size = "small",
     srcs = ["normalization_test.py"],
     python_version = "PY3",
+    tags = [
+        "broken",  # b/170974360
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
+    ],
     deps = [
         ":normalization",
         ":preprocessing_test_utils",
@@ -618,7 +627,9 @@ distribute_py_test(
     srcs = ["normalization_distribution_test.py"],
     main = "normalization_distribution_test.py",
     python_version = "PY3",
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+    ],
     deps = [
         ":normalization",
         ":preprocessing_test_utils",
@@ -636,7 +647,6 @@ tf_py_test(
     name = "table_utils_test",
     srcs = ["table_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":table_utils",
         "//tensorflow/python:client_testlib",
@@ -653,9 +663,6 @@ tf_py_test(
     srcs = ["text_vectorization_test.py"],
     python_version = "PY3",
     shard_count = 4,
-    tags = [
-        "noasan",  #TODO(b/161376526): Enable when bug fix lands.
-    ],
     deps = [
         ":preprocessing_test_utils",
         ":text_vectorization",
@@ -697,6 +704,7 @@ tf_py_test(
     name = "reduction_test",
     srcs = ["reduction_test.py"],
     python_version = "PY3",
+    tags = ["notsan"],  # TODO(b/170783154)
     deps = [
         ":reduction",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 88693c7fa25..7a965bfe2c2 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -18,7 +18,6 @@ tf_py_test(
     name = "category_encoding_benchmark",
     srcs = ["category_encoding_benchmark.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -36,7 +35,6 @@ tf_py_test(
     name = "category_crossing_benchmark",
     srcs = ["category_crossing_benchmark.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -55,7 +53,6 @@ tf_py_test(
     name = "hashing_benchmark",
     srcs = ["hashing_benchmark.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -74,7 +71,6 @@ tf_py_test(
     name = "index_lookup_adapt_benchmark",
     srcs = ["index_lookup_adapt_benchmark.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -92,7 +88,6 @@ tf_py_test(
     name = "normalization_adapt_benchmark",
     srcs = ["normalization_adapt_benchmark.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -108,11 +103,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "discretization_adapt_benchmark",
+    srcs = ["discretization_adapt_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:discretization",
+    ],
+)
+
 cuda_py_test(
     name = "image_preproc_benchmark",
     srcs = ["image_preproc_benchmark.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
index efc0ca3766f..b7ace2b9a07 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
@@ -45,7 +45,7 @@ def int_gen():
     yield (np.random.randint(0, 5, (1,)), np.random.randint(0, 7, (1,)))
 
 
-class BenchmarkLayer(benchmark.Benchmark):
+class BenchmarkLayer(benchmark.TensorFlowBenchmark):
   """Benchmark the layer forward pass."""
 
   def run_dataset_implementation(self, batch_size):
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
index 71b4c7b6b61..08b0233e2d9 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS
 v2_compat.enable_v2_behavior()
 
 
-class BenchmarkLayer(benchmark.Benchmark):
+class BenchmarkLayer(benchmark.TensorFlowBenchmark):
   """Benchmark the layer forward pass."""
 
   def run_dataset_implementation(self, output_mode, batch_size, sequence_length,
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py
new file mode 100644
index 00000000000..bb0d59e743a
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py
@@ -0,0 +1,120 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras discretization preprocessing layer's adapt method."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import discretization
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+EPSILON = 0.1
+
+v2_compat.enable_v2_behavior()
+
+
+def reduce_fn(state, values, epsilon=EPSILON):
+  """tf.data.Dataset-friendly implementation of mean and variance."""
+
+  state_, = state
+  summary = discretization.summarize(values, epsilon)
+  if np.sum(state_[:, 0]) == 0:
+    return (summary,)
+  return (discretization.merge_summaries(state_, summary, epsilon),)
+
+
+class BenchmarkAdapt(benchmark.TensorFlowBenchmark):
+  """Benchmark adapt."""
+
+  def run_dataset_implementation(self, num_elements, batch_size):
+    input_t = keras.Input(shape=(1,))
+    layer = discretization.Discretization()
+    _ = layer(input_t)
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.range(num_elements)
+      ds = ds.map(
+          lambda x: array_ops.expand_dims(math_ops.cast(x, dtypes.float32), -1))
+      ds = ds.batch(batch_size)
+
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      state = ds.reduce((np.zeros((1, 2)),), reduce_fn)
+
+      bins = discretization.get_bucket_boundaries(state, 100)
+      layer.set_weights([bins])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts))
+    return avg_time
+
+  def bm_adapt_implementation(self, num_elements, batch_size):
+    """Test the KPL adapt implementation."""
+    input_t = keras.Input(shape=(1,), dtype=dtypes.float32)
+    layer = discretization.Discretization()
+    _ = layer(input_t)
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.range(num_elements)
+      ds = ds.map(
+          lambda x: array_ops.expand_dims(math_ops.cast(x, dtypes.float32), -1))
+      ds = ds.batch(batch_size)
+
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      layer.adapt(ds)
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts))
+    name = "discretization_adapt|%s_elements|batch_%s" % (num_elements,
+                                                          batch_size)
+    baseline = self.run_dataset_implementation(num_elements, batch_size)
+    extras = {
+        "tf.data implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for vocab_size in [100, 1000, 10000, 100000, 1000000]:
+      for batch in [64 * 2048]:
+        self.bm_adapt_implementation(vocab_size, batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
index 68ab28c7f6c..ffce276ec86 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
@@ -47,7 +47,7 @@ def word_gen():
     yield "".join(random.choice(string.ascii_letters) for i in range(2))
 
 
-class BenchmarkLayer(benchmark.Benchmark):
+class BenchmarkLayer(benchmark.TensorFlowBenchmark):
   """Benchmark the layer forward pass."""
 
   def run_dataset_implementation(self, batch_size):
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
index 302c890c823..6ef4e7a7291 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
@@ -85,7 +85,7 @@ def image_augmentation(inputs, batch_size):
   return img
 
 
-class BenchmarkLayer(benchmark.Benchmark):
+class BenchmarkLayer(benchmark.TensorFlowBenchmark):
   """Benchmark the layer forward pass."""
 
   def run_dataset_implementation(self, batch_size):
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
index 621a4588715..a43d57c91ae 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
@@ -63,7 +63,7 @@ def get_top_k(dataset, k):
   return sorted_vocab
 
 
-class BenchmarkAdapt(benchmark.Benchmark):
+class BenchmarkAdapt(benchmark.TensorFlowBenchmark):
   """Benchmark adapt."""
 
   def run_numpy_implementation(self, num_elements, batch_size, k):
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py
index dfce2963f75..ae8ac3f0023 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py
@@ -60,7 +60,7 @@ def reduce_fn(state, values):
   return (k, n + batch_size, ex, ex2)
 
 
-class BenchmarkAdapt(benchmark.Benchmark):
+class BenchmarkAdapt(benchmark.TensorFlowBenchmark):
   """Benchmark adapt."""
 
   def run_dataset_implementation(self, num_elements, batch_size):
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 0690592db89..35c4363ccdd 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -298,12 +298,18 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     binary_output = (self._output_mode == BINARY)
     if isinstance(inputs, sparse_tensor.SparseTensor):
       max_value = math_ops.reduce_max(inputs.values)
+      min_value = math_ops.reduce_min(inputs.values)
     else:
       max_value = math_ops.reduce_max(inputs)
-    condition = math_ops.greater_equal(
-        math_ops.cast(out_depth, max_value.dtype), max_value)
+      min_value = math_ops.reduce_min(inputs)
+    condition = math_ops.logical_and(
+        math_ops.greater_equal(
+            math_ops.cast(out_depth, max_value.dtype), max_value),
+        math_ops.greater_equal(
+            min_value, math_ops.cast(0, min_value.dtype)))
     control_flow_ops.Assert(
-        condition, ["Input must be less than max_token {}".format(out_depth)])
+        condition, ["Input values must be in the range 0 <= values < max_tokens"
+                    " with max_tokens={}".format(out_depth)])
     if self._sparse:
       result = bincount_ops.sparse_bincount(
           inputs,
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index d0002638ad5..3b2026e5048 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -277,8 +277,23 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     int_data = encoder_layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
     model = keras.Model(inputs=input_data, outputs=int_data)
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                ".*must be less than max_token 3"):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        ".*must be in the range 0 <= values < max_tokens.*"):
+      _ = model.predict(input_array, steps=1)
+
+  def test_dense_negative(self):
+    input_array = constant_op.constant([[1, 2, 0], [2, 2, -1]])
+    max_tokens = 3
+    expected_output_shape = [None, max_tokens]
+    encoder_layer = get_layer_class()(max_tokens)
+    input_data = keras.Input(shape=(3,), dtype=dtypes.int32)
+    int_data = encoder_layer(input_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        ".*must be in the range 0 <= values < max_tokens.*"):
       _ = model.predict(input_array, steps=1)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index e36ed118822..4ffe35cbae8 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -17,23 +17,125 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import json
+
 import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import gen_boosted_trees_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import keras_export
 
 
+_BINS_NAME = "bins"
+
+
+def summarize(values, epsilon):
+  """Reduce a 1D sequence of values to a summary.
+
+  This algorithm is based on numpy.quantiles but modified to allow for
+  intermediate steps between multiple data sets. It first finds the target
+  number of bins as the reciprocal of epsilon and then takes the individual
+  values spaced at appropriate intervals to arrive at that target.
+  The final step is to return the corresponding counts between those values
+  If the target num_bins is larger than the size of values, the whole array is
+  returned (with weights of 1).
+
+  Arguments:
+      values: 1-D `np.ndarray` to be summarized.
+      epsilon: A `'float32'` that determines the approxmiate desired precision.
+
+  Returns:
+      A 2-D `np.ndarray` that is a summary of the inputs. First column is the
+      interpolated partition values, the second is the weights (counts).
+  """
+
+  num_bins = 1.0 / epsilon
+  value_shape = values.shape
+  n = np.prod([[(1 if dim is None else dim) for dim in value_shape]])
+  if num_bins >= n:
+    return np.hstack((np.expand_dims(np.sort(values), 1), np.ones((n, 1))))
+  step_size = int(n / num_bins)
+  partition_indices = np.arange(step_size, n, step_size, np.int64)
+
+  part = np.partition(values, partition_indices)[partition_indices]
+
+  return np.hstack((np.expand_dims(part, 1),
+                    step_size * np.ones((np.prod(part.shape), 1))))
+
+
+def compress(summary, epsilon):
+  """Compress a summary to within `epsilon` accuracy.
+
+  The compression step is needed to keep the summary sizes small after merging,
+  and also used to return the final target boundaries. It finds the new bins
+  based on interpolating cumulative weight percentages from the large summary.
+  Taking the difference of the cumulative weights from the previous bin's
+  cumulative weight will give the new weight for that bin.
+
+  Arguments:
+      summary: 2-D `np.ndarray` summary to be compressed.
+      epsilon: A `'float32'` that determines the approxmiate desired precision.
+
+  Returns:
+      A 2-D `np.ndarray` that is a compressed summary. First column is the
+      interpolated partition values, the second is the weights (counts).
+  """
+  if np.prod(summary[:, 0].shape) * epsilon < 1:
+    return summary
+
+  percents = epsilon + np.arange(0.0, 1.0, epsilon)
+  cum_weights = summary[:, 1].cumsum()
+  cum_weight_percents = cum_weights / cum_weights[-1]
+  new_bins = np.interp(percents, cum_weight_percents, summary[:, 0])
+  cum_weights = np.interp(percents, cum_weight_percents, cum_weights)
+  new_weights = cum_weights - np.concatenate((np.array([0]), cum_weights[:-1]))
+
+  return np.hstack((np.expand_dims(new_bins, 1),
+                    np.expand_dims(new_weights, 1)))
+
+
+def merge_summaries(prev_summary, next_summary, epsilon):
+  """Weighted merge sort of summaries.
+
+  Given two summaries of distinct data, this function merges (and compresses)
+  them to stay within `epsilon` error tolerance.
+
+  Arguments:
+      prev_summary: 2-D `np.ndarray` summary to be merged with `next_summary`.
+      next_summary: 2-D `np.ndarray` summary to be merged with `prev_summary`.
+      epsilon: A `'float32'` that determines the approxmiate desired precision.
+
+  Returns:
+      A 2-D `np.ndarray` that is a merged summary. First column is the
+      interpolated partition values, the second is the weights (counts).
+  """
+  merged = np.concatenate((prev_summary, next_summary))
+  merged = merged[merged[:, 0].argsort()]
+  if np.prod(merged.shape) * epsilon < 1:
+    return merged
+  return compress(merged, epsilon)
+
+
+def get_bucket_boundaries(summary, num_bins):
+  return compress(summary, 1.0 / num_bins)[:-1, 0]
+
+
 @keras_export("keras.layers.experimental.preprocessing.Discretization")
-class Discretization(base_preprocessing_layer.PreprocessingLayer):
+class Discretization(base_preprocessing_layer.CombinerPreprocessingLayer):
   """Buckets data into discrete ranges.
 
   This layer will place each element of its input data into one of several
@@ -47,9 +149,15 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
     Same as input shape.
 
   Attributes:
-    bins: Optional boundary specification. Bins exclude the left boundary and
-      include the right boundary, so `bins=[0., 1., 2.]` generates bins
+    bins: Optional boundary specification or number of bins to compute if `int`.
+      Bins exclude the left boundary and include the right boundary,
+      so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
+      This would correspond to bins = 4.
+    epsilon: Error tolerance, typically a small fraction close to zero (e.g.
+      0.01). Higher values of epsilon increase the quantile approximation, and
+      hence result in more unequal buckets, but could improve performance
+      and resource consumption.
 
   Examples:
 
@@ -61,19 +169,47 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
   <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
   array([[0, 1, 3, 1],
          [0, 3, 2, 0]], dtype=int32)>
+
+  Bucketize float values based on a number of buckets to compute.
+  >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+  >>> layer = tf.keras.layers.experimental.preprocessing.Discretization(
+  ...          bins=4, epsilon=0.01)
+  >>> layer.adapt(input)
+  >>> layer(input)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+  array([[0, 2, 3, 1],
+         [0, 3, 2, 0]], dtype=int32)>
   """
 
-  def __init__(self, bins, **kwargs):
-    super(Discretization, self).__init__(**kwargs)
-    base_preprocessing_layer._kpl_gauge.get_cell("V2").set("Discretization")
-    # The bucketization op requires a final rightmost boundary in order to
-    # correctly assign values higher than the largest left boundary.
-    # This should not impact intended buckets even if a max value is provided.
-    self.bins = np.append(bins, [np.Inf])
+  def __init__(self,
+               bins,
+               epsilon=0.01,
+               **kwargs):
+    super(Discretization, self).__init__(
+        combiner=Discretization.DiscretizingCombiner(
+            epsilon, bins if isinstance(bins, int) else 1),
+        **kwargs)
+    if bins is not None and not isinstance(bins, int):
+      self.bins = np.append(bins, [np.Inf])
+    else:
+      self.bins = np.zeros(bins)
+    # Need this to return correct config
+    self.input_bins = bins
+    self.epsilon = epsilon
+
+  def build(self, input_shape):
+    self.bins = self._add_state_variable(
+        name=_BINS_NAME,
+        shape=(self.bins.size,),
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(self.bins))
+    super(Discretization, self).build(input_shape)
 
   def get_config(self):
     config = {
-        "bins": self.bins,
+        "bins": None if self.input_bins is None else (
+            K.get_value(self.input_bins)),
+        "epsilon": self.epsilon,
     }
     base_config = super(Discretization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -91,9 +227,9 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
 
   def call(self, inputs):
     def _bucketize_op(bins):
-      bins = [gen_math_ops.cast(bins, dtypes.float32)]
-      return lambda inputs: boosted_trees_ops.boosted_trees_bucketize(  # pylint: disable=g-long-lambda
-          float_values=[gen_math_ops.cast(inputs, dtypes.float32)],
+      bins = [math_ops.cast(bins, dtypes.float32)]
+      return lambda inputs: gen_boosted_trees_ops.BoostedTreesBucketize(  # pylint: disable=g-long-lambda
+          float_values=[math_ops.cast(inputs, dtypes.float32)],
           bucket_boundaries=bins)[0]
 
     if tf_utils.is_ragged(inputs):
@@ -105,10 +241,10 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
       return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
-      integer_buckets = boosted_trees_ops.boosted_trees_bucketize(
-          [gen_math_ops.cast(inputs.values, dtypes.float32)],
-          bucket_boundaries=[gen_math_ops.cast(array_ops.squeeze(self.bins),
-                                               dtypes.float32)])[0]
+      integer_buckets = gen_boosted_trees_ops.BoostedTreesBucketize(
+          float_values=[math_ops.cast(inputs.values, dtypes.float32)],
+          bucket_boundaries=[math_ops.cast(array_ops.squeeze(self.bins),
+                                           dtypes.float32)])[0]
       return sparse_tensor.SparseTensor(
           indices=array_ops.identity(inputs.indices),
           values=integer_buckets,
@@ -121,9 +257,89 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
             "found {}".format(input_shape))
 
       reshaped = array_ops.reshape(
-          inputs, [-1, gen_math_ops.prod(input_shape.as_list()[1:], axis=0)])
+          inputs,
+          [-1, gen_math_ops.Prod(input=input_shape.as_list()[1:], axis=0)])
 
       return array_ops.reshape(
           control_flow_ops.vectorized_map(
               _bucketize_op(array_ops.squeeze(self.bins)), reshaped),
           array_ops.constant([-1] + input_shape.as_list()[1:]))
+
+  class DiscretizingCombiner(Combiner):
+    """Combiner for the Discretization preprocessing layer.
+
+    This class encapsulates the computations for finding the quantile boundaries
+    of a set of data in a stable and numerically correct way. Its associated
+    accumulator is a namedtuple('summaries'), representing summarizations of
+    the data used to generate boundaries.
+
+    Attributes:
+      epsilon: Error tolerance.
+      num_bins: The desired number of buckets.
+    """
+
+    def __init__(self, epsilon, num_bins,):
+      self.epsilon = epsilon
+      self.num_bins = num_bins
+
+      # TODO(mwunder): Implement elementwise per-column discretization.
+
+    def compute(self, values, accumulator=None):
+      """Compute a step in this computation, returning a new accumulator."""
+
+      if isinstance(values, sparse_tensor.SparseTensor):
+        values = values.values
+      if tf_utils.is_ragged(values):
+        values = values.flat_values
+      flattened_input = np.reshape(values, newshape=(-1, 1))
+
+      summaries = [summarize(v, self.epsilon) for v in flattened_input.T]
+
+      if accumulator is None:
+        return self._create_accumulator(summaries)
+      else:
+        return self._create_accumulator(
+            [merge_summaries(prev_summ, summ, self.epsilon)
+             for prev_summ, summ in zip(accumulator.summaries, summaries)])
+
+    def merge(self, accumulators):
+      """Merge several accumulators to a single accumulator."""
+      # Combine accumulators and return the result.
+
+      merged = accumulators[0].summaries
+      for accumulator in accumulators[1:]:
+        merged = [merge_summaries(prev, summary, self.epsilon)
+                  for prev, summary in zip(merged, accumulator.summaries)]
+
+      return self._create_accumulator(merged)
+
+    def extract(self, accumulator):
+      """Convert an accumulator into a dict of output values."""
+
+      boundaries = [np.append(get_bucket_boundaries(summary, self.num_bins),
+                              [np.Inf])
+                    for summary in accumulator.summaries]
+      return {
+          _BINS_NAME: np.squeeze(np.vstack(boundaries))
+      }
+
+    def restore(self, output):
+      """Create an accumulator based on 'output'."""
+      raise NotImplementedError(
+          "Discretization does not restore or support streaming updates.")
+
+    def serialize(self, accumulator):
+      """Serialize an accumulator for a remote call."""
+      output_dict = {
+          _BINS_NAME: [summary.tolist() for summary in accumulator.summaries]
+      }
+      return compat.as_bytes(json.dumps(output_dict))
+
+    def deserialize(self, encoded_accumulator):
+      """Deserialize an accumulator received from 'serialize()'."""
+      value_dict = json.loads(compat.as_text(encoded_accumulator))
+      return self._create_accumulator(np.array(value_dict[_BINS_NAME]))
+
+    def _create_accumulator(self, summaries):
+      """Represent the accumulator as one or more summaries of the dataset."""
+      return collections.namedtuple("Accumulator", ["summaries"])(summaries)
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
index 9d04ccc26a5..0226355cc76 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
@@ -18,19 +18,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 import numpy as np
 
 from tensorflow.python import keras
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers.preprocessing import discretization
+from tensorflow.python.keras.layers.preprocessing import discretization_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
+def get_layer_class():
+  if context.executing_eagerly():
+    return discretization.Discretization
+  else:
+    return discretization_v1.Discretization
+
+
 @keras_parameterized.run_all_keras_modes
 class DiscretizationTest(keras_parameterized.TestCase,
                          preprocessing_test_utils.PreprocessingLayerTest):
@@ -106,7 +119,6 @@ class DiscretizationTest(keras_parameterized.TestCase,
     layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
     model = keras.Model(inputs=input_data, outputs=bucket_data)
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
@@ -125,6 +137,110 @@ class DiscretizationTest(keras_parameterized.TestCase,
     self.assertAllEqual(indices, output_dataset.indices)
     self.assertAllEqual(expected_output, output_dataset.values)
 
+  @parameterized.named_parameters([
+      {
+          "testcase_name": "2d_single_element",
+          "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]]),
+          "test_data": np.array([[1.], [2.], [3.]]),
+          "use_dataset": True,
+          "expected": np.array([[0], [1], [2]]),
+          "num_bins": 5,
+          "epsilon": 0.01
+      }, {
+          "testcase_name": "2d_multi_element",
+          "adapt_data": np.array([[1., 6.], [2., 7.], [3., 8.], [4., 9.],
+                                  [5., 10.]]),
+          "test_data": np.array([[1., 10.], [2., 6.], [3., 8.]]),
+          "use_dataset": True,
+          "expected": np.array([[0, 4], [0, 2], [1, 3]]),
+          "num_bins": 5,
+          "epsilon": 0.01
+      }, {
+          "testcase_name": "1d_single_element",
+          "adapt_data": np.array([3., 2., 1., 5., 4.]),
+          "test_data": np.array([1., 2., 3.]),
+          "use_dataset": True,
+          "expected": np.array([0, 1, 2]),
+          "num_bins": 5,
+          "epsilon": 0.01
+      }, {
+          "testcase_name": "300_batch_1d_single_element_1",
+          "adapt_data": np.arange(300),
+          "test_data": np.arange(300),
+          "use_dataset": True,
+          "expected":
+              np.concatenate([np.zeros(101), np.ones(99), 2 * np.ones(100)]),
+          "num_bins": 3,
+          "epsilon": 0.01
+      }, {
+          "testcase_name": "300_batch_1d_single_element_2",
+          "adapt_data": np.arange(300) ** 2,
+          "test_data": np.arange(300) ** 2,
+          "use_dataset": True,
+          "expected":
+              np.concatenate([np.zeros(101), np.ones(99), 2 * np.ones(100)]),
+          "num_bins": 3,
+          "epsilon": 0.01
+      }, {
+          "testcase_name": "300_batch_1d_single_element_large_epsilon",
+          "adapt_data": np.arange(300),
+          "test_data": np.arange(300),
+          "use_dataset": True,
+          "expected": np.concatenate([np.zeros(137), np.ones(163)]),
+          "num_bins": 2,
+          "epsilon": 0.1
+      }])
+  def test_layer_computation(self, adapt_data, test_data, use_dataset,
+                             expected, num_bins=5, epsilon=0.01):
+
+    input_shape = tuple(list(test_data.shape)[1:])
+    np.random.shuffle(adapt_data)
+    if use_dataset:
+      # Keras APIs expect batched datasets
+      adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
+          test_data.shape[0] // 2)
+      test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
+          test_data.shape[0] // 2)
+
+    cls = get_layer_class()
+    layer = cls(epsilon=epsilon, bins=num_bins)
+    layer.adapt(adapt_data)
+
+    input_data = keras.Input(shape=input_shape)
+    output = layer(input_data)
+    model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    output_data = model.predict(test_data)
+    self.assertAllClose(expected, output_data)
+
+  @parameterized.named_parameters(
+      {
+          "num_bins": 5,
+          "data": np.array([[1.], [2.], [3.], [4.], [5.]]),
+          "expected": {
+              "bins": np.array([1., 2., 3., 4., np.Inf])
+          },
+          "testcase_name": "2d_single_element_all_bins"
+      }, {
+          "num_bins": 5,
+          "data": np.array([[1., 6.], [2., 7.], [3., 8.], [4., 9.], [5., 10.]]),
+          "expected": {
+              "bins": np.array([2., 4., 6., 8., np.Inf])
+          },
+          "testcase_name": "2d_multi_element_all_bins",
+      }, {
+          "num_bins": 3,
+          "data": np.array([[0.], [1.], [2.], [3.], [4.], [5.]]),
+          "expected": {
+              "bins": np.array([1., 3., np.Inf])
+          },
+          "testcase_name": "2d_single_element_3_bins"
+      })
+  def test_combiner_computation(self, num_bins, data, expected):
+    epsilon = 0.01
+    combiner = discretization.Discretization.DiscretizingCombiner(epsilon,
+                                                                  num_bins)
+    self.validate_accumulator_extract(combiner, data, expected)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_v1.py b/tensorflow/python/keras/layers/preprocessing/discretization_v1.py
new file mode 100644
index 00000000000..6daea9b21e6
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_v1.py
@@ -0,0 +1,28 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow V1 version of the Discretization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.engine.base_preprocessing_layer_v1 import CombinerPreprocessingLayer
+from tensorflow.python.keras.layers.preprocessing import discretization
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=['keras.layers.experimental.preprocessing.Discretization'])
+class Discretization(discretization.Discretization, CombinerPreprocessingLayer):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index ea8d6f0fd95..071d8c7ec1b 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -213,7 +213,7 @@ class Hashing(base_preprocessing_layer.PreprocessingLayer):
     indices = [sp_inp.indices for sp_inp in sparse_inputs]
     values = [sp_inp.values for sp_inp in sparse_inputs]
     shapes = [sp_inp.dense_shape for sp_inp in sparse_inputs]
-    indices_out, values_out, shapes_out = gen_sparse_ops.sparse_cross_hashed(
+    indices_out, values_out, shapes_out = gen_sparse_ops.SparseCrossHashed(
         indices=indices,
         values=values,
         shapes=shapes,
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
index d0ffc987e01..c15cce3b050 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -217,3 +217,9 @@ class IntegerLookup(index_lookup.IndexLookup):
     base_config["oov_value"] = base_config["oov_token"]
     del base_config["oov_token"]
     return base_config
+
+  def set_vocabulary(self, vocab):
+    if isinstance(vocab, str):
+      vocab = table_utils.get_vocabulary_from_file(vocab)
+      vocab = [int(v) for v in vocab]
+    super().set_vocabulary(vocab)
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
index 0b71c6aaecc..4b791fc47aa 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@@ -426,6 +426,21 @@ class IntegerLookupVocabularyTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_int_output_explicit_vocab_from_file_via_setter(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
   def test_non_unique_vocab_fails(self):
     vocab_data = [42, 1138, 725, 1729, 1729]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index b8cf233d780..1de1c30d8e6 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import keras_export
 
@@ -37,7 +38,15 @@ _MEAN_NAME = 'mean'
 _VARIANCE_NAME = 'variance'
 
 
-# TODO(momernick): Find a good example of normalization?
+def convert_to_ndarray(values):
+  if isinstance(values, np.ndarray):
+    return values
+  elif isinstance(values, ops.Tensor):
+    return K.get_value(values)
+  else:
+    return np.array(values)
+
+
 @keras_export('keras.layers.experimental.preprocessing.Normalization', v1=[])
 class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
   """Feature-wise normalization of the data.
@@ -56,9 +65,17 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
         normalization statistics. By default the last axis, the `features` axis
         is kept and any `space` or `time` axes are summed. Each element in the
         the axes that are kept is normalized independently. If `axis` is set to
-        'None', the layer will perform scalar normalization (diving the input
+        'None', the layer will perform scalar normalization (dividing the input
         by a single scalar value). The `batch` axis, 0, is always summed over
         (`axis=0` is not allowed).
+      mean: The mean value(s) to use during normalization. The passed value(s)
+        will be broadcast to the shape of the kept axes above; if the value(s)
+        cannot be broadcast, an error will be raised when this layer's build()
+        method is called.
+      variance: The variance value(s) to use during normalization. The passed
+        value(s) will be broadcast to the shape of the kept axes above; if the
+        value(s)cannot be broadcast, an error will be raised when this layer's
+        build() method is called.
 
   Examples:
 
@@ -70,12 +87,22 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
   >>> layer.adapt(adapt_data)
   >>> layer(input_data)
   <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+  array([[-1.4142135 ],
+         [-0.70710677],
+         [ 0.        ]], dtype=float32)>
+
+  Pass the mean and variance directly.
+
+  >>> input_data = np.array([[1.], [2.], [3.]], np.float32)
+  >>> layer = Normalization(mean=3., variance=2.)
+  >>> layer(input_data)
+  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
   array([[-1.4142135 ],
          [-0.70710677],
          [ 0.        ]], dtype=float32)>
   """
 
-  def __init__(self, axis=-1, dtype=None, **kwargs):
+  def __init__(self, axis=-1, dtype=None, mean=None, variance=None, **kwargs):
     # This ensures that if the value of K.floatx() changes after file-loading
     # time, the dtype value will change to reflect it.
     dtype = dtype or K.floatx()
@@ -97,6 +124,24 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     self.axis = axis
 
+    if isinstance(mean, variables.Variable):
+      raise ValueError('Normalization does not support passing a Variable '
+                       'for the `mean` init arg.')
+    if isinstance(variance, variables.Variable):
+      raise ValueError('Normalization does not support passing a Variable '
+                       'for the `variance` init arg.')
+
+    if mean is not None and variance is not None:
+      mean = convert_to_ndarray(mean)
+      variance = convert_to_ndarray(variance)
+    elif mean is not None or variance is not None:
+      raise ValueError(
+          'When setting values directly, both `mean` and `variance` '
+          'must be set. Got mean: {} and variance: {}'.format(mean, variance))
+
+    self.mean_val = mean
+    self.variance_val = variance
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if len(input_shape) == 1:
@@ -144,6 +189,11 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     super(Normalization, self).build(input_shape)
 
+    if (self.mean_val is not None and self.variance_val is not None):
+      mean_val = self.mean_val * np.ones(mean_and_var_shape)
+      variance_val = self.variance_val * np.ones(mean_and_var_shape)
+      self.set_weights([mean_val, variance_val])
+
   def call(self, inputs):
     inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
     if inputs.shape.rank == 1:
@@ -297,8 +347,9 @@ class _NormalizingCombiner(base_preprocessing_layer.Combiner):
     if (count == 0 and (mean.any() != 0.0 or var.any() != 0.0)):
       raise RuntimeError(
           'The mean and/or variance of a Normalization preprocessing layer '
-          "were set without also setting 'count'. If 'count' is not also set,"
-          " 'adapt' cannot be called unless the 'reset_state' arg is True.")
+          "were set without also setting 'count'. If 'count' is not also set, "
+          " or was set to 0, 'adapt' cannot be called unless the 'reset_state'"
+          'arg is True.')
     return self._create_accumulator(output[_COUNT_NAME], output[_MEAN_NAME],
                                     output[_VARIANCE_NAME])
 
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 69eafc54adc..f629b88f369 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -25,12 +25,14 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers.preprocessing import normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -216,7 +218,7 @@ class NormalizationTest(keras_parameterized.TestCase,
   @parameterized.named_parameters(*_get_layer_computation_test_cases())
   def test_layer_computation(self, adapt_data, axis, test_data, use_dataset,
                              expected):
-    input_shape = tuple([None for _ in range(test_data.ndim - 1)])
+    input_shape = tuple([test_data.shape[i] for i in range(1, test_data.ndim)])
     if use_dataset:
       # Keras APIs expect batched datasets
       adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
@@ -235,6 +237,45 @@ class NormalizationTest(keras_parameterized.TestCase,
     output_data = model.predict(test_data)
     self.assertAllClose(expected, output_data)
 
+    weights = layer.get_weights()
+    mean = weights[0]
+    var = weights[1]
+
+    direct_set_layer = cls(axis=axis, mean=mean, variance=var)
+    input_data = keras.Input(shape=input_shape)
+    output = direct_set_layer(input_data)
+    model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    output_data = model.predict(test_data)
+    self.assertAllClose(expected, output_data)
+
+  def test_broadcasting_during_direct_setting(self):
+    cls = get_layer_class()
+    layer = cls(axis=-1, mean=[1.0], variance=[2.0])
+    layer.build((None, 2))
+    weights = layer.get_weights()
+    self.assertAllClose([1.0, 1.0], weights[0])
+    self.assertAllClose([2.0, 2.0], weights[1])
+
+  def test_broadcasting_during_direct_setting_with_tensors(self):
+    cls = get_layer_class()
+    layer = cls(
+        axis=-1,
+        mean=constant_op.constant([1.0]),
+        variance=constant_op.constant([2.0]))
+    layer.build((None, 2))
+    weights = layer.get_weights()
+    self.assertAllClose([1.0, 1.0], weights[0])
+    self.assertAllClose([2.0, 2.0], weights[1])
+
+  def test_broadcasting_during_direct_setting_with_variables_fails(self):
+    cls = get_layer_class()
+    with self.assertRaisesRegex(ValueError, "passing a Variable"):
+      _ = cls(
+          axis=-1,
+          mean=variables.Variable([1.0]),
+          variance=variables.Variable([2.0]))
+
   def test_mean_setting_continued_adapt_failure(self):
 
     if not context.executing_eagerly():
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
index 91545b8ee28..a698761ed61 100644
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections.abc as collections_abc
 import numpy as np
 
 from tensorflow.python.platform import test
-from tensorflow.python.util.compat import collections_abc
 
 
 class PreprocessingLayerTest(test.TestCase):
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
index c70ac50dd07..679f4fd5a71 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -212,3 +212,8 @@ class StringLookup(index_lookup.IndexLookup):
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
     return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
+
+  def set_vocabulary(self, vocab):
+    if isinstance(vocab, str):
+      vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding)
+    super().set_vocabulary(vocab)
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
index 2b45b59fcf4..0ca10ff574b 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -177,6 +177,22 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_int_output_explicit_vocab_from_file_via_setter(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
   def test_non_unique_vocab_fails(self):
     vocab_data = ["earth", "wind", "and", "fire", "fire"]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index c72b8252480..c6e0f39aed8 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -165,7 +166,7 @@ def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
         token = text
       elif isinstance(text, bytes):
         token = text.decode(encoding, "ignore")
-      token = token.strip()
+      token = token.rstrip(os.linesep)
       vocab.append(token)
   return vocab
 
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 6449d8afaf7..096fd489ded 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -28,6 +28,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
@@ -481,7 +482,8 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
     it.
 
     Arguments:
-      vocab: An array of string tokens.
+      vocab: An array of string tokens, or a path to a file containing one
+        token per line.
       df_data: An array of document frequency data. Only necessary if the layer
         output_mode is TFIDF.
       oov_df_value: The document frequency of the OOV token. Only necessary if
@@ -506,6 +508,21 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
                           "be changed after the layer is "
                           "called.").format(mode=self._output_mode))
 
+    # Handle reading from a file. We can't do this via TF-IDF, as we don't have
+    # a standard format - we error out and ask our users to parse the file
+    # themselves.
+    if isinstance(vocab, str):
+      if self._output_mode == TFIDF:
+        raise RuntimeError("Setting vocabulary directly from a file is not "
+                           "supported in TF-IDF mode, since this layer cannot "
+                           "read files containing TF-IDF weight data. Please "
+                           "read the file using Python and set the vocab "
+                           "and weights by passing lists or arrays to the "
+                           "set_vocabulary function's `vocab` and `df_data` "
+                           "args.")
+      vocab = table_utils.get_vocabulary_from_file(
+          vocab, self._index_lookup_layer.encoding)
+
     self._index_lookup_layer.set_vocabulary(vocab)
 
     # When doing raw or integer output, we don't have a Vectorize layer to
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index a1f9f54a39f..adab52f6dda 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -414,6 +415,15 @@ class TextVectorizationPreprocessingTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
   def test_summary_before_adapt(self):
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
     layer = get_layer_class()(
@@ -709,6 +719,46 @@ class TextVectorizationPreprocessingTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_vocab_setting_via_init_file(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT,
+        vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_vocab_setting_via_setter(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT)
+    layer.set_vocabulary(vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationDistributionTest(
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 72ba1fbcc58..d74cd6197af 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections.abc as collections_abc
 import warnings
 
 import numpy as np
@@ -442,6 +443,16 @@ class RNN(Layer):
         raise ValueError('RNNs with stateful=True not yet supported with '
                          'tf.distribute.Strategy.')
 
+  @property
+  def _use_input_spec_as_call_signature(self):
+    if self.unroll:
+      # When the RNN layer is unrolled, the time step shape cannot be unknown.
+      # The input spec does not define the time step (because this layer can be
+      # called with any time step value, as long as it is not None), so it
+      # cannot be used as the call function signature when saving to SavedModel.
+      return False
+    return super(RNN, self)._use_input_spec_as_call_signature
+
   @property
   def states(self):
     if self._states is None:
@@ -1568,7 +1579,6 @@ class SimpleRNN(RNN):
     self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self._maybe_reset_cell_dropout_mask(self.cell)
     return super(SimpleRNN, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -2103,7 +2113,6 @@ class GRU(RNN):
     self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self._maybe_reset_cell_dropout_mask(self.cell)
     return super(GRU, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -2778,7 +2787,6 @@ class LSTM(RNN):
     self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self._maybe_reset_cell_dropout_mask(self.cell)
     return super(LSTM, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -3039,7 +3047,8 @@ def _caching_device(rnn_cell):
                  'consider updating your code to remove tf.while_loop if '
                  'possible.')
     return None
-  if rnn_cell._dtype_policy.should_cast_variables:
+  if (rnn_cell._dtype_policy.compute_dtype !=
+      rnn_cell._dtype_policy.variable_dtype):
     logging.warn('Variable read device caching has been disabled since it '
                  'doesn\'t work with the mixed precision API. This is '
                  'likely to cause a slowdown for RNN training due to '
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 8eddc14f5f8..c0a22b3f38b 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -39,7 +39,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import build_info
+from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -67,7 +67,7 @@ _CUDNN_NOT_AVAILABLE_MSG = ('Layer %s will not use cuDNN kernel since it '
 
 
 def _use_new_code():
-  return True
+  return False
 
 
 # TODO(b/169707691): The wrapper can be removed if TFLite doesn't need to rely
@@ -444,8 +444,8 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
 
-    # TODO(b/156447398) Investigate why the cuDNN kernel kernel fails with
-    # ragged inputs.
+    # TODO(b/156447398) Investigate why the cuDNN kernel fails with ragged
+    # inputs.
     if is_ragged_input or not self._could_use_gpu_kernel:
       kwargs = {'training': training}
       self._maybe_reset_cell_dropout_mask(self.cell)
@@ -650,7 +650,7 @@ def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
   # (6 * units)
   bias = array_ops.split(K.flatten(bias), 6)
 
-  if build_info.build_info['is_cuda_build']:
+  if sysconfig.get_build_info()['is_cuda_build']:
     # Note that the gate order for CuDNN is different from the canonical format.
     # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need
     # to be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
@@ -680,8 +680,8 @@ def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
       # expected_output = [0, 0, 6, 5 ,4]
       inputs = array_ops.reverse_sequence_v2(
           inputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
-    outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(
-        inputs,
+    outputs, h, _, _, _ = gen_cudnn_rnn_ops.CudnnRNNV3(
+        input=inputs,
         input_h=init_h,
         input_c=0,
         params=params,
@@ -812,7 +812,7 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
         false_fn=standard_gru_fn)
 
   if _use_new_code():
-    # Chooses the implementation dynamicly based on the running device.
+    # Chooses the implementation dynamically based on the running device.
     (last_output, outputs, new_h,
      runtime) = control_flow_ops.execute_fn_for_device(
          {
@@ -1162,8 +1162,8 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
 
-    # TODO(b/156447398) Investigate why the cuDNN kernel kernel fails with
-    # ragged inputs.
+    # TODO(b/156447398) Investigate why the cuDNN kernel fails with ragged
+    # inputs.
     if is_ragged_input or not self._could_use_gpu_kernel:
       # Fall back to use the normal LSTM.
       kwargs = {'training': training}
@@ -1454,7 +1454,7 @@ def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
   # so that mathematically it is same as the canonical LSTM implementation.
   full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
 
-  if build_info.build_info['is_rocm_build']:
+  if sysconfig.get_build_info()['is_rocm_build']:
     # ROCm MIOpen's weight sequence for LSTM is different from both canonical
     # and Cudnn format
     # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
@@ -1626,7 +1626,7 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
         false_fn=stardard_lstm_fn)
 
   if _use_new_code():
-    # Chooses the implementation dynamicly based on the running device.
+    # Chooses the implementation dynamically based on the running device.
     (last_output, outputs, new_h, new_c,
      runtime) = control_flow_ops.execute_fn_for_device(
          {
@@ -1693,7 +1693,7 @@ def has_fully_masked_sequence(mask):
   # data. We walk around this issue by rerouting the computation to standard
   # kernel, until the issue on cudnn side has been fixed.
   # For a fully masked sequence, it will contain all Falses. To make it easy to
-  # check, we inverse the boolean, check if any of the seqence has all True.
+  # check, we inverse the boolean, check if any of the sequence has all True.
   return math_ops.reduce_any(
       math_ops.reduce_all(
           math_ops.logical_not(mask),
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 6798e5c8fff..c2accf24e58 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -93,8 +93,8 @@ class TimeDistributed(Wrapper):
   with `channels_last` data format, across 10 timesteps.
   The batch input shape is `(32, 10, 128, 128, 3)`.
 
-  You can then use `TimeDistributed` to apply a `Conv2D` layer to each of the
-  10 timesteps, independently:
+  You can then use `TimeDistributed` to apply the same `Conv2D` layer to each
+  of the 10 timesteps, independently:
 
   >>> inputs = tf.keras.Input(shape=(10, 128, 128, 3))
   >>> conv_2d_layer = tf.keras.layers.Conv2D(64, (3, 3))
@@ -102,6 +102,9 @@ class TimeDistributed(Wrapper):
   >>> outputs.shape
   TensorShape([None, 10, 126, 126, 64])
 
+  Because `TimeDistributed` applies the same instance of `Conv2D` to each of the
+  timestamps, the same set of weights are used at each timestamp.
+
   Arguments:
     layer: a `tf.keras.layers.Layer` instance.
 
diff --git a/tensorflow/python/keras/legacy_tf_layers/BUILD b/tensorflow/python/keras/legacy_tf_layers/BUILD
index 45ccd958db0..a8751302681 100644
--- a/tensorflow/python/keras/legacy_tf_layers/BUILD
+++ b/tensorflow/python/keras/legacy_tf_layers/BUILD
@@ -34,7 +34,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/engine:base_layer",
-        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/training/tracking:base",
     ],
 )
@@ -88,7 +88,6 @@ tf_py_test(
     srcs = ["base_test.py"],
     main = "base_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":core",
         ":layers_base",
@@ -119,7 +118,6 @@ tf_py_test(
     srcs = ["core_test.py"],
     main = "core_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":core",
         "//tensorflow/python:array_ops",
@@ -171,7 +169,6 @@ tf_py_test(
     main = "pooling_test.py",
     python_version = "PY3",
     tags = ["no_rocm"],
-    tfrt_enabled = True,
     deps = [
         ":pooling",
         "//tensorflow/python:array_ops",
@@ -189,7 +186,6 @@ cuda_py_test(
     main = "normalization_test.py",
     python_version = "PY3",
     shard_count = 10,
-    tfrt_enabled = True,
     deps = [
         ":convolutional",
         ":normalization",
diff --git a/tensorflow/python/keras/legacy_tf_layers/base.py b/tensorflow/python/keras/legacy_tf_layers/base.py
index 8052651efa7..50e8ae683a3 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import functools
 import warnings
 
 from tensorflow.python.eager import context
@@ -26,13 +27,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
 # Avoid breaking users who directly import this symbol from this file.
@@ -210,6 +212,9 @@ class Layer(base_layer.Layer):
     if 'autocast' not in kwargs:
       kwargs['autocast'] = False
 
+    # Mark that legacy layers should not be instrumented as Keras usage
+    self._disable_keras_instrumentation = True
+
     super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
                                 **kwargs)
 
@@ -538,7 +543,7 @@ class Layer(base_layer.Layer):
       try:
         call_has_scope_arg = self._call_has_scope_arg
       except AttributeError:
-        self._call_fn_args = function_utils.fn_args(self.call)
+        self._call_fn_args = fn_args(self.call)
         self._call_has_scope_arg = 'scope' in self._call_fn_args
         call_has_scope_arg = self._call_has_scope_arg
       if call_has_scope_arg:
@@ -592,3 +597,35 @@ def _add_elements_to_collection(elements, collection_list):
     for element in elements:
       if id(element) not in collection_set:
         collection.append(element)
+
+
+def fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  if isinstance(fn, functools.partial):
+    args = fn_args(fn.func)
+    args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
+  else:
+    if hasattr(fn, '__call__') and tf_inspect.ismethod(fn.__call__):
+      fn = fn.__call__
+    args = tf_inspect.getfullargspec(fn).args
+    if is_bound_method(fn) and args:
+      # If it's a bound method, it may or may not have a self/cls first
+      # argument; for example, self could be captured in *args.
+      # If it does have a positional argument, it is self/cls.
+      args.pop(0)
+  return tuple(args)
+
+
+def is_bound_method(fn):
+  _, fn = tf_decorator.unwrap(fn)
+  return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
diff --git a/tensorflow/python/keras/legacy_tf_layers/base_test.py b/tensorflow/python/keras/legacy_tf_layers/base_test.py
index 2c9810c4109..90d57fae407 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base_test.py
@@ -60,6 +60,9 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
+    # Assert that the layer was not instrumented as a Keras layer
+    self.assertFalse(layer._instrumented_keras_api)
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInt64Layer(self):
     layer = base_layers.Layer(name='my_layer', dtype='int64')
@@ -83,6 +86,8 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
     with base_layers.keras_style_scope():
       layer = base_layers.Layer(name='my_layer')
+    # Assert that the layer was not instrumented as a Keras layer
+    self.assertFalse(layer._instrumented_keras_api)
     # Test basic variable creation.
     with backend.name_scope('bar'):
       variable = layer.add_variable(
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index c4bc03aed8c..f26b797db64 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -315,6 +315,31 @@ class Metric(base_layer.Layer):
 
   ### End: For use by subclasses ###
 
+  @property
+  def trainable_weights(self):
+    # Overridden from Layer class to track submetric weights.
+    if self.trainable:
+      trainable_weights = self._trainable_weights
+      for m in self._metrics:
+        trainable_weights += m.trainable_weights
+      return self._dedup_weights(trainable_weights)
+    else:
+      return []
+
+  @property
+  def non_trainable_weights(self):
+    # Overridden from Layer class to track submetric weights.
+    if self.trainable:
+      non_trainable_weights = self._non_trainable_weights
+      for m in self._metrics:
+        non_trainable_weights += m.non_trainable_weights
+    else:
+      non_trainable_weights = (
+          self._non_trainable_weights + self._trainable_weights)
+      for m in self._metrics:
+        non_trainable_weights += m.weights
+    return self._dedup_weights(non_trainable_weights)
+
   @property
   def _trackable_saved_model_saver(self):
     return metric_serialization.MetricSavedModelSaver(self)
@@ -731,7 +756,7 @@ class BinaryAccuracy(MeanMetricWrapper):
 
 @keras_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
-  """Calculates how often predictions matches one-hot labels.
+  """Calculates how often predictions match one-hot labels.
 
   You can provide logits of classes as `y_pred`, since argmax of
   logits and probabilities are same.
@@ -783,7 +808,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
 
 @keras_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
-  """Calculates how often predictions matches integer labels.
+  """Calculates how often predictions match integer labels.
 
   ```python
   acc = np.dot(sample_weight, np.equal(y_true, np.argmax(y_pred, axis=1))
@@ -3220,7 +3245,7 @@ def accuracy(y_true, y_pred):
 @keras_export('keras.metrics.binary_accuracy')
 @dispatch.add_dispatch_support
 def binary_accuracy(y_true, y_pred, threshold=0.5):
-  """Calculates how often predictions matches binary labels.
+  """Calculates how often predictions match binary labels.
 
   Standalone usage:
   >>> y_true = [[1], [1], [0], [0]]
@@ -3248,7 +3273,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
 @keras_export('keras.metrics.categorical_accuracy')
 @dispatch.add_dispatch_support
 def categorical_accuracy(y_true, y_pred):
-  """Calculates how often predictions matches one-hot labels.
+  """Calculates how often predictions match one-hot labels.
 
   Standalone usage:
   >>> y_true = [[0, 0, 1], [0, 1, 0]]
@@ -3277,7 +3302,7 @@ def categorical_accuracy(y_true, y_pred):
 @keras_export('keras.metrics.sparse_categorical_accuracy')
 @dispatch.add_dispatch_support
 def sparse_categorical_accuracy(y_true, y_pred):
-  """Calculates how often predictions matches integer labels.
+  """Calculates how often predictions match integer labels.
 
   Standalone usage:
   >>> y_true = [2, 1]
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/BUILD
similarity index 96%
rename from tensorflow/python/keras/mixed_precision/experimental/BUILD
rename to tensorflow/python/keras/mixed_precision/BUILD
index d1bd18f85a5..d10ab72082f 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/BUILD
@@ -69,13 +69,14 @@ py_test(
     ],
     python_version = "PY3",
     srcs_version = "PY2AND3",
+    tags = ["no_rocm"],
     deps = [
         ":policy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
-        "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
+        "//tensorflow/python/keras/mixed_precision:loss_scale_optimizer",
         "//tensorflow/python/keras/optimizer_v2",
     ],
 )
@@ -191,6 +192,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["loss_scale_optimizer_test.py"],
     python_version = "PY3",
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":loss_scale_optimizer",
         ":test_util",
@@ -256,8 +260,8 @@ cuda_py_test(
     size = "medium",
     srcs = ["keras_test.py"],
     data = [
-        "//tensorflow/python/keras/mixed_precision/experimental/testdata:lso_ckpt_tf2.2",
-        "//tensorflow/python/keras/mixed_precision/experimental/testdata:lso_savedmodel_tf2.2",
+        "//tensorflow/python/keras/mixed_precision/testdata:lso_ckpt_tf2.2",
+        "//tensorflow/python/keras/mixed_precision/testdata:lso_savedmodel_tf2.2",
     ],
     python_version = "PY3",
     shard_count = 10,
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/autocast_variable.py
similarity index 93%
rename from tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
rename to tensorflow/python/keras/mixed_precision/autocast_variable.py
index b33ea3a0b33..6882a055a68 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/autocast_variable.py
@@ -34,6 +34,19 @@ from tensorflow.python.types import core
 _autocast_dtype = threading.local()
 
 
+def numpy_text(tensor, is_repr=False):
+  """Human readable representation of a tensor's numpy value."""
+  if tensor.dtype.is_numpy_compatible:
+    # pylint: disable=protected-access
+    text = repr(tensor._numpy()) if is_repr else str(tensor._numpy())
+    # pylint: enable=protected-access
+  else:
+    text = '<unprintable>'
+  if '\n' in text:
+    text = '\n' + text
+  return text
+
+
 class AutoCastVariable(variables.Variable, core.Tensor):
   """Variable that will cast itself to a different dtype in applicable contexts.
 
@@ -51,9 +64,6 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   >>> with enable_auto_cast_variables(tf.float16):
   ...   tf.identity(v).dtype
   tf.float16
-  >>> with enable_auto_cast_variables(tf.float16):
-  ...   v.dtype  # v.dtype also changes under the context manager
-  tf.float16
 
   The purpose of this class is to allow Keras layers to create variables in
   float32, and automatically cast them to float16 or bfloat16 when the layer is
@@ -82,38 +92,42 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   def _should_cast(self):
     """Returns True if this variable should be casted when accessed."""
     autocast_dtype = getattr(_autocast_dtype, 'dtype', None)
-    return autocast_dtype is not None and self.true_dtype != autocast_dtype
+    return autocast_dtype is not None and self.dtype != autocast_dtype
 
   @property
   def dtype(self):
-    """The dtype this variable will be casted to when read."""
-    dtype = getattr(_autocast_dtype, 'dtype', None)
-    return dtype or self._variable.dtype
+    """The dtype of the underlying variable, before any casts are done."""
+    return self._variable.dtype
 
   @property
   def true_dtype(self):
-    """The dtype of the underlying variable, before any casts are done."""
+    """Deprecated alias of `dtype`."""
     return self._variable.dtype
 
+  @property
+  def _cast_dtype(self):
+    dtype = getattr(_autocast_dtype, 'dtype', None)
+    return dtype or self._variable.dtype
+
   def value(self):
     val = self._variable.value()
     if not self._should_cast():
       return val
-    return math_ops.cast(val, self.dtype)
+    return math_ops.cast(val, self._cast_dtype)
 
   def read_value(self):
     val = self._variable.read_value()
-    return math_ops.cast(val, self.dtype)
+    return math_ops.cast(val, self._cast_dtype)
 
   def sparse_read(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather`."""
     val = self._variable.sparse_read(indices, name=name)
-    return math_ops.cast(val, self.dtype)
+    return math_ops.cast(val, self._cast_dtype)
 
   def gather_nd(self, indices, name=None):
     """Gather slices of the variable into a Tensor."""
     val = self._variable.gather_nd(indices, name=name)
-    return math_ops.cast(val, self.dtype)
+    return math_ops.cast(val, self._cast_dtype)
 
   def __getattr__(self, name):
     return getattr(self._variable, name)
@@ -124,13 +138,14 @@ class AutoCastVariable(variables.Variable, core.Tensor):
       return ops.convert_to_tensor(self._variable, dtype, name, as_ref)
     # TODO(reedwm): Support as_ref?
     assert not as_ref
-    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+    if dtype is not None and not dtype.is_compatible_with(self._cast_dtype):
       raise ValueError(
-          'Incompatible type conversion requested to type {!r} for variable '
-          'of type {!r}'.format(dtype.name, self.dtype.name))
+          'Incompatible type conversion requested to type {!r} for '
+          'AutoCastVariable which is casted to type {!r}'.format(
+              dtype.name, self._cast_dtype.name))
     val = ops.convert_to_tensor_v2_with_dispatch(
         self._variable, dtype=self._variable.dtype, name=name)
-    return math_ops.cast(val, self.dtype)
+    return math_ops.cast(val, self._cast_dtype)
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -139,13 +154,13 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   def __repr__(self):
     if context.executing_eagerly() and not self._in_graph_mode:
       repr_str = ("<AutoCastVariable '{v.name}' shape={v.shape} "
-                  'dtype={v.dtype.name} true_dtype={v.true_dtype.name}, '
+                  'dtype={v.dtype.name} dtype_to_cast_to={v._cast_dtype.name}, '
                   'numpy={np_repr}>')
       return repr_str.format(
-          v=self, np_repr=ops.numpy_text(self.read_value(), is_repr=True))
+          v=self, np_repr=numpy_text(self.read_value(), is_repr=True))
     else:
       repr_str = ("<AutoCastVariable '{v.name}' shape={v.shape} "
-                  'dtype={v.dtype.name} true_dtype={v.true_dtype.name}>')
+                  'dtype={v.dtype.name} dtype_to_cast_to={v._cast_dtype.name}>')
       return repr_str.format(v=self)
 
   # Method delegations: We delegate the following methods to self._variable.
@@ -504,7 +519,8 @@ def create_autocast_variable(variable, op=None):
 
       # pylint: disable=missing-format-attribute
       return ('<AutoCastDistributedVariable dtype={v.dtype.name} '
-              'true_dtype={v.true_dtype.name} inner_variable={v._variable}>'
+              'dtype_to_cast_to={v._cast_dtype.name} '
+              'inner_variable={v._variable}>'
              ).format(v=self)
       # pylint: enable=missing-format-attribute
 
@@ -531,5 +547,3 @@ class enable_auto_cast_variables(object):  # pylint:disable=invalid-name
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
     _autocast_dtype.dtype = self._prev_dtype
-
-
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/autocast_variable_test.py
similarity index 95%
rename from tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
rename to tensorflow/python/keras/mixed_precision/autocast_variable_test.py
index 738333039da..c21ff865205 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/autocast_variable_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
+from tensorflow.python.keras.mixed_precision import autocast_variable
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
@@ -77,7 +77,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
       # within auto cast scope of different dtype
       with autocast_variable.enable_auto_cast_variables(dtypes.float16):
-        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.dtype, dtypes.float32)
         self.assertEqual(x.value().dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
         self.assertEqual(array_ops.identity(x).dtype, dtypes.float16)
@@ -111,14 +111,11 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.evaluate(x.initializer)
 
       with autocast_variable.enable_auto_cast_variables(dtypes.float16):
-        self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
         with autocast_variable.enable_auto_cast_variables(dtypes.float32):
-          self.assertEqual(x.dtype, dtypes.float32)
           self.assertEqual(x.read_value().dtype, dtypes.float32)
 
-        self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
   @ds_combinations.generate(maybe_distribute)
@@ -133,7 +130,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
       dtype = dtypes.float16
       with autocast_variable.enable_auto_cast_variables(dtype):
-        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.dtype, dtypes.float32)
         self.assertIsInstance(x.dtype, dtypes.DType)
         self.assertEqual(x.true_dtype, dtypes.float32)
         self.assertIsInstance(x.true_dtype, dtypes.DType)
@@ -153,7 +150,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
           def evaluate(var):
             self.assertIsInstance(var, autocast_variable.AutoCastVariable)
-            self.assertEqual(var.dtype, read_dtype)
+            self.assertEqual(array_ops.identity(var).dtype, read_dtype)  # pylint: disable=cell-var-from-loop
             return self.evaluate(var)
 
         x = get_var(7., dtypes.float32)
@@ -415,13 +412,13 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
     self.evaluate(x.initializer)
 
     with autocast_variable.enable_auto_cast_variables(dtypes.float16):
-      self.assertEqual(x.dtype, dtypes.float16)
+      self.assertEqual(array_ops.identity(x).dtype, dtypes.float16)
 
       # New threads should not see the modified value of the autocast dtype.
       var_dtype = None
       def f():
         nonlocal var_dtype
-        var_dtype = x.dtype
+        var_dtype = x._cast_dtype
       thread = threading.Thread(target=f)
       thread.start()
       thread.join()
@@ -465,24 +462,26 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
     if context.executing_eagerly():
       self.assertStartsWith(
           repr(x),
-          "<AutoCastVariable 'x:0' shape=() dtype=float32 true_dtype=float32, "
-          "numpy="
+          "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+          "dtype_to_cast_to=float32, numpy="
       )
       with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertStartsWith(
             repr(x),
-            "<AutoCastVariable 'x:0' shape=() dtype=float16 "
-            "true_dtype=float32, numpy="
+            "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+            "dtype_to_cast_to=float16, numpy="
         )
     else:
       self.assertEqual(
           repr(x),
-          "<AutoCastVariable 'x:0' shape=() dtype=float32 true_dtype=float32>"
+          "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+          "dtype_to_cast_to=float32>"
       )
       with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertEqual(
             repr(x),
-            "<AutoCastVariable 'x:0' shape=() dtype=float16 true_dtype=float32>"
+            "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+            "dtype_to_cast_to=float16>"
         )
 
   def test_repr_distributed(self):
@@ -494,12 +493,14 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       if use_policy:
         self.assertRegex(
             repr(x).replace('\n', ' '),
-            '<AutoCastDistributedVariable dtype=float32 true_dtype=float32 '
+            '<AutoCastDistributedVariable dtype=float32 '
+            'dtype_to_cast_to=float32 '
             'inner_variable=DistributedVariable.*>')
       else:
         self.assertRegex(
             repr(x).replace('\n', ' '),
-            '<AutoCastDistributedVariable dtype=float32 true_dtype=float32 '
+            '<AutoCastDistributedVariable dtype=float32 '
+            'dtype_to_cast_to=float32 '
             'inner_variable=MirroredVariable.*>')
 
   @parameterized.named_parameters(
diff --git a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check.py b/tensorflow/python/keras/mixed_precision/device_compatibility_check.py
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check.py
rename to tensorflow/python/keras/mixed_precision/device_compatibility_check.py
diff --git a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py b/tensorflow/python/keras/mixed_precision/device_compatibility_check_test.py
similarity index 98%
rename from tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
rename to tensorflow/python/keras/mixed_precision/device_compatibility_check_test.py
index ccefa250d2d..381b054fa58 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
+++ b/tensorflow/python/keras/mixed_precision/device_compatibility_check_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import re
 
 from tensorflow.python.keras import combinations
-from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
+from tensorflow.python.keras.mixed_precision import device_compatibility_check
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/__init__.py b/tensorflow/python/keras/mixed_precision/experimental/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
deleted file mode 100644
index dd7bf6a682d..00000000000
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ /dev/null
@@ -1,691 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains the loss scaling optimizer class."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import one_device_strategy
-from tensorflow.python.distribute import tpu_strategy
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import smart_cond
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale as keras_loss_scale_module
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.training.experimental import mixed_precision
-from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.util.tf_export import keras_export
-
-
-class _UnwrapPreventer(object):
-  """Wrapper that DistributionStrategy will not unwrap.
-
-  Typically, DistributionStrategy will unwrap values when going from a cross-
-  replica context to a replica context via `call_for_each_replica`. This class
-  is a wrapper that DistributionStrategy will not unwrap, so it can be used to
-  prevent it from unwrapping a value.
-
-  TODO(reedwm): Find/implement a better way of preventing values from being
-  unwrapped by DistributionStrategy
-  """
-
-  __slots__ = ['value']
-
-  def __init__(self, value):
-    self.value = value
-
-
-class _DelegatingTrackableMixin(object):
-  """A mixin that delegates all Trackable methods to another trackable object.
-
-  This class must be used with multiple inheritance. A class that subclasses
-  Trackable can also subclass this class, which causes all Trackable methods to
-  be delegated to the trackable object passed in the constructor.
-
-  A subclass can use this mixin to appear as if it were the trackable passed to
-  the constructor, from a Checkpoint's perspective. LossScaleOptimizer uses this
-  mixin, so that the checkpoint format for a LossScaleOptimizer is identical to
-  the checkpoint format for a normal optimizer. This allows a model to be saved
-  with a normal Optimizer and restored with a LossScaleOptimizer, or vice versa.
-  The only difference in checkpoint format is that the loss scale is also saved
-  with a LossScaleOptimizer.
-  """
-
-  def __init__(self, trackable_obj):
-    self._trackable = trackable_obj
-
-  # pylint: disable=protected-access
-  @property
-  def _setattr_tracking(self):
-    return self._trackable._setattr_tracking
-
-  @_setattr_tracking.setter
-  def _setattr_tracking(self, value):
-    self._trackable._setattr_tracking = value
-
-  @property
-  def _update_uid(self):
-    return self._trackable._update_uid
-
-  @_update_uid.setter
-  def _update_uid(self, value):
-    self._trackable._update_uid = value
-
-  @property
-  def _unconditional_checkpoint_dependencies(self):
-    return self._trackable._unconditional_checkpoint_dependencies
-
-  @property
-  def _unconditional_dependency_names(self):
-    return self._trackable._unconditional_dependency_names
-
-  @property
-  def _name_based_restores(self):
-    return self._trackable._name_based_restores
-
-  def _maybe_initialize_trackable(self):
-    return self._trackable._maybe_initialize_trackable()
-
-  @property
-  def _object_identifier(self):
-    return self._trackable._object_identifier
-
-  @property
-  def _tracking_metadata(self):
-    return self._trackable._tracking_metadata
-
-  def _no_dependency(self, value):
-    return self._trackable._no_dependency(value)
-
-  def _name_based_attribute_restore(self, checkpoint):
-    return self._trackable._name_based_attribute_restore(checkpoint)
-
-  @property
-  def _checkpoint_dependencies(self):
-    return self._trackable._checkpoint_dependencies
-
-  @property
-  def _deferred_dependencies(self):
-    return self._trackable._deferred_dependencies
-
-  def _lookup_dependency(self, name):
-    self._trackable._lookup_dependency(name)
-
-  def _add_variable_with_custom_getter(self,
-                                       name,
-                                       shape=None,
-                                       dtype=dtypes.float32,
-                                       initializer=None,
-                                       getter=None,
-                                       overwrite=False,
-                                       **kwargs_for_getter):
-    return self._trackable._add_variable_with_custom_getter(
-        name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
-
-  def _preload_simple_restoration(self, name):
-    return self._trackable._preload_simple_restoration(name)
-
-  def _track_trackable(self, trackable, name, overwrite=False):  # pylint: disable=redefined-outer-name
-    return self._trackable._track_trackable(trackable, name, overwrite)
-
-  def _handle_deferred_dependencies(self, name, trackable):  # pylint: disable=redefined-outer-name
-    return self._trackable._handle_deferred_dependencies(name, trackable)
-
-  def _restore_from_checkpoint_position(self, checkpoint_position):
-    return self._trackable._restore_from_checkpoint_position(
-        checkpoint_position)
-
-  def _single_restoration_from_checkpoint_position(self, checkpoint_position,
-                                                   visit_queue):
-    return self._trackable._single_restoration_from_checkpoint_position(
-        checkpoint_position, visit_queue)
-
-  def _gather_saveables_for_checkpoint(self):
-    return self._trackable._gather_saveables_for_checkpoint()
-
-  def _list_extra_dependencies_for_serialization(self, serialization_cache):
-    return self._trackable._list_extra_dependencies_for_serialization(
-        serialization_cache)
-
-  def _list_functions_for_serialization(self, serialization_cache):
-    return self._trackable._list_functions_for_serialization(
-        serialization_cache)
-  # pylint: enable=protected-access
-
-
-@keras_export('keras.mixed_precision.experimental.LossScaleOptimizer')
-class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
-  """An optimizer that applies loss scaling.
-
-  Loss scaling is a process that multiplies the loss by a multiplier called the
-  loss scale, and divides each gradient by the same multiplier. The pseudocode
-  for this process is:
-
-  ```
-  loss = ...
-  loss *= loss_scale
-  grads = gradients(loss, vars)
-  grads /= loss_scale
-  ```
-
-  Mathematically, loss scaling has no effect, but can help avoid numerical
-  underflow in intermediate gradients when float16 tensors are used. By
-  multiplying the loss, each intermediate gradient will have the same multiplier
-  applied.
-
-  The loss scale can either be a fixed constant, chosen by the user, or be
-  dynamically determined. Dynamically determining the loss scale is convenient
-  as a loss scale does not have to be explicitly chosen. However it reduces
-  performance.
-
-  This optimizer wraps another optimizer and applies loss scaling to it via a
-  `LossScale`. Loss scaling is applied whenever gradients are
-  computed, either through `minimize()` or `get_gradients()`. The loss scale is
-  updated via `LossScale.update()` whenever gradients are applied, either
-  through `minimize()` or `apply_gradients()`. For example:
-
-  >>> opt = tf.keras.optimizers.SGD(0.25)
-  >>> opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt,
-  ...                                                                "dynamic")
-  >>> var = tf.Variable(1.)
-  >>> loss_fn = lambda: var ** 2
-  >>> # 'minimize' applies loss scaling to the loss and updates the loss sale.
-  >>> opt.minimize(loss_fn, var_list=var)
-  >>> var.numpy()
-  0.5
-
-  If a `tf.GradientTape` is used to compute gradients instead of
-  `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, the loss
-  and gradients must be scaled manually. This can be done by calling
-  `LossScaleOptimizer.get_scaled_loss` before passing the loss to
-  `tf.GradientTape`, and `LossScaleOptimizer.get_unscaled_gradients` after
-  computing the gradients with `tf.GradientTape`. For example:
-
-  >>> with tf.GradientTape() as tape:
-  ...   loss = loss_fn()
-  ...   scaled_loss = opt.get_scaled_loss(loss)
-  >>> scaled_grad = tape.gradient(scaled_loss, var)
-  >>> (grad,) = opt.get_unscaled_gradients([scaled_grad])
-  >>> opt.apply_gradients([(grad, var)])  # Loss scale is updated here
-  >>> var.numpy()
-  0.25
-
-  Hyperparameters can be accessed and set on the LossScaleOptimizer, which will
-  be delegated to the wrapped optimizer.
-
-  >>> opt = tf.keras.optimizers.Adam(beta_1=0.8, epsilon=1e-5)
-  >>> lso = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt,
-  ...                                                                "dynamic")
-  >>> opt.beta_1
-  0.8
-  >>> lso.beta_1  # Equivalent to `opt.beta_1`
-  0.8
-  >>> lso.beta_1 = 0.7  # Equivalent to `opt.beta_1 = 0.7`
-  >>> opt.beta_1
-  0.7
-  >>> lso.beta_1
-  0.7
-
-  However, accessing or setting non-hyperparameters is not delegated to the
-  LossScaleOptimizer. In an Adam optimizer, `beta_1` is a hyperparameter but
-  `epsilon` is not, as the Adam optimizer only calls `Optimizer._set_hyper` on
-  `beta_1`.
-
-  >>> opt.epsilon
-  1e-5
-  >>> lso.epsilon
-  Traceback (most recent call last):
-  ...
-  AttributeError: 'LossScaleOptimizer' object has no attribute 'epsilon'
-  >>> lso.epsilon = 1e-4
-  >>> opt.epsilon
-  >>> 1e-5
-
-  In the above example, despite epsilon being set on the LossScaleOptimizer, the
-  old epsilon value will still be used when training as epsilon was not set on
-  the Adam optimizer.
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self, optimizer, loss_scale):
-    """Initializes this loss scale optimizer.
-
-    Args:
-      optimizer: The Optimizer instance to wrap.
-      loss_scale: The loss scale to scale the loss and gradients. This can
-        either be an int/float to use a fixed loss scale, the string "dynamic"
-        to use dynamic loss scaling, or an instance of a LossScale. The string
-        "dynamic" equivalent to passing `DynamicLossScale()`, and passing an
-        int/float is equivalent to passing a FixedLossScale with the given loss
-        scale.
-    """
-    if not isinstance(optimizer, optimizer_v2.OptimizerV2):
-      raise ValueError('"optimizer" must be an instance of OptimizerV2, but '
-                       'got: %s' % optimizer)
-    self._raise_if_strategy_unsupported()
-
-    self._optimizer = optimizer
-    self._loss_scale = keras_loss_scale_module.get(loss_scale)
-    if self._loss_scale is None:
-      raise ValueError('loss_scale cannot be None.')
-
-    # We don't call super().__init__, since we do not want to call OptimizerV2's
-    # constructor.
-    _DelegatingTrackableMixin.__init__(self, self._optimizer)
-
-    for weight in self._loss_scale._weights.values():  # pylint: disable=protected-access
-      # We cannot call `track_variable` in the LossScale class itself, because a
-      # file outside of Keras cannot depend on a Keras file. Calling it here
-      # instead is OK, because a variable only needs to be tracked if used with
-      # a Keras class, and the only way to use LossScale with a Keras class is
-      # through the LossScaleOptimizer.
-      backend.track_variable(weight)
-    self._track_trackable(self._loss_scale, 'loss_scale')
-
-    # To support restoring TensorFlow 2.2 checkpoints.
-    self._track_trackable(FakeOptimizerForRestoration(self._optimizer),
-                          'base_optimizer')
-
-  @property
-  def loss_scale(self):
-    """The `LossScale` instance associated with this optimizer."""
-    return self._loss_scale
-
-  def get_scaled_loss(self, loss):
-    """Scales the loss by the loss scale.
-
-    This method is only needed if you compute gradients manually, e.g. with
-    `tf.GradientTape`. In that case, call this method to scale the loss before
-    passing the loss to `tf.GradientTape`. If you use
-    `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, loss
-    scaling is automatically applied and this method is unneeded.
-
-    If this method is called, `get_unscaled_gradients` should also be called.
-    See the `tf.keras.mixed_precision.experimental.LossScaleOptimizer` doc for
-    an example.
-
-    Args:
-      loss: The loss, which will be multiplied by the loss scale. Can either be
-        a tensor or a callable returning a tensor.
-
-    Returns:
-      `loss` multiplied by `LossScaleOptimizer.loss_scale()`.
-    """
-    loss_scale = self._loss_scale()
-    if callable(loss):
-      def new_loss():
-        loss_val = loss()
-        return loss_val * math_ops.cast(loss_scale, loss_val.dtype)
-      return new_loss
-    else:
-      return loss * math_ops.cast(loss_scale, loss.dtype)
-
-  def get_unscaled_gradients(self, grads):
-    """Unscales the gradients by the loss scale.
-
-    This method is only needed if you compute gradients manually, e.g. with
-    `tf.GradientTape`. In that case, call this method to unscale the gradients
-    after computing them with `tf.GradientTape`. If you use
-    `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, loss
-    scaling is automatically applied and this method is unneeded.
-
-    If this method is called, `get_scaled_loss` should also be called. See
-    the `tf.keras.mixed_precision.experimental.LossScaleOptimizer` doc for an
-    example.
-
-    Args:
-      grads: A list of tensors, each which will be divided by the loss scale.
-        Can have None values, which are ignored.
-
-    Returns:
-      A new list the same size as `grads`, where every non-None value in `grads`
-      is divided by `LossScaleOptimizer.loss_scale()`.
-    """
-    loss_scale = self._loss_scale()
-    loss_scale_reciprocal = 1. / loss_scale
-    return [
-        _multiply_gradient(g, loss_scale_reciprocal) if g is not None else None
-        for g in grads
-    ]
-
-  def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
-    tape = backprop.GradientTape() if tape is None else tape
-    with tape:
-      loss = self.get_scaled_loss(loss)
-    grads_and_vars = self._optimizer._compute_gradients(  # pylint: disable=protected-access
-        loss,
-        var_list,
-        grad_loss,
-        tape=tape)
-    grads = [g for g, _ in grads_and_vars]
-    variables = [v for _, v in grads_and_vars]
-    unscaled_grads = self.get_unscaled_gradients(grads)
-    return list(zip(unscaled_grads, variables))
-
-  def get_gradients(self, loss, params):
-    loss = self.get_scaled_loss(loss)
-    grads = self._optimizer.get_gradients(loss, params)
-    return self.get_unscaled_gradients(grads)
-
-  def _create_all_weights(self, var_list):
-    self._optimizer._create_all_weights(var_list)    # pylint: disable=protected-access
-
-  def apply_gradients(self,
-                      grads_and_vars,
-                      name=None,
-                      experimental_aggregate_gradients=True):
-    if distribution_strategy_context.in_cross_replica_context():
-      raise ValueError('apply_gradients() must be called in a replica context.')
-    # We check for the strategy here despite already checking in the constructor
-    # as frequently the optimizer is created outside the strategy's scope.
-    self._raise_if_strategy_unsupported()
-
-    grads_and_vars = tuple(grads_and_vars)
-    return distribution_strategy_context.get_replica_context().merge_call(
-        self._apply_gradients_cross_replica,
-        args=(grads_and_vars, name, experimental_aggregate_gradients))
-
-  def _apply_gradients_cross_replica(self, distribution, grads_and_vars, name,
-                                     experimental_aggregate_gradients):
-    grads = [g for g, _ in grads_and_vars]
-    loss_scale_update_op, should_apply_grads = self._loss_scale.update(grads)
-
-    def apply_fn():
-      # We do not want DistributionStrategy to unwrap any MirroredVariables in
-      # grads_and_vars, because even in a replica context, the wrapped optimizer
-      # expects mirrored variables. So we wrap the variables with an
-      # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
-      # MirroredVariables.
-      wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
-      return distribution.extended.call_for_each_replica(
-          self._apply_gradients,
-          args=(grads, wrapped_vars, name, experimental_aggregate_gradients))
-
-    def do_not_apply_fn():
-      # Normally self._optimizer.iterations is incremented in
-      # self._optimizer.apply_gradients(). Since that is not called in this
-      # branch, we increment it here instead.
-      return self._optimizer.iterations.assign_add(1, read_value=False)
-
-    # Note: We must call this cond() in a cross-replica context.
-    # DistributionStrategy does not support having a cond in a replica context
-    # with a branch that calls `merge_call`, and self._optimizer.apply_gradients
-    # calls `merge_call`.
-    maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
-                                           do_not_apply_fn)
-    return control_flow_ops.group(maybe_apply_op, loss_scale_update_op)
-
-  def _apply_gradients(self, grads, wrapped_vars, name,
-                       experimental_aggregate_gradients):
-    # TODO(reedwm): This will raise a fairly cryptic error message if
-    # self._optimizer.apply_gradients does not take
-    # experimental_aggregate_gradients.
-    return self._optimizer.apply_gradients(
-        list(zip(grads, wrapped_vars.value)), name,
-        experimental_aggregate_gradients=experimental_aggregate_gradients)
-
-  def get_config(self):
-    serialized_optimizer = optimizers.serialize(self._optimizer)
-    serialized_loss_scale = keras_loss_scale_module.serialize(self._loss_scale)
-    return {
-        'optimizer': serialized_optimizer,
-        'loss_scale': serialized_loss_scale,
-    }
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()  # Make a copy, since we mutate config
-    config['optimizer'] = optimizers.deserialize(
-        config['optimizer'], custom_objects=custom_objects)
-    config['loss_scale'] = keras_loss_scale_module.deserialize(
-        config['loss_scale'], custom_objects=custom_objects)
-    return cls(**config)
-
-  def _raise_if_strategy_unsupported(self):
-    if not strategy_supports_loss_scaling():
-      strategy = distribution_strategy_context.get_strategy()
-      if isinstance(strategy,
-                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
-                     tpu_strategy.TPUStrategyV2)):
-        raise ValueError(
-            'Loss scaling is not supported with TPUStrategy. Loss scaling is '
-            'unnecessary with TPUs, since they support bfloat16 instead of '
-            'float16 and bfloat16 does not require loss scaling. You should '
-            'remove the use of the LossScaleOptimizer when TPUs are used.')
-      else:
-        raise ValueError('Loss scaling is not supported with the '
-                         'tf.distribute.Strategy: %s. Try using a different '
-                         'Strategy, e.g. a MirroredStrategy' %
-                         strategy.__class__.__name__)
-
-  # Delegations: We delegate most OptimizerV2 methods to the wrapped optimizer
-  # below.
-
-  @property
-  def iterations(self):
-    return self._optimizer.iterations
-
-  @iterations.setter
-  def iterations(self, variable):
-    self._optimizer.iterations = variable
-
-  def get_slot_names(self):
-    return self._optimizer.get_slot_names()
-
-  def variables(self):
-    return self._optimizer.variables()
-
-  @property
-  def weights(self):
-    return self._optimizer.weights
-
-  def get_weights(self):
-    return self._optimizer.get_weights()
-
-  def set_weights(self, weights):
-    return self._optimizer.set_weights(weights)
-
-  @property
-  def clipnorm(self):
-    return self._optimizer.clipnorm
-
-  @clipnorm.setter
-  def clipnorm(self, val):
-    self._optimizer.clipnorm = val
-
-  @property
-  def global_clipnorm(self):
-    return self._optimizer.global_clipnorm
-
-  @global_clipnorm.setter
-  def global_clipnorm(self, val):
-    self._optimizer.global_clipnorm = val
-
-  @property
-  def clipvalue(self):
-    return self._optimizer.clipvalue
-
-  @clipvalue.setter
-  def clipvalue(self, val):
-    self._optimizer.clipvalue = val
-
-  def _aggregate_gradients(self, grads_and_vars):
-    return self._optimizer._aggregate_gradients(grads_and_vars)  # pylint: disable=protected-access
-
-  def _restore_slot_variable(self, slot_name, variable, slot_variable):
-    return self._optimizer._restore_slot_variable(slot_name, variable,  # pylint: disable=protected-access
-                                                  slot_variable)
-
-  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
-                                       variable):
-    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
-        slot_variable_position, slot_name, variable)
-
-  def get_slot(self, var, slot_name):
-    return self._optimizer.get_slot(var, slot_name)
-
-  def add_slot(self, var, slot_name, initializer='zeros'):
-    return self._optimizer.add_slot(var, slot_name, initializer)
-
-  def __getattribute__(self, name):
-    try:
-      return object.__getattribute__(self, name)
-    except AttributeError as e:
-      if name == '_optimizer' or name == '_hyper':
-        # Avoid infinite recursion
-        raise e
-
-      # Delegate hyperparameter accesses to inner optimizer.
-      if name == 'lr':
-        name = 'learning_rate'
-      if name in self._optimizer._hyper:
-        return self._optimizer._get_hyper(name)
-      raise e
-
-  def __dir__(self):
-    result = set(super(LossScaleOptimizer, self).__dir__())
-    if '_optimizer' in result:
-      result |= self._optimizer._hyper.keys()
-      if 'learning_rate' in self._optimizer._hyper.keys():
-        result.add('lr')
-    return list(result)
-
-  def __setattr__(self, name, value):
-    if name == 'lr':
-      name = 'learning_rate'
-    # Delegate setting hyperparameter to inner optimizer if the attribute does
-    # not exist on the LossScaleOptimizer
-    try:
-      # We cannot check for the 'iterations' attribute as it cannot be set after
-      # it is accessed.
-      if name != 'iterations':
-        object.__getattribute__(self, name)
-      has_attribute = True
-    except AttributeError:
-      has_attribute = False
-    if (name != '_optimizer' and name in self._optimizer._hyper
-        and not has_attribute):
-      self._optimizer._set_hyper(name, value)
-    else:
-      super(LossScaleOptimizer, self).__setattr__(name, value)
-
-  # We do not override some OptimizerV2 methods. For each, we describe why we do
-  # not delegate them to self._optimizer:
-  # * get_updates: get_updates() calls get_gradients(). Since we override
-  #   get_gradients(), we cannot delegate get_updates() to self._optimizer,
-  #   otherwise the overridden get_gradients() method would not be called.
-  #   Luckily, get_updates() does not access any OptimizerV2 fields, so
-  #   inheriting the OptimizerV2 version works fine.
-  # * minimize: We don't delegate for a similar as get_updates(): it calls
-  #   both self._compute_gradients() and self.apply_gradients(), and both need
-  #   to have the LossScaleOptimizer version called.
-
-  # TODO(reedwm): Maybe merge this class's functionality into OptimizerV2.
-
-  # TODO(reedwm): Maybe throw an error if mixed precision is used without this
-  # optimizer being used.
-
-  # Trackable delegations: Delegate all Trackable methods to the wrapped
-  # optimizer. This is so the checkpoint format for a LossScaleOptimizer is
-  # identical to the checkpoint format for a normal optimizer, except the loss
-  # scale is stored in the checkpoint.
-
-
-class FakeOptimizerForRestoration(trackable.Trackable):
-  """A fake optimizer used to support restoring TensorFlow 2.2 checkpoints.
-
-  The checkpoint format for LossScaleOptimizers changed after TF 2.2. This class
-  exists to support restoring TF 2.2 checkpoints in newer version of TensorFlow.
-
-  In TF 2.2, LossScaleOptimizer would track the wrapped optimizer by calling the
-  following in LossScaleOptimizer.__init__
-
-  ```
-  self._track_trackable(self._optimizer, 'base_optimizer')
-  ```
-
-  This means a dependency from the LossScaleOptimizer to the wrapped optimizer
-  would be stored in the checkpoint. However now, the checkpoint format with a
-  LossScaleOptimizer is the same as the format without a LossScaleOptimizer,
-  except the loss scale is also stored. This means there is no dependency from
-  the LossScaleOptimizer to the wrapped optimizer. Instead, the
-  LossScaleOptimizer acts as if it is the wrapped optimizer, from a checkpoint's
-  perspective, by overriding all Trackable methods and delegating them to the
-  wrapped optimizer.
-
-  To allow restoring TF 2.2. checkpoints, LossScaleOptimizer adds a dependency
-  on this class instead of the inner optimizer. When restored, this class will
-  instead restore the slot variables of the inner optimizer. Since this class
-  has no variables, it does not affect the checkpoint when saved.
-  """
-
-  def __init__(self, optimizer):
-    self._optimizer = optimizer
-
-  def get_slot_names(self):
-    return self._optimizer.get_slot_names()
-
-  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
-                                       variable):
-    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
-        slot_variable_position, slot_name, variable)
-
-
-# pylint: disable=protected-access
-mixed_precision._register_wrapper_optimizer_cls(optimizer_v2.OptimizerV2,
-                                                LossScaleOptimizer)
-
-
-def _multiply_gradient(gradient, scale):
-  """Multiply a (possibly sparse) gradient by the given scale factor."""
-  scale = math_ops.cast(scale, gradient.dtype)
-  if isinstance(gradient, ops.IndexedSlices):
-    return ops.IndexedSlices(
-        gradient.values * scale,
-        gradient.indices,
-        dense_shape=gradient.dense_shape)
-  else:
-    return gradient * scale
-
-
-def strategy_supports_loss_scaling():
-  """Returns True if the current Strategy supports loss scaling."""
-  if not distribution_strategy_context.has_strategy():
-    return True
-  strategy = distribution_strategy_context.get_strategy()
-  # Strategies are supported if either there is only one replica or if variables
-  # are replicated per device. Otherwise, the current model.fit() implementation
-  # and most custom training loops incorrectly unscale the gradients. Currently,
-  # gradients are unscaled once per compute replica, but they should be unscaled
-  # once per variable replica. When there is one variable replica for each
-  # compute replica, this works fine, but otherwise issues will occur.
-  # TODO(reedwm): Support all strategies.
-  return isinstance(strategy, (
-      collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-      collective_all_reduce_strategy.CollectiveAllReduceStrategyV1,
-      one_device_strategy.OneDeviceStrategy,
-      one_device_strategy.OneDeviceStrategyV1,
-      mirrored_strategy.MirroredStrategy,
-      mirrored_strategy.MirroredStrategyV1,
-  ))
diff --git a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py b/tensorflow/python/keras/mixed_precision/get_layer_policy.py
similarity index 89%
rename from tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py
rename to tensorflow/python/keras/mixed_precision/get_layer_policy.py
index 47826b48a97..dec706fde1f 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py
+++ b/tensorflow/python/keras/mixed_precision/get_layer_policy.py
@@ -29,13 +29,16 @@ from tensorflow.python.util.tf_export import keras_export
 def get_layer_policy(layer):
   """Returns the dtype policy of a layer.
 
+  Warning: This function is deprecated. Use
+  `tf.keras.layers.Layer.dtype_policy` instead.
+
   Args:
     layer: A `tf.keras.layers.Layer`.
 
   Returns:
-    The `tf.keras.mixed_precision.experimental.Policy` of the layer.
+    The `tf.keras.mixed_precision.Policy` of the layer.
   """
   if not isinstance(layer, base_layer.Layer):
     raise ValueError('get_policy can only be called on a layer, but got: %s'
                      % (layer,))
-  return layer._dtype_policy  # pylint: disable=protected-access
+  return layer.dtype_policy
diff --git a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py b/tensorflow/python/keras/mixed_precision/get_layer_policy_test.py
similarity index 91%
rename from tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py
rename to tensorflow/python/keras/mixed_precision/get_layer_policy_test.py
index f38bdfaf482..ae1ac94055c 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/get_layer_policy_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.mixed_precision.experimental import get_layer_policy
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import get_layer_policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/keras_test.py
similarity index 83%
rename from tensorflow/python/keras/mixed_precision/experimental/keras_test.py
rename to tensorflow/python/keras/mixed_precision/keras_test.py
index dd754e87bb4..59c6b6dfdda 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/keras_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import os
 
+from absl import flags
 from absl.testing import parameterized
 import numpy as np
 
@@ -44,16 +45,16 @@ from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.mixed_precision.experimental import get_layer_policy
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
-from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.keras.mixed_precision import get_layer_policy
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import policy
+from tensorflow.python.keras.mixed_precision import test_util as mp_test_util
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import flags
+# from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.tracking import util as trackable_utils
@@ -141,7 +142,11 @@ class KerasLayerTest(keras_parameterized.TestCase):
         y = layer(x)
         self.assertEqual(layer.v.dtype, dtypes.float32)
         self.assertEqual(y.dtype, dtype)
+        self.assertEqual(layer.dtype_policy.name, policy_name)
+        self.assertIsInstance(layer.dtype_policy, policy.Policy)
+        self.assertEqual(layer.compute_dtype, dtype)
         self.assertEqual(layer.dtype, dtypes.float32)
+        self.assertEqual(layer.variable_dtype, dtypes.float32)
         self.assertEqual(get_layer_policy.get_layer_policy(layer).name,
                          policy_name)
         self.evaluate(variables.global_variables_initializer())
@@ -226,7 +231,11 @@ class KerasLayerTest(keras_parameterized.TestCase):
         # Passing a Policy to dtype overrides the global Policy
         layer = mp_test_util.MultiplyLayer(
             assert_type=dtypes.float64, dtype=policy.Policy('float64'))
-        self.assertEqual(layer.dtype, 'float64')
+        self.assertEqual(layer.dtype_policy.name, 'float64')
+        self.assertIsInstance(layer.dtype_policy, policy.Policy)
+        self.assertEqual(layer.compute_dtype, dtypes.float64)
+        self.assertEqual(layer.dtype, dtypes.float64)
+        self.assertEqual(layer.variable_dtype, dtypes.float64)
         self.assertEqual(layer(x).dtype, dtypes.float64)
         self.assertEqual(layer.v.dtype, dtypes.float64)
 
@@ -344,32 +353,10 @@ class KerasLayerTest(keras_parameterized.TestCase):
       self.assertEqual(layer.dtype, 'float32')
       self.assertEqual(layer(x).dtype, 'float16')
       self.assertEqual(layer.v.dtype, 'float32')
-
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('mixed_float16',
-                                                             loss_scale=None))
       config = layer.get_config()
       self.assertEqual(config['dtype'],
                        {'class_name': 'Policy',
-                        'config': {'name': 'mixed_float16',
-                                   'loss_scale': None}})
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      self.assertEqual(layer.dtype, 'float32')
-      self.assertEqual(layer(x).dtype, 'float16')
-      self.assertEqual(layer.v.dtype, 'float32')
-
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('float64',
-                                                             loss_scale=2.))
-      config = layer.get_config()
-      self.assertEqual(config['dtype'],
-                       {'class_name': 'Policy',
-                        'config': {'name': 'float64',
-                                   'loss_scale': {
-                                       'class_name': 'FixedLossScale',
-                                       'config': {'loss_scale_value': 2.0}}}})
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      self.assertEqual(layer.dtype, 'float64')
-      self.assertEqual(layer(x).dtype, 'float64')
-      self.assertEqual(layer.v.dtype, 'float64')
+                        'config': {'name': 'mixed_float16'}})
 
       layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('_infer'))
       config = layer.get_config()
@@ -383,11 +370,53 @@ class KerasLayerTest(keras_parameterized.TestCase):
       self.assertEqual(layer(x).dtype, 'float32')
       self.assertEqual(layer.v.dtype, 'float32')
 
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('_infer',
-                                                             loss_scale=2.))
+  @parameterized.named_parameters(*TESTCASES)
+  def test_config_policy_v1(self, strategy_fn):
+    x = constant_op.constant([1.], dtype=dtypes.float16)
+    with strategy_fn().scope():
+
+      layer = mp_test_util.MultiplyLayer(dtype=policy.PolicyV1('mixed_float16',
+                                                               loss_scale=None))
       config = layer.get_config()
+      self.assertEqual(config['dtype'],
+                       {'class_name': 'PolicyV1',
+                        'config': {'name': 'mixed_float16',
+                                   'loss_scale': None}})
+      layer = mp_test_util.MultiplyLayer.from_config(config)
+      self.assertEqual(layer.dtype, 'float32')
+      self.assertEqual(layer(x).dtype, 'float16')
+      self.assertEqual(layer.v.dtype, 'float32')
+      # Restoring a PolicyV1 silently converts it to a Policy and drops the loss
+      # scale.
+      self.assertEqual(type(layer.dtype_policy), policy.Policy)
+      config = layer.get_config()
+      # The loss_scale is silently dropped
       self.assertEqual(config['dtype'],
                        {'class_name': 'Policy',
+                        'config': {'name': 'mixed_float16'}})
+
+      layer = mp_test_util.MultiplyLayer(dtype=policy.PolicyV1('float64',
+                                                               loss_scale=2.))
+      config = layer.get_config()
+      self.assertEqual(config['dtype'],
+                       {'class_name': 'PolicyV1',
+                        'config': {'name': 'float64',
+                                   'loss_scale': {
+                                       'class_name': 'FixedLossScale',
+                                       'config': {'loss_scale_value': 2.0}}}})
+      layer = mp_test_util.MultiplyLayer.from_config(config)
+      self.assertEqual(layer.dtype, 'float64')
+      self.assertEqual(layer(x).dtype, 'float64')
+      self.assertEqual(layer.v.dtype, 'float64')
+      self.assertEqual(type(layer.dtype_policy), policy.Policy)
+      config = layer.get_config()
+      self.assertEqual(config['dtype'], 'float64')
+
+      layer = mp_test_util.MultiplyLayer(dtype=policy.PolicyV1('_infer',
+                                                               loss_scale=2.))
+      config = layer.get_config()
+      self.assertEqual(config['dtype'],
+                       {'class_name': 'PolicyV1',
                         'config': {'name': '_infer',
                                    'loss_scale': {
                                        'class_name': 'FixedLossScale',
@@ -396,6 +425,9 @@ class KerasLayerTest(keras_parameterized.TestCase):
       self.assertEqual(layer.dtype, None)
       self.assertEqual(layer(x).dtype, 'float16')
       self.assertEqual(layer.v.dtype, 'float16')
+      self.assertEqual(type(layer.dtype_policy), policy.Policy)
+      config = layer.get_config()
+      self.assertEqual(config['dtype'], 'float16')
 
   def test_delete_variable(self):
     layer = base_layer.Layer(dtype=policy.Policy('mixed_float16'))
@@ -501,6 +533,11 @@ class KerasModelTest(keras_parameterized.TestCase):
           'strategy_fn': create_mirrored_strategy,
           'save_format': 'h5',
           'use_regularizer': True,
+      }, {
+          'testcase_name': 'saved_model_v1_policy',
+          'strategy_fn': create_mirrored_strategy,
+          'use_v1_policy': True,
+          'save_format': 'tf',
       })
   def test_model(self,
                  strategy_fn,
@@ -509,19 +546,23 @@ class KerasModelTest(keras_parameterized.TestCase):
                  policy_name='mixed_float16',
                  get_config=False,
                  save_format=None,
-                 use_input_spec=False):
+                 use_input_spec=False,
+                 use_v1_policy=False):
     self._skip_if_strategy_unsupported(strategy_fn)
     self._skip_if_save_format_unsupported(save_format)
-    regularizer = (mp_test_util.IdentityRegularizer() if use_regularizer
-                   else None)
+    if use_regularizer:
+      weight_regularizer = mp_test_util.IdentityRegularizer()
+      activity_regularizer = mp_test_util.ReduceSumRegularizer()
+    else:
+      weight_regularizer = activity_regularizer = None
     with strategy_fn().scope():
-      # Pass loss_scale=None, as this test will fail if the DynamicLossScale
-      # skips applying gradients for a step
-      with policy.policy_scope(policy.Policy(policy_name, loss_scale=None)):
+      cls = policy.PolicyV1 if use_v1_policy else policy.Policy
+      with policy.policy_scope(cls(policy_name)):
         layer = mp_test_util.MultiplyLayer(
             assert_type=dtypes.float16,
             use_operator=use_operator,
-            regularizer=regularizer,
+            regularizer=weight_regularizer,
+            activity_regularizer=activity_regularizer,
             input_shape=(1,))
         if use_input_spec:
           layer.input_spec = input_spec.InputSpec(shape=(None, 1))
@@ -543,6 +584,10 @@ class KerasModelTest(keras_parameterized.TestCase):
         # the variable will not change. So this tests the learning rate not
         # applied to a float16 value, but instead the float32 variable.
         opt = gradient_descent.SGD(2**-14)
+        # Use a fixed loss scale, as this test will fail if gradients are
+        # skipped for a step due to dynamic loss scaling.
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                      initial_scale=8)
         model.compile(
             opt,
             loss=loss_fn,
@@ -556,8 +601,9 @@ class KerasModelTest(keras_parameterized.TestCase):
     # from it.
     expected = 1 - 2**-14
     if use_regularizer:
-      # Regularizer adds another 2 ** -14 to the gradient.
-      expected -= 2**-14
+      # Weight and activity regularizer each add another 2 ** -14 to the
+      # gradient.
+      expected -= 2 * 2**-14
     self.assertEqual(backend.eval(layer.v), expected)
 
     if save_format:
@@ -574,14 +620,14 @@ class KerasModelTest(keras_parameterized.TestCase):
                 if 'MultiplyLayer' in layer.__class__.__name__)
     expected = 1 - 2**-14
     if use_regularizer:
-      expected -= 2**-14
+      expected -= 2 * 2**-14
     self.assertEqual(backend.eval(layer.v), expected)
 
     # Continue training, and assert variable is correct value
     model.fit(dataset)
     new_expected = expected - 2 ** -14
     if use_regularizer:
-      new_expected -= 2 ** -14
+      new_expected -= 2 * 2 ** -14
     self.assertEqual(backend.eval(layer.v), new_expected)
 
     # Load saved model again, and assert variable is previous value
@@ -597,6 +643,13 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertEqual(layer.v.dtype, 'float32')
     self.assertEqual(layer(np.ones((2, 1))).dtype, 'float16')
 
+    # Loading a model always loads with a v2 Policy, even if saved with a
+    # PolicyV1.
+    self.assertEqual(type(model.dtype_policy), policy.Policy)
+    self.assertEqual(layer.get_config()['dtype'],
+                     {'class_name': 'Policy', 'config': {
+                         'name': 'mixed_float16'}})
+
   @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(
       {
@@ -630,7 +683,8 @@ class KerasModelTest(keras_parameterized.TestCase):
         return math_ops.reduce_mean(y_pred)
 
       opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                    initial_scale=loss_scale)
       model.compile(
           opt,
           loss=loss_fn,
@@ -669,13 +723,10 @@ class KerasModelTest(keras_parameterized.TestCase):
     strategy = strategy_fn()
     if use_loss_scaling:
       loss_scale = 8.
-    else:
-      loss_scale = None
     learning_rate = 2**-14
 
     with strategy.scope():
-      with policy.policy_scope(policy.Policy('mixed_float16',
-                                             loss_scale=loss_scale)):
+      with policy.policy_scope(policy.Policy('mixed_float16')):
         x = layers.Input(shape=(1,), batch_size=2)
         layer1 = mp_test_util.MultiplyLayer(
             assert_type=dtypes.float16,
@@ -710,6 +761,9 @@ class KerasModelTest(keras_parameterized.TestCase):
           return math_ops.reduce_mean(y_pred)
 
         opt = gradient_descent.SGD(learning_rate)
+        if use_loss_scaling:
+          opt = loss_scale_optimizer.LossScaleOptimizer(
+              opt, dynamic=False, initial_scale=loss_scale)
         model.compile(
             opt,
             loss=loss_fn,
@@ -743,6 +797,11 @@ class KerasModelTest(keras_parameterized.TestCase):
           'testcase_name': 'get_config',
           'strategy_fn': create_mirrored_strategy,
           'get_config': True,
+      }, {
+          'testcase_name': 'get_config_v1_lso',
+          'strategy_fn': create_mirrored_strategy,
+          'get_config': True,
+          'use_v1_loss_scale_optimizer': True,
       }, {
           'testcase_name': 'get_config_and_pass_loss_scale_to_policy',
           'strategy_fn': create_mirrored_strategy,
@@ -752,12 +811,11 @@ class KerasModelTest(keras_parameterized.TestCase):
   def test_dynamic_loss_scaling(self,
                                 strategy_fn,
                                 pass_loss_scale_to_policy=False,
-                                get_config=False):
+                                get_config=False,
+                                use_v1_loss_scale_optimizer=False):
     strategy = strategy_fn()
     initial_loss_scale = 2.
     batch_size = 4
-    loss_scale = loss_scale_module.DynamicLossScale(
-        initial_loss_scale=initial_loss_scale, increment_period=2)
     expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                          dtype=dtypes.float16)
     # If this variable is set to True, the model below will have NaN gradients
@@ -765,10 +823,19 @@ class KerasModelTest(keras_parameterized.TestCase):
     with strategy.scope():
       opt = gradient_descent.SGD(1.)
       if pass_loss_scale_to_policy:
-        p = policy.Policy('mixed_float16', loss_scale=loss_scale)
+        loss_scale = loss_scale_module.DynamicLossScale(
+            initial_loss_scale=initial_loss_scale, increment_period=2)
+        p = policy.PolicyV1('mixed_float16', loss_scale=loss_scale)
+      elif use_v1_loss_scale_optimizer:
+        loss_scale = loss_scale_module.DynamicLossScale(
+            initial_loss_scale=initial_loss_scale, increment_period=2)
+        p = policy.Policy('mixed_float16')
+        opt = loss_scale_optimizer.LossScaleOptimizerV1(
+            opt, loss_scale)
       else:
-        p = policy.Policy('mixed_float16', loss_scale=None)
-        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+        p = policy.Policy('mixed_float16')
+        opt = loss_scale_optimizer.LossScaleOptimizer(
+            opt, initial_scale=initial_loss_scale, dynamic_growth_steps=2)
       with policy.policy_scope(p):
         x = layers.Input(
             shape=(1,), batch_size=batch_size, dtype=dtypes.float16)
@@ -835,19 +902,32 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertEqual(backend.eval(layer.v), -3)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_loss_scale_optimizer_overrides_policy_loss_scale(self):
-    with policy.policy_scope(policy.Policy('float32', loss_scale=10.)):
+  def test_loss_scale_optimizer_overrides_policy_v1_loss_scale(self):
+    with policy.policy_scope(policy.PolicyV1('float32', loss_scale=10.)):
       opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=5.)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                    initial_scale=5.)
       x = layers.Input(shape=(1,))
       y = mp_test_util.MultiplyLayer()(x)
       model = models.Model(x, y)
       model.compile(opt, loss='mse')
-      self.assertEqual(self.evaluate(model.optimizer.loss_scale()), 5.)
+      self.assertEqual(self.evaluate(model.optimizer.loss_scale), 5.)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_policy_v1_without_loss_scale(self):
+    with policy.policy_scope(policy.PolicyV1('mixed_float16',
+                                             loss_scale=None)):
+      opt = gradient_descent.SGD(1.)
+      x = layers.Input(shape=(1,))
+      y = mp_test_util.MultiplyLayer()(x)
+      model = models.Model(x, y)
+      model.compile(opt, loss='mse')
+      self.assertNotIsInstance(model.optimizer,
+                               loss_scale_optimizer.LossScaleOptimizer)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_pass_invalid_optimizer_with_loss_scaling(self):
-    with policy.policy_scope(policy.Policy('float32', loss_scale=10.)):
+    with policy.policy_scope(policy.PolicyV1('float32', loss_scale=10.)):
       x = layers.Input(shape=(1,))
       y = mp_test_util.MultiplyLayer()(x)
       model = models.Model(x, y)
@@ -926,7 +1006,7 @@ class KerasModelTest(keras_parameterized.TestCase):
   def test_save_slot_variables_with_autocast_vars(self,
                                                   strategy_fn,
                                                   var_name='v'):
-    p = policy.Policy('mixed_float16', loss_scale=None)
+    p = policy.Policy('mixed_float16')
     with strategy_fn().scope(), policy.policy_scope(p):
       x = layers.Input(shape=(2,), batch_size=2)
       # Having a var_name other than 'v' tests that a fixed bug (b/134713714)
@@ -938,6 +1018,8 @@ class KerasModelTest(keras_parameterized.TestCase):
       y = layer(x)
       model = models.Model(inputs=x, outputs=y)
       opt = gradient_descent.SGD(1., 1.)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                    initial_scale=1)
       model.compile(
           optimizer=opt,
           loss='mse',
@@ -971,18 +1053,17 @@ class KerasModelTest(keras_parameterized.TestCase):
       y = mp_test_util.MultiplyLayer(assert_type=dtypes.float32)(x)
       model = models.Model(inputs=x, outputs=y)
 
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=1., increment_period=2., multiplier=2.)
       opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(
+          opt, initial_scale=1., dynamic_growth_steps=2.)
       model.compile(
           optimizer=opt,
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly())
     # Run for 3 steps (6 examples with a batch size of 2)
     model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
-    self.assertEqual(backend.get_value(loss_scale()), 2)
-    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
+    self.assertEqual(backend.get_value(opt.loss_scale), 2)
+    self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
 
     # Save model weights.
     save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
@@ -990,20 +1071,20 @@ class KerasModelTest(keras_parameterized.TestCase):
 
     # Run model again for 1 step (2 examples with a batch size of 2)
     model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
-    self.assertEqual(backend.get_value(loss_scale()), 4)
-    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0)
+    self.assertEqual(backend.get_value(opt.loss_scale), 4)
+    self.assertEqual(backend.get_value(opt.dynamic_counter), 0)
 
     # Load model weights and ensure loss scale weights are restored.
     model.load_weights(save_prefix)
-    self.assertEqual(backend.get_value(loss_scale()), 2)
-    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
+    self.assertEqual(backend.get_value(opt.loss_scale), 2)
+    self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
 
   @keras_parameterized.run_all_keras_modes
   def test_restore_old_loss_scale_checkpoint(self):
     # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
     # of LossScaleOptimizer changed, but old checkpoints can still be loaded
     opt = gradient_descent.SGD(0.1, momentum=0.1)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt)
     model = sequential.Sequential([core.Dense(2,)])
 
     # The checkpoint and expected values were obtained from the program in
@@ -1011,9 +1092,9 @@ class KerasModelTest(keras_parameterized.TestCase):
     ckpt_dir = os.path.join(
         flags.FLAGS['test_srcdir'].value,
         'org_tensorflow/tensorflow/python/keras',
-        'mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
+        'mixed_precision/testdata/lso_ckpt_tf2.2')
     # ckpt_dir = test.test_src_dir_path(
-    #     'python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
+    #     'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2')
     model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
     model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
     model(np.zeros((2, 2)))  # Create model weights
@@ -1024,8 +1105,8 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertAllClose(
         self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
         expected_slot)
-    self.assertEqual(self.evaluate(opt.loss_scale()), 32768)
-    self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)
+    self.assertEqual(self.evaluate(opt.loss_scale), 32768)
+    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
 
     # Check restoring works even after the model is compiled and the weights
     # have been created.
@@ -1039,22 +1120,22 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertAllClose(
         self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
         expected_slot)
-    self.assertEqual(self.evaluate(opt.loss_scale()), 32768)
-    self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)
+    self.assertEqual(self.evaluate(opt.loss_scale), 32768)
+    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
 
   def test_restore_old_saved_model(self):
     saved_model_dir = os.path.join(
         flags.FLAGS['test_srcdir'].value,
         'org_tensorflow/tensorflow/python/keras',
-        'mixed_precision/experimental/testdata/lso_savedmodel_tf2.2')
+        'mixed_precision/testdata/lso_savedmodel_tf2.2')
     # saved_model_dir = test.test_src_dir_path(
-    #     'python/keras/mixed_precision/experimental/testdata/'
+    #     'python/keras/mixed_precision/testdata/'
     #     'lso_savedmodel_tf2.2')
     model = save.load_model(saved_model_dir)
     expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
     self.assertAllClose(backend.eval(model.weights[0]), expected_kernel)
-    self.assertIsInstance(model.optimizer,
-                          loss_scale_optimizer.LossScaleOptimizer)
+    self.assertEqual(type(model.optimizer),
+                     loss_scale_optimizer.LossScaleOptimizer)
 
   @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(
@@ -1064,6 +1145,10 @@ class KerasModelTest(keras_parameterized.TestCase):
       }, {
           'testcase_name': 'distribute',
           'strategy_fn': create_mirrored_strategy,
+      }, {
+          'testcase_name': 'use_v1_lso',
+          'strategy_fn': create_mirrored_strategy,
+          'use_v1_loss_scale_optimizer': True
       }, {
           'testcase_name': 'base_h5',
           'strategy_fn': default_strategy_fn,
@@ -1073,7 +1158,8 @@ class KerasModelTest(keras_parameterized.TestCase):
           'strategy_fn': create_mirrored_strategy,
           'h5': True,
       })
-  def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False):
+  def test_save_model_with_dynamic_loss_scaling(
+      self, strategy_fn, h5=False, use_v1_loss_scale_optimizer=False):
     # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy
     # as well.
     strategy = strategy_fn()
@@ -1088,18 +1174,22 @@ class KerasModelTest(keras_parameterized.TestCase):
       y = mp_test_util.MultiplyLayer()(x)
       model = models.Model(inputs=x, outputs=y)
 
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=1., increment_period=2., multiplier=2.)
       opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      if use_v1_loss_scale_optimizer:
+        loss_scale = loss_scale_module.DynamicLossScale(
+            initial_loss_scale=1., increment_period=2.)
+        opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
+      else:
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=1.,
+                                                      dynamic_growth_steps=2.)
       model.compile(
           optimizer=opt,
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly())
     # Run for 3 steps (6 examples with a batch size of 2)
     model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2)
-    self.assertEqual(backend.get_value(loss_scale()), 2)
-    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
+    self.assertEqual(backend.get_value(opt.loss_scale), 2)
+    self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
     (weight,) = model.trainable_weights
     orig_weight = backend.get_value(weight)
 
@@ -1111,13 +1201,12 @@ class KerasModelTest(keras_parameterized.TestCase):
     model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
     new_weight = backend.get_value(weight)
     self.assertNotEqual(new_weight, orig_weight)
-    self.assertEqual(backend.get_value(loss_scale()), 4)
-    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0)
+    self.assertEqual(backend.get_value(opt.loss_scale), 4)
+    self.assertEqual(backend.get_value(opt.dynamic_counter), 0)
 
     # Load model weights and ensure loss scale weights are restored.
     model = save.load_model(
         save_path, custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer})
-    loss_scale = model.optimizer.loss_scale
     (weight,) = model.trainable_weights
     loaded_weight = backend.get_value(weight)
     self.assertEqual(loaded_weight, orig_weight)
@@ -1125,8 +1214,14 @@ class KerasModelTest(keras_parameterized.TestCase):
     # Model.save(). So we assert the loss scale either has the value when it was
     # saved, or the value it was initialized with.
     # TODO(reedwm): Always save/restore the loss scale with Model.save().
-    self.assertIn(backend.get_value(loss_scale()), (1, 2))
-    self.assertIn(backend.get_value(loss_scale._num_good_steps), (0, 1))
+    self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2))
+    self.assertIn(backend.get_value(model.optimizer.dynamic_counter), (0, 1))
+
+    # Test optimizer attributes and type
+    self.assertEqual(model.optimizer.initial_scale, 1.)
+    self.assertEqual(model.optimizer.dynamic_growth_steps, 2.)
+    self.assertEqual(type(model.optimizer),
+                     loss_scale_optimizer.LossScaleOptimizer)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/mixed_precision/experimental/layer_correctness_test.py b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
similarity index 98%
rename from tensorflow/python/keras/mixed_precision/experimental/layer_correctness_test.py
rename to tensorflow/python/keras/mixed_precision/layer_correctness_test.py
index e049b590ddd..bbccc8721cd 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/layer_correctness_test.py
+++ b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
@@ -42,7 +42,7 @@ from tensorflow.python.keras.layers import pooling
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers import wrappers
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.platform import test
 
 
@@ -159,6 +159,10 @@ class LayerCorrectnessTest(keras_parameterized.TestCase):
       input_data: A Numpy array with the data of the input. If None, input data
         will be randomly generated
     """
+
+    if f32_layer_fn == convolutional.ZeroPadding2D and \
+       test.is_built_with_rocm():
+      return
     if isinstance(input_shape[0], int):
       input_shapes = [input_shape]
     else:
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py b/tensorflow/python/keras/mixed_precision/loss_scale.py
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
rename to tensorflow/python/keras/mixed_precision/loss_scale.py
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py b/tensorflow/python/keras/mixed_precision/loss_scale_benchmark.py
similarity index 98%
rename from tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
rename to tensorflow/python/keras/mixed_precision/loss_scale_benchmark.py
index 95fcd1168d1..d468326e1ad 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
+++ b/tensorflow/python/keras/mixed_precision/loss_scale_benchmark.py
@@ -25,7 +25,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
diff --git a/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
new file mode 100644
index 00000000000..0802669e471
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
@@ -0,0 +1,1189 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the loss scaling optimizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.mixed_precision import loss_scale as keras_loss_scale_module
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.training.experimental import mixed_precision
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
+
+
+class _UnwrapPreventer(object):
+  """Wrapper that DistributionStrategy will not unwrap.
+
+  Typically, DistributionStrategy will unwrap values when going from a cross-
+  replica context to a replica context via `call_for_each_replica`. This class
+  is a wrapper that DistributionStrategy will not unwrap, so it can be used to
+  prevent it from unwrapping a value.
+
+  TODO(reedwm): Find/implement a better way of preventing values from being
+  unwrapped by DistributionStrategy
+  """
+
+  __slots__ = ['value']
+
+  def __init__(self, value):
+    self.value = value
+
+
+class _DelegatingTrackableMixin(object):
+  """A mixin that delegates all Trackable methods to another trackable object.
+
+  This class must be used with multiple inheritance. A class that subclasses
+  Trackable can also subclass this class, which causes all Trackable methods to
+  be delegated to the trackable object passed in the constructor.
+
+  A subclass can use this mixin to appear as if it were the trackable passed to
+  the constructor, from a Checkpoint's perspective. LossScaleOptimizer uses this
+  mixin, so that the checkpoint format for a LossScaleOptimizer is identical to
+  the checkpoint format for a normal optimizer. This allows a model to be saved
+  with a normal Optimizer and restored with a LossScaleOptimizer, or vice versa.
+  The only difference in checkpoint format is that the loss scale is also saved
+  with a LossScaleOptimizer.
+  """
+
+  def __init__(self, trackable_obj):
+    self._trackable = trackable_obj
+
+  # pylint: disable=protected-access
+  @property
+  def _setattr_tracking(self):
+    return self._trackable._setattr_tracking
+
+  @_setattr_tracking.setter
+  def _setattr_tracking(self, value):
+    self._trackable._setattr_tracking = value
+
+  @property
+  def _update_uid(self):
+    return self._trackable._update_uid
+
+  @_update_uid.setter
+  def _update_uid(self, value):
+    self._trackable._update_uid = value
+
+  @property
+  def _unconditional_checkpoint_dependencies(self):
+    return self._trackable._unconditional_checkpoint_dependencies
+
+  @property
+  def _unconditional_dependency_names(self):
+    return self._trackable._unconditional_dependency_names
+
+  @property
+  def _name_based_restores(self):
+    return self._trackable._name_based_restores
+
+  def _maybe_initialize_trackable(self):
+    return self._trackable._maybe_initialize_trackable()
+
+  @property
+  def _object_identifier(self):
+    return self._trackable._object_identifier
+
+  @property
+  def _tracking_metadata(self):
+    return self._trackable._tracking_metadata
+
+  def _no_dependency(self, value):
+    return self._trackable._no_dependency(value)
+
+  def _name_based_attribute_restore(self, checkpoint):
+    return self._trackable._name_based_attribute_restore(checkpoint)
+
+  @property
+  def _checkpoint_dependencies(self):
+    return self._trackable._checkpoint_dependencies
+
+  @property
+  def _deferred_dependencies(self):
+    return self._trackable._deferred_dependencies
+
+  def _lookup_dependency(self, name):
+    self._trackable._lookup_dependency(name)
+
+  def _add_variable_with_custom_getter(self,
+                                       name,
+                                       shape=None,
+                                       dtype=dtypes.float32,
+                                       initializer=None,
+                                       getter=None,
+                                       overwrite=False,
+                                       **kwargs_for_getter):
+    return self._trackable._add_variable_with_custom_getter(
+        name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
+
+  def _preload_simple_restoration(self, name):
+    return self._trackable._preload_simple_restoration(name)
+
+  def _track_trackable(self, trackable, name, overwrite=False):  # pylint: disable=redefined-outer-name
+    return self._trackable._track_trackable(trackable, name, overwrite)
+
+  def _handle_deferred_dependencies(self, name, trackable):  # pylint: disable=redefined-outer-name
+    return self._trackable._handle_deferred_dependencies(name, trackable)
+
+  def _restore_from_checkpoint_position(self, checkpoint_position):
+    return self._trackable._restore_from_checkpoint_position(
+        checkpoint_position)
+
+  def _single_restoration_from_checkpoint_position(self, checkpoint_position,
+                                                   visit_queue):
+    return self._trackable._single_restoration_from_checkpoint_position(
+        checkpoint_position, visit_queue)
+
+  def _gather_saveables_for_checkpoint(self):
+    return self._trackable._gather_saveables_for_checkpoint()
+
+  def _list_extra_dependencies_for_serialization(self, serialization_cache):
+    return self._trackable._list_extra_dependencies_for_serialization(
+        serialization_cache)
+
+  def _list_functions_for_serialization(self, serialization_cache):
+    return self._trackable._list_functions_for_serialization(
+        serialization_cache)
+  # pylint: enable=protected-access
+
+
+def _is_all_finite(grads):
+  """Returns a scalar boolean tensor indicating if all gradients are finite."""
+  is_finite_per_grad = [
+      math_ops.reduce_all(math_ops.is_finite(g)) for g in grads if g is not None
+  ]
+  return math_ops.reduce_all(is_finite_per_grad)
+
+
+def _op_in_graph_mode(tensor):
+  """Returns the tensor's op in graph mode, or the tensor in eager mode.
+
+  This is useful because sometimes an op is needed in graph mode instead of a
+  tensor. In eager mode, there are no ops.
+
+  Args:
+    tensor: A tensor.
+
+  Returns:
+    The tensor's op in graph mode. The tensor in eager mode.
+  """
+  if context.executing_eagerly():
+    return tensor
+  return tensor.op
+
+
+def _assign_if_finite(var, value):
+  """Assigns a value to a variable if the value is finite."""
+  return control_flow_ops.cond(
+      math_ops.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)),
+      control_flow_ops.no_op)
+
+
+class _DynamicLossScaleState(trackable.Trackable):
+  """The state of a dynamic loss scale."""
+
+  def __init__(self,
+               initial_loss_scale,
+               growth_steps,
+               multiplier):
+    """Creates the dynamic loss scale."""
+    super(_DynamicLossScaleState, self).__init__()
+    self._initial_loss_scale = float(initial_loss_scale)
+    self._growth_steps = int(growth_steps)
+    self._multiplier = float(multiplier)
+
+    self._weights = {}
+    self._current_loss_scale = self._add_weight(
+        name='current_loss_scale',
+        dtype=dtypes.float32,
+        initial_value=self._initial_loss_scale)
+    # The number of consecutive steps with finite gradients since the last
+    # nonfinite gradient or change in loss scale. The name is 'good_steps' for
+    # backwards compatibility with older checkpoints.
+    self._counter = self._add_weight(
+        name='good_steps', dtype=dtypes.int64, initial_value=0)
+
+  def _add_weight(self, name, initial_value, dtype=None):
+    """Adds a weight to this loss scale.
+
+    Args:
+      name: Variable name.
+      initial_value: The variable's initial value.
+      dtype: The type of the variable.
+
+    Returns:
+      A variable.
+
+    Raises:
+      RuntimeError: If a weight with `name` has already been added.
+    """
+    variable = variable_scope.variable(
+        initial_value=initial_value,
+        name=name,
+        dtype=dtype,
+        trainable=False,
+        use_resource=True,
+        synchronization=variables.VariableSynchronization.AUTO,
+        # Set aggregation to NONE, as loss scaling variables should never be
+        # aggregated.
+        aggregation=variables.VariableAggregation.NONE)
+    if context.executing_eagerly():
+      graph_key = None
+    else:
+      graph = ops.get_default_graph()
+      graph_key = graph._graph_key  # pylint: disable=protected-access
+
+    key = (name, graph_key)
+    self._weights[key] = variable
+    self._handle_deferred_dependencies(name=name, trackable=variable)
+    backend.track_variable(variable)
+    return variable
+
+  @property
+  def _checkpoint_dependencies(self):
+    """From Trackable. Gather graph-specific weights to save."""
+    if context.executing_eagerly():
+      graph_key = None
+    else:
+      graph = ops.get_default_graph()
+      graph_key = graph._graph_key  # pylint: disable=protected-access
+    weights = []
+    for (name, g), v in sorted(self._weights.items(), key=lambda i: i[0][0]):
+      if g == graph_key:
+        weights.append(trackable.TrackableReference(name=name, ref=v))
+    return (super(_DynamicLossScaleState, self)._checkpoint_dependencies +
+            weights)
+
+  def _lookup_dependency(self, name):
+    """From Trackable. Find a weight in the current graph."""
+    unconditional = super(_DynamicLossScaleState, self)._lookup_dependency(name)
+    if unconditional is not None:
+      return unconditional
+    if context.executing_eagerly():
+      graph_key = None
+    else:
+      graph = ops.get_default_graph()
+      graph_key = graph._graph_key  # pylint: disable=protected-access
+    return self._weights.get((name, graph_key), None)
+
+  @property
+  def initial_loss_scale(self):
+    return self._initial_loss_scale
+
+  @property
+  def growth_steps(self):
+    return self._growth_steps
+
+  @property
+  def multiplier(self):
+    return self._multiplier
+
+  @property
+  def current_loss_scale(self):
+    """Returns the current loss scale as a float32 `tf.Variable`."""
+    return self._current_loss_scale
+
+  @property
+  def counter(self):
+    """Returns the counter as a float32 `tf.Variable`."""
+    return self._counter
+
+  def __call__(self):
+    """Returns the current loss scale as a scalar `float32` tensor."""
+    return ops.convert_to_tensor(self._current_loss_scale)
+
+  def update(self, grads):
+    """Updates the value of the loss scale.
+
+    Args:
+      grads: A nested structure of unscaled gradients, each which is the
+        gradient of the loss with respect to a weight.
+
+    Returns:
+      update_op: In eager mode, None. In graph mode, an op to update the loss
+        scale.
+      should_apply_gradients: Either a bool or a scalar boolean tensor. If
+        False, the caller should skip applying `grads` to the variables this
+        step.
+    """
+    grads = nest.flatten(grads)
+    if distribution_strategy_context.has_strategy():
+      distribution = distribution_strategy_context.get_strategy()
+
+      def get_is_finite(grads):
+        is_finite = _is_all_finite(grads)
+        # We cast to float, because we cannot reduce booleans with
+        # DistributionStrategy.
+        return math_ops.cast(is_finite, dtypes.float32)
+
+      is_finite_float = distribution.extended.call_for_each_replica(
+          get_is_finite, args=(grads,))
+      reduced_is_finite_float = distribution.reduce(reduce_util.ReduceOp.SUM,
+                                                    is_finite_float, axis=None)
+      is_finite = math_ops.equal(reduced_is_finite_float,
+                                 distribution.num_replicas_in_sync)
+    else:
+      is_finite = _is_all_finite(grads)
+
+    def update_if_finite_grads():
+      """Update assuming the gradients are finite."""
+
+      def incr_loss_scale():
+        new_loss_scale = self.current_loss_scale * self.multiplier
+        return control_flow_ops.group(
+            _assign_if_finite(self.current_loss_scale, new_loss_scale),
+            self.counter.assign(0))
+
+      return control_flow_ops.cond(
+          self.counter + 1 >= self.growth_steps,
+          incr_loss_scale,
+          lambda: _op_in_graph_mode(self.counter.assign_add(1)))
+
+    def update_if_not_finite_grads():
+      """Update assuming the gradients are nonfinite."""
+
+      new_loss_scale = math_ops.maximum(
+          self.current_loss_scale / self.multiplier, 1)
+      return control_flow_ops.group(
+          self.counter.assign(0),
+          self.current_loss_scale.assign(new_loss_scale))
+
+    update_op = control_flow_ops.cond(is_finite, update_if_finite_grads,
+                                      update_if_not_finite_grads)
+    should_apply_gradients = is_finite
+    return update_op, should_apply_gradients
+
+
+# See LossScaleOptimizer docstring for why this is so big
+_DEFAULT_INITIAL_SCALE = 2 ** 15
+_DEFAULT_GROWTH_STEPS = 2000
+
+
+# pylint: disable=g-classes-have-attributes
+@keras_export('keras.mixed_precision.LossScaleOptimizer')
+class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
+  """An optimizer that applies loss scaling to prevent numeric underflow.
+
+  Loss scaling is a technique to prevent numeric underflow in intermediate
+  gradients when float16 is used. To prevent underflow, the loss is multiplied
+  (or "scaled") by a certain factor called the "loss scale", which causes
+  intermediate gradients to be scaled by the loss scale as well. The final
+  gradients are divided (or "unscaled") by the loss scale to bring them back to
+  their original value.
+
+  `LossScaleOptimizer` wraps another optimizer and applies loss scaling to it.
+  By default, the loss scale is dynamically updated over time so you do not have
+  to choose the loss scale. The `minimize` method automatically scales the loss,
+  unscales the gradients, and updates the loss scale so all you have to do is
+  wrap your optimizer with a `LossScaleOptimizer` if you use `minimize`. For
+  example:
+
+  >>> opt = tf.keras.optimizers.SGD(0.25)
+  >>> opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
+  >>> var = tf.Variable(1.)
+  >>> loss_fn = lambda: var ** 2
+  >>> # 'minimize' applies loss scaling and updates the loss sale.
+  >>> opt.minimize(loss_fn, var_list=var)
+  >>> var.numpy()
+  0.5
+
+  If a `tf.GradientTape` is used to compute gradients instead of `minimize`, you
+  must scale the loss and gradients manually. This can be done with the
+  `LossScaleOptimizer.get_scaled_loss` and
+  `LossScaleOptimizer.get_unscaled_gradients` methods. For example:
+
+  >>> with tf.GradientTape() as tape:
+  ...   loss = loss_fn()
+  ...   scaled_loss = opt.get_scaled_loss(loss)
+  >>> scaled_grad = tape.gradient(scaled_loss, var)
+  >>> (grad,) = opt.get_unscaled_gradients([scaled_grad])
+  >>> opt.apply_gradients([(grad, var)])  # Loss scale is updated here
+  >>> var.numpy()
+  0.25
+
+  Warning: If you forget to call `get_scaled_loss` or `get_unscaled_gradients`
+  (or both) when using a `tf.GradientTape`, the model will likely converge to a
+  worse quality. Please make sure you call each function exactly once.
+
+  When mixed precision with float16 is used, there is typically no risk of
+  underflow affecting model quality if loss scaling is properly used. See
+  [the mixed precision guide](
+  https://www.tensorflow.org/guide/keras/mixed_precision) for more information
+  on how to use mixed precision.
+
+  Args:
+    inner_optimizer: The `tf.keras.optimizers.Optimizer` instance to wrap.
+    dynamic: Bool indicating whether dynamic loss scaling is used. Defaults to
+      True. If True, the loss scale will be dynamically updated over time using
+      an algorithm that keeps the loss scale at approximately its optimal value.
+      If False, a single fixed loss scale is used and `initial_scale` must be
+      specified, which is used as the loss scale. Recommended to keep as True,
+      as choosing a fixed loss scale can be tricky. Currently, there is a small
+      performance overhead to dynamic loss scaling compared to fixed loss
+      scaling.
+    initial_scale: The initial loss scale. If `dynamic` is True, this defaults
+      to `2 ** 15`. If `dynamic` is False, this must be specified and acts as
+      the sole loss scale, as the loss scale does not change over time. When
+      dynamic loss scaling is used, is better for this to be a very high number,
+      because a loss scale that is too high gets lowered far more quickly than a
+      loss scale that is too low gets raised.
+    dynamic_growth_steps: With dynamic loss scaling, every
+      `dynamic_growth_steps` steps with finite gradients, the loss scale is
+      doubled. Defaults to 2000. If a nonfinite gradient is encountered, the
+      count is reset back to zero, gradients are skipped that step, and the loss
+      scale is halved. The count can be queried with
+      `LossScaleOptimizer.dynamic_counter`. This argument can only be specified
+      if `dynamic` is True.
+
+  `LossScaleOptimizer` will occasionally skip applying gradients to the
+  variables, in which case the trainable variables will not change that step.
+  This is done because the dynamic loss scale will sometimes be raised too
+  high, causing overflow in the gradients. Typically, the first 2 to 15 steps of
+  the model are skipped as the initial loss scale is very high, but afterwards
+  steps will only be skipped on average 0.05% of the time (the fraction of steps
+  skipped is `1 / dynamic_growth_steps`).
+
+  `LossScaleOptimizer` delegates all public `Optimizer` methods to the inner
+  optimizer. Additionally, in methods `minimize` and `get_gradients, it scales
+  the loss and unscales the gradients. In methods `minimize` and
+  `apply_gradients`, it additionally updates the loss scale and skips applying
+  gradients if any gradient has a nonfinite value.
+
+  ### Hyperparameters
+
+  Hyperparameters can be accessed and set on the LossScaleOptimizer, which will
+  be delegated to the wrapped optimizer.
+
+  >>> opt = tf.keras.optimizers.Adam(beta_1=0.8, epsilon=1e-5)
+  >>> opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
+  >>> opt.beta_1  # Equivalent to `opt.inner_optimizer.beta_1`
+  0.8
+  >>> opt.beta_1 = 0.7  # Equivalent to `opt.inner_optimizer.beta_1 = 0.7`
+  >>> opt.beta_1
+  0.7
+  >>> opt.inner_optimizer.beta_1
+  0.7
+
+  However, accessing or setting non-hyperparameters is not delegated to the
+  LossScaleOptimizer. In an Adam optimizer, `beta_1` is a hyperparameter but
+  `epsilon` is not, as the Adam optimizer only calls `Optimizer._set_hyper` on
+  `beta_1`.
+
+  >>> opt.inner_optimizer.epsilon
+  1e-5
+  >>> opt.epsilon
+  Traceback (most recent call last):
+  ...
+  AttributeError: 'LossScaleOptimizer' object has no attribute 'epsilon'
+  >>> opt.epsilon = 1e-4  # This does NOT set epsilon on `opt.inner_optimizer`
+  >>> opt.inner_optimizer.epsilon
+  >>> 1e-5
+
+  In the above example, despite epsilon being set on the LossScaleOptimizer, the
+  old epsilon value will still be used when training as epsilon was not set on
+  the inner optimizer.
+  """
+
+  _HAS_AGGREGATE_GRAD = True
+
+  def __init__(self, inner_optimizer, dynamic=True, initial_scale=None,
+               dynamic_growth_steps=None):
+    if not isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
+      raise TypeError('"inner_optimizer" must be an instance of OptimizerV2, '
+                      'but got: %s' % inner_optimizer)
+    if not isinstance(dynamic, bool):
+      # Catch errors if a user incorrectly passes a string or float to the
+      # second argument argument, as this is commonly done for
+      # LossScaleOptimizerV1.
+      raise TypeError('"dynamic" argument to LossScaleOptimizer.__init__ must '
+                      'be a bool, but got: %r' % (dynamic,))
+    self._raise_if_strategy_unsupported()
+    self._optimizer = inner_optimizer
+
+    # We don't call super().__init__, since we do not want to call OptimizerV2's
+    # constructor.
+    _DelegatingTrackableMixin.__init__(self, self._optimizer)
+
+    if dynamic:
+      if initial_scale is None:
+        initial_scale = _DEFAULT_INITIAL_SCALE
+      if dynamic_growth_steps is None:
+        dynamic_growth_steps = _DEFAULT_GROWTH_STEPS
+      self._loss_scale = _DynamicLossScaleState(
+          initial_scale, dynamic_growth_steps, multiplier=2)
+      self._track_trackable(self._loss_scale, 'loss_scale')
+    else:
+      if initial_scale is None:
+        raise ValueError('"initial_scale" must be specified if "dynamic" is '
+                         'False')
+      self._loss_scale = float(initial_scale)
+      if dynamic_growth_steps is not None:
+        raise ValueError('"dynamic_growth_steps" must be None if "dynamic" '
+                         'is False, but got: %s' % (dynamic_growth_steps,))
+
+    # To support restoring TensorFlow 2.2 checkpoints.
+    self._track_trackable(FakeOptimizerForRestoration(self._optimizer),
+                          'base_optimizer')
+
+  @property
+  def dynamic(self):
+    """Bool indicating whether dynamic loss scaling is used."""
+    return isinstance(self._loss_scale, _DynamicLossScaleState)
+
+  @property
+  def loss_scale(self):
+    """The current loss scale as a float32 scalar tensor."""
+    if isinstance(self._loss_scale, _DynamicLossScaleState):
+      return ops.convert_to_tensor(self._loss_scale.current_loss_scale)
+    else:
+      return ops.convert_to_tensor(self._loss_scale)
+
+  @property
+  def dynamic_counter(self):
+    """The number of steps since the loss scale was last increased or decreased.
+
+    This is None if `LossScaleOptimizer.dynamic` is False.
+
+    The counter is incremented every step. Once it reaches
+    `LossScaleOptimizer.dynamic_growth_steps`, the loss scale will be doubled
+    and the counter will be reset back to zero. If nonfinite gradients are
+    encountered, the loss scale will be halved and the counter will be reset
+    back to zero.
+    """
+    if isinstance(self._loss_scale, _DynamicLossScaleState):
+      return self._loss_scale.counter
+    else:
+      return None
+
+  @property
+  def initial_scale(self):
+    """The initial loss scale.
+
+    If `LossScaleOptimizer.dynamic` is False, this is the same number as
+    `LossScaleOptimizer.loss_scale`, as the loss scale never changes.
+    """
+    if isinstance(self._loss_scale, _DynamicLossScaleState):
+      return self._loss_scale.initial_loss_scale
+    else:
+      return self._loss_scale
+
+  @property
+  def dynamic_growth_steps(self):
+    """The number of steps it takes to increase the loss scale.
+
+    This is None if `LossScaleOptimizer.dynamic` is False.
+
+    Every `dynamic_growth_steps` consecutive steps with finite gradients, the
+    loss scale is increased.
+    """
+    if isinstance(self._loss_scale, _DynamicLossScaleState):
+      return self._loss_scale.growth_steps
+    else:
+      return None
+
+  @property
+  def inner_optimizer(self):
+    """The optimizer that this LossScaleOptimizer is wrapping."""
+    return self._optimizer
+
+  def get_scaled_loss(self, loss):
+    """Scales the loss by the loss scale.
+
+    This method is only needed if you compute gradients manually, e.g. with
+    `tf.GradientTape`. In that case, call this method to scale the loss before
+    passing the loss to `tf.GradientTape`. If you use
+    `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, loss
+    scaling is automatically applied and this method is unneeded.
+
+    If this method is called, `get_unscaled_gradients` should also be called.
+    See the `tf.keras.mixed_precision.LossScaleOptimizer` doc for
+    an example.
+
+    Args:
+      loss: The loss, which will be multiplied by the loss scale. Can either be
+        a tensor or a callable returning a tensor.
+
+    Returns:
+      `loss` multiplied by `LossScaleOptimizer.loss_scale`.
+    """
+    if callable(loss):
+      def new_loss():
+        loss_val = loss()
+        return loss_val * math_ops.cast(self.loss_scale, loss_val.dtype)
+      return new_loss
+    else:
+      return loss * math_ops.cast(self.loss_scale, loss.dtype)
+
+  def get_unscaled_gradients(self, grads):
+    """Unscales the gradients by the loss scale.
+
+    This method is only needed if you compute gradients manually, e.g. with
+    `tf.GradientTape`. In that case, call this method to unscale the gradients
+    after computing them with `tf.GradientTape`. If you use
+    `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, loss
+    scaling is automatically applied and this method is unneeded.
+
+    If this method is called, `get_scaled_loss` should also be called. See
+    the `tf.keras.mixed_precision.LossScaleOptimizer` doc for an
+    example.
+
+    Args:
+      grads: A list of tensors, each which will be divided by the loss scale.
+        Can have None values, which are ignored.
+
+    Returns:
+      A new list the same size as `grads`, where every non-None value in `grads`
+      is divided by `LossScaleOptimizer.loss_scale`.
+    """
+    loss_scale_reciprocal = 1. / self.loss_scale
+    return [
+        _multiply_gradient(g, loss_scale_reciprocal) if g is not None else None
+        for g in grads
+    ]
+
+  def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
+    tape = backprop.GradientTape() if tape is None else tape
+    with tape:
+      loss = self.get_scaled_loss(loss)
+    grads_and_vars = self._optimizer._compute_gradients(  # pylint: disable=protected-access
+        loss,
+        var_list,
+        grad_loss,
+        tape=tape)
+    grads = [g for g, _ in grads_and_vars]
+    weights = [v for _, v in grads_and_vars]
+    unscaled_grads = self.get_unscaled_gradients(grads)
+    return list(zip(unscaled_grads, weights))
+
+  def get_gradients(self, loss, params):
+    loss = self.get_scaled_loss(loss)
+    grads = self._optimizer.get_gradients(loss, params)
+    return self.get_unscaled_gradients(grads)
+
+  def _create_all_weights(self, var_list):
+    self._optimizer._create_all_weights(var_list)    # pylint: disable=protected-access
+
+  def apply_gradients(self,
+                      grads_and_vars,
+                      name=None,
+                      experimental_aggregate_gradients=True):
+    if distribution_strategy_context.in_cross_replica_context():
+      raise ValueError('apply_gradients() must be called in a replica context.')
+    # We check for the strategy here despite already checking in the constructor
+    # as frequently the optimizer is created outside the strategy's scope.
+    self._raise_if_strategy_unsupported()
+
+    grads_and_vars = tuple(grads_and_vars)
+    return distribution_strategy_context.get_replica_context().merge_call(
+        self._apply_gradients_cross_replica,
+        args=(grads_and_vars, name, experimental_aggregate_gradients))
+
+  def _apply_gradients_cross_replica(self, distribution, grads_and_vars, name,
+                                     experimental_aggregate_gradients):
+    grads = [g for g, _ in grads_and_vars]
+    if isinstance(self._loss_scale, _DynamicLossScaleState):
+      loss_scale_update_op, should_apply_grads = self._loss_scale.update(grads)
+    else:
+      loss_scale_update_op = control_flow_ops.no_op()
+      should_apply_grads = True
+
+    def apply_fn():
+      # We do not want DistributionStrategy to unwrap any MirroredVariables in
+      # grads_and_vars, because even in a replica context, the wrapped optimizer
+      # expects mirrored variables. So we wrap the variables with an
+      # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
+      # MirroredVariables.
+      wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
+      return distribution.extended.call_for_each_replica(
+          self._apply_gradients,
+          args=(grads, wrapped_vars, name, experimental_aggregate_gradients))
+
+    def do_not_apply_fn():
+      # Normally self._optimizer.iterations is incremented in
+      # self._optimizer.apply_gradients(). Since that is not called in this
+      # branch, we increment it here instead.
+      return self._optimizer.iterations.assign_add(1, read_value=False)
+
+    # Note: We must call this cond() in a cross-replica context.
+    # DistributionStrategy does not support having a cond in a replica context
+    # with a branch that calls `merge_call`, and self._optimizer.apply_gradients
+    # calls `merge_call`.
+    maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
+                                           do_not_apply_fn)
+    return control_flow_ops.group(maybe_apply_op, loss_scale_update_op)
+
+  def _apply_gradients(self, grads, wrapped_vars, name,
+                       experimental_aggregate_gradients):
+    # TODO(reedwm): This will raise a fairly cryptic error message if
+    # self._optimizer.apply_gradients does not take
+    # experimental_aggregate_gradients.
+    return self._optimizer.apply_gradients(
+        list(zip(grads, wrapped_vars.value)), name,
+        experimental_aggregate_gradients=experimental_aggregate_gradients)
+
+  def get_config(self):
+    serialized_optimizer = optimizers.serialize(self._optimizer)
+    return {
+        'inner_optimizer': serialized_optimizer,
+        'dynamic': self.dynamic,
+        'initial_scale': self.initial_scale,
+        'dynamic_growth_steps': self.dynamic_growth_steps,
+    }
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = config.copy()  # Make a copy, since we mutate config
+    if 'loss_scale' in config:
+      # If loss_scale is in config, we assume we are deserializing a
+      # LossScaleOptimizer from TF 2.3 or below. We convert the config so it
+      # can be deserialized in the current LossScaleOptimizer.
+      loss_scale = keras_loss_scale_module.deserialize(
+          config.pop('loss_scale'))
+      if isinstance(loss_scale, loss_scale_module.FixedLossScale):
+        config['dynamic'] = False
+        config['initial_scale'] = loss_scale._loss_scale_value  # pylint: disable=protected-access
+      elif isinstance(loss_scale, loss_scale_module.DynamicLossScale):
+        config['dynamic'] = True
+        config['initial_scale'] = loss_scale.initial_loss_scale
+        config['dynamic_growth_steps'] = loss_scale.increment_period
+        if loss_scale.multiplier != 2:
+          raise ValueError('Cannot deserialize LossScaleOptimizer with a '
+                           'DynamicLossScale whose multiplier is not 2. Got '
+                           'DynamicLossScale: %s' % (loss_scale,))
+      else:
+        raise ValueError(
+            'Serialized LossScaleOptimizers with a LossScale that is neither a '
+            'FixedLossScale nor a DynamicLossScale can no longer be '
+            'deserialized')
+      config['inner_optimizer'] = config.pop('optimizer')
+    config['inner_optimizer'] = optimizers.deserialize(
+        config['inner_optimizer'], custom_objects=custom_objects)
+    return cls(**config)
+
+  def _raise_if_strategy_unsupported(self):
+    if not strategy_supports_loss_scaling():
+      strategy = distribution_strategy_context.get_strategy()
+      if isinstance(strategy,
+                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
+                     tpu_strategy.TPUStrategyV2)):
+        raise ValueError(
+            'Loss scaling is not supported with TPUStrategy. Loss scaling is '
+            'unnecessary with TPUs, since they support bfloat16 instead of '
+            'float16 and bfloat16 does not require loss scaling. You should '
+            'remove the use of the LossScaleOptimizer when TPUs are used.')
+      else:
+        raise ValueError('Loss scaling is not supported with the '
+                         'tf.distribute.Strategy: %s. Try using a different '
+                         'Strategy, e.g. a MirroredStrategy' %
+                         strategy.__class__.__name__)
+
+  # Delegations: We delegate most OptimizerV2 methods to the wrapped optimizer
+  # below.
+
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+
+  def get_slot_names(self):
+    return self._optimizer.get_slot_names()
+
+  def variables(self):
+    return self._optimizer.variables()
+
+  @property
+  def weights(self):
+    return self._optimizer.weights
+
+  def get_weights(self):
+    return self._optimizer.get_weights()
+
+  def set_weights(self, weights):
+    return self._optimizer.set_weights(weights)
+
+  @property
+  def clipnorm(self):
+    return self._optimizer.clipnorm
+
+  @clipnorm.setter
+  def clipnorm(self, val):
+    self._optimizer.clipnorm = val
+
+  @property
+  def global_clipnorm(self):
+    return self._optimizer.global_clipnorm
+
+  @global_clipnorm.setter
+  def global_clipnorm(self, val):
+    self._optimizer.global_clipnorm = val
+
+  @property
+  def clipvalue(self):
+    return self._optimizer.clipvalue
+
+  @clipvalue.setter
+  def clipvalue(self, val):
+    self._optimizer.clipvalue = val
+
+  def _aggregate_gradients(self, grads_and_vars):
+    return self._optimizer._aggregate_gradients(grads_and_vars)  # pylint: disable=protected-access
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    return self._optimizer._restore_slot_variable(slot_name, variable,  # pylint: disable=protected-access
+                                                  slot_variable)
+
+  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
+                                       variable):
+    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position, slot_name, variable)
+
+  def get_slot(self, var, slot_name):
+    return self._optimizer.get_slot(var, slot_name)
+
+  def add_slot(self, var, slot_name, initializer='zeros'):
+    return self._optimizer.add_slot(var, slot_name, initializer)
+
+  def __getattribute__(self, name):
+    try:
+      return object.__getattribute__(self, name)
+    except AttributeError as e:
+      if name == '_optimizer' or name == '_hyper':
+        # Avoid infinite recursion
+        raise e
+
+      # Delegate hyperparameter accesses to inner optimizer.
+      if name == 'lr':
+        name = 'learning_rate'
+      if name in self._optimizer._hyper:
+        return self._optimizer._get_hyper(name)
+      raise e
+
+  def __dir__(self):
+    result = set(super(LossScaleOptimizer, self).__dir__())
+    if '_optimizer' in result:
+      result |= self._optimizer._hyper.keys()
+      if 'learning_rate' in self._optimizer._hyper.keys():
+        result.add('lr')
+    return list(result)
+
+  def __setattr__(self, name, value):
+    if name == 'lr':
+      name = 'learning_rate'
+    # Delegate setting hyperparameter to inner optimizer if the attribute does
+    # not exist on the LossScaleOptimizer
+    try:
+      # We cannot check for the 'iterations' attribute as it cannot be set after
+      # it is accessed.
+      if name != 'iterations':
+        object.__getattribute__(self, name)
+      has_attribute = True
+    except AttributeError:
+      has_attribute = False
+    if (name != '_optimizer' and name in self._optimizer._hyper
+        and not has_attribute):
+      self._optimizer._set_hyper(name, value)
+    else:
+      super(LossScaleOptimizer, self).__setattr__(name, value)
+
+  # We do not override some OptimizerV2 methods. For each, we describe why we do
+  # not delegate them to self._optimizer:
+  # * get_updates: get_updates() calls get_gradients(). Since we override
+  #   get_gradients(), we cannot delegate get_updates() to self._optimizer,
+  #   otherwise the overridden get_gradients() method would not be called.
+  #   Luckily, get_updates() does not access any OptimizerV2 fields, so
+  #   inheriting the OptimizerV2 version works fine.
+  # * minimize: We don't delegate for a similar as get_updates(): it calls
+  #   both self._compute_gradients() and self.apply_gradients(), and both need
+  #   to have the LossScaleOptimizer version called.
+
+  # TODO(reedwm): Maybe throw an error if mixed precision is used without this
+  # optimizer being used.
+
+
+@keras_export('keras.mixed_precision.experimental.LossScaleOptimizer')
+class LossScaleOptimizerV1(LossScaleOptimizer):
+  """An deprecated optimizer that applies loss scaling.
+
+  Warning: This class is deprecated and will be removed in TensorFlow 2.5.
+  Please use the non-experimental class
+  `tf.keras.mixed_precision.LossScaleOptimizer` instead.
+
+  This class is identical to the non-experimental
+  `keras.mixed_precision.LossScaleOptimizer` except its constructor takes
+  different arguments. For this class (the experimental version), the
+  constructor takes a `loss_scale` argument.  For the non-experimental class,
+  the constructor encodes the loss scaling information in multiple arguments.
+  Note that unlike this class, the non-experimental class does not accept a
+  `tf.compat.v1.mixed_precision.LossScale`, which is deprecated.
+
+  If you currently use this class, you should switch to the non-experimental
+  `tf.keras.mixed_precision.LossScaleOptimizer` instead. We show several
+  examples of converting the use of the experimental class to the equivalent
+  non-experimental class.
+
+  >>> # In all of the the examples below, `opt1` and `opt2` are identical
+  >>> opt1 = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD(), loss_scale='dynamic')
+  >>> opt2 = tf.keras.mixed_precision.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD())
+  >>> assert opt1.get_config() == opt2.get_config()
+
+  >>> opt1 = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD(), loss_scale=123)
+  >>> # dynamic=False indicates to use fixed loss scaling. initial_scale=123
+  >>> # refers to the initial loss scale, which is the single fixed loss scale
+  >>> # when dynamic=False.
+  >>> opt2 = tf.keras.mixed_precision.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD(), dynamic=False, initial_scale=123)
+  >>> assert opt1.get_config() == opt2.get_config()
+
+  >>> loss_scale = tf.compat.v1.mixed_precision.experimental.DynamicLossScale(
+  ...     initial_loss_scale=2048, increment_period=500)
+  >>> opt1 = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD(), loss_scale=loss_scale)
+  >>> opt2 = tf.keras.mixed_precision.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD(), initial_scale=2048,
+  ...     dynamic_growth_steps=500)
+  >>> assert opt1.get_config() == opt2.get_config()
+
+  Make sure to also switch from this class to the non-experimental class in
+  isinstance checks, if you have any. If you do not do this, your model may run
+  into hard-to-debug issues, as the experimental `LossScaleOptimizer` subclasses
+  the non-experimental `LossScaleOptimizer`, but not vice versa. It is safe to
+  switch isinstance checks to the non-experimental `LossScaleOptimizer` even
+  before using the non-experimental `LossScaleOptimizer`.
+
+  >>> opt1 = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD(), loss_scale='dynamic')
+  >>> # The experimental class subclasses the non-experimental class
+  >>> isinstance(opt1, tf.keras.mixed_precision.LossScaleOptimizer)
+  True
+  >>> opt2 = tf.keras.mixed_precision.LossScaleOptimizer(
+  ...     tf.keras.optimizers.SGD())
+  >>> # The non-experimental class does NOT subclass the experimental class.
+  >>> isinstance(opt2, tf.keras.mixed_precision.experimental.LossScaleOptimizer)
+  False
+
+  Args:
+    optimizer: The Optimizer instance to wrap.
+    loss_scale: The loss scale to scale the loss and gradients. This can
+      either be an int/float to use a fixed loss scale, the string "dynamic"
+      to use dynamic loss scaling, or an instance of a LossScale. The string
+      "dynamic" equivalent to passing `DynamicLossScale()`, and passing an
+      int/float is equivalent to passing a FixedLossScale with the given loss
+      scale. If a DynamicLossScale is passed, DynamicLossScale.multiplier must
+      be 2 (the default).
+  """
+
+  def __init__(self, optimizer, loss_scale):
+    warn_msg_prefix = (
+        'tf.keras.mixed_precision.experimental.LossScaleOptimizer is '
+        'deprecated. Please use tf.keras.mixed_precision.LossScaleOptimizer '
+        'instead. ')
+
+    if isinstance(loss_scale, dict):
+      loss_scale = keras_loss_scale_module.deserialize(loss_scale)
+
+    if isinstance(loss_scale, (int, float)):
+      tf_logging.warn(
+          warn_msg_prefix + 'For example\n'
+          '  opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer('
+          'opt, dynamic=False, initial_scale={})'.format(loss_scale))
+      super(LossScaleOptimizerV1, self).__init__(optimizer, dynamic=False,
+                                                 initial_scale=loss_scale)
+    elif isinstance(loss_scale, loss_scale_module.FixedLossScale):
+      ls_val = loss_scale._loss_scale_value  # pylint: disable=protected-access
+      tf_logging.warn(
+          warn_msg_prefix + 'For example\n'
+          '  opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer('
+          'opt, dynamic=False, initial_scale={})'.format(ls_val))
+      super(LossScaleOptimizerV1, self).__init__(optimizer, dynamic=False,
+                                                 initial_scale=ls_val)
+    elif loss_scale == 'dynamic':
+      tf_logging.warn(
+          warn_msg_prefix + 'For example\n'
+          '  opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer('
+          'opt)')
+      super(LossScaleOptimizerV1, self).__init__(optimizer)
+    elif isinstance(loss_scale, loss_scale_module.DynamicLossScale):
+      kwargs = {}
+      extra_arguments = ''
+      if loss_scale.initial_loss_scale != _DEFAULT_INITIAL_SCALE:
+        kwargs['initial_scale'] = loss_scale.initial_loss_scale
+        extra_arguments += (', initial_scale=%s' %
+                            loss_scale.initial_loss_scale)
+      if loss_scale.increment_period != _DEFAULT_GROWTH_STEPS:
+        kwargs['dynamic_growth_steps'] = loss_scale.increment_period
+        extra_arguments += (', dynamic_growth_steps=%s' %
+                            loss_scale.increment_period)
+      if loss_scale.multiplier != 2:
+        raise ValueError('When passing a DynamicLossScale to "loss_scale", '
+                         'DynamicLossScale.multiplier must be 2. Got: %s'
+                         % (loss_scale,))
+      tf_logging.warn(
+          warn_msg_prefix +
+          'Note that the non-experimental LossScaleOptimizer does not take a '
+          'DynamicLossScale but instead takes the dynamic configuration '
+          'directly in the constructor. For example:\n'
+          '  opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer('
+          'opt{})\n'.format(extra_arguments))
+      super(LossScaleOptimizerV1, self).__init__(optimizer, **kwargs)
+    elif isinstance(loss_scale, loss_scale_module.LossScale):
+      raise TypeError('Passing a LossScale that is not a FixedLossScale or a '
+                      'DynamicLossScale is no longer supported. Got: {}'
+                      .format(loss_scale))
+    else:
+      raise ValueError('Invalid value passed to loss_scale. loss_scale '
+                       'must be the string "dynamic" (recommended), an int, '
+                       'a float, a FixedLossScale, or a DynamicLossScale. Got '
+                       'value: {}'.format(loss_scale))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = config.copy()  # Make a copy, since we mutate config
+
+    # If loss_scale is in config, we assume we are deserializing a
+    # LossScaleOptimizer from TF 2.3 or below. Otherwise, we assume we are
+    # deserializing a LossScaleOptimizer from TF 2.4 or above.
+    if 'loss_scale' in config:
+      config['loss_scale'] = keras_loss_scale_module.deserialize(
+          config['loss_scale'])
+      if (isinstance(config['loss_scale'], loss_scale_module.DynamicLossScale)
+          and config['loss_scale'].multiplier != 2):
+        raise ValueError('Cannot deserialize LossScaleOptimizer with a '
+                         'DynamicLossScale whose multiplier is not 2. Got '
+                         'DynamicLossScale: %s' % (config['loss_scale'],))
+      config['optimizer'] = optimizers.deserialize(
+          config['optimizer'], custom_objects=custom_objects)
+      return cls(**config)
+
+    # We convert the config, as generated by LossScaleOptimizer.get_config, to a
+    # version that can be passed to LossScaleOptimizerV1.__init__
+    if config['dynamic']:
+      config['loss_scale'] = loss_scale_module.DynamicLossScale(
+          config['initial_scale'], config['dynamic_growth_steps'], multiplier=2)
+    else:
+      config['loss_scale'] = loss_scale_module.FixedLossScale(
+          config['initial_scale'])
+
+    del config['dynamic']
+    del config['initial_scale']
+    del config['dynamic_growth_steps']
+    config['optimizer'] = optimizers.deserialize(
+        config.pop('inner_optimizer'), custom_objects=custom_objects)
+    return cls(**config)
+
+
+class FakeOptimizerForRestoration(trackable.Trackable):
+  """A fake optimizer used to support restoring TensorFlow 2.2 checkpoints.
+
+  The checkpoint format for LossScaleOptimizers changed after TF 2.2. This class
+  exists to support restoring TF 2.2 checkpoints in newer version of TensorFlow.
+
+  In TF 2.2, LossScaleOptimizer would track the wrapped optimizer by calling the
+  following in LossScaleOptimizer.__init__
+
+  ```
+  self._track_trackable(self._optimizer, 'base_optimizer')
+  ```
+
+  This means a dependency from the LossScaleOptimizer to the wrapped optimizer
+  would be stored in the checkpoint. However now, the checkpoint format with a
+  LossScaleOptimizer is the same as the format without a LossScaleOptimizer,
+  except the loss scale is also stored. This means there is no dependency from
+  the LossScaleOptimizer to the wrapped optimizer. Instead, the
+  LossScaleOptimizer acts as if it is the wrapped optimizer, from a checkpoint's
+  perspective, by overriding all Trackable methods and delegating them to the
+  wrapped optimizer.
+
+  To allow restoring TF 2.2. checkpoints, LossScaleOptimizer adds a dependency
+  on this class instead of the inner optimizer. When restored, this class will
+  instead restore the slot variables of the inner optimizer. Since this class
+  has no variables, it does not affect the checkpoint when saved.
+  """
+
+  def __init__(self, optimizer):
+    self._optimizer = optimizer
+
+  def get_slot_names(self):
+    return self._optimizer.get_slot_names()
+
+  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
+                                       variable):
+    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position, slot_name, variable)
+
+
+# pylint: disable=protected-access
+mixed_precision._register_wrapper_optimizer_cls(optimizer_v2.OptimizerV2,
+                                                LossScaleOptimizerV1)
+
+
+def _multiply_gradient(gradient, scale):
+  """Multiply a (possibly sparse) gradient by the given scale factor."""
+  scale = math_ops.cast(scale, gradient.dtype)
+  if isinstance(gradient, ops.IndexedSlices):
+    return ops.IndexedSlices(
+        gradient.values * scale,
+        gradient.indices,
+        dense_shape=gradient.dense_shape)
+  else:
+    return gradient * scale
+
+
+def strategy_supports_loss_scaling():
+  """Returns True if the current Strategy supports loss scaling."""
+  if not distribution_strategy_context.has_strategy():
+    return True
+  strategy = distribution_strategy_context.get_strategy()
+  # Strategies are supported if either there is only one replica or if variables
+  # are replicated per device. Otherwise, the current model.fit() implementation
+  # and most custom training loops incorrectly unscale the gradients. Currently,
+  # gradients are unscaled once per compute replica, but they should be unscaled
+  # once per variable replica. When there is one variable replica for each
+  # compute replica, this works fine, but otherwise issues will occur.
+  # TODO(reedwm): Support all strategies.
+  return isinstance(strategy, (
+      collective_all_reduce_strategy.CollectiveAllReduceStrategy,
+      collective_all_reduce_strategy.CollectiveAllReduceStrategyV1,
+      one_device_strategy.OneDeviceStrategy,
+      one_device_strategy.OneDeviceStrategyV1,
+      mirrored_strategy.MirroredStrategy,
+      mirrored_strategy.MirroredStrategyV1,
+  ))
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer_test.py
similarity index 57%
rename from tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
rename to tensorflow/python/keras/mixed_precision/loss_scale_optimizer_test.py
index fe3a237ef83..e9f375303a6 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer_test.py
@@ -32,15 +32,15 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
-from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision import test_util as mp_test_util
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.training.experimental import loss_scale as tf_loss_scale_module
 from tensorflow.python.training.tracking import util as trackable_utils
 
 # Disable not-callable lint error, as the linter is unable to detect that
@@ -93,7 +93,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([5.0])
       opt = gradient_descent.SGD(2.0)
       loss_scale = 10.
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                    initial_scale=loss_scale)
+      self.assertEqual(self.evaluate(opt.loss_scale), loss_scale)
+      self.assertIsInstance(opt.loss_scale, ops.Tensor)
       # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale
       # / strategy.num_replicas_in_sync will not be exact, which could lead to
       # assertion failures due to rounding issues.
@@ -112,7 +115,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([2.0])
       opt = gradient_descent.SGD(1.0)
       loss_scale = 10.
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                    initial_scale=loss_scale)
       grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
           loss_scale)
       loss = grad_check_fn(var)
@@ -122,9 +126,18 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
       self.evaluate(run_op)
 
+  def testDynamicAttrsWithFixedLossScale(self):
+    opt = gradient_descent.SGD()
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                  initial_scale=2.)
+    self.assertFalse(opt.dynamic)
+    self.assertIsNone(opt.dynamic_counter)
+    self.assertIsNone(opt.dynamic_growth_steps)
+
   def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2.)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                  initial_scale=2.)
     loss = ops.convert_to_tensor_v2_with_dispatch(5.)
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
@@ -134,7 +147,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testGetUnscaledGradients(self):
     opt = gradient_descent.SGD(2.0)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                  initial_scale=2)
     scaled_grads = [
         ops.convert_to_tensor_v2_with_dispatch(3.), None,
         ops.convert_to_tensor_v2_with_dispatch(-4., dtype='float16')
@@ -145,7 +159,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testGetUnscaledSparseGradients(self):
     opt = gradient_descent.SGD(2.0)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                  initial_scale=2)
     sparse_scaled_grad = ops.IndexedSlices(
         ops.convert_to_tensor_v2_with_dispatch([[4., 2.], [8., 5.]]),
         ops.convert_to_tensor_v2_with_dispatch([1, 3], dtype='int32'),
@@ -165,12 +180,14 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       var = variables.Variable([5.0])
       opt = gradient_descent.SGD(learning_rate)
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=2, increment_period=1, multiplier=2)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-      self.assertEqual(
-          loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
+                                                    dynamic_growth_steps=1)
+      self.assertEqual(opt.initial_scale, 2.)
+      self.assertIsInstance(opt.initial_scale, float)
+      self.assertEqual(opt.dynamic_growth_steps, 1)
+      self.assertIsInstance(opt.dynamic_growth_steps, int)
 
+      self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0)
       run_fn = self._run_fn_with_grad_check(strategy, var, opt,
                                             expected_gradient)
       run_op = strategy.experimental_run(run_fn)
@@ -189,6 +206,14 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # 1.
       self.assertAllClose([1.], self.evaluate(var))
 
+  def testDynamicLossScaleDefaultValues(self):
+    opt = gradient_descent.SGD()
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt)
+    self.assertEqual(opt.initial_scale, 2 ** 15)
+    self.assertEqual(opt.dynamic_growth_steps, 2000)
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(self.evaluate(opt.loss_scale), 2 ** 15)
+
   # pylint: disable=cell-var-from-loop
   @parameterized.named_parameters(*TESTCASES)
   def testClipping(self, strategy_fn):
@@ -198,12 +223,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       with strategy.scope(), self.subTest(clip_type=clip_type):
         var = variables.Variable([5.0])
         opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0})
-        loss_scale = loss_scale_module.DynamicLossScale(
-            initial_loss_scale=2, increment_period=1, multiplier=2)
-        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
+                                                      dynamic_growth_steps=1)
         self.assertEqual(getattr(opt, clip_type), 2.0)
-        self.assertEqual(
-            loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0)
+        self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0)
 
         loss = lambda: var * 4 / strategy.num_replicas_in_sync
         run_fn = lambda: opt.minimize(loss, var_list=[var])
@@ -215,7 +238,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         # The gradient is 4 but is clipped to 2, so the variable will be
         # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
         self.assertAllClose([1.], self.evaluate(var))
-        self.assertEqual(self.evaluate(opt.loss_scale()), 4)
+        self.assertEqual(self.evaluate(opt.loss_scale), 4)
 
         # Test changing the clip amount and running again
         setattr(opt, clip_type, 3.0)
@@ -224,7 +247,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         # The gradient is 4 but is clipped to 3, so the variable will be
         # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
         self.assertAllClose([-5.], self.evaluate(var))
-        self.assertEqual(self.evaluate(opt.loss_scale()), 8)
+        self.assertEqual(self.evaluate(opt.loss_scale), 8)
 
         # Test Inf gradients are still skipped instead of being clipped
         loss = lambda: var * float('Inf')
@@ -232,7 +255,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         run_op = strategy.experimental_run(run_fn)
         self._run_if_in_graph_mode(run_op)
         self.assertAllClose([-5.], self.evaluate(var))  # Var does not change
-        self.assertEqual(self.evaluate(opt.loss_scale()), 4)
+        self.assertEqual(self.evaluate(opt.loss_scale), 4)
   # pylint: enable=cell-var-from-loop
 
   @parameterized.named_parameters(*TESTCASES)
@@ -240,9 +263,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with strategy_fn().scope() as strategy:
       var = variables.Variable([1.0, 2.0])
       opt = gradient_descent.SGD(1.0)
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=2, increment_period=1, multiplier=2)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
+                                                    dynamic_growth_steps=1)
 
       # Test optimizer with finite gradients
       loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
@@ -253,7 +275,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # Gradient is 2, so variable will have 2 subtracted from it
       self.assertAllClose([-1.0, 0.0], self.evaluate(var))
       # Loss scale has doubled from 2 to 4
-      self.assertEqual(4., self.evaluate(opt.loss_scale()))
+      self.assertEqual(4., self.evaluate(opt.loss_scale))
 
       # Test optimizer with NaN gradients
       loss = lambda: var * float('NaN')
@@ -263,7 +285,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # Variable should not change from before, due to NaN gradients.
       self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
       # Loss scale should half due to NaN gradients.
-      self.assertEqual(2., self.evaluate(opt.loss_scale()))
+      self.assertEqual(2., self.evaluate(opt.loss_scale))
 
   @parameterized.named_parameters(*TESTCASES)
   def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
@@ -272,9 +294,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       var = variables.Variable([5.0])
       opt = gradient_descent.SGD(learning_rate)
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=2, increment_period=1, multiplier=2)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
+                                                    dynamic_growth_steps=1)
 
       def loss():
         return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16')
@@ -297,11 +318,9 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([1.0, 2.0])
       # An SGD optimizer with momentum has slot variables.
       opt = gradient_descent.SGD(1.0, momentum=1.)
-      initial_loss_scale = 2.
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=initial_loss_scale, increment_period=1,
-          multiplier=4)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      initial_scale = 2.
+      opt = loss_scale_optimizer.LossScaleOptimizer(
+          opt, initial_scale=initial_scale, dynamic_growth_steps=1)
       loss = lambda: var / strategy.num_replicas_in_sync
       run_fn = lambda: opt.minimize(loss, var_list=[var])
       run_op = strategy.experimental_run(run_fn)
@@ -312,7 +331,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # variable is subtracted by the accumulator, so the variable is subtracted
       # by 1.
       self.assertAllClose([0.0, 1.0], self.evaluate(var))
-      self.assertEqual(self.evaluate(opt.loss_scale()), initial_loss_scale * 4)
+      self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2)
 
       run_op = strategy.experimental_run(run_fn)
       self._run_if_in_graph_mode(run_op)
@@ -321,14 +340,14 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # variable is subtracted by the accumulator, so the variable is subtracted
       # by 2.
       self.assertAllClose([-2., -1.], self.evaluate(var))
-      self.assertEqual(self.evaluate(opt.loss_scale()),
-                       initial_loss_scale * 16)
+      self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4)
 
       self.assertEqual(opt.get_slot_names(), ['momentum'])
 
   def testIterations(self):
     opt = gradient_descent.SGD(2.0)
-    lso = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=10.)
+    lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                  initial_scale=10.)
     lso.iterations = 7
     self.assertEqual(lso.iterations, 7)
     self.assertEqual(opt.iterations, 7)
@@ -338,7 +357,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with strategy_fn().scope() as strategy:
       # Test iterations is incremented in opt.minimize.
       opt = gradient_descent.SGD(1.0)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale='dynamic')
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt)
       var = variables.Variable([5.0])
       loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
       run_fn = lambda: opt.minimize(loss, [var])
@@ -361,11 +380,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with self.test_session():
       var = variables.Variable([1.0])
       opt = gradient_descent.SGD(1.0)
-      initial_loss_scale = 2.
-      loss_scale = loss_scale_module.DynamicLossScale(
-          initial_loss_scale=initial_loss_scale, increment_period=1,
-          multiplier=4)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2.,
+                                                    dynamic_growth_steps=1)
       run_op = opt.minimize(lambda: var * 2, [var])
       self.evaluate(variables.global_variables_initializer())
       self._run_if_in_graph_mode(run_op)
@@ -377,15 +393,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       opt.set_weights([np.array(2.)])
       self.assertEqual(self.evaluate(opt.variables()[0]), 2)
 
-  def testPassingNoneToLossScale(self):
-    opt = gradient_descent.SGD()
-    with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'):
-      loss_scale_optimizer.LossScaleOptimizer(opt, None)
-
   def testHyperParametersExposed(self):
     with self.cached_session():
       opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
-      lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
       # Force hyperparameters to be created
       opt.lr  # pylint: disable=pointless-statement
       self.evaluate(variables.global_variables_initializer())
@@ -420,13 +431,13 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
           self._set_hyper('loss_scale', 123.)
 
       opt = MyOpt()
-      lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
       with self.assertRaises(AttributeError):
-        lso.loss_scale = loss_scale_module.FixedLossScale(2.)
+        lso.loss_scale = 2.
 
   def testArbitraryAttributesNotExposed(self):
     opt = gradient_descent.SGD()
-    lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+    lso = loss_scale_optimizer.LossScaleOptimizer(opt)
     self.assertFalse(opt.nesterov)
     with self.assertRaisesRegex(
         AttributeError,
@@ -438,15 +449,14 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertFalse(opt.nesterov)
 
   def testDir(self):
-    lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(),
-                                                  'dynamic')
+    lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
     dir_result = dir(lso)
     self.assertIn('learning_rate', dir_result)  # Hyperparameter
     self.assertIn('lr', dir_result)  # Hyperparameter
     self.assertIn('minimize', dir_result)  # Attribute
     self.assertIn('loss_scale', dir_result)  # Attribute
     self.assertNotIn('nesterov', dir_result)  # Attribute on inner optimizer
-    self.assertIn('nesterov', dir(lso._optimizer))
+    self.assertIn('nesterov', dir(lso.inner_optimizer))
 
   def testApplyGradientsGetsUnwrappedTensors(self):
     # Tests that gradients passed to apply_gradients are not wrapped in a
@@ -471,11 +481,125 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with create_mirrored_strategy().scope() as strategy:
       var = variables.Variable([5.0])
       opt = MyOptimizer(learning_rate=1.0)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=1)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
+                                                    initial_scale=1)
       loss = lambda: var * 2.0
       run_fn = lambda: opt.minimize(loss, [var])
       strategy.experimental_run(run_fn)
 
+  @parameterized.named_parameters(*TESTCASES)
+  def testV1Optimizer(self, strategy_fn):
+    strategy = strategy_fn()
+    learning_rate = 2.
+    with strategy.scope():
+      # Test FixedLossScale
+      var = variables.Variable([5.0])
+      opt = gradient_descent.SGD(learning_rate)
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale=2)
+      self.assertIsInstance(opt.loss_scale, ops.Tensor)
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(opt.loss_scale), 2)
+      self.assertEqual(opt.initial_scale, 2)
+      self.assertIsNone(opt.dynamic_growth_steps)
+      run_fn = self._run_fn_with_grad_check(
+          strategy, var, opt, 2 / strategy.num_replicas_in_sync)
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The loss is the identity of the variable. Therefore the gradient is 1,
+      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
+      self.assertAllClose([3.], self.evaluate(var))
+
+      # Test DynamicLossScale
+      var = variables.Variable([5.0])
+      opt = gradient_descent.SGD(learning_rate)
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, 'dynamic')
+      self.assertEqual(opt.initial_scale, 2 ** 15)
+      self.assertEqual(opt.dynamic_growth_steps, 2000)
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(opt.loss_scale), 2 ** 15)
+      for s in strategy.experimental_local_results(opt.dynamic_counter):
+        self.assertEqual(self.evaluate(s), 0)
+
+      loss = lambda: var * float('NaN')
+      run_fn = lambda: opt.minimize(loss, var_list=[var])
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      self.assertAllClose([5.], self.evaluate(var))
+      self.assertEqual(self.evaluate(opt.loss_scale), 2 ** 14)
+      for s in strategy.experimental_local_results(opt.dynamic_counter):
+        self.assertEqual(self.evaluate(s), 0)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def testPassingV1LossScale(self, strategy_fn):
+    strategy = strategy_fn()
+    learning_rate = 2.
+    with strategy.scope():
+      # Test FixedLossScale
+      var = variables.Variable([5.0])
+      opt = gradient_descent.SGD(learning_rate)
+      loss_scale = tf_loss_scale_module.FixedLossScale(2.)
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
+      self.assertIsInstance(opt.loss_scale, ops.Tensor)
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(opt.loss_scale), 2)
+      run_fn = self._run_fn_with_grad_check(
+          strategy, var, opt, 2 / strategy.num_replicas_in_sync)
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The loss is the identity of the variable. Therefore the gradient is 1,
+      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
+      self.assertAllClose([3.], self.evaluate(var))
+
+      # Test DynamicLossScale
+      var = variables.Variable([5.0])
+      opt = gradient_descent.SGD(learning_rate)
+      loss_scale = tf_loss_scale_module.DynamicLossScale(
+          initial_loss_scale=4, increment_period=1, multiplier=2)
+      loss_scale._current_loss_scale.assign(2)
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
+      self.assertEqual(opt.initial_scale, 4)
+      self.assertEqual(opt.dynamic_growth_steps, 1)
+      self.evaluate(variables.global_variables_initializer())
+      # Current loss scale is not copied so loss scale is reinitialized to 4
+      self.assertEqual(self.evaluate(opt.loss_scale), 4)
+      for s in strategy.experimental_local_results(opt.dynamic_counter):
+        self.assertEqual(self.evaluate(s), 0)
+
+      run_fn = self._run_fn_with_grad_check(
+          strategy, var, opt, 4 / strategy.num_replicas_in_sync)
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      self.assertAllClose([3.], self.evaluate(var))
+
+  def testPassingV1LossScaleErrors(self):
+    opt = gradient_descent.SGD()
+    loss_scale = tf_loss_scale_module.DynamicLossScale(multiplier=4)
+    with self.assertRaisesRegex(
+        ValueError, 'When passing a DynamicLossScale to "loss_scale", '
+                    'DynamicLossScale.multiplier must be 2. Got: '
+                    'DynamicLossScale'):
+      loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
+
+    class MyLossScale(tf_loss_scale_module.LossScale):
+
+      def __call__(self):
+        return 1.
+
+      def update(self, grads):
+        return None, True
+
+      def get_config(self):
+        return {}
+
+    with self.assertRaisesRegex(
+        TypeError, 'Passing a LossScale that is not a FixedLossScale or a '
+                   'DynamicLossScale is no longer supported. Got:'):
+      loss_scale_optimizer.LossScaleOptimizerV1(opt, MyLossScale())
+
   @parameterized.named_parameters({
       'testcase_name': 'SaveAndRestoreBase',
       'strategy_fn': default_strategy_fn,
@@ -529,10 +653,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([2.0])
       opt = inner_opt = MySGD(1., momentum=1.)
       if save_with_ls:
-        loss_scale = loss_scale_module.DynamicLossScale(
-            initial_loss_scale=1., increment_period=2.,
-            multiplier=2.)
-        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=1.,
+                                                      dynamic_growth_steps=2.)
       run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var])
       opt_op = strategy.experimental_run(run_fn)
       self.evaluate(variables.global_variables_initializer())
@@ -541,8 +663,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # Assert values.
       self.assertEqual(self.evaluate(var), 1.)
       if save_with_ls:
-        self.assertEqual(self.evaluate(loss_scale()), 1.)
-        self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+        self.assertEqual(self.evaluate(opt.loss_scale), 1.)
+        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
       slot_var = opt.get_slot(var, 'momentum')
       self.assertEqual(self.evaluate(slot_var).item(), -1)
       self.assertEqual(self.evaluate(opt.iterations), 1)
@@ -560,10 +682,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([2.0])
       opt = inner_opt = MySGD(1., momentum=1.)
       if restore_with_ls:
-        loss_scale = loss_scale_module.DynamicLossScale(
-            initial_loss_scale=1., increment_period=2.,
-            multiplier=2.)
-        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=1.,
+                                                      dynamic_growth_steps=2.)
 
       # Restore new model.
       checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
@@ -578,11 +698,11 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       if context.executing_eagerly():
         self.assertEqual(self.evaluate(var), 1.)
         if save_with_ls and restore_with_ls:
-          self.assertEqual(self.evaluate(loss_scale()), 1.)
-          self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+          self.assertEqual(self.evaluate(opt.loss_scale), 1.)
+          self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
         elif restore_with_ls:
-          self.assertEqual(self.evaluate(loss_scale()), 1.)
-          self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
+          self.assertEqual(self.evaluate(opt.loss_scale), 1.)
+          self.assertEqual(self.evaluate(opt.dynamic_counter), 0)
         self.assertEqual(self.evaluate(opt.iterations), 1)
 
       # Run the model again.
@@ -611,30 +731,180 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(var), 1)
       self.assertEqual(self.evaluate(slot_var).item(), -1)
 
-  def testGetConfig(self):
+  @combinations.generate(combinations.combine(
+      get_config=['v1', 'v2', 'tf2_3'], from_config=['v1', 'v2']))
+  def testGetConfigFixed(self, get_config, from_config):
+    # Get a config from LossScaleOptimizerV1, LossScaleOptimizer, or the
+    # LossScaleOptimizer from TF 2.3. Then restore the config into a
+    # LossScaleOptimizerV1 or LossScaleOptimizer
     opt = gradient_descent.SGD(2., momentum=0.5)
-    loss_scale = loss_scale_module.DynamicLossScale(
-        initial_loss_scale=2., increment_period=3.,
-        multiplier=4.)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-    config = opt.get_config()
-    opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+    if get_config == 'v1':
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, 2)
+      config = opt.get_config()
+    elif get_config == 'v2':
+      opt = loss_scale_optimizer.LossScaleOptimizer(
+          opt, dynamic=False, initial_scale=2)
+      config = opt.get_config()
+    else:
+      self.assertEqual(get_config, 'tf2_3')
+      config = {
+          'optimizer': {
+              'class_name': 'SGD',
+              'config': {
+                  'learning_rate': 2.0,
+                  'momentum': 0.5,
+                  'decay': 0.0,
+                  'nesterov': False,
+                  'name': 'SGD',
+              }
+          },
+          'loss_scale': {
+              'class_name': 'FixedLossScale',
+              'config': {'loss_scale_value': 2.0}
+          },
+      }
+
+    if from_config == 'v1':
+      opt = loss_scale_optimizer.LossScaleOptimizerV1.from_config(config)
+    else:
+      self.assertEqual(from_config, 'v2')
+      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+
     # Force hyperparameters to be created
     opt.lr  # pylint: disable=pointless-statement
     self.evaluate(variables.global_variables_initializer())
 
+    # Test attributes on the optimizer
     self.assertEqual(self.evaluate(opt.lr), 2.)
-    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
-    self.assertEqual(opt.loss_scale.increment_period, 3.)
-    self.assertEqual(opt.loss_scale.multiplier, 4.)
+    self.assertEqual(self.evaluate(opt.inner_optimizer.lr), 2.)
+    self.assertEqual(self.evaluate(opt.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
+    self.assertEqual(opt.initial_scale, 2.)
+    self.assertIsNone(opt.dynamic_growth_steps)
+    self.assertIsNone(opt.dynamic_counter)
+    self.assertFalse(opt.dynamic)
 
-  def testSerializationWithBuiltInOptimizer(self):
+    # Ensure the optimizer can be used
+    var = variables.Variable([5.0])
+    run_op = self._run_fn_with_grad_check(
+        distribution_strategy_context.get_strategy(), var, opt, 2)()
+    self.evaluate(variables.global_variables_initializer())
+    self._run_if_in_graph_mode(run_op)
+    self.assertEqual(self.evaluate(var), [3.])
+
+  @combinations.generate(combinations.combine(
+      get_config=['v1', 'v2', 'tf2_3'], from_config=['v1', 'v2']))
+  def testGetConfigDynamic(self, get_config, from_config):
+    # Get a config from LossScaleOptimizerV1, LossScaleOptimizer, or the
+    # LossScaleOptimizer from TF 2.3. Then restore the config into a
+    # LossScaleOptimizerV1 or LossScaleOptimizer
     opt = gradient_descent.SGD(2., momentum=0.5)
-    loss_scale = loss_scale_module.DynamicLossScale(
-        initial_loss_scale=2., increment_period=3.,
-        multiplier=4.)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    if get_config == 'v1':
+      loss_scale = tf_loss_scale_module.DynamicLossScale(
+          initial_loss_scale=2, increment_period=3)
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
+      config = opt.get_config()
+    elif get_config == 'v2':
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
+                                                    dynamic_growth_steps=3)
+      config = opt.get_config()
+    else:
+      self.assertEqual(get_config, 'tf2_3')
+      config = {
+          'optimizer': {
+              'class_name': 'SGD',
+              'config': {
+                  'learning_rate': 2.0,
+                  'momentum': 0.5,
+                  'decay': 0.0,
+                  'nesterov': False,
+                  'name': 'SGD',
+              }
+          },
+          'loss_scale': {
+              'class_name': 'DynamicLossScale',
+              'config': {
+                  'initial_loss_scale': 2.0,
+                  'increment_period': 3,
+                  'multiplier': 2.0,
+              }
+          },
+      }
+
+    if from_config == 'v1':
+      opt = loss_scale_optimizer.LossScaleOptimizerV1.from_config(config)
+    else:
+      self.assertEqual(from_config, 'v2')
+      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+
+    # Force hyperparameters to be created
+    opt.lr  # pylint: disable=pointless-statement
+    self.evaluate(variables.global_variables_initializer())
+
+    # Test attributes on the optimizer
+    self.assertEqual(self.evaluate(opt.lr), 2.)
+    self.assertEqual(self.evaluate(opt.inner_optimizer.lr), 2.)
+    self.assertEqual(self.evaluate(opt.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
+    self.assertEqual(opt.initial_scale, 2.)
+    self.assertEqual(opt.dynamic_growth_steps, 3.)
+    self.assertTrue(opt.dynamic)
+
+    # Ensure the optimizer can be used
+    var = variables.Variable([5.0])
+    run_op = self._run_fn_with_grad_check(
+        distribution_strategy_context.get_strategy(), var, opt, 2)()
+    self.evaluate(variables.global_variables_initializer())
+    self._run_if_in_graph_mode(run_op)
+    self.assertEqual(self.evaluate(var), [3.])
+    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
+
+  def test_from_config_with_invalid_multiplier(self):
+    config = {
+        'optimizer': {
+            'class_name': 'SGD',
+            'config': {
+                'learning_rate': 2.0,
+                'momentum': 0.5,
+                'decay': 0.0,
+                'nesterov': False,
+                'name': 'SGD',
+            }
+        },
+        'loss_scale': {
+            'class_name': 'DynamicLossScale',
+            'config': {
+                'initial_loss_scale': 2.0,
+                'increment_period': 3,
+                'multiplier': 4.0,
+            }
+        },
+    }
+
+    expected_error = ('Cannot deserialize LossScaleOptimizer with a '
+                      'DynamicLossScale whose multiplier is not 2. Got '
+                      'DynamicLossScale: DynamicLossScale\\(')
+    with self.assertRaisesRegex(ValueError, expected_error):
+      loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+    with self.assertRaisesRegex(ValueError, expected_error):
+      loss_scale_optimizer.LossScaleOptimizerV1.from_config(config)
+
+  @parameterized.named_parameters({
+      'testcase_name': 'V2',
+      'use_v1': False,
+  }, {
+      'testcase_name': 'V1',
+      'use_v1': True,
+  },)
+  def testSerializationWithBuiltInOptimizer(self, use_v1):
+    opt = gradient_descent.SGD(2., momentum=0.5)
+    if use_v1:
+      loss_scale = tf_loss_scale_module.DynamicLossScale(
+          initial_loss_scale=2., increment_period=3.)
+      opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
+    else:
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2.,
+                                                    dynamic_growth_steps=3.)
     config = optimizers.serialize(opt)
     opt = optimizers.deserialize(config)
     # Force hyperparameters to be created
@@ -642,10 +912,22 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.evaluate(variables.global_variables_initializer())
 
     self.assertEqual(self.evaluate(opt.lr), 2.)
-    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
-    self.assertEqual(opt.loss_scale.increment_period, 3.)
-    self.assertEqual(opt.loss_scale.multiplier, 4.)
+    self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
+    self.assertEqual(opt.dynamic_growth_steps, 3.)
+    self.assertTrue(opt.dynamic, 4.)
+    # Deserializing a LossScaleOptimizer always always results in a V2
+    # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1.
+    self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)
+
+    # Ensure the optimizer can be used
+    var = variables.Variable([5.0])
+    run_op = self._run_fn_with_grad_check(
+        distribution_strategy_context.get_strategy(), var, opt, 2)()
+    self.evaluate(variables.global_variables_initializer())
+    self._run_if_in_graph_mode(run_op)
+    self.assertEqual(self.evaluate(var), [3.])
+    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
 
   def testSerializationWithCustomOptimizer(self):
     class MySGD(gradient_descent.SGD):
@@ -655,10 +937,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         self.my_attribute = 123
 
     opt = MySGD(2., momentum=0.5)
-    loss_scale = loss_scale_module.DynamicLossScale(
-        initial_loss_scale=2., increment_period=3.,
-        multiplier=4.)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2.,
+                                                  dynamic_growth_steps=3.)
     config = optimizers.serialize(opt)
     custom_objects = {'MySGD': MySGD}
     opt = optimizers.deserialize(config, custom_objects=custom_objects)
@@ -667,11 +947,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.evaluate(variables.global_variables_initializer())
 
     self.assertEqual(self.evaluate(opt.lr), 2.)
-    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
-    self.assertEqual(opt.loss_scale.increment_period, 3.)
-    self.assertEqual(opt.loss_scale.multiplier, 4.)
-    self.assertEqual(opt._optimizer.my_attribute, 123)
+    self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
+    self.assertEqual(opt.dynamic_growth_steps, 3.)
+    self.assertEqual(opt.inner_optimizer.my_attribute, 123)
 
   def testUnsupportedStrategy(self):
     strategy = central_storage_strategy.CentralStorageStrategy()
@@ -680,8 +959,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         'CentralStorageStrategy. Try using a different Strategy, e.g. a '
         'MirroredStrategy')
     with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error):
-      loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(), 1.)
-    opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(), 1.)
+      loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
+    opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
     with strategy.scope():
       var = variables.Variable(1.0)
       loss = lambda: var * 2.0
@@ -689,6 +968,24 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       with self.assertRaisesRegex(ValueError, expected_error):
         strategy.experimental_run(run_fn)
 
+  def testInvalidArgsWithFixedLossScale(self):
+    opt = gradient_descent.SGD()
+    with self.assertRaisesRegex(
+        ValueError, '"initial_scale" must be specified if "dynamic" is False'):
+      loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False)
+    with self.assertRaisesRegex(
+        ValueError, '"dynamic_growth_steps" must be None if "dynamic" is '
+                    'False, but got: 2'):
+      loss_scale_optimizer.LossScaleOptimizer(
+          opt, dynamic=False, initial_scale=1, dynamic_growth_steps=2)
+
+  def testDynamicMustBeBool(self):
+    opt = gradient_descent.SGD()
+    with self.assertRaisesRegex(
+        TypeError, '"dynamic" argument to LossScaleOptimizer.__init__ must be '
+                   "a bool, but got: 'dynamic'"):
+      loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py b/tensorflow/python/keras/mixed_precision/mixed_precision_graph_rewrite_test.py
similarity index 91%
rename from tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
rename to tensorflow/python/keras/mixed_precision/mixed_precision_graph_rewrite_test.py
index d0fea573bd0..3fc9b9c455b 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
+++ b/tensorflow/python/keras/mixed_precision/mixed_precision_graph_rewrite_test.py
@@ -24,8 +24,8 @@ from tensorflow.python.framework import config
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import loss_scale_optimizer as loss_scale_optimizer_v2
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.platform import test
 from tensorflow.python.training.experimental import mixed_precision
@@ -65,13 +65,13 @@ class MixedPrecisionTest(keras_parameterized.TestCase):
     opt = gradient_descent_v2.SGD(1.0)
     opt = enable_mixed_precision_graph_rewrite(opt, 123.)
     self.assertIsInstance(
-        opt, loss_scale_optimizer_v2.LossScaleOptimizer)
-    self.assertEqual(self.evaluate(opt._loss_scale()), 123.)
+        opt, loss_scale_optimizer_v2.LossScaleOptimizerV1)
+    self.assertEqual(self.evaluate(opt.loss_scale), 123.)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_optimizer_errors(self):
     opt = gradient_descent_v2.SGD(1.0)
-    opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt, 'dynamic')
+    opt = loss_scale_optimizer_v2.LossScaleOptimizerV1(opt, 'dynamic')
     with self.assertRaisesRegex(
         ValueError, '"opt" must not already be an instance of a '
         'LossScaleOptimizer.'):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/policy.py
similarity index 56%
rename from tensorflow/python/keras/mixed_precision/experimental/policy.py
rename to tensorflow/python/keras/mixed_precision/policy.py
index 33f6562f796..e0b75b0a1d1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/policy.py
@@ -24,128 +24,73 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale as keras_loss_scale_module
+from tensorflow.python.keras.mixed_precision import device_compatibility_check
+from tensorflow.python.keras.mixed_precision import loss_scale as keras_loss_scale_module
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.mixed_precision.experimental.Policy', v1=[])
+# pylint: disable=g-classes-have-attributes
+@keras_export('keras.mixed_precision.Policy', v1=[])
 class Policy(object):
   """A dtype policy for a Keras layer.
 
-  A dtype policy determines dtype-related aspects of a layer, such as its
-  computation and variable dtypes. Each layer has a policy. Policies can be
-  passed to the `dtype` argument of layer constructors, or a global policy can
-  be set with `tf.keras.mixed_precision.experimental.set_policy`. A layer will
-  default to the global policy if no policy is passed to it's constructor.
+  A dtype policy determines a layer's computation and variable dtypes. Each
+  layer has a policy. Policies can be passed to the `dtype` argument of layer
+  constructors, or a global policy can be set with
+  `tf.keras.mixed_precision.set_global_policy`.
 
-  For many models, each layer's policy will have the same compute dtype and
-  variable dtype, which will typically be float32. In this case, we refer to the
-  singular dtype as the layer's dtype, which can be queried by the property
-  `tf.keras.layers.Layer.dtype`.
+  Args:
+    name: The policy name, which determines the compute and variable dtypes. Can
+      be any dtype name, such as `'float32'` or `'float64'`, which causes both
+      the compute and variable dtypes will be that dtype. Can also be the string
+      `'mixed_float16'` or `'mixed_bfloat16'`, which causes the compute dtype to
+      be float16 or bfloat16 and the variable dtype to be float32.
 
-  When mixed precision training is used, most layers will instead have a float16
-  or bfloat16 compute dtype and a float32 variable dtype, and so the layer does
-  not have a single dtype. When the variable dtype does not match the compute
-  dtype, variables will be automatically casted to the compute dtype to avoid
-  type errors. In this case, `tf.keras.layers.Layer.dtype` refers to the
-  variable dtype, not the compute dtype. See [the mixed precision guide](
-    https://www.tensorflow.org/guide/keras/mixed_precision) for more
-  information on how to use mixed precision.
+  Typically you only need to interact with dtype policies when using mixed
+  precision, which is the use of float16 or bfloat16 for computations and
+  float32 for variables. This is why the term `mixed_precision` appears in the
+  API name. Mixed precision can be enabled by passing `'mixed_float16'` or
+  `'mixed_bfloat16'` to `tf.keras.mixed_precision.set_global_policy`. See [the
+  mixed precision guide](https://www.tensorflow.org/guide/keras/mixed_precision)
+  for more information on how to use mixed precision.
 
-  Certain policies also have a `tf.mixed_precision.experimental.LossScale`
-  instance, which is used by `tf.keras.Model`s to performance loss scaling. Loss
-  scaling is a technique used with mixed precision to avoid numerical underflow
-  in float16 gradients. Loss scaling is only done by Models in `Model.fit`,
-  `Model.train_on_batch`, and similar methods. Layers which are not Models
-  ignore the loss scale.
+  >>> tf.keras.mixed_precision.set_global_policy('mixed_float16')
+  >>> layer1 = tf.keras.layers.Dense(10)
+  >>> layer1.dtype_policy  # `layer1` will automatically use mixed precision
+  <Policy "mixed_float16">
+  >>> # Can optionally override layer to use float32 instead of mixed precision.
+  >>> layer2 = tf.keras.layers.Dense(10, dtype='float32')
+  >>> layer2.dtype_policy
+  <Policy "float32">
+  >>> # Set policy back to initial float32 for future examples.
+  >>> tf.keras.mixed_precision.set_global_policy('float32')
 
-  Policies are constructed by passing a string to the constructor, e.g.
-  `tf.keras.mixed_precision.experimental.Policy('float32')`. The string
-  determines the compute and variable dtypes. It can be one of the following:
+  In the example above, passing `dtype='float32'` to the layer is equivalent to
+  passing `dtype=tf.keras.mixed_precision.Policy('float32')`. In general,
+  passing a dtype to a layer is equivalent to passing the corresponding policy,
+  so it is never necessary to explicitly construct a `Policy` object.
 
-    * Any dtype name, such as 'float32' or 'float64'. Both the variable and
-      compute dtypes will be that dtype. No loss scaling is done by default.
-    * 'mixed_float16' or 'mixed_bfloat16': The compute dtype is float16 or
-      bfloat16, while the variable dtype is float32. These policies are used for
-      mixed precision training. With 'mixed_float16', a dynamic loss scale is
-      used by default. 'mixed_bfloat16' does no loss scaling by default, as loss
-      scaling is unnecessary with bfloat16.
-
-  ### How to use mixed precision in a Keras model
-
-  To use mixed precision in a Keras model, the `'mixed_float16'` or
-  `'mixed_bfloat16'` policy can be used.
-  `tf.keras.mixed_precision.experimental.set_policy` can be used to set the
-  default policy for layers if no policy is passed to them. For example:
-
-  >>> tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
-  >>> model = tf.keras.models.Sequential([
-  ...     tf.keras.layers.Input((100,)),
-  ...     # Dense layers use global policy of 'mixed_float16', which does
-  ...     # computations in float16 while keeping variables in float32.
-  ...     tf.keras.layers.Dense(10),
-  ...     tf.keras.layers.Dense(10),
-  ...     # Softmax should be done in float32 for numeric stability. We pass
-  ...     # dtype='float32' to use float32 instead of the global policy.
-  ...     tf.keras.layers.Activation('softmax', dtype='float32')
-  ... ])
-
-  Alternatively, the policy can be passed to individual layers instead of
-  setting the global policy with `set_policy`:
-
-  >>> policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
-  >>> model = tf.keras.models.Sequential([
-  ...     tf.keras.layers.Input((100,)),
-  ...     tf.keras.layers.Dense(10, dtype=policy),
-  ...     tf.keras.layers.Dense(10, dtype=policy),
-  ...     # Softmax should be done in float32 for numeric stability.
-  ...     tf.keras.layers.Activation('softmax', dtype='float32')
-  ... ])
-
-  Note the `'mixed_float16'` policy will apply loss scaling by default in
-  `Model.fit`, `Model.train_on_batch`, and other training methods. If no such
-  method is used (e.g., a custom training loop is used) and `'mixed_float16'` is
-  used, the loss scale must be manually applied. See
-  `tf.keras.mixed_precision.experimental.LossScaleOptimizer` for details. For
-  `'mixed_bfloat16'`, no loss scaling is done and loss scaling never needs to be
-  manually applied.
-
-  See [the mixed precision guide](
-    https://www.tensorflow.org/guide/keras/mixed_precision) for more
-  information on using mixed precision
-
-  ### How to use float64 in a Keras model
-
-  Using float64 is similar to mixed precision. Either the global policy can be
-  set to float64, or `dtype='float64'` can be passed to individual layers. For
-  example, to set the global policy:
-
-  >>> tf.keras.mixed_precision.experimental.set_policy('float64')
-  >>> model = tf.keras.models.Sequential([
-  ...     tf.keras.layers.Input((100,)),
-  ...     # All layers use global policy of 'float64', which does computations
-  ...     # and creates variables in float64.
-  ...     tf.keras.layers.Dense(10),
-  ...     tf.keras.layers.Dense(10),
-  ...     tf.keras.layers.Activation('softmax')
-  ... ])
-  >>> # Optionaly set policy back to float32 if any other models use float32
-  >>> tf.keras.mixed_precision.experimental.set_policy('float32')
+  Note: `Model.compile` will automatically wrap an optimizer with a
+  `tf.keras.mixed_precision.LossScaleOptimizer` if you use the `'mixed_float16'`
+  policy. If you use a custom training loop instead of calling `Model.compile`,
+  you should explicitly use a `tf.keras.mixed_precision.LossScaleOptimizer` to
+  avoid numeric underflow with float16.
 
   ### How a layer uses its policy's compute dtype
 
-  A layer will cast its inputs to its compute dtype in TensorFlow 2. For
-  example:
+  A layer casts its inputs to its compute dtype. This causes the layer's
+  computations and output to also be in the compute dtype. For example:
 
   >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
   >>> # `layer`'s policy defaults to float32.
   >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
-  >>> # `layer` casts it's inputs to its compute dtype, which is float32, and
-  >>> # does computations in float32.
+  >>> layer.compute_dtype  # Equivalent to layer.dtype_policy.compute_dtype
+  'float32'
+  >>> # `layer` casts its inputs to its compute dtype and does computations in
+  >>> # that dtype.
   >>> y = layer(x)
   >>> y.dtype
   tf.float32
@@ -154,7 +99,8 @@ class Policy(object):
   subclassing your own layer, you do not have to insert any casts.
 
   Currently, only tensors in the first argument to the layer's `call` method are
-  casted. For example:
+  casted (although this will likely be changed in a future minor release). For
+  example:
 
   >>> class MyLayer(tf.keras.layers.Layer):
   ...   # Bug! `b` will not be casted.
@@ -169,45 +115,13 @@ class Policy(object):
   >>> y.dtype
   tf.float32
 
-  If writing your own layer, it is recommended to accept tensors only in the
-  first argument. This way, all tensors are casted to the layer's compute dtype.
-  `MyLayer` should therefore be written as:
+  If writing your own layer with multiple inputs, you should either explicitly
+  cast other tensors to `self.compute_dtype` in `call` or accept all tensors in
+  the first argument as a list.
 
-  >>> class MyLayer(tf.keras.layers.Layer):
-  ...   # Now, all tensor inputs will be casted.
-  ...   def call(self, inputs):
-  ...     a, b = inputs
-  ...     return a + 1., b + 1.
-  >>> a = tf.constant(1., dtype="float32")
-  >>> b = tf.constant(1., dtype="float32")
-  >>> layer = MyLayer(dtype="float64")
-  >>> x, y = layer((a, b))
-  >>> x.dtype
-  tf.float64
-  >>> y.dtype
-  tf.float64
-
-  Other arguments are not automatically casted for technical reasons, but this
-  may change in a future minor release.
-
-  The casting only occurs in TensorFlow 2, but can be enabled if
-  `tf.compat.v1.disable_v2_behavior()` has been called with
-  `tf.compat.v1.keras.layers.enable_v2_dtype_behavior()`.
-
-  A layer subclass can prevent its inputs from being autocasted by passing
-  `autocast=False` to the layer constructor. For example:
-
-  >>> class NonAutoCastingLayer(tf.keras.layers.Layer):
-  ...   def __init__(self, **kwargs):
-  ...     kwargs['autocast'] = False
-  ...     super(NonAutoCastingLayer, self).__init__(**kwargs)
-  ...   def call(self, inp):
-  ...     return inp
-  >>> x = tf.ones((4, 4, 4, 4), dtype='float32')
-  >>> layer = NonAutoCastingLayer(dtype='float64')
-  >>> y = layer(x)  # Will not cast inputs to it's compute dtype of float64
-  >>> y.dtype
-  tf.float32
+  The casting only occurs in TensorFlow 2. If
+  `tf.compat.v1.disable_v2_behavior()` has been called, you can enable the
+  casting behavior with `tf.compat.v1.keras.layers.enable_v2_dtype_behavior()`.
 
   ### How a layer uses its policy's variable dtype
 
@@ -216,29 +130,33 @@ class Policy(object):
 
   If a layer's compute and variable dtypes differ, `add_weight` will wrap
   floating-point variables with a special wrapper called an `AutoCastVariable`.
-  This wrapper is identical to the original variable except it casts itself to
-  the layer's compute dtype when used within `Layer.call`. Outside `Layer.call`,
-  the variable is not casted.
+  `AutoCastVariable` is identical to the original variable except it casts
+  itself to the layer's compute dtype when used within `Layer.call`. This means
+  if you are writing a layer, you do not have to explicitly cast the variables
+  to the layer's compute dtype. For example:
+
+  >>> class SimpleDense(tf.keras.layers.Layer):
+  ...
+  ...   def build(self, input_shape):
+  ...     # With mixed precision, self.kernel is a float32 AutoCastVariable
+  ...     self.kernel = self.add_weight('kernel', (input_shape[-1], 10))
+  ...
+  ...   def call(self, inputs):
+  ...     # With mixed precision, self.kernel will be casted to float16
+  ...     return tf.linalg.matmul(inputs, self.kernel)
+  ...
+  >>> dtype_policy = tf.keras.mixed_precision.Policy('mixed_float16')
+  >>> layer = SimpleDense(dtype=dtype_policy)
+  >>> y = layer(tf.ones((10, 10)))
+  >>> y.dtype
+  tf.float16
+  >>> layer.kernel.dtype
+  tf.float32
 
   A layer author can prevent a variable from being wrapped with an
-  `AutoCastVariable` by passing `experimental_autocast=False` to `add_weight`:
-
-  >>> class MyLayer(tf.keras.layers.Layer):
-  ...  def build(self, input_shape):
-  ...    self.x = self.add_weight('x')
-  ...    self.y = self.add_weight('y', experimental_autocast=False)
-  >>> policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
-  >>> layer = MyLayer(dtype=policy)
-  >>> layer.build((2, 2))
-  >>> layer.x
-  <AutoCastVariable 'x:0' shape=() dtype=float32 true_dtype=float32, numpy=...>
-  >>> layer.y
-  <tf.Variable 'y:0' shape=() dtype=float32, numpy=...>
-
-  Passing `experimental_autocast=False` is useful for layers which may
-  internally do some math in the variable dtype instead of the compute dtype.
-  For example, you may wish to compute variable statistics, such as mean and
-  variance, in the variable dtype.
+  `AutoCastVariable` by passing `experimental_autocast=False` to `add_weight`,
+  which is useful if the float32 value of the variable must be accessed within
+  the layer.
 
   ### How to write a layer that supports mixed precision and float64.
 
@@ -247,71 +165,33 @@ class Policy(object):
   automatically casts inputs, creates variables of the correct type, and in the
   case of mixed precision, wraps variables with `AutoCastVariables`.
 
-  For example, this simple dense layer does not require any additional work to
-  support mixed precision or float64. Keras automatically casts the inputs and
-  variable to the appropriate dtype.
-
-  >>> class MyDense(tf.keras.layers.Layer):
-  ...   def build(self, input_shape):
-  ...     self.kernel = self.add_weight('kernel', (input_shape[-1], 10))
-  ...   def call(self, inputs):
-  ...     return tf.matmul(inputs, self.kernel)
-
-  >>> policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
-  >>> layer = MyDense(dtype=policy)
-  >>> x = np.random.rand(10, 10)
-  >>> y = layer(x)
-  >>> y.dtype
-  tf.float16
-
   The primary case where you need extra work to support mixed precision or
   float64 is when you create a new tensor, such as with `tf.ones` or
-  `tf.constant`. In such cases, you must create the tensor of the correct dtype.
-  For example, suppose you modify the `MyDense` layer to add a random number to
-  the output using `tf.random.normal`. You must pass the input dtype to
-  `tf.random.normal` to ensure the dtypes match.
+  `tf.random.normal`, In such cases, you must create the tensor of the correct
+  dtype. For example, if you call `tf.random.normal`, you must pass the compute
+  dtype, which is the dtype the inputs have been casted to:
 
-  >>> class MyDense(tf.keras.layers.Layer):
-  ...   def build(self, input_shape):
-  ...     self.kernel = self.add_weight('kernel', (input_shape[-1], 10))
+  >>> class AddRandom(tf.keras.layers.Layer):
+  ...
   ...   def call(self, inputs):
+  ...     # We must pass `dtype=inputs.dtype`, otherwise a TypeError may
+  ...     # occur when adding `inputs` to `rand`.
   ...     rand = tf.random.normal(shape=inputs.shape, dtype=inputs.dtype)
-  ...     return tf.matmul(inputs, self.kernel) + rand
-  >>>
-  >>> layer = MyDense(dtype=policy)
+  ...     return inputs + rand
+
+  >>> dtype_policy = tf.keras.mixed_precision.Policy('mixed_float16')
+  >>> layer = AddRandom(dtype=dtype_policy)
   >>> y = layer(x)
   >>> y.dtype
   tf.float16
 
-  If you did not pass `dtype=inputs.dtype` to `tf.random.normal`, a `TypeError`
-  would have occurred. This is because the dtype defaults to `"float32"`, so the
-  layer would only work if the inputs were float32.
+  If you did not pass `dtype=inputs.dtype` to `tf.random.normal`, a
+  `TypeError` would have occurred. This is because the `tf.random.normal`'s
+  dtype defaults to `"float32"`, but the input dtype is float16. You cannot add
+  a float32 tensor with a float16 tensor.
   """
 
-  def __init__(self, name, loss_scale='auto'):
-    """Constructs the policy.
-
-    The `name` argument determines the compute and variable dtype, the default
-    loss scale, and has no additional effect on the Policy. The compute and
-    variable dtypes can only be specified through `name`, and cannot be
-    specified directly.
-
-    Args:
-      name: A string. Can be one of the following values:
-        * Any dtype name, such as 'float32' or 'float64'. Both the variable and
-          compute dtypes will be that dtype.
-        * 'mixed_float16' or 'mixed_bfloat16': The compute dtype is float16 or
-          bfloat16, while the variable dtype is float32. With 'mixed_float16',
-          a dynamic loss scale is used. These policies are used for mixed
-          precision training.
-      loss_scale: A `tf.mixed_precision.experimental.LossScale`, an int (which
-        uses a `FixedLossScale`), the string "dynamic" (which uses a
-        `DynamicLossScale`), or None (which uses no loss scale). Defaults to
-        `"auto"`. In the `"auto"` case: 1) if `name` is `"mixed_float16"`, then
-        use `loss_scale="dynamic"`. 2) otherwise, do not use a loss scale. Only
-        `tf.keras.Model`s, not layers, use the loss scale, and it is only used
-        during `Model.fit`, `Model.train_on_batch`, and other similar methods.
-    """
+  def __init__(self, name):
     if isinstance(name, dtypes.DType):
       raise TypeError("'name' must be a string, not a DType. "
                       "Instead, pass DType.name. Got: %s" % (name.name,))
@@ -319,19 +199,6 @@ class Policy(object):
       raise TypeError("'name' must be a string, but got: %s" % (name,))
     self._name = name
     self._compute_dtype, self._variable_dtype = self._parse_name(name)
-
-    if loss_scale == 'auto':
-      loss_scale = 'dynamic' if name == 'mixed_float16' else None
-      self._using_default_loss_scale = True
-    else:
-      self._using_default_loss_scale = False
-    if loss_scale and self._compute_dtype not in (None, 'float16'):
-      tf_logging.warn('Creating a Policy with a loss scale is only useful for '
-                      'float16 policies. You passed loss_scale=%r for policy '
-                      '%s. Consider not passing any loss_scale instead.' %
-                      (loss_scale, name))
-    self._loss_scale = keras_loss_scale_module.get(loss_scale)
-
     if name in ('mixed_float16', 'mixed_bloat16'):
       device_compatibility_check.log_device_compatibility_check(name)
 
@@ -378,9 +245,8 @@ class Policy(object):
       dtype = dtypes.as_dtype(name).name
     except TypeError:
       error = ("Cannot convert value %s to a mixed precision Policy. "
-               "Valid policies include include 'mixed_float16', "
-               "'mixed_bfloat16', and the name of any dtype such as "
-               "'float32'." % (name,))
+               "Valid policies include 'mixed_float16', 'mixed_bfloat16', "
+               "and the name of any dtype such as 'float32'." % (name,))
       # six.raise_from suppresses the original TypeError from being raised
       six.raise_from(ValueError(error), None)
     return dtype, dtype
@@ -394,8 +260,10 @@ class Policy(object):
     `Policy.compute_dtype`, Layers will cast variables to the compute dtype to
     avoid type errors.
 
+    Variable regularizers are run in the variable dtype, not the compute dtype.
+
     Returns:
-      The variable dtype of this policy.
+      The variable dtype of this policy, as a string.
     """
     return self._variable_dtype
 
@@ -403,56 +271,123 @@ class Policy(object):
   def compute_dtype(self):
     """The compute dtype of this policy.
 
-    This is the dtype layers will do their computations in.
+    This is the dtype layers will do their computations in. Typically layers
+    output tensors with the compute dtype as well.
 
     Note that even if the compute dtype is float16 or bfloat16, hardware devices
     may not do individual adds, multiplies, and other fundamental operations in
-    [b]float16, but instead may do some of them in float32 for numeric
+    float16 or bfloat16, but instead may do some of them in float32 for numeric
     stability. The compute dtype is the dtype of the inputs and outputs of the
     TensorFlow ops that the layer executes. Internally, many TensorFlow ops will
-    do certain internal calculations in float32, or some other device-internal
-    intermediate format with higher precision than [b]float16, to increase
+    do certain internal calculations in float32 or some other device-internal
+    intermediate format with higher precision than float16/bfloat16, to increase
     numeric stability.
 
     For example, a `tf.keras.layers.Dense` layer, when run on a GPU with a
-    float16 compute dtype, will pass float16 inputs to tf.matmul. But, tf.matmul
-    will do use float32 intermediate math. The performance benefit of float16 is
-    still apparent, due to increased memory bandwidth and the fact modern GPUs
-    have specialized hardware for computing matmuls on float16 while still
-    keeping intermediate computations in float32.
+    float16 compute dtype, will pass float16 inputs to `tf.linalg.matmul`. But,
+    `tf.linalg.matmul` will do use float32 intermediate math. The performance
+    benefit of float16 is still apparent, due to increased memory bandwidth and
+    the fact modern GPUs have specialized hardware for computing matmuls on
+    float16 inputs while still keeping intermediate computations in float32.
 
     Returns:
-      The compute dtype of this policy.
+      The compute dtype of this policy, as a string.
     """
     return self._compute_dtype
 
-  @property
-  def should_cast_variables(self):
-    """Returns True if variables should be casted.
-
-    This is true if the variable dtype is not the same as the compute dtype.
-
-    Returns:
-      True, if variables should be casted.
-    """
-    return self.variable_dtype != self.compute_dtype
-
-  @property
-  def loss_scale(self):
-    """Returns the loss scale of this Policy.
-
-    Returns:
-      A `tf.mixed_precision.experimental.LossScale`, or None.
-    """
-    return self._loss_scale
-
   @property
   def name(self):
     """Returns the name of this policy."""
     return self._name
 
   def __repr__(self):
-    return '<Policy "%s", loss_scale=%s>' % (self._name, self.loss_scale)
+    return '<Policy "%s">' % self._name
+
+  def get_config(self):
+    return {'name': self.name}
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    del custom_objects
+    if 'loss_scale' in config:
+      config = config.copy()
+      # Policy.get_config in TensorFlow 2.3 and below had a loss_scale. We
+      # silently drop it.
+      del config['loss_scale']
+    return cls(**config)
+
+
+@keras_export('keras.mixed_precision.experimental.Policy', v1=[])
+class PolicyV1(Policy):
+  """A deprecated dtype policy for a Keras layer.
+
+  Warning: This class is now deprecated and will be removed soon. Please use the
+  non-experimental class `tf.keras.mixed_precision.Policy` instead.
+
+  The difference between this class and the non-experimental class is that this
+  class has a `loss_scale` field and the non-experimental class does not. The
+  loss scale is only used by `tf.keras.Model.compile`, which automatically wraps
+  the optimizer with a `LossScaleOptimizer` if the optimizer is not already a
+  `LossScaleOptimizer`. For the non-experimental Policy class, `Model.compile`
+  instead wraps the optimizer with a `LossScaleOptimizer` if `Policy.name` is
+  "mixed_float16".
+
+  When deserializing objects with an experimental policy using functions like
+  `tf.keras.utils.deserialize_keras_object`, the policy will be deserialized as
+  the non-experimental `tf.keras.mixed_precision.Policy`, and the loss scale
+  will silently be dropped. This is so that SavedModels that are generated
+  with an experimental policy can be restored after the experimental policy is
+  removed.
+  """
+
+  def __init__(self, name, loss_scale='auto'):
+    """Constructs the policy.
+
+    The `name` argument determines the compute and variable dtype, the default
+    loss scale, and has no additional effect on the Policy. The compute and
+    variable dtypes can only be specified through `name`, and cannot be
+    specified directly.
+
+    Args:
+      name: A string. Can be one of the following values:
+        * Any dtype name, such as 'float32' or 'float64'. Both the variable and
+          compute dtypes will be that dtype.
+        * 'mixed_float16' or 'mixed_bfloat16': The compute dtype is float16 or
+          bfloat16, while the variable dtype is float32. With 'mixed_float16',
+          a dynamic loss scale is used. These policies are used for mixed
+          precision training.
+      loss_scale: A `tf.compat.v1.mixed_precision.LossScale`, an int (which
+        uses a `FixedLossScale`), the string "dynamic" (which uses a
+        `DynamicLossScale`), or None (which uses no loss scale). Defaults to
+        `"auto"`. In the `"auto"` case: 1) if `name` is `"mixed_float16"`, then
+        use `loss_scale="dynamic"`. 2) otherwise, do not use a loss scale. Only
+        `tf.keras.Model`s, not layers, use the loss scale, and it is only used
+        during `Model.fit`, `Model.train_on_batch`, and other similar methods.
+    """
+    super(PolicyV1, self).__init__(name)
+    if loss_scale == 'auto':
+      loss_scale = 'dynamic' if name == 'mixed_float16' else None
+      self._using_default_loss_scale = True
+    else:
+      self._using_default_loss_scale = False
+    if loss_scale and self._compute_dtype not in (None, 'float16'):
+      tf_logging.warn('Creating a Policy with a loss scale is only useful for '
+                      'float16 policies. You passed loss_scale=%r for policy '
+                      '%s. Consider not passing any loss_scale instead.' %
+                      (loss_scale, name))
+    self._loss_scale = keras_loss_scale_module.get(loss_scale)
+
+  @property
+  def loss_scale(self):
+    """Returns the loss scale of this Policy.
+
+    Returns:
+      A `tf.compat.v1.mixed_precision.experimental.LossScale`, or None.
+    """
+    return self._loss_scale
+
+  def __repr__(self):
+    return '<PolicyV1 "%s", loss_scale=%s>' % (self._name, self.loss_scale)
 
   def get_config(self):
     config = {
@@ -481,23 +416,28 @@ class Policy(object):
 _global_policy = None
 
 
-@keras_export('keras.mixed_precision.experimental.global_policy', v1=[])
+@keras_export('keras.mixed_precision.global_policy',
+              'keras.mixed_precision.experimental.global_policy', v1=[])
 def global_policy():
-  """Returns the global Policy.
+  """Returns the global dtype policy.
 
-  The global policy is the default policy used for layers, if no policy is
-  passed to the layer constructor. If no policy has been set with
-  `keras.mixed_precision.experimental.set_policy`, this will return a policy
+  The global policy is the default `tf.keras.mixed_precision.Policy` used for
+  layers, if no policy is passed to the layer constructor. If no policy has been
+  set with `keras.mixed_precision.set_global_policy`, this will return a policy
   constructed from `tf.keras.backend.floatx()` (floatx defaults to float32).
 
+  >>> tf.keras.mixed_precision.global_policy()
+  <Policy "float32">
+  >>> tf.keras.layers.Dense(10).dtype_policy  # Defaults to the global policy
+  <Policy "float32">
+
   If TensorFlow 2 behavior has been disabled with
   `tf.compat.v1.disable_v2_behavior()`, this will instead return a special
   "_infer" policy which infers the dtype from the dtype of the first input the
   first time the layer is called. This behavior matches the behavior that
   existed in TensorFlow 1.
 
-  See `tf.keras.mixed_precision.experimental.Policy` for more information on
-  policies.
+  See `tf.keras.mixed_precision.Policy` for more information on policies.
 
   Returns:
     The global Policy.
@@ -526,23 +466,42 @@ def _check_if_mixed_precision_graph_rewrite_is_enabled(policy):
         'customizable.'.format(policy=policy))
 
 
-@keras_export('keras.mixed_precision.experimental.set_policy', v1=[])
+@keras_export('keras.mixed_precision.set_global_policy',
+              'keras.mixed_precision.experimental.set_policy', v1=[])
 def set_policy(policy):
-  """Sets the global Policy.
+  """Sets the global dtype policy.
 
-  The global policy is the default policy used for layers, if no policy is
-  passed to the layer constructor. If no global policy is set, layers will
-  instead default to a Policy constructed from `tf.keras.backend.floatx()`.
+  The global policy is the default `tf.keras.mixed_precision.Policy` used for
+  layers, if no policy is passed to the layer constructor.
+
+  >>> tf.keras.mixed_precision.set_global_policy('mixed_float16')
+  >>> tf.keras.mixed_precision.global_policy()
+  <Policy "mixed_float16">
+  >>> tf.keras.layers.Dense(10).dtype_policy
+  <Policy "mixed_float16">
+  >>> # Global policy is not used if a policy is directly passed to constructor
+  >>> tf.keras.layers.Dense(10, dtype='float64').dtype_policy
+  <Policy "float64">
+  >>> tf.keras.mixed_precision.set_global_policy('float32')
+
+  If no global policy is set, layers will instead default to a Policy
+  constructed from `tf.keras.backend.floatx()`.
+
+  To use mixed precision, the global policy should be set to `'mixed_float16'`
+  or `'mixed_bfloat16'`, so that every layer uses a 16-bit compute dtype and
+  float32 variable dtype by default.
 
   Only floating point policies can be set as the global policy, such as
   `'float32'` and `'mixed_float16'`. Non-floating point policies such as
   `'int32'` and `'complex64'` cannot be set as the global policy because most
   layers do not support such policies.
 
-  See `tf.keras.mixed_precision.experimental.Policy` for more information.
+  See `tf.keras.mixed_precision.Policy` for more information.
 
   Args:
-    policy: A Policy, or a string that will be converted to a Policy..
+    policy: A Policy, or a string that will be converted to a Policy. Can also
+      be None, in which case the global policy will be constructed from
+      `tf.keras.backend.floatx()`
   """
   global _global_policy
   if not base_layer_utils.v2_dtype_behavior_enabled():
@@ -552,7 +511,8 @@ def set_policy(policy):
                      '"tf.compat.v1.keras.layers.enable_v2_dtype_behavior()"')
   if policy is not None and not isinstance(policy, Policy):
     policy = Policy(policy)
-  is_mixed_policy = policy is not None and policy.should_cast_variables
+  is_mixed_policy = (policy is not None and
+                     policy.compute_dtype != policy.variable_dtype)
   if is_mixed_policy:
     _check_if_mixed_precision_graph_rewrite_is_enabled(policy)
   if (policy is not None and policy.compute_dtype is not None and
@@ -596,8 +556,8 @@ def _policy_equivalent_to_dtype(policy):
   """Returns True if the Policy is equivalent to a single dtype.
 
   A policy is equivalent to a single dtype if the policy's compute and variable
-  dtypes are the same and the policy does not cause the layer/model to have
-  additional behavior, such as loss scaling.
+  dtypes are the same and the policy's type is Policy and not a subclass of
+  Policy (such as PolicyV1).
 
   The "_infer" policy is considered equivalent to a single dtype.
 
@@ -607,7 +567,7 @@ def _policy_equivalent_to_dtype(policy):
   Returns:
     True, if the policy is equivalent to a single dtype.
   """
-  # We use type() instead of isinstance because a sublcass of Policy is never
+  # We use type() instead of isinstance because a subclass of Policy is never
   # equivalent to a dtype.
   return (type(policy) == Policy and  # pylint: disable=unidiomatic-typecheck
           list(policy.get_config().keys()) == ['name'] and
@@ -628,7 +588,7 @@ def deserialize(config, custom_objects=None):
     return Policy(config)
   if config is None:
     return Policy('_infer')
-  module_objects = {'Policy': Policy}
+  module_objects = {'Policy': Policy, 'PolicyV1': Policy}
   return generic_utils.deserialize_keras_object(
       config,
       module_objects=module_objects,
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/policy_test.py
similarity index 86%
rename from tensorflow/python/keras/mixed_precision/experimental/policy_test.py
rename to tensorflow/python/keras/mixed_precision/policy_test.py
index 9ebcc3558e6..85c41a2adeb 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/policy_test.py
@@ -27,8 +27,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
-from tensorflow.python.keras.mixed_precision.experimental import policy as mp_policy
+from tensorflow.python.keras.mixed_precision import device_compatibility_check
+from tensorflow.python.keras.mixed_precision import policy as mp_policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -60,14 +60,21 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
 
   @testing_utils.enable_v2_dtype_behavior
   def test_repr(self):
-    for policy in ('float32', 'int8', 'mixed_bfloat16', '_infer'):
+    # Test Policy repr
+    for policy in ('float32', 'int8', 'mixed_float16', 'mixed_bfloat16',
+                   '_infer'):
       self.assertEqual(repr(mp_policy.Policy(policy)),
-                       '<Policy "%s", loss_scale=None>' % policy)
-    self.assertEqual(repr(mp_policy.Policy('float16', loss_scale=2)),
-                     '<Policy "float16", loss_scale=FixedLossScale(2.0)>')
+                       '<Policy "%s">' % policy)
+
+    # Test PolicyV1 repr
+    for policy in ('float32', 'int8', 'mixed_bfloat16', '_infer'):
+      self.assertEqual(repr(mp_policy.PolicyV1(policy)),
+                       '<PolicyV1 "%s", loss_scale=None>' % policy)
+    self.assertEqual(repr(mp_policy.PolicyV1('float16', loss_scale=2)),
+                     '<PolicyV1 "float16", loss_scale=FixedLossScale(2.0)>')
     self.assertStartsWith(
-        repr(mp_policy.Policy('mixed_float16')),
-        '<Policy "mixed_float16", loss_scale=DynamicLossScale(')
+        repr(mp_policy.PolicyV1('mixed_float16')),
+        '<PolicyV1 "mixed_float16", loss_scale=DynamicLossScale(')
 
   @testing_utils.enable_v2_dtype_behavior
   def test_policy_errors(self):
@@ -113,26 +120,26 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
 
   @testing_utils.enable_v2_dtype_behavior
   def test_loss_scale(self):
-    policy = mp_policy.Policy('float32')
+    policy = mp_policy.PolicyV1('float32')
     self.assertEqual(policy.loss_scale, None)
 
-    policy = mp_policy.Policy('float32', loss_scale=None)
+    policy = mp_policy.PolicyV1('float32', loss_scale=None)
     self.assertEqual(policy.loss_scale, None)
 
     ls = loss_scale_module.DynamicLossScale()
-    policy = mp_policy.Policy('float32', loss_scale=ls)
+    policy = mp_policy.PolicyV1('float32', loss_scale=ls)
     self.assertIs(policy.loss_scale, ls)
 
-    policy = mp_policy.Policy('float32', loss_scale='dynamic')
+    policy = mp_policy.PolicyV1('float32', loss_scale='dynamic')
     self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale)
 
-    policy = mp_policy.Policy('mixed_float16')
+    policy = mp_policy.PolicyV1('mixed_float16')
     self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale)
 
-    policy = mp_policy.Policy('mixed_float16', loss_scale=None)
+    policy = mp_policy.PolicyV1('mixed_float16', loss_scale=None)
     self.assertEqual(policy.loss_scale, None)
 
-    policy = mp_policy.Policy('mixed_bfloat16')
+    policy = mp_policy.PolicyV1('mixed_bfloat16')
     self.assertEqual(policy.loss_scale, None)
 
   @testing_utils.enable_v2_dtype_behavior
@@ -173,7 +180,7 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
   @testing_utils.enable_v2_dtype_behavior
   def test_loss_scale_warning(self):
     with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      mp_policy.Policy('float32', loss_scale=2.)
+      mp_policy.PolicyV1('float32', loss_scale=2.)
       self.assertEqual(
           mock_warn.call_args[0][0],
           'Creating a Policy with a loss scale is only useful for float16 '
@@ -182,9 +189,9 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
 
     for policy_name in 'float16', 'mixed_float16':
       # Trigger any other warnings that occur only once
-      mp_policy.Policy(policy_name, loss_scale=2.)
+      mp_policy.PolicyV1(policy_name, loss_scale=2.)
       with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-        mp_policy.Policy(policy_name, loss_scale=2.)
+        mp_policy.PolicyV1(policy_name, loss_scale=2.)
         mock_warn.assert_not_called()
 
   @testing_utils.enable_v2_dtype_behavior
@@ -230,12 +237,6 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
         mp_policy.Policy('mixed_float16'),
         mp_policy.Policy('mixed_bfloat16'),
         mp_policy.Policy('_infer'),
-        mp_policy.Policy('float32', loss_scale=2.),
-        mp_policy.Policy('float32', loss_scale=None),
-        mp_policy.Policy('mixed_float16', loss_scale=2.),
-        mp_policy.Policy('mixed_float16', loss_scale=None),
-        mp_policy.Policy('mixed_bfloat16', loss_scale=2.),
-        mp_policy.Policy('mixed_bfloat16', loss_scale=None),
     ):
       config = policy.get_config()
       new_policy = mp_policy.Policy.from_config(config)
@@ -263,7 +264,7 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
     class MyPolicy(mp_policy.Policy):
       pass
 
-    # Test policies that do not override the loss scale
+    # Test policies that are not equivalent to a single dtype
     for policy in (
         mp_policy.Policy('mixed_float16'),
         mp_policy.Policy('mixed_bfloat16'),
@@ -276,14 +277,14 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
                                          custom_objects={'MyPolicy': MyPolicy})
       self.assertEqual(str(policy), str(new_policy))
 
-    # Test policies that override the loss scale
+    # Test V1 policies that override the loss scale
     for policy in (
-        mp_policy.Policy('float32', loss_scale=2.),
-        mp_policy.Policy('float32', loss_scale=None),
-        mp_policy.Policy('mixed_float16', loss_scale=2.),
-        mp_policy.Policy('mixed_float16', loss_scale=None),
-        mp_policy.Policy('mixed_bfloat16', loss_scale=2.),
-        mp_policy.Policy('mixed_bfloat16', loss_scale=None),
+        mp_policy.PolicyV1('float32', loss_scale=2.),
+        mp_policy.PolicyV1('float32', loss_scale=None),
+        mp_policy.PolicyV1('mixed_float16', loss_scale=2.),
+        mp_policy.PolicyV1('mixed_float16', loss_scale=None),
+        mp_policy.PolicyV1('mixed_bfloat16', loss_scale=2.),
+        mp_policy.PolicyV1('mixed_bfloat16', loss_scale=None),
     ):
       config = mp_policy.serialize(policy)
       expected_loss_scale_config = None
@@ -300,9 +301,6 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
                   'loss_scale': expected_loss_scale_config
               }
           })
-      new_policy = mp_policy.deserialize(
-          config, custom_objects={'MyPolicy': MyPolicy})
-      self.assertEqual(str(policy), str(new_policy))
 
   @testing_utils.enable_v2_dtype_behavior
   def test_error_if_graph_rewrite_enabled(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/test_util.py b/tensorflow/python/keras/mixed_precision/test_util.py
similarity index 90%
rename from tensorflow/python/keras/mixed_precision/experimental/test_util.py
rename to tensorflow/python/keras/mixed_precision/test_util.py
index c0d9cbf98d6..95891672b25 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/test_util.py
+++ b/tensorflow/python/keras/mixed_precision/test_util.py
@@ -130,13 +130,15 @@ class MultiplyLayer(AssertTypeLayer):
 
   def __init__(self,
                regularizer=None,
+               activity_regularizer=None,
                use_operator=False,
                var_name='v',
                **kwargs):
     """Initializes the MultiplyLayer.
 
     Args:
-      regularizer: The regularizer on the scalar variable.
+      regularizer: The weight regularizer on the scalar variable.
+      activity_regularizer: The activity regularizer.
       use_operator: If True, add using the * operator. If False, add using
         tf.multiply.
       var_name: The name of the variable. It can be useful to pass a name other
@@ -148,9 +150,15 @@ class MultiplyLayer(AssertTypeLayer):
     if isinstance(regularizer, dict):
       self._regularizer = regularizers.deserialize(regularizer,
                                                    custom_objects=globals())
+    self._activity_regularizer = activity_regularizer
+    if isinstance(activity_regularizer, dict):
+      self._activity_regularizer = regularizers.deserialize(
+          activity_regularizer, custom_objects=globals())
+
     self._use_operator = use_operator
     self._var_name = var_name
-    super(MultiplyLayer, self).__init__(**kwargs)
+    super(MultiplyLayer, self).__init__(
+        activity_regularizer=self._activity_regularizer, **kwargs)
 
   def build(self, _):
     self.v = self.add_weight(
@@ -159,7 +167,6 @@ class MultiplyLayer(AssertTypeLayer):
 
   def call(self, inputs):
     self.assert_input_types(inputs)
-    assert inputs.dtype == self.v.dtype
     return self._multiply(inputs, self.v)
 
   def _multiply(self, x, y):
@@ -171,6 +178,8 @@ class MultiplyLayer(AssertTypeLayer):
   def get_config(self):
     config = super(MultiplyLayer, self).get_config()
     config['regularizer'] = regularizers.serialize(self._regularizer)
+    config['activity_regularizer'] = regularizers.serialize(
+        self._activity_regularizer)
     config['use_operator'] = self._use_operator
     config['var_name'] = self._var_name
     config['assert_type'] = self._assert_type
@@ -185,3 +194,12 @@ class IdentityRegularizer(regularizers.Regularizer):
 
   def get_config(self):
     return {}
+
+
+class ReduceSumRegularizer(regularizers.Regularizer):
+
+  def __call__(self, x):
+    return math_ops.reduce_sum(x)
+
+  def get_config(self):
+    return {}
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/BUILD b/tensorflow/python/keras/mixed_precision/testdata/BUILD
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/BUILD
rename to tensorflow/python/keras/mixed_precision/testdata/BUILD
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/checkpoint b/tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/checkpoint
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/checkpoint
rename to tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/checkpoint
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002 b/tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002
rename to tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/ckpt.data-00000-of-00002
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002 b/tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002
rename to tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/ckpt.data-00001-of-00002
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.index b/tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/ckpt.index
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2/ckpt.index
rename to tensorflow/python/keras/mixed_precision/testdata/lso_ckpt_tf2.2/ckpt.index
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/saved_model.pb b/tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/saved_model.pb
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/saved_model.pb
rename to tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/saved_model.pb
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002 b/tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002
rename to tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/variables/variables.data-00000-of-00002
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002 b/tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002
rename to tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/variables/variables.data-00001-of-00002
diff --git a/tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.index b/tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/variables/variables.index
similarity index 100%
rename from tensorflow/python/keras/mixed_precision/experimental/testdata/lso_savedmodel_tf2.2/variables/variables.index
rename to tensorflow/python/keras/mixed_precision/testdata/lso_savedmodel_tf2.2/variables/variables.index
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index b3737b5c2c4..a6558f74e0b 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -318,10 +318,10 @@ def _clone_sequential_model(model, input_tensors=None, layer_fn=_clone_layer):
 
   layers = []  # Layers needed to compute the model's outputs.
   layer_map = {}
-  # Use model._layers to ensure that all layers are cloned. The model's layers
+  # Ensure that all layers are cloned. The model's layers
   # property will exclude the initial InputLayer (if it exists) in the model,
   # resulting in a different Sequential model structure.
-  for layer in model._layers:
+  for layer in model._flatten_layers(include_self=False, recursive=False):
     if isinstance(layer, InputLayer) and input_tensors is not None:
       # If input tensors are provided, the original model's InputLayer is
       # overwritten with a different InputLayer.
@@ -460,9 +460,8 @@ def _in_place_subclassed_model_reset(model):
   # Retrieve all layers tracked by the model as well as their attribute names
   attributes_cache = {}
   for name in dir(model):
-    # Skip the check of methods in tf.Module since they basically
-    # recursively query all the other attributes within same module.
-    if name == 'submodules':
+    # Skip attrs that track other trackables.
+    if name == 'submodules' or name == '_self_tracked_trackables':
       continue
 
     try:
@@ -489,10 +488,11 @@ def _in_place_subclassed_model_reset(model):
 
   # Replace layers on the model with fresh layers
   layers_to_names = {value: key for key, value in attributes_cache.items()}
-  original_layers = model._layers[:]
+  original_layers = list(
+      model._flatten_layers(include_self=False, recursive=False))
   setattr_tracking = model._setattr_tracking
   model._setattr_tracking = False
-  model._layers = []
+  model._self_tracked_trackables = []
   for layer in original_layers:  # We preserve layer order.
     config = layer.get_config()
     # This will not work for nested subclassed models used as layers.
@@ -505,7 +505,7 @@ def _in_place_subclassed_model_reset(model):
     fresh_layer = layer.__class__.from_config(config)
     name = layers_to_names[layer]
     setattr(model, name, fresh_layer)
-    model._layers.append(fresh_layer)
+    model._self_tracked_trackables.append(fresh_layer)
 
   # Cache original model build attributes (in addition to layers)
   if (not hasattr(model, '_original_attributes_cache') or
@@ -576,11 +576,11 @@ def in_place_subclassed_model_state_restoration(model):
     # when they're constructed.
     setattr_tracking = model._setattr_tracking
     model._setattr_tracking = False
-    model._layers = []
+    model._self_tracked_trackables = []
     for name, value in model._original_attributes_cache.items():
       setattr(model, name, value)
       if isinstance(value, Layer):
-        model._layers.append(value)
+        model._self_tracked_trackables.append(value)
     model._original_attributes_cache = None
     model._setattr_tracking = setattr_tracking
   else:
@@ -594,7 +594,7 @@ def clone_and_build_model(
     optimizer_config=None):
   """Clone a `Model` and build/compile it with the same settings used before.
 
-  This function can be be run in the same graph or in a separate graph from the
+  This function can be run in the same graph or in a separate graph from the
   model. When using a separate graph, `in_place_reset` must be `False`.
 
   Note that, currently, the clone produced from this function may not work with
@@ -659,7 +659,7 @@ def clone_and_build_model(
                   model._build_input_shape, dtype=model.inputs[0].dtype))
     else:
       try:
-        # Prefer clonining the model if serial/deserial logic is implemented for
+        # Prefer cloning the model if serial/deserial logic is implemented for
         # subclassed model.
         clone = model.__class__.from_config(model.get_config())
       except NotImplementedError:
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 854a0cabd3e..0ece5ac69eb 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -111,16 +111,20 @@ class TestModelCloning(keras_parameterized.TestCase):
     model = models.Sequential(_get_layers(input_shape, add_input_layer))
     # Sanity check
     self.assertEqual(
-        isinstance(model._layers[0], keras.layers.InputLayer),
-        add_input_layer)
+        isinstance(
+            list(model._flatten_layers(include_self=False, recursive=False))[0],
+            keras.layers.InputLayer), add_input_layer)
     self.assertEqual(model._is_graph_network, add_input_layer)
 
     # With placeholder creation -- clone model should have an InputLayer
     # if the original model has one.
     new_model = clone_fn(model)
     self.assertEqual(
-        isinstance(new_model._layers[0], keras.layers.InputLayer),
-        add_input_layer)
+        isinstance(
+            list(
+                new_model._flatten_layers(include_self=False,
+                                          recursive=False))[0],
+            keras.layers.InputLayer), add_input_layer)
     self.assertEqual(new_model._is_graph_network, model._is_graph_network)
     if input_shape and not ops.executing_eagerly_outside_functions():
       # update ops from batch norm needs to be included
@@ -129,7 +133,9 @@ class TestModelCloning(keras_parameterized.TestCase):
     # On top of new tensor  -- clone model should always have an InputLayer.
     input_a = keras.Input(shape=(4,))
     new_model = clone_fn(model, input_tensors=input_a)
-    self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+    self.assertIsInstance(
+        list(new_model._flatten_layers(include_self=False, recursive=False))[0],
+        keras.layers.InputLayer)
     self.assertTrue(new_model._is_graph_network)
 
     # On top of new, non-Keras tensor  -- clone model should always have an
@@ -139,7 +145,10 @@ class TestModelCloning(keras_parameterized.TestCase):
       # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
       new_model = clone_fn(model, input_tensors=input_a)
-      self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+      self.assertIsInstance(
+          list(new_model._flatten_layers(include_self=False,
+                                         recursive=False))[0],
+          keras.layers.InputLayer)
       self.assertTrue(new_model._is_graph_network)
 
   @keras_parameterized.run_all_keras_modes
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 6d774f606b1..2c5660c041b 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -47,6 +47,7 @@ py_library(
         "//tensorflow/python/distribute:central_storage_strategy",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:parameter_server_strategy",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/keras:backend",
@@ -92,7 +93,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["adagrad_test.py"],
     shard_count = 4,
-    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -113,7 +113,10 @@ cuda_py_test(
     size = "medium",
     srcs = ["adam_test.py"],
     shard_count = 4,
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        "no_windows",  # TODO(b/171384138)
+    ],
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -177,7 +180,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["ftrl_test.py"],
     shard_count = 4,
-    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -217,7 +219,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["nadam_test.py"],
     shard_count = 4,
-    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -264,7 +265,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["learning_rate_schedule_test.py"],
     shard_count = 4,
-    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -279,7 +279,6 @@ cuda_py_test(
     name = "legacy_learning_rate_decay_test",
     size = "medium",
     srcs = ["legacy_learning_rate_decay_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":legacy_learning_rate_decay",
         "//tensorflow/python:framework_test_lib",
@@ -299,6 +298,9 @@ cuda_py_test(
     srcs = ["rmsprop_test.py"],
     shard_count = 2,
     tags = ["no_rocm"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
     deps = [
         ":optimizer_v2",
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index 447a348618c..e7e89780f70 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 def _maybe_serialized(lr_decay, serialize_and_deserialize):
@@ -510,4 +510,4 @@ class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase,
       self.evaluate(decayed_lr(step))
 
 if __name__ == "__main__":
-  googletest.main()
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
index b530767b6f8..fb31afae03f 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.optimizer_v2 import legacy_learning_rate_decay as learning_rate_decay
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 @combinations.generate(combinations.combine(mode=["graph", "eager"]))
@@ -478,4 +478,4 @@ class NoisyLinearCosineDecayTest(keras_parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  googletest.main()
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index ca3f1a3a9b1..3f67df268f2 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -25,8 +25,10 @@ import functools
 
 import six
 
+from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -615,9 +617,12 @@ class OptimizerV2(trackable.Trackable):
             "context.")
 
       strategy = distribute_ctx.get_strategy()
-      if (not experimental_aggregate_gradients and strategy and isinstance(
-          strategy.extended,
-          parameter_server_strategy.ParameterServerStrategyExtended)):
+      if (not experimental_aggregate_gradients and strategy and
+          isinstance(strategy,
+                     (parameter_server_strategy.ParameterServerStrategyV1,
+                      parameter_server_strategy_v2.ParameterServerStrategyV2,
+                      central_storage_strategy.CentralStorageStrategy,
+                      central_storage_strategy.CentralStorageStrategyV1))):
         raise NotImplementedError(
             "`experimental_aggregate_gradients=False is not supported for "
             "ParameterServerStrategy and CentralStorageStrategy")
@@ -688,7 +693,7 @@ class OptimizerV2(trackable.Trackable):
         # If the current context is graph mode or any of the update ops are
         # symbolic then the step update should be carried out under a graph
         # context. (eager updates execute immediately)
-        with ops._get_graph_from_inputs(update_ops).as_default():  # pylint: disable=protected-access
+        with backend._current_graph(update_ops).as_default():  # pylint: disable=protected-access
           with ops.control_dependencies([control_flow_ops.group(update_ops)]):
             return self._iterations.assign_add(1, read_value=False)
 
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index e1f2ca80cf9..ab779b161de 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -60,7 +60,7 @@ def deserialize(config, custom_objects=None):
   """
   # loss_scale_optimizer has a direct dependency of optimizer, import here
   # rather than top to avoid the cyclic dependency.
-  from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.mixed_precision import loss_scale_optimizer  # pylint: disable=g-import-not-at-top
   all_classes = {
       'adadelta': adadelta_v2.Adadelta,
       'adagrad': adagrad_v2.Adagrad,
@@ -71,6 +71,10 @@ def deserialize(config, custom_objects=None):
       'sgd': gradient_descent_v2.SGD,
       'ftrl': ftrl.Ftrl,
       'lossscaleoptimizer': loss_scale_optimizer.LossScaleOptimizer,
+      # LossScaleOptimizerV1 deserializes into LossScaleOptimizer, as
+      # LossScaleOptimizerV1 will be removed soon but deserializing it will
+      # still be supported.
+      'lossscaleoptimizerv1': loss_scale_optimizer.LossScaleOptimizer,
   }
 
   # Make deserialization case-insensitive for built-in optimizers.
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 04925088f88..e6bee7cd332 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -114,7 +114,6 @@ tf_py_test(
     size = "small",
     srcs = ["sequence_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":sequence",
         "//tensorflow/python:client_testlib",
@@ -127,7 +126,6 @@ tf_py_test(
     size = "small",
     srcs = ["text_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":text",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 78cec643570..2f22b92f05a 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -22,6 +22,7 @@ import os
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data import Dataset
@@ -71,18 +72,21 @@ class TestImage(keras_parameterized.TestCase):
     output = preprocessing_image.smart_resize(test_input, size=(5, 15))
     self.assertListEqual(list(output.shape), [5, 15, 3])
 
+  @parameterized.named_parameters(
+      ('size1', (50, 50)),
+      ('size2', (10, 10)),
+      ('size3', (100, 50)),
+      ('size4', (5, 15)))
   @testing_utils.run_v2_only
-  def test_smart_resize_tf_dataset(self):
+  def test_smart_resize_tf_dataset(self, size):
     test_input_np = np.random.random((2, 20, 40, 3))
     test_ds = Dataset.from_tensor_slices(test_input_np)
 
     resize = lambda img: preprocessing_image.smart_resize(img, size=size)
-
-    for size in [(50, 50), (10, 10), (100, 50), (5, 15)]:
-      test_ds = test_ds.map(resize)
-      for sample in test_ds.as_numpy_iterator():
-        self.assertIsInstance(sample, np.ndarray)
-        self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
+    test_ds = test_ds.map(resize)
+    for sample in test_ds.as_numpy_iterator():
+      self.assertIsInstance(sample, np.ndarray)
+      self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
 
   def test_smart_resize_errors(self):
     with self.assertRaisesRegex(ValueError, 'a tuple of 2 integers'):
diff --git a/tensorflow/python/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
index 5a343e0069c..372dc18b61d 100644
--- a/tensorflow/python/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -63,7 +63,7 @@ def one_hot(input_text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
-  """One-hot encodes a text into a list of word indexes of size `n`.
+  r"""One-hot encodes a text into a list of word indexes of size `n`.
 
   This function receives as input a string of text and returns a
   list of encoded integers each corresponding to a word (or token)
@@ -73,8 +73,11 @@ def one_hot(input_text,
       input_text: Input text (string).
       n: int. Size of vocabulary.
       filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
-            includes basic punctuation, tabs, and newlines.
+        punctuation. Default:
+        ```
+        '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
+        ```,
+        includes basic punctuation, tabs, and newlines.
       lower: boolean. Whether to set the text to lowercase.
       split: str. Separator for word splitting.
 
diff --git a/tensorflow/python/keras/protobuf/BUILD b/tensorflow/python/keras/protobuf/BUILD
index df253252f36..b7d85419fb9 100644
--- a/tensorflow/python/keras/protobuf/BUILD
+++ b/tensorflow/python/keras/protobuf/BUILD
@@ -14,3 +14,10 @@ tf_proto_library(
     srcs = ["projector_config.proto"],
     cc_api_version = 2,
 )
+
+tf_proto_library(
+    name = "saved_metadata_proto",
+    srcs = ["saved_metadata.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
diff --git a/tensorflow/python/keras/protobuf/saved_metadata.proto b/tensorflow/python/keras/protobuf/saved_metadata.proto
new file mode 100644
index 00000000000..41684bbd627
--- /dev/null
+++ b/tensorflow/python/keras/protobuf/saved_metadata.proto
@@ -0,0 +1,33 @@
+// Protobuf containing the metadata for each Keras object saved in a SavedModel.
+
+syntax = "proto3";
+
+package third_party.tensorflow.python.keras.protobuf;
+
+import "tensorflow/core/framework/versions.proto";
+
+message SavedMetadata {
+  // Nodes represent trackable objects in the SavedModel. The data for every
+  // Keras object is stored.
+  repeated SavedObject nodes = 1;
+}
+
+// Metadata of an individual Keras object.
+message SavedObject {
+  // Version defined by the code serializing this Keras object.
+  .tensorflow.VersionDef version = 1;
+  // Index of the node in the SavedModel SavedObjectGraph.
+  int32 node_id = 2;
+  // String path from root (e.g. "root.child_layer")
+  string node_path = 3;
+
+  // Identifier to determine loading function.
+  // Currently supported identifiers:
+  //   _tf_keras_layer, _tf_keras_input_layer, _tf_keras_rnn_layer,
+  //   _tf_keras_metric, _tf_keras_network, _tf_keras_model,
+  //   _tf_keras_sequential
+  string identifier = 4;
+  // Metadata containing a JSON-serialized object with the non-TensorFlow
+  // attributes for this Keras object.
+  string metadata = 5;
+}
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 4b7d0e2fef5..14b78a10dfe 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -49,6 +49,7 @@ py_library(
     deps = [
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
@@ -57,7 +58,8 @@ py_library(
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/engine:input_spec",
-        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision:autocast_variable",
+        "//tensorflow/python/keras/protobuf:saved_metadata_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:metrics_utils",
         "//tensorflow/python/keras/utils:mode_keys",
@@ -73,6 +75,9 @@ tf_py_test(
     srcs = ["metrics_serialization_test.py"],
     python_version = "PY3",
     shard_count = 8,
+    tags = [
+        "notsan",  # TODO(b/170870790)
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -206,7 +211,6 @@ tf_py_test(
     size = "small",
     srcs = ["saved_model/json_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":saving",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index e91b9b323ce..21a167683ee 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -58,7 +58,7 @@ except ImportError:
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
-  @keras_parameterized.run_with_all_saved_model_formats
+  @keras_parameterized.run_with_all_weight_formats
   def test_weight_loading(self):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir)
@@ -410,9 +410,14 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
   def test_save_and_load(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
+    save_kwargs = testing_utils.get_save_kwargs()
 
-    if save_format == 'h5' and testing_utils.get_model_type() == 'subclass':
-      return  # HDF5 format currently does not allow saving classed models.
+    if ((save_format == 'h5' or not save_kwargs.get('save_traces', True)) and
+        testing_utils.get_model_type() == 'subclass'):
+      # HDF5 format currently does not allow saving subclassed models.
+      # When saving with `save_traces=False`, the subclassed model must have a
+      # get_config/from_config, which the autogenerated model does not have.
+      return
 
     with self.cached_session():
       model = testing_utils.get_model_from_layers(
@@ -440,7 +445,9 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
+      keras.models.save_model(
+          model, saved_model_dir, save_format=save_format,
+          **save_kwargs)
 
       loaded_model = keras.models.load_model(saved_model_dir)
       self._assert_same_weights_and_metrics(model, loaded_model)
@@ -831,29 +838,30 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
 
-    model = _make_model()
-    model.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(),
-        optimizer=optimizers.gradient_descent_v2.SGD(),
-        metrics=[keras.metrics.SparseCategoricalCrossentropy()])
-    x = np.random.normal(size=(32, 4))
-    y = np.random.randint(0, 3, size=32)
-    model.train_on_batch(x, y)
-    evaluation_results = model.evaluate(x, y)
-    # Save and reload model.
-    model.save(saved_model_dir, save_format=save_format)
-    del model  # Prevent misuse.
-    loaded_model = keras.models.load_model(saved_model_dir)
-    loaded_model_eval_results = loaded_model.evaluate(x, y)
-    # Assert all evaluation results are the same.
-    self.assertAllClose(evaluation_results, loaded_model_eval_results, 1e-9)
-    # Check correctness of the loss calculation.
-    self.assertAllGreater(evaluation_results, 0.)
-    evaluation_results = dict(
-        zip(loaded_model.metrics_names, evaluation_results))
-    self.assertNear(
-        evaluation_results['sparse_categorical_crossentropy'] +
-        evaluation_results['custom_loss'], evaluation_results['loss'], 1e-6)
+    with self.cached_session():
+      model = _make_model()
+      model.compile(
+          loss=keras.losses.SparseCategoricalCrossentropy(),
+          optimizer=optimizers.gradient_descent_v2.SGD(),
+          metrics=[keras.metrics.SparseCategoricalCrossentropy()])
+      x = np.random.normal(size=(32, 4))
+      y = np.random.randint(0, 3, size=32)
+      model.train_on_batch(x, y)
+      evaluation_results = model.evaluate(x, y)
+      # Save and reload model.
+      model.save(saved_model_dir, save_format=save_format)
+      del model  # Prevent misuse.
+      loaded_model = keras.models.load_model(saved_model_dir)
+      loaded_model_eval_results = loaded_model.evaluate(x, y)
+      # Assert all evaluation results are the same.
+      self.assertAllClose(evaluation_results, loaded_model_eval_results, 1e-9)
+      # Check correctness of the loss calculation.
+      self.assertAllGreater(evaluation_results, 0.)
+      evaluation_results = dict(
+          zip(loaded_model.metrics_names, evaluation_results))
+      self.assertNear(
+          evaluation_results['sparse_categorical_crossentropy'] +
+          evaluation_results['custom_loss'], evaluation_results['loss'], 1e-6)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_save_uncompiled_model_with_optimizer(self):
diff --git a/tensorflow/python/keras/saving/model_config.py b/tensorflow/python/keras/saving/model_config.py
index 63f82b404a4..facc95b22f9 100644
--- a/tensorflow/python/keras/saving/model_config.py
+++ b/tensorflow/python/keras/saving/model_config.py
@@ -34,6 +34,15 @@ except ImportError:
 @keras_export('keras.models.model_from_config')
 def model_from_config(config, custom_objects=None):
   """Instantiates a Keras model from its config.
+ 
+  Usage:
+  ```
+  # for a Functional API model
+  tf.keras.Model().from_config(model.get_config())
+
+  # for a Sequential model
+  tf.keras.Sequential().from_config(model.get_config())
+  ```
 
   Arguments:
       config: Configuration dictionary.
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index c0c69c4e715..9f3c4d20351 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.saved_model import load_context
 from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.util import keras_deps
 from tensorflow.python.util.tf_export import keras_export
 
 # pylint: disable=g-import-not-at-top
@@ -52,9 +53,14 @@ def save_model(model,
                include_optimizer=True,
                save_format=None,
                signatures=None,
-               options=None):
+               options=None,
+               save_traces=True):
+  # pylint: disable=line-too-long
   """Saves a model as a TensorFlow SavedModel or HDF5 file.
 
+  See the [Serialization and Saving guide](https://keras.io/guides/serialization_and_saving/)
+  for details.
+
   Usage:
 
   >>> model = tf.keras.Sequential([
@@ -65,28 +71,38 @@ def save_model(model,
   >>> x = tf.random.uniform((10, 3))
   >>> assert np.allclose(model.predict(x), loaded_model.predict(x))
 
-  The saved model contains:
+  The SavedModel and HDF5 file contains:
 
-      - the model's configuration (topology)
-      - the model's weights
-      - the model's optimizer's state (if any)
+  - the model's configuration (topology)
+  - the model's weights
+  - the model's optimizer's state (if any)
 
-  Thus the saved model can be reinstantiated in
-  the exact same state, without any of the code
-  used for model definition or training.
+  Thus models can be reinstantiated in the exact same state, without any of the
+  code used for model definition or training.
 
   Note that the model weights may have different scoped names after being
   loaded. Scoped names include the model/layer names, such as
   `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
   access specific variables, e.g. `model.get_layer("dense_1").kernel`.
 
-  _SavedModel serialization_
+  __SavedModel serialization format__
 
-  The SavedModel serialization path uses `tf.saved_model.save` to save the model
-  and all trackable objects attached to the model (e.g. layers and variables).
-  `@tf.function`-decorated methods are also saved. Additional trackable objects
-  and functions are added to the SavedModel to allow the model to be
-  loaded back as a Keras Model object.
+  Keras SavedModel uses `tf.saved_model.save` to save the model and all
+  trackable objects attached to the model (e.g. layers and variables). The model
+  config, weights, and optimizer are saved in the SavedModel. Additionally, for
+  every Keras layer attached to the model, the SavedModel stores:
+
+    * the config and metadata -- e.g. name, dtype, trainable status
+    * traced call and loss functions, which are stored as TensorFlow subgraphs.
+
+  The traced functions allow the SavedModel format to save and load custom
+  layers without the original class definition.
+
+  You can choose to not save the traced functions by disabling the `save_traces`
+  option. This will decrease the time it takes to save the model and the
+  amount of disk space occupied by the output SavedModel. If you enable this
+  option, then you _must_ provide all custom class definitions when loading
+  the model. See the `custom_objects` argument in `tf.keras.models.load_model`.
 
   Arguments:
       model: Keras model instance to be saved.
@@ -102,12 +118,19 @@ def save_model(model,
       signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
         format only. Please see the `signatures` argument in
         `tf.saved_model.save` for details.
-      options: Optional `tf.saved_model.SaveOptions` object that specifies
-        options for saving to SavedModel.
+      options: (only applies to SavedModel format) `tf.saved_model.SaveOptions`
+        object that specifies options for saving to SavedModel.
+      save_traces: (only applies to SavedModel format) When enabled, the
+        SavedModel will store the function traces for each layer. This
+        can be disabled, so that only the configs of each layer are stored.
+        Defaults to `True`. Disabling this will decrease serialization time and
+        reduce file size, but it requires that all custom layers/models
+        implement a `get_config()` method.
 
   Raises:
       ImportError: If save format is hdf5, and h5py is not available.
   """
+  # pylint: enable=line-too-long
   from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
 
   default_format = 'tf' if tf2.enabled() else 'h5'
@@ -132,7 +155,7 @@ def save_model(model,
         model, filepath, overwrite, include_optimizer)
   else:
     saved_model_save.save(model, filepath, overwrite, include_optimizer,
-                          signatures, options)
+                          signatures, options, save_traces)
 
 
 @keras_export('keras.models.load_model')
@@ -192,3 +215,7 @@ def load_model(filepath, custom_objects=None, compile=True, options=None):  # py
   raise IOError(
       'Unable to load model. Filepath is not an hdf5 file (or h5py is not '
       'available) or SavedModel.')
+
+# Inject the load_model function to keras_deps to remove the dependency
+# from TFLite to Keras.
+keras_deps.register_load_model_function(load_model)
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index fb2114a292b..fcd2003aab8 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.feature_column import dense_features
+from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
@@ -190,7 +191,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
                 shape=(None, 1), sparse=True, name='b', dtype='string')
     }
 
-    fc_layer, _ = feature_column_lib.SequenceFeatures(cols)(input_layers)
+    fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
     # TODO(tibell): Figure out the right dtype and apply masking.
     # sequence_length_mask = array_ops.sequence_mask(sequence_length)
     # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
@@ -289,7 +290,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_saving_model_with_custom_object(self):
-    with generic_utils.custom_object_scope():
+    with generic_utils.custom_object_scope(), self.cached_session():
 
       @generic_utils.register_keras_serializable()
       class CustomLoss(losses.MeanSquaredError):
diff --git a/tensorflow/python/keras/saving/saved_model/base_serialization.py b/tensorflow/python/keras/saving/saved_model/base_serialization.py
index 0065e6d786e..0b26fe774b1 100644
--- a/tensorflow/python/keras/saving/saved_model/base_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/base_serialization.py
@@ -22,6 +22,7 @@ import abc
 import six
 
 from tensorflow.python.keras.saving.saved_model import json_utils
+from tensorflow.python.keras.saving.saved_model import utils
 from tensorflow.python.training.tracking import tracking
 
 
@@ -71,6 +72,9 @@ class SavedModelSaver(object):
       A dictionary mapping attribute names to trackable objects. The entire list
       of attributes are listed in the `saved_model._LayerAttributes` class.
     """
+    if not utils.should_save_traces():
+      return {}
+
     return self.objects_to_serialize(serialization_cache)
 
   def list_functions_for_serialization(self, serialization_cache):
@@ -84,6 +88,9 @@ class SavedModelSaver(object):
         A dictionary mapping attribute names to `Function` or
         `ConcreteFunction`.
     """
+    if not utils.should_save_traces():
+      return {}
+
     fns = self.functions_to_serialize(serialization_cache)
 
     # The parent AutoTrackable class saves all user-defined tf.functions, and
diff --git a/tensorflow/python/keras/saving/saved_model/constants.py b/tensorflow/python/keras/saving/saved_model/constants.py
index 3f1eca9c500..12265e0a3f3 100644
--- a/tensorflow/python/keras/saving/saved_model/constants.py
+++ b/tensorflow/python/keras/saving/saved_model/constants.py
@@ -26,3 +26,7 @@ KERAS_ATTR = 'keras_api'
 # Keys for the serialization cache.
 # Maps to the keras serialization dict {Layer --> SerializedAttributes object}
 KERAS_CACHE_KEY = 'keras_serialized_attributes'
+
+
+# Name of Keras metadata file stored in the SavedModel.
+SAVED_METADATA_PATH = 'keras_metadata.pb'
diff --git a/tensorflow/python/keras/saving/saved_model/json_utils.py b/tensorflow/python/keras/saving/saved_model/json_utils.py
index d06e4180564..48dea4a2609 100644
--- a/tensorflow/python/keras/saving/saved_model/json_utils.py
+++ b/tensorflow/python/keras/saving/saved_model/json_utils.py
@@ -25,19 +25,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections.abc as collections_abc
 import json
 import numpy as np
 import wrapt
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.util import serialization
-
-try:
-  # This import only works on python 3.3 and above.
-  import collections.abc as collections_abc  # pylint: disable=unused-import, g-import-not-at-top
-except ImportError:
-  import collections as collections_abc  # pylint: disable=unused-import, g-import-not-at-top
 
 
 class Encoder(json.JSONEncoder):
@@ -47,7 +41,7 @@ class Encoder(json.JSONEncoder):
     if isinstance(obj, tensor_shape.TensorShape):
       items = obj.as_list() if obj.rank is not None else None
       return {'class_name': 'TensorShape', 'items': items}
-    return serialization.get_json_type(obj)
+    return get_json_type(obj)
 
   def encode(self, obj):
     return super(Encoder, self).encode(_encode_tuple(obj))
diff --git a/tensorflow/python/keras/saving/saved_model/layer_serialization.py b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
index 4216457bf28..73f3ba250a4 100644
--- a/tensorflow/python/keras/saving/saved_model/layer_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.saving.saved_model import base_serialization
 from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import save_impl
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index e0426e74f6b..43c1d2bd0d4 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -17,9 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import re
 import types
 
+from google.protobuf import message
+
+from tensorflow.core.framework import versions_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import ops
@@ -28,6 +32,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.keras.protobuf import saved_metadata_pb2
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import json_utils
@@ -36,11 +41,14 @@ from tensorflow.python.keras.saving.saved_model.serialized_attributes import Com
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import metrics_utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import load as tf_load
+from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking.tracking import delete_tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -117,8 +125,46 @@ def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
   # TODO(kathywu): Add saving/loading of optimizer, compiled losses and metrics.
   # TODO(kathywu): Add code to load from objects that contain all endpoints
 
-  model = tf_load.load_internal(
-      path, options=options, loader_cls=KerasObjectLoader)
+  # Look for metadata file or parse the SavedModel
+  metadata = saved_metadata_pb2.SavedMetadata()
+  meta_graph_def = loader_impl.parse_saved_model(path).meta_graphs[0]
+  object_graph_def = meta_graph_def.object_graph_def
+  path_to_metadata_pb = os.path.join(path, constants.SAVED_METADATA_PATH)
+  if gfile.Exists(path_to_metadata_pb):
+    try:
+      with gfile.GFile(path_to_metadata_pb, 'rb') as f:
+        file_content = f.read()
+      metadata.ParseFromString(file_content)
+    except message.DecodeError as e:
+      raise IOError('Cannot parse keras metadata {}: {}.'
+                    .format(path_to_metadata_pb, str(e)))
+  else:
+    logging.warning('SavedModel saved prior to TF 2.4 detected when loading '
+                    'Keras model. Please ensure that you are saving the model '
+                    'with model.save() or tf.keras.models.save_model(), *NOT* '
+                    'tf.saved_model.save(). To confirm, there should be a file '
+                    'named "keras_metadata.pb" in the SavedModel directory.')
+    _read_legacy_metadata(object_graph_def, metadata)
+
+  if not metadata.nodes:
+    # When there are no Keras objects, return the results from the core loader
+    return tf_load.load(path, options=options)
+
+  # Recreate layers and metrics using the info stored in the metadata.
+  keras_loader = KerasObjectLoader(metadata, object_graph_def)
+  keras_loader.load_layers()
+
+  # Generate a dictionary of all loaded nodes.
+  nodes_to_load = {'root': None}
+  for node_id, loaded_node in keras_loader.loaded_nodes.items():
+    nodes_to_load[keras_loader.get_path(node_id)] = loaded_node
+  loaded = tf_load.load_partial(path, nodes_to_load, options=options)
+
+  # Finalize the loaded layers and remove the extra tracked dependencies.
+  keras_loader.finalize_objects()
+  keras_loader.del_tracking()
+
+  model = loaded['root']
 
   # pylint: disable=protected-access
   if isinstance(model, training_lib.Model) and compile:
@@ -143,6 +189,45 @@ def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
   return model
 
 
+def _read_legacy_metadata(object_graph_def, metadata):
+  """Builds a KerasMetadata proto from the SavedModel ObjectGraphDef."""
+  # Older SavedModels store the metadata directly in the proto instead of the
+  # separate pb file.
+  node_paths = _generate_object_paths(object_graph_def)
+  for node_id, proto in enumerate(object_graph_def.nodes):
+    if (proto.WhichOneof('kind') == 'user_object' and
+        proto.user_object.identifier in KERAS_OBJECT_IDENTIFIERS):
+      metadata.nodes.add(
+          node_id=node_id,
+          node_path=node_paths[node_id],
+          version=versions_pb2.VersionDef(
+              producer=1, min_consumer=1, bad_consumers=[]),
+          identifier=proto.user_object.identifier,
+          metadata=proto.user_object.metadata)
+
+
+def _generate_object_paths(object_graph_def):
+  """Traverses through an ObjectGraphDef and builds a map of all node paths."""
+  paths = {0: 'root'}
+  nodes_to_visit = [0]
+  visited_nodes = set([])
+
+  while nodes_to_visit:
+    current_node = nodes_to_visit.pop()
+    # if current_node in visited_nodes:
+    #   continue
+    visited_nodes.add(current_node)
+    current_path = paths[current_node]
+    for reference in object_graph_def.nodes[current_node].children:
+      if reference.node_id in paths:
+        continue
+      paths[reference.node_id] = '{}.{}'.format(current_path,
+                                                reference.local_name)
+      nodes_to_visit.append(reference.node_id)
+
+  return paths
+
+
 def _is_graph_network(layer):
   """Determines whether the layer is a graph network."""
   # pylint: disable=protected-access
@@ -154,7 +239,7 @@ def _is_graph_network(layer):
   return False
 
 
-class KerasObjectLoader(tf_load.Loader):
+class KerasObjectLoader(object):
   """Loader that recreates Keras objects (e.g. layers, models).
 
   Layers and models are revived from either the config or SavedModel following
@@ -173,15 +258,17 @@ class KerasObjectLoader(tf_load.Loader):
 
   """
 
-  def __init__(self, *args, **kwargs):
-    # Maps node id -> (node, revive setter function)
-    # Nodes recreated from the config may generate other nodes. This list
-    # records all nodes that were generated directly/indirectly from the config,
-    # so that they do not get recreated multiple times.
-    self._nodes_recreated_from_config = {}
+  def __init__(self, metadata, object_graph_def):
+    self._metadata = metadata
+    self._proto = object_graph_def
+
+    self._node_paths = {node_data.node_id: node_data.node_path
+                        for node_data in metadata.nodes}
+    self.loaded_nodes = {}  # Maps node path -> loaded node
+
     # Store all node ids that have already been traversed when tracking nodes
     # that were recreated from the config.
-    self._traversed_nodes_from_config = []
+    self._traversed_nodes_from_config = set()
 
     # Maps model id -> (blank model obj, list of child layer or their node ids)
     # This tracks all layers in functional and sequential models. These models
@@ -189,8 +276,8 @@ class KerasObjectLoader(tf_load.Loader):
     self.model_layer_dependencies = {}
     self._models_to_reconstruct = []
 
-    super(KerasObjectLoader, self).__init__(*args, **kwargs)
-
+  def del_tracking(self):
+    """Removes tracked references that are only used when loading the model."""
     # Now that the node object has been fully loaded, and the checkpoint has
     # been restored, the object no longer needs to track objects added from
     # SerializedAttributes. (Note that saving a training checkpoint still
@@ -199,50 +286,33 @@ class KerasObjectLoader(tf_load.Loader):
     # TODO(kathywu): Instead of outright deleting these nodes (which would
     # make restoring from a different checkpoint tricky), mark them as extra
     # dependencies that are OK to overwrite.
-    for node in self._nodes:
+    for node in self.loaded_nodes.values():
+      node = node[0]
       if not isinstance(node, base_layer.Layer):
+        # Loaded nodes can contain other trackable objects created when
+        # loading layers from the config, such as variables.
         continue
       for name in PUBLIC_ATTRIBUTES:
         delete_tracking(node, name)
 
-  def _load_all(self):
-    """Reconstruct the object graph from the SavedModel."""
-    # Load layer and model objects from either config or SavedModel. The objects
-    # loaded from config may create variables / other objects during
-    # initialization. These are recorded in `_nodes_recreated_from_config`.
-    self._layer_nodes = self._load_layers()
-
-    # Load all other nodes and functions.
-    super(KerasObjectLoader, self)._load_all()
-
-    # Finish setting up layers and models. See function docstring for more info.
-    self._finalize_objects()
-
-  @property
-  def _expect_partial_checkpoint(self):
-    return True
-
-  def _recreate(self, proto, node_id):
-    """Creates a Python object from a SavedObject protocol buffer."""
-    if node_id in self._layer_nodes:
-      return self._layer_nodes[node_id]
-
-    if node_id in self._nodes_recreated_from_config:
-      obj, setter = self._nodes_recreated_from_config[node_id]
-
-      # Overwrite variable names with the ones saved in the SavedModel.
-      if proto.WhichOneof('kind') == 'variable' and proto.variable.name:
-        obj._handle_name = proto.variable.name + ':0'  # pylint: disable=protected-access
-    else:
-      obj, setter = super(KerasObjectLoader, self)._recreate(proto, node_id)
-    return obj, setter
+      if isinstance(node, functional_lib.Functional):
+        # Delete the temporary layer dependencies, which were used to restore
+        # the checkpointed values. When the model is live, the user can delete
+        # or add layers to the model at any time, so these layer dependencies
+        # may be obsolete.
+        dependencies = list(node._self_unconditional_dependency_names)  # pylint: disable=protected-access
+        for name in dependencies:
+          if re.match(r'^layer(_with_weights)?-[\d+]', name) is not None:
+            delete_tracking(node, name)
 
   def _add_children_recreated_from_config(self, obj, proto, node_id):
     """Recursively records objects recreated from config."""
     # pylint: disable=protected-access
     if node_id in self._traversed_nodes_from_config:
       return
-    self._traversed_nodes_from_config.append(node_id)
+
+    parent_path = self._node_paths[node_id]
+    self._traversed_nodes_from_config.add(node_id)
     obj._maybe_initialize_trackable()
     if isinstance(obj, base_layer.Layer) and not obj.built:
       metadata = json_utils.decode(proto.user_object.metadata)
@@ -253,21 +323,23 @@ class KerasObjectLoader(tf_load.Loader):
     # Look for direct children
     for reference in proto.children:
       obj_child = obj._lookup_dependency(reference.local_name)
-      children.append((obj_child, reference.node_id))
+      children.append((obj_child, reference.node_id, reference.local_name))
 
     # Add metrics that may have been added to the layer._metrics list.
     # This is stored in the SavedModel as layer.keras_api.layer_metrics in
     # SavedModels created after Tf 2.2.
     metric_list_node_id = self._search_for_child_node(
-        node_id, [constants.KERAS_ATTR, 'layer_metrics'], raise_error=False)
+        node_id, [constants.KERAS_ATTR, 'layer_metrics'])
     if metric_list_node_id is not None and hasattr(obj, '_metrics'):
       obj_metrics = {m.name: m for m in obj._metrics}
       for reference in self._proto.nodes[metric_list_node_id].children:
         metric = obj_metrics.get(reference.local_name)
         if metric is not None:
-          children.append((metric, reference.node_id))
+          metric_path = '{}.layer_metrics.{}'.format(constants.KERAS_ATTR,
+                                                     reference.local_name)
+          children.append((metric, reference.node_id, metric_path))
 
-    for (obj_child, child_id) in children:
+    for (obj_child, child_id, child_name) in children:
       child_proto = self._proto.nodes[child_id]
 
       if not isinstance(obj_child, trackable.Trackable):
@@ -281,47 +353,57 @@ class KerasObjectLoader(tf_load.Loader):
         setter = setattr
         # pylint: enable=protected-access
 
-      if (child_id in self._nodes_recreated_from_config and
-          self._nodes_recreated_from_config[child_id][0] is not obj_child):
-        # This means that the same trackable object is referenced by two
-        # different objects that were recreated from the config.
-        logging.warn('Looks like there is an object (perhaps variable or layer)'
-                     ' that is shared between different layers/models. This '
-                     'may cause issues when restoring the variable values.'
-                     'Object: {}'.format(obj_child))
-      self._nodes_recreated_from_config[child_id] = (
-          obj_child, self._config_node_setter(setter))
+      if child_id in self.loaded_nodes:
+        if self.loaded_nodes[child_id][0] is not obj_child:
+          # This means that the same trackable object is referenced by two
+          # different objects that were recreated from the config.
+          logging.warn('Looks like there is an object (perhaps variable or '
+                       'layer) that is shared between different layers/models. '
+                       'This may cause issues when restoring the variable '
+                       'values. Object: {}'.format(obj_child))
+        continue
+
+      # Overwrite variable names with the ones saved in the SavedModel.
+      if (child_proto.WhichOneof('kind') == 'variable' and
+          child_proto.variable.name):
+        obj_child._handle_name = child_proto.variable.name + ':0'  # pylint: disable=protected-access
+
+      if isinstance(obj_child, data_structures.TrackableDataStructure):
+        setter = lambda *args: None
+
+      child_path = '{}.{}'.format(parent_path, child_name)
+      self._node_paths[child_id] = child_path
       self._add_children_recreated_from_config(
           obj_child, child_proto, child_id)
+      self.loaded_nodes[child_id] = obj_child, setter
 
-  def _load_layers(self):
-    layers = {}
-
+  def load_layers(self):
+    """Load all layer nodes from the metadata."""
     # Load metrics after models and layers, since it's likely that models
     # and layers will create the metric when initialized (this avoids wasting
     # time by creating objects multiple times).
     metric_list = []
-    for node_id, proto in enumerate(self._proto.nodes):
-      if (proto.WhichOneof('kind') != 'user_object' or
-          proto.user_object.identifier not in KERAS_OBJECT_IDENTIFIERS):
-        continue
-      if proto.user_object.identifier == '_tf_keras_metric':
-        metric_list.append((node_id, proto))
+    for node_metadata in self._metadata.nodes:
+      if node_metadata.identifier == '_tf_keras_metric':
+        metric_list.append(node_metadata)
         continue
 
-      layers[node_id] = self._load_layer(proto.user_object, node_id)
+      self.loaded_nodes[node_metadata.node_id] = self._load_layer(
+          node_metadata.node_id, node_metadata.identifier,
+          node_metadata.metadata)
 
-    for node_id, proto in metric_list:
-      layers[node_id] = self._load_layer(proto.user_object, node_id)
-    return layers
+    for node_metadata in metric_list:
+      self.loaded_nodes[node_metadata.node_id] = self._load_layer(
+          node_metadata.node_id, node_metadata.identifier,
+          node_metadata.metadata)
 
-  def _load_layer(self, proto, node_id):
+  def _load_layer(self, node_id, identifier, metadata):
     """Load a single layer from a SavedUserObject proto."""
-    metadata = json_utils.decode(proto.metadata)
+    metadata = json_utils.decode(metadata)
 
     # If node was already created
-    if node_id in self._nodes_recreated_from_config:
-      node, setter = self._nodes_recreated_from_config[node_id]
+    if node_id in self.loaded_nodes:
+      node, setter = self.loaded_nodes[node_id]
 
       # Revive setter requires the object to have a `_serialized_attributes`
       # property. Add it here.
@@ -329,15 +411,17 @@ class KerasObjectLoader(tf_load.Loader):
 
       config = metadata.get('config')
       if _is_graph_network(node) and generic_utils.validate_config(config):
-        self.model_layer_dependencies[node_id] = (
-            node, self._get_child_layer_node_ids(node_id, node.name))
+        child_nodes = self._get_child_layer_node_ids(node_id)
+        self.model_layer_dependencies[node_id] = (node, child_nodes)
+        if not child_nodes:
+          self._models_to_reconstruct.append(node_id)
       return node, setter
 
     # Detect whether this object can be revived from the config. If not, then
     # revive from the SavedModel instead.
-    obj, setter = self._revive_from_config(proto.identifier, metadata, node_id)
+    obj, setter = self._revive_from_config(identifier, metadata, node_id)
     if obj is None:
-      obj, setter = revive_custom_object(proto.identifier, metadata)
+      obj, setter = revive_custom_object(identifier, metadata)
 
     # Add an attribute that stores the extra functions/objects saved in the
     # SavedModel. Most of these functions/objects are ignored, but some are
@@ -349,7 +433,7 @@ class KerasObjectLoader(tf_load.Loader):
   def _revive_from_config(self, identifier, metadata, node_id):
     """Revives a layer/model from config, or returns None."""
     if identifier == '_tf_keras_metric':
-      obj = self._revive_metric_from_config(metadata, node_id)
+      obj = self._revive_metric_from_config(metadata)
     else:
       obj = (
           self._revive_graph_network(metadata, node_id) or
@@ -359,7 +443,6 @@ class KerasObjectLoader(tf_load.Loader):
       return None, None
 
     setter = self._config_node_setter(_revive_setter)
-    self._nodes_recreated_from_config[node_id] = obj, setter
     self._add_children_recreated_from_config(
         obj, self._proto.nodes[node_id], node_id)
     return obj, setter
@@ -394,9 +477,10 @@ class KerasObjectLoader(tf_load.Loader):
 
     # Record this model and its layers. This will later be used to reconstruct
     # the model.
-    layers = self._get_child_layer_node_ids(node_id, model.name)
+    layers = self._get_child_layer_node_ids(node_id)
     self.model_layer_dependencies[node_id] = (model, layers)
-
+    if not layers:
+      self._models_to_reconstruct.append(node_id)
     return model
 
   def _revive_layer_from_config(self, metadata, node_id):
@@ -449,7 +533,8 @@ class KerasObjectLoader(tf_load.Loader):
 
     return obj
 
-  def _revive_metric_from_config(self, metadata, node_id):
+  def _revive_metric_from_config(self, metadata):
+    """Revives a metric object using the config saved in the metadata."""
     class_name = compat.as_str(metadata['class_name'])
     config = metadata.get('config')
 
@@ -490,7 +575,10 @@ class KerasObjectLoader(tf_load.Loader):
       if node_id not in self.model_layer_dependencies:
         self._add_object_graph_edges(proto, node_id)
 
-  def _finalize_objects(self):
+  def get_path(self, node_id):
+    return self._node_paths[node_id]
+
+  def finalize_objects(self):
     """Finish setting up Keras objects.
 
     This function is executed after all objects and functions have been created.
@@ -504,7 +592,7 @@ class KerasObjectLoader(tf_load.Loader):
     # functions and losses to each object, and sets model inputs/outputs.
     layers_revived_from_config = []
     layers_revived_from_saved_model = []
-    for node_id, node in enumerate(self._nodes):
+    for node_id, (node, _) in self.loaded_nodes.items():
       if (not isinstance(node, base_layer.Layer) or
           # Don't finalize models until all layers have finished loading.
           node_id in self.model_layer_dependencies):
@@ -517,10 +605,10 @@ class KerasObjectLoader(tf_load.Loader):
       elif isinstance(node, metrics.Metric):
         continue
 
-      if node_id in self._nodes_recreated_from_config:
-        layers_revived_from_config.append(node)
-      else:
+      if isinstance(node, (RevivedLayer, RevivedInputLayer)):
         layers_revived_from_saved_model.append(node)
+      else:
+        layers_revived_from_config.append(node)
 
     _finalize_saved_model_layers(layers_revived_from_saved_model)
     _finalize_config_layers(layers_revived_from_config)
@@ -539,13 +627,13 @@ class KerasObjectLoader(tf_load.Loader):
         self._models_to_reconstruct.append(model_id)
 
   def _reconstruct_all_models(self):
+    """Reconstructs the network structure of all models."""
     all_initialized_models = set()
     while self._models_to_reconstruct:
       model_id = self._models_to_reconstruct.pop(0)
       all_initialized_models.add(model_id)
       model, layers = self.model_layer_dependencies[model_id]
       self._reconstruct_model(model_id, model, layers)
-      self._add_object_graph_edges(self._proto.nodes[model_id], model_id)
       _finalize_config_layers([model])
 
     if all_initialized_models != set(self.model_layer_dependencies.keys()):
@@ -560,10 +648,17 @@ class KerasObjectLoader(tf_load.Loader):
                        .format(uninitialized_model_names))
 
   def _reconstruct_model(self, model_id, model, layers):
+    """Reconstructs the network structure."""
     config = json_utils.decode(
         self._proto.nodes[model_id].user_object.metadata)['config']
-    if isinstance(model, models_lib.Sequential):
-      if not isinstance(layers[0], input_layer.InputLayer):
+
+    # Set up model inputs
+    if model.inputs:
+      # Inputs may already be created if the model is instantiated in another
+      # object's __init__.
+      pass
+    elif isinstance(model, models_lib.Sequential):
+      if not layers or not isinstance(layers[0], input_layer.InputLayer):
         if config['layers'][0]['class_name'] == 'InputLayer':
           layers.insert(0, input_layer.InputLayer.from_config(
               config['layers'][0]['config']))
@@ -576,13 +671,13 @@ class KerasObjectLoader(tf_load.Loader):
               name=layers[0].name + '_input'))
       model.__init__(layers, name=config['name'])
       if not model.inputs:
-        first_layer = self._get_child_layer_node_ids(model_id, model.name)[0]
+        first_layer = self._get_child_layer_node_ids(model_id)[0]
         input_specs = self._infer_inputs(first_layer)
         input_shapes = self._infer_inputs(first_layer, convert_to_shapes=True)
         model._set_inputs(input_specs)  # pylint: disable=protected-access
         if not model.built and not isinstance(input_specs, dict):
           model.build(input_shapes)
-    else:
+    else:  # Reconstruct functional model
       (inputs, outputs,
        created_layers) = functional_lib.reconstruct_from_config(
            config, created_layers={layer.name: layer for layer in layers})
@@ -595,15 +690,31 @@ class KerasObjectLoader(tf_load.Loader):
     # Unblock models that are dependent on this model.
     self._unblock_model_reconstruction(model_id, model)
 
-  def _get_child_layer_node_ids(self, node_id, name):
-    """Returns the node ids of the children layers of a node."""
-    # Retrieve the node id of layer.keras_api.layers.
-    layer_list = self._search_for_child_node(
-        node_id, [constants.KERAS_ATTR, 'layers'], name)
-    return [node.node_id for node in self._proto.nodes[layer_list].children]
+  def _get_child_layer_node_ids(self, node_id):
+    """Returns the node ids of each layer in a Sequential/Functional model."""
+    # Sequential and Functional track layers with names following the format
+    # "layer-N". Use this to generate the list of layers.
+    num_layers = 0
+    child_layers = {}
+    pattern = re.compile('layer-(\\d+)')
 
-  def _search_for_child_node(
-      self, parent_id, path_to_child, debugging_name=None, raise_error=True):
+    for child in self._proto.nodes[node_id].children:
+      m = pattern.match(child.local_name)
+      if m is None:
+        continue
+      layer_n = int(m.group(1))
+      num_layers = max(layer_n + 1, num_layers)
+      child_layers[layer_n] = child.node_id
+
+    ordered = []
+    for n in range(num_layers):
+      child = child_layers.get(n)
+      if child is None:
+        break
+      ordered.append(child)
+    return ordered
+
+  def _search_for_child_node(self, parent_id, path_to_child):
     """Returns node id of child node.
 
     A helper method for traversing the object graph proto.
@@ -621,37 +732,23 @@ class KerasObjectLoader(tf_load.Loader):
     Args:
       parent_id: node id of parent node
       path_to_child: list of children names.
-      debugging_name: the name to print out when raising an error.
-      raise_error: Whether to raise an error if the child isn't found.
 
     Returns:
       node_id of child, or None if child isn't found.
-
-    Raises:
-      ValueError: if child isn't found and raise_error is True.
     """
     if not path_to_child:
       return parent_id
 
     for child in self._proto.nodes[parent_id].children:
       if child.local_name == path_to_child[0]:
-        return self._search_for_child_node(child.node_id, path_to_child[1:],
-                                           debugging_name, raise_error)
-
-    if raise_error:
-      raise ValueError(
-          'Error when loading {}: could not find attribute {}.\n'
-          'Most likely this object was serialized incorrectly.'
-          .format(debugging_name or path_to_child[0], path_to_child[0]))
-    else:
-      return None
+        return self._search_for_child_node(child.node_id, path_to_child[1:])
+    return None
 
   def _infer_inputs(self, layer_node_id, convert_to_shapes=False):
     """Infers input shape of layer from SavedModel functions."""
     coder = nested_structure_coder.StructureCoder()
     call_fn_id = self._search_for_child_node(
-        layer_node_id, ['call_and_return_all_conditional_losses'], None,
-        raise_error=False)
+        layer_node_id, ['call_and_return_all_conditional_losses'])
     if call_fn_id is None:
       return None
 
@@ -738,9 +835,9 @@ def _unable_to_call_layer_due_to_serialization_issue(
   """
 
   raise ValueError(
-      'Cannot call {} ({}), because the call function was not serialized to '
-      'the SavedModel (due to lack information about the inputs). Please try '
-      'one of the following methods to fix the serialization:'
+      'Cannot call custom layer {} of type {}, because the call function was '
+      'not serialized to the SavedModel.'
+      'Please try one of the following methods to fix this issue:'
       '\n\n(1) Implement `get_config` and `from_config` in the layer/model '
       'class, and pass the object to the `custom_objects` argument when '
       'loading the model. For more details, see: '
@@ -749,7 +846,7 @@ def _unable_to_call_layer_due_to_serialization_issue(
       'and not `__call__`. The input shape and dtype will be automatically '
       'recorded when the object is called, and used when saving. To manually '
       'specify the input shape/dtype, decorate the call function with '
-      '`@tf.function(input_signature=...)`.'.format(layer.name, layer))
+      '`@tf.function(input_signature=...)`.'.format(layer.name, type(layer)))
 
 
 def _finalize_config_layers(layers):
@@ -917,8 +1014,11 @@ def _revive_setter(layer, name, value):
   elif (isinstance(layer, functional_lib.Functional) and
         re.match(r'^layer(_with_weights)?-[\d+]', name) is not None):
     # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
-    # network._track_layers, should not be added as an attribute.
-    pass
+    # network._track_layers, should not be added as an attribute. They should
+    # be temporarily added as a dependency so that checkpointed values can be
+    # restored. These dependencies are manually deleted in
+    # KerasObjectLoader.del_tracking.
+    layer._track_trackable(value, name)  # pylint: disable=protected-access
   elif getattr(layer, name, None) is not None:
     # Don't overwrite already defined attributes.
     pass
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 80170dbd708..d9944214b6d 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -328,7 +328,9 @@ class TestModelRevive(ReviveTestBase):
     ])
     model.save(self.path, save_format='tf')
     revived = keras_load.load(self.path)
-    self.assertEqual(dtypes.string, revived._layers[0].dtype)
+    revived_layers = list(
+        revived._flatten_layers(include_self=False, recursive=False))
+    self.assertEqual(dtypes.string, revived_layers[0].dtype)
 
   @parameterized.named_parameters(
       ('default_config', CustomNetworkDefaultConfig),
diff --git a/tensorflow/python/keras/saving/saved_model/save.py b/tensorflow/python/keras/saving/saved_model/save.py
index a40856cbf54..2ab7ebb60b1 100644
--- a/tensorflow/python/keras/saving/saved_model/save.py
+++ b/tensorflow/python/keras/saving/saved_model/save.py
@@ -18,14 +18,21 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+
+from tensorflow.core.framework import versions_pb2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.protobuf import saved_metadata_pb2
 from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import save_impl
+from tensorflow.python.keras.saving.saved_model import utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import save as save_lib
 
+
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
 
@@ -38,7 +45,7 @@ training_lib = LazyLoader(
 
 
 def save(model, filepath, overwrite, include_optimizer, signatures=None,
-         options=None):
+         options=None, save_traces=True):
   """Saves a model as a SavedModel to the filepath.
 
   Args:
@@ -49,8 +56,14 @@ def save(model, filepath, overwrite, include_optimizer, signatures=None,
     signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
       format only. Please see the `signatures` argument in `tf.saved_model.save`
       for details.
-    options: Optional `tf.saved_model.SaveOptions` object that specifies
-      options for saving to SavedModel.
+    options: (only applies to SavedModel format) `tf.saved_model.SaveOptions`
+      object that specifies options for saving to SavedModel.
+    save_traces: (only applies to SavedModel format) When enabled, the
+      SavedModel will store the function traces for each layer. This
+      can be disabled, so that only the configs of each layer are stored.
+      Defaults to `True`. Disabling this will decrease serialization time
+      and reduce file size, but it requires that all custom layers/models
+      implement a `get_config()` method.
 
   Raises:
     ValueError: if the model's inputs have not been defined.
@@ -61,8 +74,9 @@ def save(model, filepath, overwrite, include_optimizer, signatures=None,
     if not proceed:
       return
 
-  if save_impl.should_skip_serialization(model):
-    saving_utils.raise_model_input_error(model)
+  if save_traces:
+    if save_impl.should_skip_serialization(model):
+      saving_utils.raise_model_input_error(model)
 
   if not include_optimizer:
     orig_optimizer = model.optimizer
@@ -77,7 +91,40 @@ def save(model, filepath, overwrite, include_optimizer, signatures=None,
     # the replica context is not available when calling `add_update()`, and thus
     # we use the default replica context here.
     with distribution_strategy_context._get_default_replica_context():  # pylint: disable=protected-access
-      save_lib.save(model, filepath, signatures, options)
+      with utils.keras_option_scope(save_traces):
+        saved_nodes, node_paths = save_lib.save_and_return_nodes(
+            model, filepath, signatures, options)
+
+    # Save all metadata to a separate file in the SavedModel directory.
+    metadata = generate_keras_metadata(saved_nodes, node_paths)
+
+  with gfile.GFile(
+      os.path.join(filepath, constants.SAVED_METADATA_PATH), "wb") as w:
+    w.write(metadata.SerializeToString(deterministic=True))
 
   if not include_optimizer:
     model.optimizer = orig_optimizer
+
+
+def generate_keras_metadata(saved_nodes, node_paths):
+  """Constructs a KerasMetadata proto with the metadata of each keras object."""
+  metadata = saved_metadata_pb2.SavedMetadata()
+
+  for node_id, node in enumerate(saved_nodes):
+    if isinstance(node, base_layer.Layer):
+      path = node_paths[node]
+      if not path:
+        node_path = "root"
+      else:
+        node_path = "root.{}".format(
+            ".".join([ref.name for ref in path]))
+
+      metadata.nodes.add(
+          node_id=node_id,
+          node_path=node_path,
+          version=versions_pb2.VersionDef(
+              producer=1, min_consumer=1, bad_consumers=[]),
+          identifier=node._object_identifier,  # pylint: disable=protected-access
+          metadata=node._tracking_metadata)  # pylint: disable=protected-access
+
+  return metadata
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index 97c8fc313f0..cc0f71380b2 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
-from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
+from tensorflow.python.keras.mixed_precision import autocast_variable
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import load as keras_load
@@ -377,26 +377,26 @@ class LayerCallCollection(object):
     if (isinstance(layer.call, def_function.Function) and
         layer.call.input_signature is not None):
       return layer.call.input_signature
+    elif isinstance(layer, training_lib.Model):
+      return saving_utils.model_input_signature(layer)
+    elif (layer.input_spec is not None and
+          layer._use_input_spec_as_call_signature):  # pylint: disable=protected-access
+
+      def to_tensor_spec_or_none(x):
+        spec = input_spec.to_tensor_spec(x, layer._compute_dtype)  # pylint: disable=protected-access
+        # If the shape is too general (e.g. multiple dimensions are allowed),
+        # return None so that separate functions can be generated for each
+        # inferred input signature.
+        # TODO(b/134962016): currently partial signatures are not supported.
+        if spec.shape == tensor_shape.TensorShape(None):
+          return None
+        return spec
+      input_signature = [nest.map_structure(
+          to_tensor_spec_or_none, layer.input_spec)]
+
+      return input_signature
     else:
-      if isinstance(layer, training_lib.Model):
-        return saving_utils.model_input_signature(layer)
-      elif layer.input_spec is not None:
-
-        def to_tensor_spec_or_none(x):
-          spec = input_spec.to_tensor_spec(x, layer._compute_dtype)  # pylint: disable=protected-access
-          # If the shape is too general (e.g. multiple dimensions are allowed),
-          # return None so that separate functions can be generated for each
-          # inferred input signature.
-          # TODO(b/134962016): currently partial signatures are not supported.
-          if spec.shape == tensor_shape.TensorShape(None):
-            return None
-          return spec
-        input_signature = [nest.map_structure(
-            to_tensor_spec_or_none, layer.input_spec)]
-
-        return input_signature
-      else:
-        return None
+      return None
 
   def add_trace(self, *args, **kwargs):
     """Traces all functions with the same args and kwargs.
@@ -628,7 +628,9 @@ def _wrap_activity_regularizer(layer):
   return def_function.Function(
       layer._activity_regularizer,
       '{}_activity_regularizer'.format(layer.name),
-      input_signature=[tensor_spec.TensorSpec(None, layer.dtype or K.floatx())])
+      input_signature=[
+          tensor_spec.TensorSpec(None, layer._compute_dtype or K.floatx())
+      ])
   # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index ac25dfda943..9bbb31eb004 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -26,7 +26,6 @@ from __future__ import print_function
 
 import os
 import shutil
-import sys
 
 from absl.testing import parameterized
 import numpy as np
@@ -84,6 +83,10 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
   def compute_output_shape(self, input_shape):
     return input_shape
 
+  @property
+  def _use_input_spec_as_call_signature(self):
+    return True
+
 
 class LayerWithLoss(keras.layers.Layer):
 
@@ -114,7 +117,7 @@ class GlobalLayerThatShouldFailIfNotAdded(keras.layers.Layer):
 
 
 @keras_parameterized.run_all_keras_modes
-class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
+class TestSavedModelFormatAllModes(keras_parameterized.TestCase):
 
   def _save_model_dir(self, dirname='saved_model'):
     temp_dir = self.get_temp_dir()
@@ -327,6 +330,10 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
             'a': keras.layers.InputSpec(max_ndim=3, axes={-1: 2}),
             'b': keras.layers.InputSpec(shape=(None, 2, 3), dtype='float16')}
 
+      @property
+      def _use_input_spec_as_call_signature(self):
+        return True
+
     layer = LayerWithNestedSpec()
     saved_model_dir = self._save_model_dir()
     tf_save.save(layer, saved_model_dir)
@@ -411,14 +418,16 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.evaluate(variables.variables_initializer(model.variables))
     saved_model_dir = self._save_model_dir()
 
-    with self.captureWritesToStream(sys.stderr) as captured_logs:
-      model.save(saved_model_dir, save_format='tf')
-      loaded = keras_load.load(saved_model_dir)
+    # TODO(kathywu): Re-enable this check after removing the tf.saved_model.save
+    # metadata warning.
+    # with self.captureWritesToStream(sys.stderr) as captured_logs:
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
 
     # Assert that saving does not log deprecation warnings
     # (even if it needs to set learning phase for compat reasons)
-    if context.executing_eagerly():
-      self.assertNotIn('deprecated', captured_logs.contents())
+    # if context.executing_eagerly():
+    #   self.assertNotIn('deprecated', captured_logs.contents())
 
     input_arr = array_ops.constant([[11], [12], [13]], dtype=dtypes.float32)
     input_arr2 = array_ops.constant([[14], [15], [16]], dtype=dtypes.float32)
@@ -736,8 +745,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
                         predictions)
 
   @parameterized.named_parameters([
-      # TODO(b/148491963): Unrolling does not work with SavedModel
-      # ('with_unrolling', True),
+      ('with_unrolling', True),
       ('no_unrolling', False)
   ])
   def testSaveStatefulRNN(self, unroll):
@@ -829,6 +837,14 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.evaluate(variables.variables_initializer(loaded.variables))
     self.assertAllClose(model.predict(f), loaded.predict(f))
 
+
+class TestSavedModelFormat(test.TestCase):
+
+  def _save_model_dir(self, dirname='saved_model'):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+    return os.path.join(temp_dir, dirname)
+
   def test_load_with_partially_failed_serialization(self):
 
     class BadCustomLayer(keras.layers.Layer):
@@ -858,6 +874,52 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, 'call function was not serialized'):
       loaded.layer(inp)
 
+  def test_save_without_tracing(self):
+
+    class DoNotTrace(keras.layers.Layer):
+
+      def __init__(self):
+        super(DoNotTrace, self).__init__()
+        self.input_spec = keras.layers.InputSpec(shape=[None])
+        self.built = True
+
+      def call(self, inputs):
+        raise ValueError('I said do not trace')
+
+      def get_config(self):
+        return {}
+
+      @property
+      def _use_input_spec_as_call_signature(self):
+        return True
+
+    root = keras.models.Sequential()
+    root.add(keras.layers.Input(shape=(3,)))
+    root.attached_layer = DoNotTrace()
+
+    saved_model_dir = self._save_model_dir()
+
+    # With the default settings, the call function is traced.
+    with self.assertRaisesRegex(ValueError, 'do not trace'):
+      root.save(saved_model_dir, save_format='tf')
+
+    # When saving the config only, the layer call function should not be not
+    # traced.
+    root.save(saved_model_dir, save_format='tf', save_traces=False)
+    loaded = tf_load.load(saved_model_dir)
+    self.assertTrue(hasattr(loaded, 'attached_layer'))
+
+    # This should raise an error when loaded without the custom object
+    loaded = keras_load.load(saved_model_dir)
+    with self.assertRaisesRegex(ValueError, 'Cannot call custom layer'):
+      loaded.attached_layer(constant_op.constant([1.]))
+
+    # Try loading with the custom objects
+    with generic_utils.CustomObjectScope({'DoNotTrace': DoNotTrace}):
+      loaded = keras_load.load(saved_model_dir)
+    with self.assertRaisesRegex(ValueError, 'I said do not trace'):
+      loaded.attached_layer(constant_op.constant([1.]))
+
 
 class TestLayerCallTracing(test.TestCase, parameterized.TestCase):
 
@@ -965,33 +1027,34 @@ class MetricTest(test.TestCase, parameterized.TestCase):
                                  num_tensor_args,
                                  shape=(1, 5),
                                  test_sample_weight=True):
-    tf_save.save(metric, save_dir)
-    loaded = keras_load.load(save_dir)
-    self.evaluate([v.initializer for v in loaded.variables])
-    self.assertEqual(metric.name, loaded.name)
-    self.assertEqual(metric.dtype, loaded.dtype)
+    with self.cached_session():
+      tf_save.save(metric, save_dir)
+      loaded = keras_load.load(save_dir)
+      self.evaluate([v.initializer for v in loaded.variables])
+      self.assertEqual(metric.name, loaded.name)
+      self.assertEqual(metric.dtype, loaded.dtype)
 
-    inputs = self.generate_inputs(num_tensor_args, shape)
-    actual = self.evaluate(metric(*inputs))
-    self.assertAllClose(actual, loaded(*inputs))
-    self.assertAllClose(metric.variables, loaded.variables)
-
-    # Test with separate calls to update state and result.
-    inputs = self.generate_inputs(num_tensor_args, shape)
-    self.evaluate(metric.update_state(*inputs))
-    self.evaluate(loaded.update_state(*inputs))
-    actual = self.evaluate(metric.result())
-    self.assertAllClose(actual, loaded.result())
-
-    if test_sample_weight:
-      # Test with sample weights input.
       inputs = self.generate_inputs(num_tensor_args, shape)
-      sample_weight = self.generate_inputs(1, [])[0]
-      inputs.append(sample_weight)
-
       actual = self.evaluate(metric(*inputs))
       self.assertAllClose(actual, loaded(*inputs))
-    return loaded
+      self.assertAllClose(metric.variables, loaded.variables)
+
+      # Test with separate calls to update state and result.
+      inputs = self.generate_inputs(num_tensor_args, shape)
+      self.evaluate(metric.update_state(*inputs))
+      self.evaluate(loaded.update_state(*inputs))
+      actual = self.evaluate(metric.result())
+      self.assertAllClose(actual, loaded.result())
+
+      if test_sample_weight:
+        # Test with sample weights input.
+        inputs = self.generate_inputs(num_tensor_args, shape)
+        sample_weight = self.generate_inputs(1, [])[0]
+        inputs.append(sample_weight)
+
+        actual = self.evaluate(metric(*inputs))
+        self.assertAllClose(actual, loaded(*inputs))
+      return loaded
 
   @parameterized.named_parameters([
       ('mean', keras.metrics.Mean, 1, (1, 5)),
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index 5ed68e775d8..51628223d00 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -18,13 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import threading
 import types
 
 from tensorflow.python.eager import context
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import control_flow_util
-from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.util import tf_decorator
@@ -115,10 +116,11 @@ def layer_uses_training_bool(layer):
 
 def list_all_layers(obj):
   if isinstance(obj, training_lib.Model):
+    # Handle special case of Sequential, which doesn't return
+    # the `Input` layer.
     return obj.layers
   else:
-    return list(
-        layer_utils.filter_empty_layer_containers(obj._layers))  # pylint: disable=protected-access
+    return list(obj._flatten_layers(include_self=False, recursive=False))  # pylint: disable=protected-access
 
 
 def list_all_layers_and_sublayers(obj):
@@ -245,3 +247,27 @@ def remove_training_arg(index, args, kwargs):
     args.pop(index)
   else:
     kwargs.pop('training', None)
+
+
+class SaveOptionsContext(threading.local):
+
+  def __init__(self):
+    super(SaveOptionsContext, self).__init__()
+    self.save_traces = True
+
+
+_save_options_context = SaveOptionsContext()
+
+
+@tf_contextlib.contextmanager
+def keras_option_scope(save_traces):
+  previous_value = _save_options_context.save_traces
+  try:
+    _save_options_context.save_traces = save_traces
+    yield
+  finally:
+    _save_options_context.save_traces = previous_value
+
+
+def should_save_traces():
+  return _save_options_context.save_traces
diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py
index 45130922250..d3e60ca0088 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental_test.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py
@@ -278,6 +278,13 @@ def load_model(sess, path, mode):
   return inputs, outputs, meta_graph_def
 
 
+def get_train_op(meta_graph_def):
+  graph = ops.get_default_graph()
+  signature_def = meta_graph_def.signature_def['__saved_model_train_op']
+  op_name = signature_def.outputs['__saved_model_train_op'].name
+  return graph.as_graph_element(op_name)
+
+
 class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
   def _save_model_dir(self, dirname='saved_model'):
@@ -402,7 +409,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
-        train_op = loader_impl.get_train_op(meta_graph_def)
+        train_op = get_train_op(meta_graph_def)
         train_outputs, _ = sess.run(
             [outputs, train_op], {inputs[input_name]: input_arr,
                                   inputs[target_name]: target_arr})
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index b240c4262af..d16f69bd89d 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections.abc as collections_abc
 import copy
 import os
 import six
@@ -32,7 +33,6 @@ from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
 
 
 def extract_model_metrics(model):
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 6d212e0cda3..fecf52e71b3 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -304,6 +304,7 @@ _thread_local_data = threading.local()
 _thread_local_data.model_type = None
 _thread_local_data.run_eagerly = None
 _thread_local_data.saved_model_format = None
+_thread_local_data.save_kwargs = None
 
 
 @tf_contextlib.contextmanager
@@ -383,7 +384,7 @@ def should_run_eagerly():
 
 
 @tf_contextlib.contextmanager
-def saved_model_format_scope(value):
+def saved_model_format_scope(value, **kwargs):
   """Provides a scope within which the savde model format to test is `value`.
 
   The saved model format gets restored to its original value upon exiting the
@@ -391,17 +392,21 @@ def saved_model_format_scope(value):
 
   Arguments:
      value: saved model format value
+     **kwargs: optional kwargs to pass to the save function.
 
   Yields:
     The provided value.
   """
-  previous_value = _thread_local_data.saved_model_format
+  previous_format = _thread_local_data.saved_model_format
+  previous_kwargs = _thread_local_data.save_kwargs
   try:
     _thread_local_data.saved_model_format = value
-    yield value
+    _thread_local_data.save_kwargs = kwargs
+    yield
   finally:
     # Restore saved model format to initial value.
-    _thread_local_data.saved_model_format = previous_value
+    _thread_local_data.saved_model_format = previous_format
+    _thread_local_data.save_kwargs = previous_kwargs
 
 
 def get_save_format():
@@ -413,6 +418,15 @@ def get_save_format():
   return _thread_local_data.saved_model_format
 
 
+def get_save_kwargs():
+  if _thread_local_data.save_kwargs is None:
+    raise ValueError(
+        'Cannot call `get_save_kwargs()` outside of a '
+        '`saved_model_format_scope()` or `run_with_all_saved_model_formats` '
+        'decorator.')
+  return _thread_local_data.save_kwargs or {}
+
+
 def get_model_type():
   """Gets the model type that should be tested."""
   if _thread_local_data.model_type is None:
diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index c20177aff00..e7db6c45e04 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -29,7 +29,6 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
-    tfrt_enabled = True,
     deps = [
         ":get_config_samples",
         "//tensorflow/python:client_testlib",
@@ -57,7 +56,6 @@ tpu_py_test(
         "automatic_outside_compilation_test.py",
     ],
     disable_experimental = True,
-    disable_mlir_bridge = False,
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
@@ -129,7 +127,6 @@ tf_py_test(
     name = "graph_util_test",
     srcs = ["graph_util_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -272,7 +269,6 @@ tf_py_test(
         "nomsan",  # TODO(b/149948895): Re-enable.
         "notsan",  # TODO(b/149948895): Re-enable.
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -283,47 +279,10 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "op_callbacks_test",
-    srcs = ["op_callbacks_test.py"],
-    python_version = "PY3",
-    xla_enable_strict_auto_jit = False,
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:op_callbacks",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
-        "//tensorflow/python/keras:combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "summary_ops_test",
-    size = "small",
-    srcs = ["summary_ops_test.py"],
-    tfrt_enabled = True,
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python/keras:testing_utils",
-        "//tensorflow/python/keras/engine",
-        "//tensorflow/python/keras/layers:core",
-    ],
-)
-
 tf_py_test(
     name = "saved_model_test",
     size = "small",
     srcs = ["saved_model_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -370,7 +329,6 @@ tf_py_test(
     size = "small",
     srcs = ["serialization_util_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -403,7 +361,6 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -432,7 +389,6 @@ tf_py_test(
     srcs = ["tracking_util_test.py"],
     python_version = "PY3",
     tags = ["notsan"],  # b/74395663
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python:checkpoint_management",
@@ -454,6 +410,7 @@ tf_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
@@ -488,6 +445,7 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
diff --git a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
index 64733746776..cba6adbb35a 100644
--- a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
+++ b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 
+from absl import flags
 import numpy as np
 
 from tensorboard.plugins.histogram import summary_v2 as histogram_summary_v2
@@ -45,7 +46,7 @@ from tensorflow.python.keras.layers import pooling as pool_layer_lib
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
-from tensorflow.python.platform import flags
+# from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.tpu import tpu_strategy_util
@@ -246,33 +247,42 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
   def testSummaryWithCustomTrainingLoop(self):
     strategy = get_tpu_strategy()
 
+    writer = summary_ops_v2.create_file_writer_v2(self.summary_dir)
     with strategy.scope():
       model = distribute_strategy_test.get_model()
       model.compile('sgd', 'mse')
-      writer = summary_ops_v2.create_file_writer_v2(self.summary_dir)
 
-      @def_function.function
-      def custom_function(dataset):
+    @def_function.function
+    def custom_function(dataset):
 
-        def _custom_step(features, labels):
-          del labels
-          logits = model(features)
-          with summary_ops_v2.always_record_summaries(), writer.as_default():
-            scalar_summary_v2.scalar(
-                'logits',
-                math_ops.reduce_sum(logits),
-                step=model.optimizer.iterations)
-          return logits
+      def _custom_step(features, labels):
+        del labels
+        logits = model(features)
+        with summary_ops_v2.record_if(True), writer.as_default():
+          scalar_summary_v2.scalar(
+              'logits',
+              math_ops.reduce_sum(logits),
+              step=model.optimizer.iterations)
+        return logits
 
-        iterator = iter(dataset)
-        output = strategy.unwrap(
-            strategy.run(_custom_step, args=(next(iterator))))
-        return output
+      iterator = iter(dataset)
+      output = strategy.unwrap(
+          strategy.run(_custom_step, args=(next(iterator))))
+      return output
 
-      dataset = strategy.experimental_distribute_dataset(
-          distribute_strategy_test.get_dataset(strategy))
+    dataset = strategy.experimental_distribute_dataset(
+        distribute_strategy_test.get_dataset(strategy))
 
-      custom_function(dataset)
+    custom_function(dataset)
+    writer.close()
+
+    event_files = file_io.get_matching_files_v2(
+        os.path.join(self.summary_dir, 'event*'))
+    events_count_dictionary = {
+        ('logits'): 0,
+    }
+    self.validate_recorded_sumary_file(event_files, events_count_dictionary,
+                                       1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/tests/memory_checker_test.py b/tensorflow/python/keras/tests/memory_checker_test.py
index 163552ef50e..63e51f279fa 100644
--- a/tensorflow/python/keras/tests/memory_checker_test.py
+++ b/tensorflow/python/keras/tests/memory_checker_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 class MemoryCheckerTest(test.TestCase):
 
   def testKerasBasic(self):
-    # TODO(kkb): Fix the the slowness on Forge.
+    # TODO(kkb): Fix the slowness on Forge.
     self.skipTest('This test is too slow on Forge so disabled for now.')
 
     x = array_ops.zeros([1, 1])
@@ -47,7 +47,7 @@ class MemoryCheckerTest(test.TestCase):
     memory_checker.assert_no_leak_if_all_possibly_except_one()
 
   def testKerasAdvanced(self):
-    # TODO(kkb): Fix the the slowness on Forge.
+    # TODO(kkb): Fix the slowness on Forge.
     self.skipTest('This test is too slow on Forge so disabled for now.')
 
     # A real world example taken from the following.
diff --git a/tensorflow/python/keras/tests/model_architectures.py b/tensorflow/python/keras/tests/model_architectures.py
index a7e09509d88..28d124cb3df 100644
--- a/tensorflow/python/keras/tests/model_architectures.py
+++ b/tensorflow/python/keras/tests/model_architectures.py
@@ -66,7 +66,7 @@ def lstm():
 
 
 def multi_input_multi_output():
-  """Multi-input Multi-ouput model."""
+  """Multi-input Multi-output model."""
   body_input = keras.Input(shape=(None,), name='body')
   tags_input = keras.Input(shape=(2,), name='tags')
 
@@ -290,7 +290,7 @@ ALL_MODELS = [
 
 
 def get_models(exclude_models=None):
-  """Get all models excluding the specificed ones."""
+  """Get all models excluding the specified ones."""
   models = [model for model in ALL_MODELS
             if model[0] not in exclude_models]
   return models
diff --git a/tensorflow/python/keras/tests/op_callbacks_test.py b/tensorflow/python/keras/tests/op_callbacks_test.py
deleted file mode 100644
index bee71f3b09e..00000000000
--- a/tensorflow/python/keras/tests/op_callbacks_test.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Unit tests for op_callback."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.eager import context
-from tensorflow.python.framework import op_callbacks
-from tensorflow.python.framework import ops
-from tensorflow.python.keras import combinations
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.ops import script_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-# Keep all the hard-coded op type strings in one place so they are easy to
-# change all at once in the face of any possible future op type name changes.
-_ENTER_OP = b"Enter"
-_EXIT_OP = b"Exit"
-_IDENTITY_OP = b"Identity"
-_IF_OP = b"If"
-_MERGE_OP = b"Merge"
-_NEXT_ITERATION_OP = b"NextIteration"
-_PLACEHOLDER_OP = b"Placeholder"
-_STATELESS_IF_OP = b"StatelessIf"
-_SWITCH_OP = b"Switch"
-_VAR_HANDLE_OP = b"VarHandleOp"
-_WHILE_OP = b"While"
-_CASE_OP = b"Case"
-
-
-class _NumpyFunctionCallback(object):
-
-  def __init__(self, instrument_graph_ops=True, float_only=False):
-    self.instrument_graph_ops = instrument_graph_ops
-    self._float_only = float_only
-    self.reset()
-
-  def callback(self, op_type, inputs, attrs, outputs, op_name=None, graph=None):
-    is_eager = not graph
-    if is_eager:
-      self.eager_op_types.append(
-          compat.as_bytes(op_type) if op_type else op_type)
-      self.eager_op_names.append(
-          compat.as_bytes(op_name) if op_name else op_name)
-      self.eager_attrs.append(attrs)
-      self.eager_graphs.append(graph)
-      self.eager_inputs.append(inputs)
-    else:
-      self.graph_op_types.append(
-          compat.as_bytes(op_type) if op_type else op_type)
-      self.graph_op_names.append(
-          compat.as_bytes(op_name) if op_name else op_name)
-      self.graph_attrs.append(attrs)
-      self.graph_graphs.append(graph)
-      self.graph_graph_versions.append(graph.version)
-      self.graph_inputs.append(inputs)
-
-      if not self.instrument_graph_ops:
-        return outputs
-
-      # Instrument the graph with numpy_function.
-      instrumented_outputs = []
-      for output in outputs:
-        if compat.as_bytes(op_type) in (_ENTER_OP, _EXIT_OP, _IF_OP, _MERGE_OP,
-                                        _NEXT_ITERATION_OP, _STATELESS_IF_OP,
-                                        _SWITCH_OP, _WHILE_OP, _CASE_OP,
-                                        _IDENTITY_OP, _VAR_HANDLE_OP,
-                                        _PLACEHOLDER_OP):
-          # TODO(cais): Overriding the output of StatelessIf, If and While ops
-          # currently fails with error. Investigate (b/139668453).
-          # Avoid instrumenting Identity ops as well, as they are inserted
-          # by tf.function/AutoGraph for marshalling outputs.
-          instrumented_output = output
-        else:
-          def record(ndarray_value):
-            if compat.as_bytes(op_name) not in self.graph_internal_ndarrays:
-              self.graph_internal_ndarrays[compat.as_bytes(op_name)] = []
-            self.graph_internal_ndarrays[compat.as_bytes(op_name)].append(
-                ndarray_value)
-            return ndarray_value
-
-          if self._float_only and not output.dtype.is_floating:
-            instrumented_output = output
-          else:
-            instrumented_output = script_ops.numpy_function(
-                record, [output], output.dtype)
-            instrumented_output.set_shape(output.shape)
-        instrumented_outputs.append(instrumented_output)
-
-      return instrumented_outputs
-
-  def reset(self):
-    self.eager_op_types = []
-    self.eager_op_names = []
-    self.eager_attrs = []
-    self.eager_graphs = []
-    self.eager_inputs = []
-    self.graph_op_types = []
-    self.graph_op_names = []
-    self.graph_attrs = []
-    self.graph_graphs = []
-    self.graph_graph_versions = []
-    self.graph_inputs = []
-
-    # A dict mapping tensor name (e.g., "MatMut_10") to a list of ndarrays.
-    # The list is the history of the tensor's computation result inside
-    # `tf.Graph`s (`FuncGraph`s).
-    # For an op with multiple output tensors, the outputs are interleaved in
-    # the list.
-    self.graph_internal_ndarrays = {}
-
-
-@combinations.generate(combinations.combine(mode=["graph", "eager"]))
-class OpCallbacksTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    op_callbacks.clear_op_callbacks()
-    super(OpCallbacksTest, self).tearDown()
-
-  def testKerasLSTMPredict(self):
-    instrument = _NumpyFunctionCallback(float_only=True)
-
-    op_callbacks.add_op_callback(instrument.callback)
-
-    model = keras.Sequential()
-    model.add(keras.layers.LSTM(1, input_shape=(2, 4)))
-    model.compile(loss="mse", optimizer="sgd")
-
-    xs = np.zeros([8, 2, 4], dtype=np.float32)
-    ys = model.predict(xs)
-
-    self.assertAllClose(ys, np.zeros([8, 1]))
-    # We avoid asserting on the internal details of the LSTM implementation.
-    # Instead, we just assert that some graph-internal execution states are
-    # recorded by the callback.
-    self.assertTrue(instrument.graph_internal_ndarrays)
-
-  def testKeraModelFit(self):
-    # TODO(cais): The purely PyFunc (numpy_function) based instrumentation
-    # doesn't work for the entire Keras model and its fit() call, due to some
-    # shape inference limitations. Use tfdbg's gen_debug_ops for testing
-    # instead (b/139668469).
-    instrument = _NumpyFunctionCallback(instrument_graph_ops=False)
-    op_callbacks.add_op_callback(instrument.callback)
-
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(10, input_shape=(8,), activation="relu"))
-    model.add(keras.layers.BatchNormalization())
-    model.add(keras.layers.Dense(1, activation="linear"))
-    model.compile(loss="mse", optimizer="adam")
-
-    batch_size = 4
-    xs = np.ones([batch_size, 8])
-    ys = np.zeros([batch_size, 1])
-    history = model.fit(xs, ys, epochs=2, verbose=0)
-
-    # Simply assert that the training proceeded as expected and that
-    # op callbacks are invoked. We prefer not to assert on the details of the
-    # graph construction and the execution, in order to avoid future
-    # maintenance cost.
-    self.assertEqual(len(history.history["loss"]), 2)
-    self.assertTrue(instrument.graph_op_types)
-    self.assertEqual(len(instrument.graph_op_types),
-                     len(instrument.graph_op_names))
-    if context.executing_eagerly():
-      self.assertTrue(instrument.eager_op_types)
-
-
-if __name__ == "__main__":
-  ops.enable_eager_execution()
-  test.main()
diff --git a/tensorflow/python/keras/tests/summary_ops_test.py b/tensorflow/python/keras/tests/summary_ops_test.py
deleted file mode 100644
index 7d9a89ec60c..00000000000
--- a/tensorflow/python/keras/tests/summary_ops_test.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for V2 summary ops from summary_ops_v2."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.core.util import event_pb2
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.engine.sequential import Sequential
-from tensorflow.python.keras.engine.training import Model
-from tensorflow.python.keras.layers.core import Activation
-from tensorflow.python.keras.layers.core import Dense
-from tensorflow.python.lib.io import tf_record
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-
-
-class SummaryOpsTest(test.TestCase):
-
-  def tearDown(self):
-    super(SummaryOpsTest, self).tearDown()
-    summary_ops.trace_off()
-
-  def keras_model(self, *args, **kwargs):
-    logdir = self.get_temp_dir()
-    writer = summary_ops.create_file_writer(logdir)
-    with writer.as_default():
-      summary_ops.keras_model(*args, **kwargs)
-    writer.close()
-    events = events_from_logdir(logdir)
-    # The first event contains no summary values. The written content goes to
-    # the second event.
-    return events[1]
-
-  @testing_utils.run_v2_only
-  def testKerasModel(self):
-    model = Sequential(
-        [Dense(10, input_shape=(100,)),
-         Activation('relu', name='my_relu')])
-    event = self.keras_model(name='my_name', data=model, step=1)
-    first_val = event.summary.value[0]
-    self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
-
-  @testing_utils.run_v2_only
-  def testKerasModel_usesDefaultStep(self):
-    model = Sequential(
-        [Dense(10, input_shape=(100,)),
-         Activation('relu', name='my_relu')])
-    try:
-      summary_ops.set_step(42)
-      event = self.keras_model(name='my_name', data=model)
-      self.assertEqual(42, event.step)
-    finally:
-      # Reset to default state for other tests.
-      summary_ops.set_step(None)
-
-  @testing_utils.run_v2_only
-  def testKerasModel_subclass(self):
-
-    class SimpleSubclass(Model):
-
-      def __init__(self):
-        super(SimpleSubclass, self).__init__(name='subclass')
-        self.dense = Dense(10, input_shape=(100,))
-        self.activation = Activation('relu', name='my_relu')
-
-      def call(self, inputs):
-        x = self.dense(inputs)
-        return self.activation(x)
-
-    model = SimpleSubclass()
-    with test.mock.patch.object(logging, 'warn') as mock_log:
-      self.assertFalse(
-          summary_ops.keras_model(name='my_name', data=model, step=1))
-      self.assertRegex(
-          str(mock_log.call_args), 'Model failed to serialize as JSON.')
-
-  @testing_utils.run_v2_only
-  def testKerasModel_otherExceptions(self):
-    model = Sequential()
-
-    with test.mock.patch.object(model, 'to_json') as mock_to_json:
-      with test.mock.patch.object(logging, 'warn') as mock_log:
-        mock_to_json.side_effect = Exception('oops')
-        self.assertFalse(
-            summary_ops.keras_model(name='my_name', data=model, step=1))
-        self.assertRegex(
-            str(mock_log.call_args),
-            'Model failed to serialize as JSON. Ignoring... oops')
-
-
-def events_from_file(filepath):
-  """Returns all events in a single event file.
-
-  Args:
-    filepath: Path to the event file.
-
-  Returns:
-    A list of all tf.Event protos in the event file.
-  """
-  records = list(tf_record.tf_record_iterator(filepath))
-  result = []
-  for r in records:
-    event = event_pb2.Event()
-    event.ParseFromString(r)
-    result.append(event)
-  return result
-
-
-def events_from_logdir(logdir):
-  """Returns all events in the single eventfile in logdir.
-
-  Args:
-    logdir: The directory in which the single event file is sought.
-
-  Returns:
-    A list of all tf.Event protos from the single event file.
-
-  Raises:
-    AssertionError: If logdir does not contain exactly one file.
-  """
-  assert gfile.Exists(logdir)
-  files = gfile.ListDirectory(logdir)
-  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
-  return events_from_file(os.path.join(logdir, files[0]))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/tests/tracking_util_test.py b/tensorflow/python/keras/tests/tracking_util_test.py
index 5f672faa751..ed0bb17adbd 100644
--- a/tensorflow/python/keras/tests/tracking_util_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_test.py
@@ -90,7 +90,7 @@ class InterfaceTests(test.TestCase):
 
   def testSaveWithOnlyKerasSession(self):
 
-    with ops.Graph().as_default():
+    with ops.Graph().as_default(), self.cached_session():
       inp = input_layer.Input([1])
       dense = core.Dense(1)(inp)
       model = training.Model(inp, dense)
@@ -356,6 +356,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
     with self.test_session():
       num_training_steps = 10
       checkpoint_directory = self.get_temp_dir()
+      optimizer = adam.Adam(0.001)
       def _train_fn(model, input_value):
         with backprop.GradientTape() as tape:
           loss = model(input_value)
@@ -365,7 +366,6 @@ class CheckpointingTests(keras_parameterized.TestCase):
       for training_continuation in range(3):
         with testing_utils.device(should_use_gpu=True):
           model = MyModel()
-          optimizer = adam.Adam(0.001)
           root = trackable_utils.Checkpoint(
               optimizer=optimizer, model=model)
           manager = checkpoint_management.CheckpointManager(
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
index 5b5283af1eb..3463a7862bd 100644
--- a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
@@ -283,7 +282,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
-    def _train_fn(optimizer, model):
+    def _train_fn(optimizer, model, root):
       input_value = constant_op.constant([[3.]])
       optimizer.minimize(
           functools.partial(model, input_value),
@@ -303,7 +302,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
 
         for _ in range(num_training_steps):
           strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model))
+              functools.partial(_train_fn, optimizer, model, root))
         root.save(file_prefix=checkpoint_prefix)
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          root.optimizer_step.numpy())
@@ -314,7 +313,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
-    def _train_fn(optimizer, model):
+    def _train_fn(optimizer, model, root):
       input_value = constant_op.constant([[3.]])
       return optimizer.minimize(
           functools.partial(model, input_value),
@@ -332,7 +331,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
           status = root.restore(checkpoint_management.latest_checkpoint(
               checkpoint_directory))
           train_op = strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model))
+              functools.partial(_train_fn, optimizer, model, root))
           with self.session() as session:
             if training_continuation > 0:
               status.assert_consumed()
@@ -460,16 +459,6 @@ class CheckpointingTests(keras_parameterized.TestCase):
                            self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
-  def _get_checkpoint_name(self, name):
-    root = module.Module()
-    trackable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
-        root, saveables_cache=None)
-    with ops.name_scope_v2("root/" + named_variable.name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return named_variable.name
-
   @combinations.generate(combinations.combine(mode=["eager"]))
   def testAnonymousVarsInInit(self):
 
diff --git a/tensorflow/python/keras/type/types.py b/tensorflow/python/keras/type/types.py
index bf83670567c..77e52990fbe 100644
--- a/tensorflow/python/keras/type/types.py
+++ b/tensorflow/python/keras/type/types.py
@@ -176,9 +176,9 @@ class Layer(object):
   Attributes:
     name: The name of the layer (string).
     dtype: The dtype of the layer's computations and weights. If mixed
-      precision is used with a `tf.keras.mixed_precision.experimental.Policy`,
-      this is instead just the dtype of the layer's weights, as the computations
-      are done in a different dtype.
+      precision is used with a `tf.keras.mixed_precision.Policy`, this is
+      instead just the dtype of the layer's weights, as the computations are
+      done in a different dtype.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
     trainable_weights: List of variables to be included in backprop.
@@ -197,7 +197,6 @@ class Layer(object):
   if no dtype is passed. `floatx()` itself defaults to "float32". Additionally,
   layers will cast their inputs to the layer's dtype in TensorFlow 2. When mixed
   precision is used, layers may have different computation and variable dtypes.
-  See `tf.keras.mixed_precision.experimental.Policy` for details on layer
-  dtypes.
+  See `tf.keras.mixed_precision.Policy` for details on layer dtypes.
   """
   pass
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 30b93fa95d8..65f03eb6c23 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -92,6 +92,7 @@ py_library(
     srcs = ["tf_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":object_identity",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
@@ -210,6 +211,13 @@ py_library(
     ],
 )
 
+py_library(
+    name = "object_identity",
+    srcs = ["object_identity.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
 py_library(
     name = "tf_contextlib",
     srcs = ["tf_contextlib.py"],
@@ -276,7 +284,6 @@ tf_py_test(
     size = "small",
     srcs = ["version_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":version_utils",
         "//tensorflow/python:client_testlib",
@@ -290,7 +297,6 @@ tf_py_test(
     size = "small",
     srcs = ["tf_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":tf_utils",
         "//tensorflow/python:client_testlib",
@@ -332,7 +338,6 @@ tf_py_test(
         "no_windows",  # TODO: needs investigation on Windows
         "notsan",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -346,7 +351,6 @@ tf_py_test(
     size = "small",
     srcs = ["layer_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":layer_utils",
         "//tensorflow/python:client_testlib",
@@ -360,7 +364,6 @@ tf_py_test(
     size = "small",
     srcs = ["np_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -374,7 +377,6 @@ tf_py_test(
     size = "small",
     srcs = ["kernelized_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":layer_utils",
         "//tensorflow/python:client_testlib",
@@ -406,7 +408,6 @@ tf_py_test(
     size = "small",
     srcs = ["vis_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -420,7 +421,6 @@ tf_py_test(
     size = "small",
     srcs = ["conv_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -434,7 +434,6 @@ tf_py_test(
     size = "small",
     srcs = ["metrics_utils_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/keras/utils/metrics_utils_test.py b/tensorflow/python/keras/utils/metrics_utils_test.py
index 38467a63c1a..7201424c771 100644
--- a/tensorflow/python/keras/utils/metrics_utils_test.py
+++ b/tensorflow/python/keras/utils/metrics_utils_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.utils import metrics_utils
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -288,7 +288,7 @@ class FilterTopKTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     def _filter_top_k(x):
       # This loses the static shape.
-      x = script_ops.py_func_common(_identity, (x,), dtypes.float32)
+      x = script_ops.numpy_function(_identity, (x,), dtypes.float32)
 
       return metrics_utils._filter_top_k(x=x, k=2)
 
@@ -301,4 +301,4 @@ class FilterTopKTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
-  googletest.main()
+  test.main()
diff --git a/tensorflow/python/keras/utils/object_identity.py b/tensorflow/python/keras/utils/object_identity.py
new file mode 100644
index 00000000000..d87f6aaeb53
--- /dev/null
+++ b/tensorflow/python/keras/utils/object_identity.py
@@ -0,0 +1,247 @@
+"""Utilities for collecting objects based on "is" comparison."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections.abc as collections_abc
+import weakref
+
+
+# LINT.IfChange
+class _ObjectIdentityWrapper(object):
+  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
+
+  Since __eq__ is based on object identity, it's safe to also define __hash__
+  based on object ids. This lets us add unhashable types like trackable
+  _ListWrapper objects to object-identity collections.
+  """
+
+  __slots__ = ["_wrapped", "__weakref__"]
+
+  def __init__(self, wrapped):
+    self._wrapped = wrapped
+
+  @property
+  def unwrapped(self):
+    return self._wrapped
+
+  def _assert_type(self, other):
+    if not isinstance(other, _ObjectIdentityWrapper):
+      raise TypeError("Cannot compare wrapped object with unwrapped object")
+
+  def __lt__(self, other):
+    self._assert_type(other)
+    return id(self._wrapped) < id(other._wrapped)  # pylint: disable=protected-access
+
+  def __gt__(self, other):
+    self._assert_type(other)
+    return id(self._wrapped) > id(other._wrapped)  # pylint: disable=protected-access
+
+  def __eq__(self, other):
+    if other is None:
+      return False
+    self._assert_type(other)
+    return self._wrapped is other._wrapped  # pylint: disable=protected-access
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def __hash__(self):
+    # Wrapper id() is also fine for weakrefs. In fact, we rely on
+    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
+    # weakref.ref(a) in _WeakObjectIdentityWrapper.
+    return id(self._wrapped)
+
+  def __repr__(self):
+    return "<{} wrapping {!r}>".format(type(self).__name__, self._wrapped)
+
+
+class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
+
+  __slots__ = ()
+
+  def __init__(self, wrapped):
+    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
+
+  @property
+  def unwrapped(self):
+    return self._wrapped()
+
+
+class Reference(_ObjectIdentityWrapper):
+  """Reference that refers an object.
+
+  ```python
+  x = [1]
+  y = [1]
+
+  x_ref1 = Reference(x)
+  x_ref2 = Reference(x)
+  y_ref2 = Reference(y)
+
+  print(x_ref1 == x_ref2)
+  ==> True
+
+  print(x_ref1 == y)
+  ==> False
+  ```
+  """
+
+  __slots__ = ()
+
+  # Disabling super class' unwrapped field.
+  unwrapped = property()
+
+  def deref(self):
+    """Returns the referenced object.
+
+    ```python
+    x_ref = Reference(x)
+    print(x is x_ref.deref())
+    ==> True
+    ```
+    """
+    return self._wrapped
+
+
+class ObjectIdentityDictionary(collections_abc.MutableMapping):
+  """A mutable mapping data structure which compares using "is".
+
+  This is necessary because we have trackable objects (_ListWrapper) which
+  have behavior identical to built-in Python lists (including being unhashable
+  and comparing based on the equality of their contents by default).
+  """
+
+  __slots__ = ["_storage"]
+
+  def __init__(self):
+    self._storage = {}
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __getitem__(self, key):
+    return self._storage[self._wrap_key(key)]
+
+  def __setitem__(self, key, value):
+    self._storage[self._wrap_key(key)] = value
+
+  def __delitem__(self, key):
+    del self._storage[self._wrap_key(key)]
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    for key in self._storage:
+      yield key.unwrapped
+
+  def __repr__(self):
+    return "ObjectIdentityDictionary(%s)" % repr(self._storage)
+
+
+class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
+  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
+
+  __slots__ = ["__weakref__"]
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len(list(self._storage))
+
+  def __iter__(self):
+    keys = self._storage.keys()
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        del self[key]
+      else:
+        yield unwrapped
+
+
+class ObjectIdentitySet(collections_abc.MutableSet):
+  """Like the built-in set, but compares objects with "is"."""
+
+  __slots__ = ["_storage", "__weakref__"]
+
+  def __init__(self, *args):
+    self._storage = set(self._wrap_key(obj) for obj in list(*args))
+
+  @staticmethod
+  def _from_storage(storage):
+    result = ObjectIdentitySet()
+    result._storage = storage  # pylint: disable=protected-access
+    return result
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __contains__(self, key):
+    return self._wrap_key(key) in self._storage
+
+  def discard(self, key):
+    self._storage.discard(self._wrap_key(key))
+
+  def add(self, key):
+    self._storage.add(self._wrap_key(key))
+
+  def update(self, items):
+    self._storage.update([self._wrap_key(item) for item in items])
+
+  def clear(self):
+    self._storage.clear()
+
+  def intersection(self, items):
+    return self._storage.intersection([self._wrap_key(item) for item in items])
+
+  def difference(self, items):
+    return ObjectIdentitySet._from_storage(
+        self._storage.difference([self._wrap_key(item) for item in items]))
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakSet(ObjectIdentitySet):
+  """Like weakref.WeakSet, but compares objects with "is"."""
+
+  __slots__ = ()
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len([_ for _ in self])
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        self.discard(key)
+      else:
+        yield unwrapped
+# LINT.ThenChange(//tensorflow/python/util/object_identity.py)
diff --git a/tensorflow/python/keras/utils/tf_inspect.py b/tensorflow/python/keras/utils/tf_inspect.py
index dd13ea6c393..8f1b668539a 100644
--- a/tensorflow/python/keras/utils/tf_inspect.py
+++ b/tensorflow/python/keras/utils/tf_inspect.py
@@ -90,6 +90,11 @@ else:
     return _convert_maybe_argspec_to_fullargspec(getargspec(target))
 
 
+def currentframe():
+  """TFDecorator-aware replacement for inspect.currentframe."""
+  return _inspect.stack()[1][0]
+
+
 def getargspec(obj):
   """TFDecorator-aware replacement for `inspect.getargspec`.
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 3515dcc87a1..02974467b97 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import numpy as np
 import six
@@ -31,13 +32,13 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import keras_tensor
+from tensorflow.python.keras.utils import object_identity
 from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 
 
 def is_tensor_or_tensor_list(v):
@@ -66,7 +67,7 @@ def get_reachable_from_inputs(inputs, targets=None):
   reachable = object_identity.ObjectIdentitySet(inputs)
   if targets:
     remaining_targets = object_identity.ObjectIdentitySet(nest.flatten(targets))
-  queue = inputs[:]
+  queue = collections.deque(inputs)
 
   while queue:
     x = queue.pop()
@@ -93,7 +94,7 @@ def get_reachable_from_inputs(inputs, targets=None):
         reachable.add(y)
         if targets:
           remaining_targets.discard(y)
-        queue.insert(0, y)
+        queue.appendleft(y)
 
     if targets and not remaining_targets:
       return reachable
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e495656330f..b6f3b6b8c82 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -20,7 +20,6 @@ tf_py_test(
     size = "small",
     srcs = ["as_string_op_test.py"],
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -34,7 +33,6 @@ tf_py_test(
     name = "attention_ops_test",
     size = "small",
     srcs = ["attention_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -54,7 +52,6 @@ tf_py_test(
         "nomsan",  # TODO(b/161902335): Re-enable.
         "notsan",  # TODO(b/161829717): Re-enable.
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -69,7 +66,6 @@ tf_py_test(
     size = "small",
     srcs = ["base64_ops_test.py"],
     tags = ["nomac"],  # b/35468214
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -84,7 +80,6 @@ tf_py_test(
 tf_py_test(
     name = "batch_scatter_ops_test",
     srcs = ["batch_scatter_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -105,7 +100,6 @@ tf_py_test(
     name = "bcast_ops_test",
     size = "small",
     srcs = ["bcast_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client_testlib",
@@ -168,7 +162,6 @@ cuda_py_test(
     size = "small",
     srcs = ["benchmark_test.py"],
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -181,7 +174,6 @@ cuda_py_test(
 cuda_py_test(
     name = "reduce_benchmark_test",
     srcs = ["reduce_benchmark_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -200,7 +192,6 @@ cuda_py_test(
     size = "small",
     srcs = ["bincount_op_test.py"],
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:bincount_ops",
         "//tensorflow/python:client_testlib",
@@ -212,7 +203,6 @@ tf_py_test(
     name = "candidate_sampler_ops_test",
     size = "small",
     srcs = ["candidate_sampler_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:candidate_sampling_ops",
@@ -227,7 +217,6 @@ tf_py_test(
     name = "checkpoint_ops_test",
     size = "medium",
     srcs = ["checkpoint_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpoint_ops_gen",
@@ -275,7 +264,6 @@ tf_py_test(
         "no_gpu",  # b/127001953
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
@@ -283,17 +271,27 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "collective_ops_test",
     size = "medium",
     srcs = ["collective_ops_test.py"],
-    tfrt_enabled = False,
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:collective_ops_gen",
-        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -302,17 +300,21 @@ tf_py_test(
     size = "medium",
     srcs = ["collective_ops_multi_worker_test.py"],
     python_version = "PY3",
-    tags = ["no_rocm"],
-    tfrt_enabled = False,
+    tags = [
+        "no_rocm",
+        "notsan",  # TODO(b/171435192)
+    ],
     deps = [
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
+        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -320,7 +322,6 @@ tf_py_test(
     name = "conditional_accumulator_test",
     size = "small",
     srcs = ["conditional_accumulator_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -338,7 +339,6 @@ tf_py_test(
     name = "ctc_decoder_ops_test",
     size = "small",
     srcs = ["ctc_decoder_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -379,7 +379,9 @@ cuda_py_test(
     name = "cudnn_deterministic_ops_test",
     size = "small",
     srcs = ["cudnn_deterministic_ops_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "no_cuda_asan",  # TODO(b/171509035): re-enable.
+    ],
     xla_enable_strict_auto_jit = True,
     deps = [
         ":cudnn_deterministic_base",
@@ -390,7 +392,9 @@ cuda_py_test(
     name = "cudnn_deterministic_test",
     size = "small",
     srcs = ["cudnn_deterministic_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "no_cuda_asan",  # TODO(b/171509035): re-enable.
+    ],
     deps = [
         ":cudnn_deterministic_base",
     ],
@@ -400,7 +404,6 @@ cuda_py_test(
     name = "cumulative_logsumexp_test",
     size = "medium",
     srcs = ["cumulative_logsumexp_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -416,7 +419,6 @@ tf_py_test(
     name = "decode_csv_op_test",
     size = "small",
     srcs = ["decode_csv_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -431,7 +433,6 @@ tf_py_test(
     name = "decode_png_op_test",
     size = "small",
     srcs = ["decode_png_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -445,7 +446,6 @@ tf_py_test(
     name = "decode_bmp_op_test",
     size = "small",
     srcs = ["decode_bmp_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -459,7 +459,6 @@ tf_py_test(
     name = "decode_jpeg_op_test",
     srcs = ["decode_jpeg_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -473,7 +472,6 @@ tf_py_test(
     size = "small",
     srcs = ["decode_image_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -488,7 +486,6 @@ tf_py_test(
     name = "decode_raw_op_test",
     size = "small",
     srcs = ["decode_raw_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -502,7 +499,6 @@ tf_py_test(
     name = "decode_compressed_op_test",
     size = "small",
     srcs = ["decode_compressed_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -516,7 +512,6 @@ cuda_py_test(
     name = "determinant_op_test",
     size = "medium",
     srcs = ["determinant_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -529,7 +524,6 @@ tf_py_test(
     name = "draw_bounding_box_op_test",
     size = "small",
     srcs = ["draw_bounding_box_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -544,7 +538,6 @@ tf_py_test(
     name = "edit_distance_op_test",
     size = "small",
     srcs = ["edit_distance_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -576,7 +569,6 @@ tf_py_test(
     name = "fingerprint_op_test",
     size = "small",
     srcs = ["fingerprint_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//third_party/py/numpy",
     ],
@@ -587,7 +579,6 @@ tf_py_test(
     size = "small",
     srcs = ["fractional_avg_pool_op_test.py"],
     shard_count = 5,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -604,7 +595,6 @@ tf_py_test(
     size = "small",
     srcs = ["fractional_max_pool_op_test.py"],
     shard_count = 5,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -620,7 +610,6 @@ tf_py_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -635,7 +624,6 @@ tf_py_test(
     name = "identity_n_op_py_test",
     size = "small",
     srcs = ["identity_n_op_py_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -650,7 +638,6 @@ cuda_py_test(
     name = "in_topk_op_test",
     size = "small",
     srcs = ["in_topk_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -663,7 +650,6 @@ tf_py_test(
     name = "record_input_test",
     size = "medium",
     srcs = ["record_input_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -676,7 +662,6 @@ tf_py_test(
     name = "io_ops_test",
     size = "small",
     srcs = ["io_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:io_ops",
@@ -688,7 +673,6 @@ tf_py_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -705,7 +689,6 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -718,7 +701,6 @@ cuda_py_test(
     name = "logging_ops_test",
     size = "small",
     srcs = ["logging_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -756,7 +738,6 @@ tf_py_test(
     name = "losses_test",
     size = "medium",
     srcs = ["losses_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -817,7 +798,9 @@ cuda_py_test(
     name = "matrix_solve_ls_op_test",
     size = "medium",
     srcs = ["matrix_solve_ls_op_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -832,7 +815,6 @@ cuda_py_test(
     name = "matrix_square_root_op_test",
     size = "medium",
     srcs = ["matrix_square_root_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -858,7 +840,6 @@ cuda_py_test(
     name = "banded_triangular_solve_op_test",
     size = "small",
     srcs = ["banded_triangular_solve_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -871,7 +852,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["matrix_triangular_solve_op_test.py"],
     shard_count = 3,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -917,7 +897,6 @@ tf_py_test(
     name = "partitioned_variables_test",
     size = "small",
     srcs = ["partitioned_variables_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -935,7 +914,6 @@ tf_py_test(
     name = "priority_queue_test",
     size = "medium",
     srcs = ["priority_queue_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -955,7 +933,6 @@ cuda_py_test(
     #
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -974,7 +951,6 @@ tf_py_test(
     name = "regex_replace_op_test",
     size = "small",
     srcs = ["regex_replace_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -988,7 +964,6 @@ tf_py_test(
     name = "regex_full_match_op_test",
     size = "small",
     srcs = ["regex_full_match_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1002,7 +977,6 @@ tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
     srcs = ["save_restore_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -1033,7 +1007,6 @@ tf_py_test(
     name = "sparse_add_op_test",
     size = "small",
     srcs = ["sparse_add_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1050,7 +1023,6 @@ tf_py_test(
     name = "sparse_concat_op_test",
     size = "small",
     srcs = ["sparse_concat_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1065,7 +1037,6 @@ tf_py_test(
     name = "sparse_conditional_accumulator_test",
     size = "small",
     srcs = ["sparse_conditional_accumulator_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1080,7 +1051,6 @@ tf_py_test(
     name = "sparse_reorder_op_test",
     size = "small",
     srcs = ["sparse_reorder_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1096,7 +1066,6 @@ tf_py_test(
     name = "sparse_reshape_op_test",
     size = "small",
     srcs = ["sparse_reshape_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1112,7 +1081,6 @@ tf_py_test(
     name = "sparse_split_op_test",
     size = "small",
     srcs = ["sparse_split_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -1125,7 +1093,6 @@ tf_py_test(
     name = "sparse_slice_op_test",
     size = "small",
     srcs = ["sparse_slice_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -1139,7 +1106,6 @@ tf_py_test(
     name = "sparse_to_dense_op_py_test",
     size = "small",
     srcs = ["sparse_to_dense_op_py_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1153,7 +1119,6 @@ tf_py_test(
     name = "sparsemask_op_test",
     size = "small",
     srcs = ["sparsemask_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1166,7 +1131,6 @@ tf_py_test(
     name = "string_format_op_test",
     size = "small",
     srcs = ["string_format_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1180,7 +1144,6 @@ tf_py_test(
     name = "string_join_op_test",
     size = "small",
     srcs = ["string_join_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:string_ops",
@@ -1210,7 +1173,6 @@ tf_py_test(
     name = "string_bytes_split_op_test",
     size = "small",
     srcs = ["string_bytes_split_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1230,7 +1192,6 @@ tf_py_test(
     name = "string_length_op_test",
     size = "small",
     srcs = ["string_length_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1242,7 +1203,6 @@ tf_py_test(
     name = "string_strip_op_test",
     size = "small",
     srcs = ["string_strip_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1257,7 +1217,6 @@ tf_py_test(
     name = "string_lower_op_test",
     size = "small",
     srcs = ["string_lower_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1272,7 +1231,6 @@ tf_py_test(
     name = "string_upper_op_test",
     size = "small",
     srcs = ["string_upper_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1287,7 +1245,6 @@ tf_py_test(
     name = "substr_op_test",
     size = "small",
     srcs = ["substr_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -1326,7 +1283,6 @@ tf_py_test(
     name = "summary_v1_ops_test",
     size = "small",
     srcs = ["summary_v1_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1340,7 +1296,6 @@ tf_py_test(
     name = "summary_v1_tensor_op_test",
     size = "small",
     srcs = ["summary_v1_tensor_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -1357,7 +1312,6 @@ tf_py_test(
     name = "template_test",
     size = "small",
     srcs = ["template_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1376,7 +1330,6 @@ cuda_py_test(
     name = "template_mirrored_strategy_test",
     size = "small",
     srcs = ["template_mirrored_strategy_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
@@ -1408,7 +1361,6 @@ tf_py_test(
     name = "unicode_script_op_test",
     size = "small",
     srcs = ["unicode_script_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1421,7 +1373,9 @@ cuda_py_test(
     name = "topk_op_test",
     size = "medium",
     srcs = ["topk_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1437,7 +1391,6 @@ cuda_py_test(
     name = "nth_element_op_test",
     size = "small",
     srcs = ["nth_element_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1453,7 +1406,6 @@ tf_py_test(
     name = "unicode_encode_op_test",
     size = "small",
     srcs = ["unicode_encode_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1472,7 +1424,6 @@ tf_py_test(
     name = "unicode_transcode_op_test",
     size = "small",
     srcs = ["unicode_transcode_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1505,7 +1456,6 @@ tf_py_test(
     name = "unique_op_test",
     size = "small",
     srcs = ["unique_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1544,7 +1494,6 @@ tf_py_test(
     size = "small",
     srcs = ["variables_test.py"],
     tags = ["no_windows"],  # b/133869052
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1567,7 +1516,6 @@ cuda_py_test(
     name = "where_op_test",
     size = "medium",
     srcs = ["where_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1580,7 +1528,9 @@ cuda_py_test(
     name = "cast_op_test",
     size = "small",
     srcs = ["cast_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1597,7 +1547,6 @@ cuda_py_test(
     size = "small",
     srcs = ["dense_update_ops_no_tsan_test.py"],
     tags = ["notsan"],
-    tfrt_enabled = True,
     # TODO (b/140294007): the test fails with XLA.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -1616,7 +1565,6 @@ cuda_py_test(
     srcs = ["diag_op_test.py"],
     shard_count = 6,
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1631,8 +1579,7 @@ tf_py_test(
     name = "reader_ops_test",
     size = "small",
     srcs = ["reader_ops_test.py"],
-    data = ["//tensorflow/core:lmdb_testdata"],
-    tfrt_enabled = True,
+    data = ["//tensorflow/core/lib/lmdb:lmdb_testdata"],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1651,12 +1598,15 @@ cuda_py_test(
     name = "aggregate_ops_test",
     size = "small",
     srcs = ["aggregate_ops_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -1665,7 +1615,12 @@ cuda_py_test(
     name = "argmax_op_test",
     size = "small",
     srcs = ["argmax_op_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "no_cuda_asan",  # times out
+    ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -1719,7 +1674,6 @@ cuda_py_test(
     size = "small",
     srcs = ["inplace_ops_test.py"],
     shard_count = 10,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1749,7 +1703,6 @@ cuda_py_test(
     name = "batchtospace_op_test",
     size = "small",
     srcs = ["batchtospace_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -1763,7 +1716,9 @@ cuda_py_test(
     name = "betainc_op_test",
     size = "small",
     srcs = ["betainc_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1792,7 +1747,6 @@ cuda_py_test(
     name = "bias_op_deterministic_test",
     size = "medium",
     srcs = ["bias_op_deterministic_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":bias_op_base",
     ],
@@ -1811,7 +1765,6 @@ cuda_py_test(
     name = "bitcast_op_test",
     size = "small",
     srcs = ["bitcast_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1824,7 +1777,9 @@ cuda_py_test(
     name = "check_ops_test",
     size = "small",
     srcs = ["check_ops_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -1843,7 +1798,6 @@ cuda_py_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1859,7 +1813,6 @@ cuda_py_test(
     name = "constant_op_eager_test",
     size = "small",
     srcs = ["constant_op_eager_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1921,7 +1874,6 @@ tf_py_test(
     name = "control_flow_util_test",
     size = "small",
     srcs = ["control_flow_util_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -1935,7 +1887,6 @@ tf_py_test(
     name = "control_flow_util_v2_test",
     size = "small",
     srcs = ["control_flow_util_v2_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:cond_v2",
@@ -1950,7 +1901,6 @@ cuda_py_test(
     name = "conv1d_test",
     size = "small",
     srcs = ["conv1d_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1963,7 +1913,6 @@ cuda_py_test(
     name = "conv1d_transpose_test",
     size = "small",
     srcs = ["conv1d_transpose_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1977,7 +1926,6 @@ cuda_py_test(
     name = "conv2d_transpose_test",
     size = "small",
     srcs = ["conv2d_transpose_test.py"],
-    tfrt_enabled = True,
 
     # TODO(b/144432983): S32 convolutions should not be auto-clustered, only
     # crashes tests.
@@ -1996,7 +1944,6 @@ cuda_py_test(
     name = "conv3d_backprop_filter_v2_grad_test",
     size = "small",
     srcs = ["conv3d_backprop_filter_v2_grad_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2011,7 +1958,6 @@ cuda_py_test(
     name = "cross_grad_test",
     size = "small",
     srcs = ["cross_grad_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2036,7 +1982,6 @@ cuda_py_test(
     name = "dense_update_ops_test",
     size = "small",
     srcs = ["dense_update_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2053,7 +1998,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["depthtospace_op_test.py"],
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2068,7 +2015,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["division_past_test.py"],
     tags = ["manual"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2083,7 +2029,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
-    tfrt_enabled = False,  # TODO(b/153089059): add support for complex128.
+    # TODO(b/153089059): TFRT: Add support for complex128.
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2099,7 +2045,6 @@ cuda_py_test(
     name = "dynamic_stitch_op_test",
     size = "small",
     srcs = ["dynamic_stitch_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_grad",
@@ -2114,7 +2059,6 @@ cuda_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
-    tfrt_enabled = True,
     # TODO(b/144432983): S32 convolutions should not be auto-clustered.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -2129,7 +2073,6 @@ cuda_py_test(
     name = "extract_volume_patches_op_test",
     size = "small",
     srcs = ["extract_volume_patches_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2168,7 +2111,6 @@ cuda_py_test(
     name = "gather_nd_op_test",
     size = "small",
     srcs = ["gather_nd_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2184,7 +2126,6 @@ cuda_py_test(
     name = "gradient_correctness_test",
     size = "small",
     srcs = ["gradient_correctness_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2226,7 +2167,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["linalg_ops_test.py"],
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2243,7 +2183,6 @@ cuda_py_test(
     name = "lrn_op_test",
     size = "medium",
     srcs = ["lrn_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2259,7 +2198,9 @@ cuda_py_test(
     name = "lu_op_test",
     size = "small",
     srcs = ["lu_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2277,6 +2218,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["einsum_op_test.py"],
     shard_count = 4,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2294,7 +2238,6 @@ cuda_py_test(
     size = "small",
     srcs = ["manip_ops_test.py"],
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2324,7 +2267,9 @@ cuda_py_test(
     name = "morphological_ops_test",
     size = "small",
     srcs = ["morphological_ops_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2338,7 +2283,6 @@ cuda_py_test(
     name = "numerics_test",
     size = "small",
     srcs = ["numerics_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2355,26 +2299,13 @@ cuda_py_test(
     size = "small",
     srcs = ["one_hot_op_test.py"],
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//third_party/py/numpy",
+    xla_tags = [
+        "no_cuda_asan",  # times out
     ],
-)
-
-cuda_py_test(
-    name = "stack_op_test",
-    size = "small",
-    srcs = ["stack_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
 )
@@ -2386,6 +2317,9 @@ cuda_py_test(
     grpc_enabled = True,
     shard_count = 2,
     tags = ["no_windows"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2405,7 +2339,9 @@ cuda_py_test(
     name = "pad_op_test",
     size = "small",
     srcs = ["pad_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2418,7 +2354,6 @@ cuda_py_test(
     name = "padding_fifo_queue_test",
     size = "small",
     srcs = ["padding_fifo_queue_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2452,7 +2387,6 @@ cuda_py_test(
     name = "reduce_join_op_test",
     size = "small",
     srcs = ["reduce_join_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2502,7 +2436,6 @@ cuda_py_test(
         "no_gpu",
         "noguitar",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2516,7 +2449,9 @@ cuda_py_test(
     name = "relu_op_test",
     size = "small",
     srcs = ["relu_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2535,7 +2470,6 @@ cuda_py_test(
     name = "reshape_op_test",
     size = "small",
     srcs = ["reshape_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2548,7 +2482,6 @@ cuda_py_test(
     name = "reverse_sequence_op_test",
     size = "small",
     srcs = ["reverse_sequence_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2561,7 +2494,6 @@ cuda_py_test(
     name = "compare_and_bitpack_op_test",
     size = "small",
     srcs = ["compare_and_bitpack_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2574,7 +2506,6 @@ cuda_py_test(
     name = "scalar_test",
     size = "small",
     srcs = ["scalar_test.py"],
-    tfrt_enabled = True,
     # b/140221961: Invalid dims for operations
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -2595,7 +2526,6 @@ cuda_py_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -2653,7 +2583,6 @@ cuda_py_test(
     name = "softsign_op_test",
     size = "small",
     srcs = ["softsign_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2667,7 +2596,9 @@ cuda_py_test(
     name = "spacetobatch_op_test",
     size = "small",
     srcs = ["spacetobatch_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -2687,7 +2618,6 @@ cuda_py_test(
         "no_windows",
         "no_windows_gpu",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2701,7 +2631,6 @@ tf_py_test(
     name = "sparse_serialization_ops_test",
     size = "small",
     srcs = ["sparse_serialization_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2716,7 +2645,6 @@ tf_py_test(
     name = "sparse_tensors_map_ops_test",
     size = "small",
     srcs = ["sparse_tensors_map_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2733,7 +2661,6 @@ cuda_py_test(
     name = "sparse_tensor_dense_matmul_grad_test",
     size = "small",
     srcs = ["sparse_tensor_dense_matmul_grad_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -2748,7 +2675,6 @@ cuda_py_test(
     name = "sparse_xent_op_test",
     size = "small",
     srcs = ["sparse_xent_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2787,7 +2713,6 @@ cuda_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -2803,7 +2728,6 @@ cuda_py_test(
     name = "string_to_hash_bucket_op_test",
     size = "small",
     srcs = ["string_to_hash_bucket_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2816,7 +2740,6 @@ cuda_py_test(
     name = "string_to_number_op_test",
     size = "small",
     srcs = ["string_to_number_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2829,7 +2752,6 @@ cuda_py_test(
     name = "summary_v1_audio_op_test",
     size = "small",
     srcs = ["summary_v1_audio_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -2843,7 +2765,6 @@ cuda_py_test(
     name = "summary_v1_image_op_test",
     size = "small",
     srcs = ["summary_v1_image_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -2894,7 +2815,6 @@ cuda_py_test(
     size = "small",
     srcs = ["trace_op_test.py"],
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -2925,7 +2845,6 @@ cuda_py_test(
     name = "variable_ops_test",
     size = "small",
     srcs = ["variable_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2943,7 +2862,6 @@ cuda_py_test(
     name = "xent_op_test",
     size = "small",
     srcs = ["xent_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2960,7 +2878,6 @@ cuda_py_test(
     name = "zero_division_test",
     size = "medium",
     srcs = ["zero_division_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -2976,7 +2893,6 @@ cuda_py_test(
     tags = [
         "no_gpu",  #  Flaky: b/80127739, b/127001953
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2993,7 +2909,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["atrous_convolution_test.py"],
     tags = ["manual"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3008,7 +2923,9 @@ cuda_py_test(
     name = "pool_test",
     size = "medium",
     srcs = ["pool_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3040,7 +2957,6 @@ cuda_py_test(
     name = "conv3d_transpose_test",
     size = "medium",
     srcs = ["conv3d_transpose_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3084,7 +3000,6 @@ cuda_py_test(
     shard_count = 3,
     # TODO(b/118842098): Re-enable this test in Kokoro.
     tags = ["no_oss"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3101,7 +3016,6 @@ tf_py_test(
     size = "medium",
     srcs = ["neon_depthwise_conv_op_test.py"],
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3118,7 +3032,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["division_future_test.py"],
     tags = ["manual"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3130,7 +3043,6 @@ cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3165,7 +3077,6 @@ cuda_py_test(
     timeout = "long",
     srcs = ["rnn_test.py"],
     shard_count = 10,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3240,7 +3151,6 @@ cuda_py_test(
     tags = [
         "no_oss",  # Requires 4GB+ RAM
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3255,7 +3165,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["sparse_matmul_op_test.py"],
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3291,7 +3200,6 @@ cuda_py_test(
     name = "sparse_tensor_dense_matmul_op_test",
     size = "medium",
     srcs = ["sparse_tensor_dense_matmul_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3328,6 +3236,7 @@ cuda_py_test(
     name = "extract_volume_patches_grad_test",
     size = "medium",
     srcs = ["extract_volume_patches_grad_test.py"],
+    shard_count = 20,
     tags = [
         "no_pip",
         "nomac",  # http://b/139946976
@@ -3346,7 +3255,6 @@ cuda_py_test(
     name = "stage_op_test",
     size = "medium",
     srcs = ["stage_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3362,7 +3270,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["map_stage_op_test.py"],
     tags = ["no_oss"],  # b/124474135
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3378,7 +3285,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["concat_op_test.py"],
     tags = ["no_windows"],  # b/126916429
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -3400,7 +3309,6 @@ cuda_py_test(
         "nomsan",
         "notsan",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3413,7 +3321,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["conv_ops_3d_test.py"],
     shard_count = 30,
-    tfrt_enabled = True,
+    tags = ["no_cuda11"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3492,7 +3400,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["embedding_ops_test.py"],
     shard_count = 20,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3535,7 +3442,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["matrix_band_part_op_test.py"],
     shard_count = 20,
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3609,7 +3518,9 @@ cuda_py_test(
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3633,7 +3544,6 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3653,7 +3563,6 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3682,7 +3591,6 @@ tf_py_test(
     name = "sets_test",
     size = "medium",
     srcs = ["sets_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
@@ -3701,7 +3609,6 @@ tf_py_test(
     size = "small",
     srcs = ["weights_broadcast_test.py"],
     shard_count = 3,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3721,7 +3628,6 @@ tf_py_test(
     srcs = ["metrics_test.py"],
     shard_count = 20,
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3742,7 +3648,6 @@ tf_py_test(
     name = "confusion_matrix_test",
     size = "small",
     srcs = ["confusion_matrix_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3758,7 +3663,6 @@ cuda_py_test(
     name = "bucketize_op_test",
     size = "medium",
     srcs = ["bucketize_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3772,7 +3676,6 @@ tf_py_test(
     size = "small",
     srcs = ["sparse_cross_op_test.py"],
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3785,7 +3688,6 @@ tf_py_test(
     name = "garbage_collection_test",
     size = "small",
     srcs = ["garbage_collection_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -3942,7 +3844,6 @@ cuda_py_test(
     srcs = ["tridiagonal_matmul_op_test.py"],
     shard_count = 10,
     tags = ["no_rocm"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/aggregate_ops_test.py b/tensorflow/python/kernel_tests/aggregate_ops_test.py
index d9787cc3bf6..adb4f3a0f2f 100644
--- a/tensorflow/python/kernel_tests/aggregate_ops_test.py
+++ b/tensorflow/python/kernel_tests/aggregate_ops_test.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
@@ -100,24 +100,28 @@ class AddNTest(test.TestCase):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported.
     with self.session(use_gpu=False):
-      variant_const_3 = create_constant_variant(3)
-      variant_const_4 = create_constant_variant(4)
-      variant_const_5 = create_constant_variant(5)
-      # 3 + 3 + 5 + 4 = 15.
-      result = math_ops.add_n((variant_const_3, variant_const_3,
-                               variant_const_5, variant_const_4))
+      num_tests = 127
+      values = list(range(100))
+      variant_consts = [create_constant_variant(x) for x in values]
+      sum_count_indices = np.random.randint(1, 29, size=num_tests)
+      sum_indices = [
+          np.random.randint(100, size=count) for count in sum_count_indices]
+      expected_sums = [np.sum(x) for x in sum_indices]
+      variant_sums = [math_ops.add_n([variant_consts[i] for i in x])
+                      for x in sum_indices]
 
-      # Smoke test -- ensure this executes without trouble.
+      # We use as_string() to get the Variant DebugString for the
+      # variant_sums; we know its value so we can check via string equality
+      # here.
+      #
       # Right now, non-numpy-compatible objects cannot be returned from a
       # session.run call; similarly, objects that can't be converted to
       # native numpy types cannot be passed to ops.convert_to_tensor.
-      # For now, run the test and examine the output to see that the result is
-      # equal to 15.
-      result_op = logging_ops.Print(
-          result, [variant_const_3, variant_const_4, variant_const_5, result],
-          message=("Variants stored an int: c(3), c(4), c(5), "
-                   "add_n(c(3), c(3), c(5), c(4)): ")).op
-      result_op.run()
+      variant_sums_string = string_ops.as_string(variant_sums)
+      self.assertAllEqual(
+          variant_sums_string,
+          ["Variant<type: int value: {}>".format(s).encode("utf-8")
+           for s in expected_sums])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index df17e5a3a39..bdcd4fdb478 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -10,7 +10,6 @@ package(
 cuda_py_test(
     name = "batch_gather_op_test",
     srcs = ["batch_gather_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -24,7 +23,9 @@ cuda_py_test(
     name = "unstack_op_test",
     size = "small",
     srcs = ["unstack_op_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -39,7 +40,6 @@ cuda_py_test(
     name = "slice_op_test",
     size = "medium",
     srcs = ["slice_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -82,3 +82,20 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
 )
+
+cuda_py_test(
+    name = "stack_op_test",
+    size = "small",
+    srcs = ["stack_op_test.py"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/array_ops/stack_op_test.py
similarity index 74%
rename from tensorflow/python/kernel_tests/stack_op_test.py
rename to tensorflow/python/kernel_tests/array_ops/stack_op_test.py
index 8237ce228af..ab1dd1d5125 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/stack_op_test.py
@@ -64,27 +64,28 @@ class StackOpTest(test.TestCase):
               c = array_ops.stack(xs, axis=axis)
               self.assertAllEqual(c, data)
 
-  @test_util.run_deprecated_v1
   def testSimpleParallelCPU(self):
-    np.random.seed(7)
-    with self.session(use_gpu=False):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (100, 24, 24, 3):
-        with self.subTest(shape=shape):
-          data = self.randn(shape, np.float32)
-          xs = list(map(constant_op.constant, data))
-          c = array_ops.parallel_stack(xs)
-          self.assertAllEqual(c, data)
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      np.random.seed(7)
+      with test_util.device(use_gpu=False):
+        for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (100, 24, 24, 3):
+          with self.subTest(shape=shape):
+            data = self.randn(shape, np.float32)
+            xs = list(map(constant_op.constant, data))
+            c = array_ops.parallel_stack(xs)
+            self.assertAllEqual(c, data)
 
-  @test_util.run_deprecated_v1
   def testSimpleParallelGPU(self):
-    np.random.seed(7)
-    with self.session(use_gpu=True):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (100, 24, 24, 3):
-        with self.subTest(shape=shape):
-          data = self.randn(shape, np.float32)
-          xs = list(map(constant_op.constant, data))
-          c = array_ops.parallel_stack(xs)
-          self.assertAllEqual(c, data)
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      with test_util.device(use_gpu=True):
+        for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (100, 24, 24, 3):
+          with self.subTest(shape=shape):
+            data = self.randn(shape, np.float32)
+            xs = list(map(constant_op.constant, data))
+            c = array_ops.parallel_stack(xs)
+            self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testConst(self):
@@ -113,37 +114,39 @@ class StackOpTest(test.TestCase):
               self.assertEqual(cl.op.type, "Const")
               self.assertAllEqual(cl, data)
 
-  @test_util.run_deprecated_v1
   def testConstParallelCPU(self):
-    np.random.seed(7)
-    with self.session(use_gpu=False):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (8, 2, 10):
-        with self.subTest(shape=shape):
-          data = self.randn(shape, np.float32)
-          if len(shape) == 1:
-            data_list = list(data)
-            cl = array_ops.parallel_stack(data_list)
-            self.assertAllEqual(cl, data)
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      np.random.seed(7)
+      with test_util.device(use_gpu=False):
+        for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2), (8, 2, 10):
+          with self.subTest(shape=shape):
+            data = self.randn(shape, np.float32)
+            if len(shape) == 1:
+              data_list = list(data)
+              cl = array_ops.parallel_stack(data_list)
+              self.assertAllEqual(cl, data)
 
-          data = self.randn(shape, np.float32)
-          c = array_ops.parallel_stack(data)
-          self.assertAllEqual(c, data)
+            data = self.randn(shape, np.float32)
+            c = array_ops.parallel_stack(data)
+            self.assertAllEqual(c, data)
 
-  @test_util.run_deprecated_v1
   def testConstParallelGPU(self):
-    np.random.seed(7)
-    with self.session(use_gpu=True):
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        with self.subTest(shape=shape):
-          data = self.randn(shape, np.float32)
-          if len(shape) == 1:
-            data_list = list(data)
-            cl = array_ops.parallel_stack(data_list)
-            self.assertAllEqual(cl, data)
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      np.random.seed(7)
+      with test_util.device(use_gpu=True):
+        for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+          with self.subTest(shape=shape):
+            data = self.randn(shape, np.float32)
+            if len(shape) == 1:
+              data_list = list(data)
+              cl = array_ops.parallel_stack(data_list)
+              self.assertAllEqual(cl, data)
 
-          data = self.randn(shape, np.float32)
-          c = array_ops.parallel_stack(data)
-          self.assertAllEqual(c, data)
+            data = self.randn(shape, np.float32)
+            c = array_ops.parallel_stack(data)
+            self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
@@ -176,53 +179,57 @@ class StackOpTest(test.TestCase):
                                                         out_shape)
           self.assertLess(err, 1e-6)
 
-  @test_util.run_deprecated_v1
   def testZeroSizeCPU(self):
-    # Verify that stack doesn't crash for zero size inputs
-    with self.session(use_gpu=False):
-      for shape in (0,), (3, 0), (0, 3):
-        with self.subTest(shape=shape):
-          x = np.zeros((2,) + shape).astype(np.int32)
-          p = array_ops.stack(list(x)).eval()
-          self.assertAllEqual(p, x)
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      # Verify that stack doesn't crash for zero size inputs
+      with test_util.device(use_gpu=False):
+        for shape in (0,), (3, 0), (0, 3):
+          with self.subTest(shape=shape):
+            x = np.zeros((2,) + shape).astype(np.int32)
+            p = self.evaluate(array_ops.stack(list(x)))
+            self.assertAllEqual(p, x)
 
-          p = array_ops.parallel_stack(list(x)).eval()
-          self.assertAllEqual(p, x)
+            p = self.evaluate(array_ops.parallel_stack(list(x)))
+            self.assertAllEqual(p, x)
 
-  @test_util.run_deprecated_v1
   def testZeroSizeGPU(self):
-    # Verify that stack doesn't crash for zero size inputs
-    with self.session(use_gpu=True):
-      for shape in (0,), (3, 0), (0, 3):
-        with self.subTest(shape=shape):
-          x = np.zeros((2,) + shape).astype(np.int32)
-          p = array_ops.stack(list(x)).eval()
-          self.assertAllEqual(p, x)
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      # Verify that stack doesn't crash for zero size inputs
+      with test_util.device(use_gpu=True):
+        for shape in (0,), (3, 0), (0, 3):
+          with self.subTest(shape=shape):
+            x = np.zeros((2,) + shape).astype(np.int32)
+            p = self.evaluate(array_ops.stack(list(x)))
+            self.assertAllEqual(p, x)
 
-          p = array_ops.parallel_stack(list(x)).eval()
-          self.assertAllEqual(p, x)
+            p = self.evaluate(array_ops.parallel_stack(list(x)))
+            self.assertAllEqual(p, x)
 
-  @test_util.run_deprecated_v1
   def testAxis0DefaultCPU(self):
-    with self.session(use_gpu=False):
-      t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
-      stacked = array_ops.stack(t).eval()
-      parallel_stacked = array_ops.parallel_stack(t).eval()
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      with test_util.device(use_gpu=False):
+        t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
+        stacked = self.evaluate(array_ops.stack(t))
+        parallel_stacked = self.evaluate(array_ops.parallel_stack(t))
 
-    expected = np.array([[1, 2, 3], [4, 5, 6]])
-    self.assertAllEqual(stacked, expected)
-    self.assertAllEqual(parallel_stacked, expected)
+      expected = np.array([[1, 2, 3], [4, 5, 6]])
+      self.assertAllEqual(stacked, expected)
+      self.assertAllEqual(parallel_stacked, expected)
 
-  @test_util.run_deprecated_v1
   def testAxis0DefaultGPU(self):
-    with self.session(use_gpu=True):
-      t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
-      stacked = array_ops.stack(t).eval()
-      parallel_stacked = array_ops.parallel_stack(t).eval()
+    # tf.parallel_stack is only supported in graph mode.
+    with ops.Graph().as_default():
+      with test_util.device(use_gpu=True):
+        t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
+        stacked = self.evaluate(array_ops.stack(t))
+        parallel_stacked = self.evaluate(array_ops.parallel_stack(t))
 
-    expected = np.array([[1, 2, 3], [4, 5, 6]])
-    self.assertAllEqual(stacked, expected)
-    self.assertAllEqual(parallel_stacked, expected)
+      expected = np.array([[1, 2, 3], [4, 5, 6]])
+      self.assertAllEqual(stacked, expected)
+      self.assertAllEqual(parallel_stacked, expected)
 
   def testAgainstNumpy(self):
     # For 1 to 5 dimensions.
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index cbf25df03b0..c4fc23bd43a 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1486,6 +1486,31 @@ class PadTest(test_util.TensorFlowTestCase):
                           [[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 0, 0],
                            [0, 0, 4, 5, 6, 0, 0], [0, 0, 0, 0, 0, 0, 0]])
 
+  def testSymmetricMirrorPadGrad(self):
+    t = np.broadcast_to(np.arange(0, 7), (3, 2, 1, 7))
+    paddings = constant_op.constant([
+        [1, 1],
+        [0, 0],
+        [0, 0],
+        [2, 2],
+    ])
+    expected = np.broadcast_to(np.array([9, 27, 27]), (1, 2, 1, 3))
+    result = gen_array_ops.mirror_pad_grad(t, paddings, "SYMMETRIC")
+    self.assertAllEqual(result, expected)
+
+  def testReflectMirrorPadGrad(self):
+    t = np.broadcast_to(np.reshape(np.arange(0, 7), (7, 1)), (1, 4, 7, 1))
+    paddings = constant_op.constant([
+        [0, 0],
+        [1, 1],
+        [2, 2],
+        [0, 0],
+    ])
+    expected = np.broadcast_to(
+        np.reshape(np.array([16, 18, 8]), (3, 1)), (1, 2, 3, 1))
+    result = gen_array_ops.mirror_pad_grad(t, paddings, "REFLECT")
+    self.assertAllEqual(result, expected)
+
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
 
@@ -1610,7 +1635,7 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
         expected = self._scale_per_slice(shape, axis, quant_values)
         unused_minmax_value = 0 if axis is None else [0] * shape[axis]
         fake_quantized = self.evaluate(
-            array_ops.quantize_and_dequantize(
+            array_ops.quantize_and_dequantize_v2(
                 inputs,
                 unused_minmax_value,
                 unused_minmax_value,
@@ -1620,7 +1645,7 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(fake_quantized, expected)
         if axis is not None:
           fake_quantized = self.evaluate(
-              array_ops.quantize_and_dequantize(
+              array_ops.quantize_and_dequantize_v2(
                   inputs,
                   unused_minmax_value,
                   unused_minmax_value,
@@ -1628,6 +1653,39 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
                   axis=(axis - 4)))
           self.assertAllClose(fake_quantized, expected)
 
+  def testBadAxis(self):
+    input_tensor = [2.5, 2.5]
+    input_min = [0, 0]
+    input_max = [1, 1]
+    error_message_pattern = "Shape must be at least rank 11 but is rank 1"
+    # TODO(b/171260356): Eager mode and graph mode throw different error types
+    error = errors.InvalidArgumentError if context.executing_eagerly(
+    ) else ValueError
+    with self.assertRaisesRegex(error, error_message_pattern):
+      self.evaluate(
+          array_ops.quantize_and_dequantize_v2(
+              input=input_tensor,
+              input_min=input_min,
+              input_max=input_max,
+              axis=10))
+
+  def testQuantizeDequantizeGrad(self):
+    shape = (2, 2)
+    max_threshold = 0
+    min_threshold = -10
+    input_value = np.random.rand(2, 2) * 40.0 - 20.0
+    input_tensor = constant_op.constant(input_value, shape=shape,
+                                        name="input_tensor")
+    with self.cached_session():
+      def f(a):
+        return array_ops.quantize_and_dequantize_v2(
+            a,
+            input_min=min_threshold,
+            input_max=max_threshold,
+            range_given=True)
+      output_grad = gradient_checker_v2.compute_gradient(f, [input_tensor])
+      self.assertAllClose(output_grad[0], np.zeros([1, 4, 4]))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class SortedSearchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
index e7a34382355..5b318324d4c 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/BUILD
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -24,7 +24,6 @@ tf_py_test(
     name = "resource_ops_test",
     size = "small",
     srcs = ["resource_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:boosted_trees_ops",
@@ -40,7 +39,6 @@ tf_py_test(
     name = "prediction_ops_test",
     size = "small",
     srcs = ["prediction_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:array_ops",
@@ -55,7 +53,6 @@ tf_py_test(
     name = "stats_ops_test",
     size = "medium",
     srcs = ["stats_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:boosted_trees_ops",
@@ -72,7 +69,6 @@ tf_py_test(
     name = "training_ops_test",
     size = "small",
     srcs = ["training_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:array_ops",
@@ -87,7 +83,6 @@ tf_py_test(
     name = "quantile_ops_test",
     size = "small",
     srcs = ["quantile_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:boosted_trees_ops",
diff --git a/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py b/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py
index 95143cc233a..ba10e369a01 100644
--- a/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py
+++ b/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py
@@ -20,16 +20,22 @@ from __future__ import print_function
 
 import copy
 import os
+import threading
 import time
 
+from absl.testing import parameterized
+
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import collective_ops
 
 
@@ -46,6 +52,18 @@ def enable_collective_ops(cluster_resolver):
   context.context().enable_collective_ops(server_def)
 
 
+def enable_collective_ops_with_barrier(cluster_resolver):
+  multi_process_runner.get_barrier().wait()
+  enable_collective_ops(cluster_resolver)
+  multi_process_runner.get_barrier().wait()
+
+
+device_combination = (
+    combinations.combine(device="CPU", communication="RING", required_gpus=0) +
+    combinations.combine(
+        device="GPU", communication=["RING", "NCCL"], required_gpus=1))
+
+
 class CollectiveOpTest(test.TestCase):
 
   def testCheckHealth(self):
@@ -60,8 +78,9 @@ class CollectiveOpTest(test.TestCase):
               "/job:worker/replica:0/task:0",
               "/job:worker/replica:0/task:1",
           ]:
-            context.context().check_collective_ops_peer_health(task)
-        except errors.UnavailableError:
+            context.context().check_collective_ops_peer_health(
+                task, timeout_in_ms=1000)
+        except (errors.UnavailableError, errors.DeadlineExceededError):
           continue
         break
       multi_process_runner.get_barrier().wait()
@@ -76,12 +95,13 @@ class CollectiveOpTest(test.TestCase):
     def worker_fn():
       enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
       context.context().check_collective_ops_peer_health(
-          "/job:worker/replica:0/task:1",)
+          "/job:worker/replica:0/task:1", timeout_in_ms=1000)
 
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
     mpr.start_single_process("worker", 0)
-    with self.assertRaises(errors.UnavailableError):
+    with self.assertRaises(
+        (errors.UnavailableError, errors.DeadlineExceededError)):
       mpr.join()
 
   def testCheckHealthPeerRestart(self):
@@ -109,7 +129,7 @@ class CollectiveOpTest(test.TestCase):
           time.sleep(1)
           try:
             context.context().check_collective_ops_peer_health(
-                "/job:worker/replica:0/task:0",)
+                "/job:worker/replica:0/task:0", timeout_in_ms=1000)
           except errors.UnavailableError:
             pass
           except errors.FailedPreconditionError:
@@ -126,7 +146,8 @@ class CollectiveOpTest(test.TestCase):
 
     def worker_fn():
       enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
-      context.context().check_collective_ops_peer_health("localhost:12345",)
+      context.context().check_collective_ops_peer_health(
+          "localhost:12345", timeout_in_ms=1000)
 
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
@@ -135,5 +156,176 @@ class CollectiveOpTest(test.TestCase):
       mpr.join()
 
 
+two_worker_pool_runner = multi_process_runner.MultiProcessPoolRunner(
+    multi_worker_test_base.create_cluster_spec(num_workers=2),
+    initializer=lambda: enable_collective_ops(cluster_resolver_lib.
+                                              TFConfigClusterResolver()))
+
+
+@combinations.generate(
+    combinations.times(
+        combinations.combine(
+            mode="eager", num_workers=2, runner=two_worker_pool_runner),
+        device_combination))
+class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
+
+  def testAbortCommunication(self, device, communication):
+    if communication == "NCCL":
+      self.skipTest("b/171358086: cannot test multi worker NCCL")
+    dev0 = "/device:%s:0" % device
+    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+    enable_collective_ops_with_barrier(cluster_resolver)
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    # First perform a normal all-reduce to complete the group and instance
+    # resolution.
+    with ops.device(dev0):
+      collective_ops.all_reduce(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint=communication)
+
+    if cluster_resolver.task_id == 1:
+
+      def abort_fn():
+        time.sleep(2)
+        context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down")
+
+      t = threading.Thread(target=abort_fn)
+      t.start()
+
+      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
+        with ops.device(dev0):
+          collective_ops.all_reduce(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+      # After abortion, subsequent collectives should fail immediately.
+      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
+        with ops.device(dev0):
+          collective_ops.all_reduce(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+      t.join()
+
+    # Enable collective ops again in order to reset the collective executor.
+    enable_collective_ops_with_barrier(cluster_resolver)
+    with ops.device(dev0):
+      collective_ops.all_reduce(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint=communication)
+
+  def testAbortGroupParamsResolution(self, device, communication):
+    if communication == "NCCL":
+      self.skipTest("b/171358086: cannot test multi worker NCCL")
+    dev0 = "/device:%s:0" % device
+    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+    enable_collective_ops_with_barrier(cluster_resolver)
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    if cluster_resolver.task_id == 1:
+
+      def abort_fn():
+        time.sleep(2)
+        context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down")
+
+      t = threading.Thread(target=abort_fn)
+      t.start()
+
+      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
+        # This hangs on params resolution since we're only launching one
+        # collective for a group size of 2.
+        with ops.device(dev0):
+          collective_ops.all_reduce(in_tensor, group_size, group_key,
+                                    instance_key)
+
+      # After abortion, subsequent collectives should fail immediately.
+      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
+        with ops.device(dev0):
+          collective_ops.all_reduce(in_tensor, group_size, group_key,
+                                    instance_key)
+
+      t.join()
+
+    # Enable collective ops again in order to reset the collective executor.
+    enable_collective_ops_with_barrier(cluster_resolver)
+    with ops.device(dev0):
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key)
+
+  def testAbortInstanceParamsResolution(self, device, communication):
+    if communication == "NCCL":
+      self.skipTest("b/171358086: cannot test multi worker NCCL")
+    dev0 = "/device:%s:0" % device
+    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+    enable_collective_ops_with_barrier(cluster_resolver)
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    # First perform a normal all-reduce to complete the group resolution.
+    with ops.device(dev0):
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key)
+
+    # We use broadcast to test aborting instance resolution since only broadcast
+    # waits for the group.
+
+    if cluster_resolver.task_id == 1:
+
+      def abort_fn():
+        time.sleep(2)
+        context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down")
+
+      t = threading.Thread(target=abort_fn)
+      t.start()
+
+      # Use a different instance key to trigger another instance resolution.
+      instance_key = 101
+      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
+        # This hangs on params resolution since we're only launching one
+        # collective for a group size of 2.
+        with ops.device(dev0):
+          collective_ops.broadcast_send(in_tensor, (1,), dtypes.float32,
+                                        group_size, group_key, instance_key)
+
+      # After abortion, subsequent collectives should fail immediately.
+      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
+        with ops.device(dev0):
+          collective_ops.broadcast_send(in_tensor, (1,), dtypes.float32,
+                                        group_size, group_key, instance_key)
+
+      t.join()
+
+    # Enable collective ops again in order to reset the collective executor.
+    enable_collective_ops_with_barrier(cluster_resolver)
+    # Reassign instance_key so that it's the same on each worker.
+    instance_key = 100
+    with ops.device(dev0):
+      if cluster_resolver.task_id == 0:
+        collective_ops.broadcast_send(in_tensor, (1,), dtypes.float32,
+                                      group_size, group_key, instance_key)
+      else:
+        collective_ops.broadcast_recv((1,), dtypes.float32, group_size,
+                                      group_key, instance_key)
+
+
 if __name__ == "__main__":
   multi_process_runner.test_main()
diff --git a/tensorflow/python/kernel_tests/collective_ops_test.py b/tensorflow/python/kernel_tests/collective_ops_test.py
index 0e3e16179a6..fdaf3213759 100644
--- a/tensorflow/python/kernel_tests/collective_ops_test.py
+++ b/tensorflow/python/kernel_tests/collective_ops_test.py
@@ -24,14 +24,19 @@ import time
 from absl.testing import parameterized
 
 from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.experimental.ops import testing as dataset_testing
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import combinations
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops as _collective_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
@@ -41,186 +46,269 @@ class CollectiveOpsV1(object):
 
 
 class CollectiveOpsV2(object):
-  all_reduce = _collective_ops.all_reduce_v2
-  all_gather = _collective_ops.all_gather_v2
+
+  @staticmethod
+  def all_reduce(t, group_size, group_key, instance_key, *args, **kwargs):
+    group_size = array_ops.identity(group_size)
+    group_key = array_ops.identity(group_key)
+    instance_key = array_ops.identity(instance_key)
+    return _collective_ops.all_reduce_v2(t, group_size, group_key, instance_key,
+                                         *args, **kwargs)
+
+  @staticmethod
+  def all_gather(t, group_size, group_key, instance_key, *args, **kwargs):
+    group_size = array_ops.identity(group_size)
+    group_key = array_ops.identity(group_key)
+    instance_key = array_ops.identity(instance_key)
+    return _collective_ops.all_gather_v2(t, group_size, group_key, instance_key,
+                                         *args, **kwargs)
+
+
+device_combination = (
+    combinations.combine(device='CPU', communication='RING', required_gpus=0) +
+    combinations.combine(
+        device='GPU', communication=['RING', 'NCCL'], required_gpus=2))
+
+
+collective_op_combinations = combinations.times(
+    combinations.combine(
+        collective_op=[
+            combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce),
+            combinations.NamedObject('all_reduce_v2',
+                                     CollectiveOpsV2.all_reduce),
+            combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather),
+            combinations.NamedObject('all_gather_v2',
+                                     CollectiveOpsV2.all_gather),
+        ],
+        mode='eager'), device_combination)
 
 
 @combinations.generate(
-    combinations.combine(
-        collective_ops=[
-            combinations.NamedObject('v1', CollectiveOpsV1),
-            combinations.NamedObject('v2', CollectiveOpsV2)
-        ],
-        mode='eager'))
+    combinations.times(
+        combinations.combine(
+            collective_ops=[
+                combinations.NamedObject('v1', CollectiveOpsV1),
+                combinations.NamedObject('v2', CollectiveOpsV2)
+            ],
+            mode='eager'), device_combination))
 class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
     _setup_context()
     super().setUp()
 
-  def testReduce(self, collective_ops):
+  def testReduce(self, collective_ops, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
 
     @def_function.function
-    def run_all_reduce_1cpu():
-      with ops.device('/device:CPU:0'):
+    def run_all_reduce_1device():
+      with ops.device(dev0):
         in_value = constant_op.constant([1.])
         group_size = 1
         group_key = 1
         instance_key = 1
-        return collective_ops.all_reduce(in_value, group_size, group_key,
-                                         instance_key)
+        return collective_ops.all_reduce(
+            in_value,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     @def_function.function
-    def run_all_reduce_2cpus():
+    def run_all_reduce_2devices():
       in_value = constant_op.constant([1.])
       group_size = 2
       group_key = 2
       instance_key = 2
       collectives = []
-      with ops.device('/device:CPU:0'):
+      with ops.device(dev0):
         collectives.append(
-            collective_ops.all_reduce(in_value, group_size, group_key,
-                                      instance_key))
-      with ops.device('/device:CPU:1'):
+            collective_ops.all_reduce(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
+      with ops.device(dev1):
         collectives.append(
-            collective_ops.all_reduce(in_value, group_size, group_key,
-                                      instance_key))
+            collective_ops.all_reduce(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
       return collectives
 
-    self.assertAllClose(run_all_reduce_1cpu(), [1.], rtol=1e-5, atol=1e-5)
-    for result in run_all_reduce_2cpus():
+    self.assertAllClose(run_all_reduce_1device(), [1.], rtol=1e-5, atol=1e-5)
+    for result in run_all_reduce_2devices():
       self.assertAllClose(result, [2.], rtol=1e-5, atol=1e-5)
 
-  def testGather(self, collective_ops):
+  def testGather(self, collective_ops, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
 
     @def_function.function
-    def run_all_gather_1cpu():
-      with ops.device('/device:CPU:0'):
+    def run_all_gather_1device():
+      with ops.device(dev0):
         in_value = constant_op.constant([1.])
         group_size = 1
         group_key = 1
         instance_key = 1
-        return collective_ops.all_gather(in_value, group_size, group_key,
-                                         instance_key)
+        return collective_ops.all_gather(
+            in_value,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     @def_function.function
-    def run_all_gather_2cpus():
+    def run_all_gather_2devices():
       in_value = constant_op.constant([1.])
       group_size = 2
       group_key = 2
       instance_key = 2
       collectives = []
-      with ops.device('/device:CPU:0'):
+      with ops.device(dev0):
         collectives.append(
-            collective_ops.all_gather(in_value, group_size, group_key,
-                                      instance_key))
-      with ops.device('/device:CPU:1'):
+            collective_ops.all_gather(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
+      with ops.device(dev1):
         collectives.append(
-            collective_ops.all_gather(in_value, group_size, group_key,
-                                      instance_key))
+            collective_ops.all_gather(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
       return collectives
 
-    self.assertAllClose(run_all_gather_1cpu(), [1.], rtol=1e-5, atol=1e-5)
-    for result in run_all_gather_2cpus():
+    self.assertAllClose(run_all_gather_1device(), [1.], rtol=1e-5, atol=1e-5)
+    for result in run_all_gather_2devices():
       self.assertAllClose(result, [1., 1.], rtol=1e-5, atol=1e-5)
 
-  def testInstanceKeyScopedUnderGroupKey(self, collective_ops):
+  def testInstanceKeyScopedUnderGroupKey(self, collective_ops, device,
+                                         communication):
+    if device == 'GPU' and context.num_gpus() < 4:
+      self.skipTest('not enough GPU')
+
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    dev2 = '/device:%s:2' % device
+    dev3 = '/device:%s:3' % device
 
     @def_function.function
-    def run_all_reduce_4cpus_same_instance_key():
+    def run_all_reduce_4devices_same_instance_key():
       # Use a common instance key for both groups.
       instance_key = 0
       # We will create 2 groups each with 2 devices.
       group_size = 2
-      # Group 0 comprises cpu:0 and cpu:1.
+      # Group 0 comprises dev0 and dev1.
       group0_key = 0
-      # Group 1 comprises cpu:2 and cpu:3.
+      # Group 1 comprises dev2 and dev3.
       group1_key = 1
       collectives = []
-      with ops.device('/device:CPU:0'):
+      with ops.device(dev0):
         collectives.append(
             collective_ops.all_reduce(
                 constant_op.constant(1.), group_size, group0_key, instance_key))
-      with ops.device('/device:CPU:1'):
+      with ops.device(dev1):
         collectives.append(
             collective_ops.all_reduce(
                 constant_op.constant(2.), group_size, group0_key, instance_key))
-      with ops.device('/device:CPU:2'):
+      with ops.device(dev2):
         collectives.append(
             collective_ops.all_reduce(
                 constant_op.constant(3.), group_size, group1_key, instance_key))
-      with ops.device('/device:CPU:3'):
+      with ops.device(dev3):
         collectives.append(
             collective_ops.all_reduce(
                 constant_op.constant(4.), group_size, group1_key, instance_key))
       return collectives
 
-    results = run_all_reduce_4cpus_same_instance_key()
+    results = run_all_reduce_4devices_same_instance_key()
     self.assertAllClose(results[0], 3., rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], 3., rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[2], 7., rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[3], 7., rtol=1e-5, atol=1e-5)
 
-  def testCollectiveGroupSizeOne(self, collective_ops):
+  def testCollectiveGroupSizeOne(self, collective_ops, device, communication):
+    if communication == 'NCCL':
+      self.skipTest('b/170672646: it crashes with NCCL and group size one')
+    dev0 = '/device:%s:0' % device
+
     group_size = 1
     group_key = 100
     instance_key = 100
-    in_value = [1, 2, 3, 4]
+    in_value = [1., 2., 3., 4.]
     in_tensor = constant_op.constant(in_value)
 
-    reduced_tensor = collective_ops.all_reduce(in_tensor, group_size, group_key,
-                                               instance_key)
+    with ops.device(dev0):
+      reduced_tensor = collective_ops.all_reduce(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint=communication)
     self.assertAllEqual(in_value, reduced_tensor.numpy())
 
-    gathered_tensor = collective_ops.all_gather(
-        in_tensor, group_size, group_key, instance_key)
+    with ops.device(dev0):
+      gathered_tensor = collective_ops.all_gather(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint=communication)
     self.assertAllEqual(in_value, gathered_tensor.numpy())
 
-  def testMultipleGroups(self, collective_ops):
+  def testMultipleGroups(self, collective_ops, device, communication):
+    if device == 'GPU' and context.num_gpus() < 4:
+      self.skipTest('not enough GPU')
+
     num_elements = 4
 
     @def_function.function
     def run_all_reduce(group_size, group_key):
       instance_key = group_key
-      input_value = [group_key for i in range(num_elements)]
+      input_value = [float(group_key) for i in range(num_elements)]
       collectives = []
       for device_idx in range(group_size):
-        with ops.device('/CPU:{}'.format(device_idx)):
+        with ops.device('/{}:{}'.format(device, device_idx)):
           input_tensor = constant_op.constant(input_value)
           collectives.append(
-              collective_ops.all_reduce(input_tensor, group_size, group_key,
-                                        instance_key))
+              collective_ops.all_reduce(
+                  input_tensor,
+                  group_size,
+                  group_key,
+                  instance_key,
+                  communication_hint=communication))
       return collectives
 
     def run_and_assert(group_size, group_key):
       for reduced_tensor in run_all_reduce(group_size, group_key):
         self.assertAllEqual(
-            [group_key * group_size for i in range(num_elements)],
+            [float(group_key) * group_size for i in range(num_elements)],
             reduced_tensor.numpy())
 
     run_and_assert(group_size=2, group_key=1)
     run_and_assert(group_size=3, group_key=2)
 
 
-@combinations.generate(
-    combinations.combine(
-        collective_op=[
-            combinations.NamedObject('all_reduce', _collective_ops.all_reduce),
-            combinations.NamedObject('all_reduce_v2',
-                                     _collective_ops.all_reduce_v2),
-            combinations.NamedObject('all_gather', _collective_ops.all_gather),
-            combinations.NamedObject('all_gather_v2',
-                                     _collective_ops.all_gather_v2),
-        ],
-        mode='eager',
-        communication=['ring']))
+@combinations.generate(collective_op_combinations)
 class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
     _setup_context()
     super().setUp()
 
-  def testAbortGroupParamsResolution(self, collective_op, communication):
+  def testAbortGroupParamsResolution(self, collective_op, device,
+                                     communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
     group_size = 2
     group_key = 100
     instance_key = 100
@@ -236,11 +324,23 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
       # This hangs on params resolution since we're only launching one
       # collective for a group size of 2.
-      collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     # After abortion, subsequent collectives should fail immediately.
     with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     t.join()
     # Reset the context in order to reset the collective executor.
@@ -248,7 +348,7 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
     # After reset non-NCCL collectives should work.
     def collective_fn():
-      for device in ['CPU:0', 'CPU:1']:
+      for device in [dev0, dev1]:
         with ops.device(device):
           collective_op(
               in_tensor,
@@ -259,14 +359,17 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
     def_function.function(collective_fn)()
 
-  def testAbortInstanceParamsResolution(self, collective_op, communication):
+  def testAbortInstanceParamsResolution(self, collective_op, device,
+                                        communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
     group_size = 2
     group_key = 100
     instance_key = 100
     in_tensor = constant_op.constant([1.])
 
     def collective_fn():
-      for device in ['CPU:0', 'CPU:1']:
+      for device in [dev0, dev1]:
         with ops.device(device):
           collective_op(
               in_tensor,
@@ -290,11 +393,23 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
       # This hangs on params resolution since we're only launching one
       # collective for a group size of 2.
-      collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     # After abortion, subsequent collectives should fail immediately.
     with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     context._reset_context()  # pylint: disable=protected-access
     t.join()
@@ -304,7 +419,9 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
     # After reset non-NCCL collectives should work.
     def_function.function(collective_fn)()
 
-  def testAbortCommunication(self, collective_op, communication):
+  def testAbortCommunication(self, collective_op, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
     group_size = 2
     group_key = 100
     instance_key = 100
@@ -312,7 +429,7 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
     # First perform a normal collective to finish resolution.
     def collective_fn():
-      for device in ['CPU:0', 'CPU:1']:
+      for device in [dev0, dev1]:
         with ops.device(device):
           collective_op(
               in_tensor,
@@ -333,11 +450,23 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
     t.start()
 
     with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     # After abortion, subsequent collectives should fail immediately.
     with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     # Reset the context in order to reset the collective executor.
     t.join()
@@ -345,37 +474,201 @@ class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
     def_function.function(collective_fn)()
 
 
-@combinations.generate(
-    combinations.combine(
-        collective_op=[
-            combinations.NamedObject('all_reduce', _collective_ops.all_reduce),
-            combinations.NamedObject('all_reduce_v2',
-                                     _collective_ops.all_reduce_v2),
-            combinations.NamedObject('all_gather', _collective_ops.all_gather),
-            combinations.NamedObject('all_gather_v2',
-                                     _collective_ops.all_gather_v2),
-        ],
-        mode='eager',
-        communication=['ring']))
+class OpCancellationTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    _setup_context()
+    super().setUp()
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              collective_op=[
+                  combinations.NamedObject('all_reduce',
+                                           CollectiveOpsV1.all_reduce),
+                  combinations.NamedObject('all_reduce_v2',
+                                           CollectiveOpsV2.all_reduce),
+                  combinations.NamedObject('all_gather',
+                                           CollectiveOpsV1.all_gather),
+                  combinations.NamedObject('all_gather_v2',
+                                           CollectiveOpsV2.all_gather),
+              ],
+              mode='eager'), device_combination))
+  def testOpErrorNotAbortIfNoCollective(self, collective_op, device,
+                                        communication):
+    # Do not abort if there's no active collective ops. There could be
+    # exceptions like EOF which we expect users to catch, aborting collective
+    # ops on all op errors intervenes with this workflow.
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    dataset = dataset_ops.Dataset.from_tensors([1.])
+
+    @def_function.function
+    def collective_fn(in_tensor):
+      for device in [dev0, dev1]:
+        with ops.device(device):
+          collective_op(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+    @def_function.function
+    def f():
+      iterator = iter(dataset)
+      collective_fn(next(iterator))
+      # This next(iterator) should raise EOF.
+      collective_fn(next(iterator))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      f()
+    collective_fn(constant_op.constant([1.]))
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              collective_op=[
+                  combinations.NamedObject('all_reduce',
+                                           CollectiveOpsV1.all_reduce),
+                  combinations.NamedObject('all_gather',
+                                           CollectiveOpsV1.all_gather),
+              ],
+              mode='eager'), device_combination))
+  def testOpErrorAbortWithCollective(self, collective_op, device,
+                                     communication):
+    # Abort v1 collective ops if there're active collective ops at the time of
+    # an op error. This is due to the inability to cancel collective ops, and op
+    # errors may cause running collective ops to hang.
+    dev0 = '/device:%s:0' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+    # Make the dataset sleep a while so that the collective is being executed
+    # when the EOF happens.
+    dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
+        dataset_testing.sleep(sleep_microseconds=200))
+
+    @def_function.function
+    def f():
+      # Launch a collective op that won't be able to finish to test abortion
+      # when other ops error.
+      with ops.device(dev0):
+        ret = collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+      iterator = iter(dataset)
+      next(iterator)
+      # This should raise EOF.
+      next(iterator)
+      return ret
+
+    with self.assertRaises(errors.OutOfRangeError):
+      f()
+    # Now collective ops is aborted, subsequent collective ops should fail with
+    # the previous error.
+    with self.assertRaises(errors.CancelledError):
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              collective_op=[
+                  combinations.NamedObject('all_reduce_v2',
+                                           CollectiveOpsV2.all_reduce),
+                  combinations.NamedObject('all_gather_v2',
+                                           CollectiveOpsV2.all_gather),
+              ],
+              mode='eager'), device_combination))
+  def testOpErrorNotAbortWithCollective(self, collective_op, device,
+                                        communication):
+    # Do not abort v2 collective ops even if there're active collective ops at
+    # the time of an op error. We rely cancellation to terminate active
+    # collective ops.
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    @def_function.function
+    def collective_fn():
+      for device in [dev0, dev1]:
+        with ops.device(device):
+          collective_op(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+    # Local params resolution cannot be cancelled yet, so we perform a normal
+    # collective so that the group is resolved.
+    collective_fn()
+
+    # Make the dataset sleep a while so that the collective is being executed
+    # when the EOF happens.
+    dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
+        dataset_testing.sleep(sleep_microseconds=200))
+
+    @def_function.function
+    def f():
+      # Launch a collective op that won't be able to finish to test cancellation
+      # when other ops error.
+      with ops.device(dev0):
+        ret = collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+      iterator = iter(dataset)
+      next(iterator)
+      # This should raise EOF.
+      next(iterator)
+      return ret
+
+    with self.assertRaises(errors.OutOfRangeError):
+      f()
+    # Collective ops shouldn't be aborted and new collectives should be able to
+    # proceed.
+    collective_fn()
+
+
+@combinations.generate(collective_op_combinations)
 class TimeoutTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
     _setup_context()
     super().setUp()
 
-  def testTimeout(self, collective_op, communication):
-    timeout = 4.5
+  def testTimeout(self, collective_op, device, communication):
+    timeout = 1.5
 
     @def_function.function
     def run(group_size, reported_group_size=None):
       group_key = 20
       instance_key = 30
-      tensor = [1, 2, 3, 4]
+      tensor = [1., 2., 3., 4.]
       results = []
       if reported_group_size is None:
         reported_group_size = group_size
       for i in range(group_size):
-        with ops.device('/CPU:{}'.format(i)):
+        with ops.device('/{}:{}'.format(device, i)):
           input_data = constant_op.constant(tensor)
           result = collective_op(
               input_data,
@@ -396,18 +689,20 @@ class TimeoutTest(test.TestCase, parameterized.TestCase):
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
-  def testParamResolutionAfterTimeoutV2(self, collective_op, communication):
+  def testParamResolutionAfterTimeout(self, collective_op, device,
+                                      communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
     timeout = 1.5
-
     group_key = 20
     instance_key = 30
-    input_data = constant_op.constant([1, 2, 3, 4])
+    input_data = constant_op.constant([1., 2., 3., 4.])
 
     # This timeout comes from param solution.
     with self.assertRaisesRegex(
         errors.DeadlineExceededError,
         'Collective has timed out waiting for other workers'):
-      with ops.device('CPU:0'):
+      with ops.device(dev0):
         collective_op(
             input_data,
             group_size=2,
@@ -418,28 +713,29 @@ class TimeoutTest(test.TestCase, parameterized.TestCase):
 
     # We launch the second device after the first device times out. This is to
     # simulate the situation when other workers are slow and the timeout is
-    # short. Since the CPU:0 times out in the param resolution phase, CPU:1
-    # should times out as well, but in the execute phase.
-    with self.assertRaisesRegex(errors.DeadlineExceededError,
-                                'Collective has timed out during execution'):
-      with ops.device('CPU:1'):
+    # short. It should error immediately.
+    with self.assertRaisesRegex(
+        errors.DeadlineExceededError,
+        'Collective has timed out waiting for other workers'):
+      with ops.device(dev1):
         collective_op(
             input_data,
             group_size=2,
             group_key=group_key,
             instance_key=instance_key,
-            communication_hint=communication,
-            timeout=timeout)
+            communication_hint=communication)
 
-  def testExecutionAfterTimeoutV2(self, collective_op, communication):
+  def testExecutionAfterTimeout(self, collective_op, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
     timeout = 1.5
     group_key = 20
     instance_key = 30
-    input_data = constant_op.constant([1, 2, 3, 4])
+    input_data = constant_op.constant([1., 2., 3., 4.])
 
     @def_function.function
     def run():
-      for device in ['CPU:0', 'CPU:1']:
+      for device in [dev0, dev1]:
         with ops.device(device):
           collective_op(
               input_data,
@@ -454,7 +750,7 @@ class TimeoutTest(test.TestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(errors.DeadlineExceededError,
                                 'Collective has timed out during execution'):
-      with ops.device('CPU:0'):
+      with ops.device(dev0):
         collective_op(
             input_data,
             group_size=2,
@@ -468,7 +764,7 @@ class TimeoutTest(test.TestCase, parameterized.TestCase):
     # short. It should error immediately.
     with self.assertRaisesRegex(errors.DeadlineExceededError,
                                 'Collective has timed out during execution'):
-      with ops.device('CPU:1'):
+      with ops.device(dev1):
         # No timeout.
         collective_op(
             input_data,
@@ -478,15 +774,97 @@ class TimeoutTest(test.TestCase, parameterized.TestCase):
             communication_hint=communication)
 
 
+@combinations.generate(
+    combinations.times(
+        combinations.combine(
+            collective_op=[
+                combinations.NamedObject('all_reduce_v2',
+                                         CollectiveOpsV2.all_reduce),
+                combinations.NamedObject('all_gather_v2',
+                                         CollectiveOpsV2.all_gather),
+            ],
+            mode='eager'), device_combination))
+class OrderingTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    _setup_context()
+    super().setUp()
+
+  def testOrdering(self, collective_op, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    with ops.device(dev0):
+      token0 = resource_variable_ops.ResourceVariable(0.)
+    with ops.device(dev1):
+      token1 = resource_variable_ops.ResourceVariable(0.)
+
+    @def_function.function
+    def f():
+      # Launch the first collective with token.
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            ordering_token=token0.handle)
+      with ops.device(dev1):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            ordering_token=token1.handle)
+      # Launch the second collective without token.
+      with ops.device(dev0):
+        collective_op(in_tensor, group_size, group_key, instance_key)
+      with ops.device(dev1):
+        collective_op(in_tensor, group_size, group_key, instance_key)
+      # Launch the third collective with token.
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            ordering_token=token0.handle)
+      with ops.device(dev1):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            ordering_token=token1.handle)
+
+    graph = f.get_concrete_function().graph
+    for device in [dev0, dev1]:
+      # Try to find the third collective, which should have the first collective
+      # as a control input.
+      third = None
+      for op in graph.get_operations():
+        if (op.type.startswith('Collective') and op.device.endswith(device) and
+            op.control_inputs and
+            op.control_inputs[0].type.startswith('Collective')):
+          self.assertIsNone(third)
+          third = op
+      self.assertIsNotNone(third)
+      # Verify it's not the second collective by looking at the inputs.
+      self.assertTrue(any(v.dtype == dtypes.resource for v in third.inputs))
+      first = third.control_inputs[0]
+      self.assertEqual(third.device, first.device)
+      # Verify it's not the second collective by looking at the inputs.
+      self.assertTrue(any(v.dtype == dtypes.resource for v in first.inputs))
+      self.assertEmpty(first.control_inputs)
+
+
 def _setup_context():
   context._reset_context()
-  cpus = config.list_physical_devices('CPU')
-  config.set_logical_device_configuration(cpus[0], [
-      context.LogicalDeviceConfiguration(),
-      context.LogicalDeviceConfiguration(),
-      context.LogicalDeviceConfiguration(),
-      context.LogicalDeviceConfiguration()
-  ])
+  test_util.set_logical_devices_to_at_least('CPU', 4)
   context.ensure_initialized()
 
 
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 70d7b2530a9..2011b3b4b45 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -960,6 +960,42 @@ class CondV2Test(test.TestCase):
 
       self.assertAllEqual(fn_with_cond(), 12.0)
 
+  def _CheckIteratedCosGradients(self, func):
+
+    def _grad(f):
+      def _grad_function(primal):
+        with backprop.GradientTape() as tape:
+          tape.watch(primal)
+          primal_out = f(primal)
+        return tape.gradient(primal_out, primal)
+      return _grad_function
+
+    f = func
+    one = constant_op.constant(1.)
+    for expected in [math_ops.cos,
+                     lambda x: -math_ops.sin(x),
+                     lambda x: -math_ops.cos(x),
+                     math_ops.sin,
+                     math_ops.cos]:
+      self.assertAllClose(expected(one), def_function.function(f)(one))
+      f = _grad(f)
+
+  def testIteratedGradientsCond(self):
+    def _func(x):
+      return cond_v2.cond_v2(
+          constant_op.constant(True),
+          lambda: math_ops.cos(array_ops.identity(x)),
+          lambda: math_ops.sin(array_ops.identity(x)))
+    self._CheckIteratedCosGradients(_func)
+
+  def testIteratedGradientsCase(self):
+    def _func(x):
+      return cond_v2.indexed_case(
+          constant_op.constant(1),
+          [lambda: math_ops.sin(array_ops.identity(x)),
+           lambda: math_ops.cos(array_ops.identity(x))])
+    self._CheckIteratedCosGradients(_func)
+
   def testLowering(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
@@ -1405,6 +1441,15 @@ class CondV2ContainerTest(test.TestCase):
 
 class CondV2ColocationGroupAndDeviceTest(test.TestCase):
 
+  def setUp(self):
+    super(CondV2ColocationGroupAndDeviceTest, self).setUp()
+    cpus = context.context().list_physical_devices("CPU")
+    context.context().set_logical_device_configuration(
+        cpus[0], [
+            context.LogicalDeviceConfiguration(),
+            context.LogicalDeviceConfiguration()
+        ])
+
   def testColocateWithBeforeCond(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g):
@@ -1480,31 +1525,64 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
         self.assertTrue(len(run_metadata.partition_graphs) >= 2)
 
   def testDeviceBeforeCond(self):
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-
-        def fn():
-          self.assertEqual("", constant_op.constant(3.0).op.device)
-          return test_ops.device_placement_op()
+    with context.eager_mode():
+      def fn():
+        cpu_zero_op = test_ops.device_placement_op()
+        self.assertEqual("/device:CPU:0", cpu_zero_op.device)
+        with ops.device("CPU:1"):
+          cpu_one_op = test_ops.device_placement_op()
+          self.assertEqual("/device:CPU:1", cpu_one_op.device)
+        return cpu_zero_op, cpu_one_op
 
+      @def_function.function
+      def _cond_wrapper():
         with ops.device("/device:CPU:0"):
-          self.assertIn(
-              compat.as_bytes("CPU:0"),
-              self.evaluate(cond_v2.cond_v2(constant_op.constant(True),
-                                            fn, fn)))
+          return cond_v2.cond_v2(constant_op.constant(True), fn, fn)
 
-        def fn2():
-          self.assertEqual("", constant_op.constant(3.0).op.device)
-          return test_ops.device_placement_op()
+      zero_expected, one_expected = self.evaluate(_cond_wrapper())
+      self.assertIn(compat.as_bytes("CPU:0"), zero_expected)
+      self.assertIn(compat.as_bytes("CPU:1"), one_expected)
 
-        if test_util.is_gpu_available():
-          with ops.device("/device:GPU:0"):
-            self.assertIn(
-                compat.as_bytes("GPU:0"),
-                self.evaluate(cond_v2.cond_v2(constant_op.constant(True),
-                                              fn2, fn2)))
-        else:
-          self.skipTest("Test requires a GPU to check GPU device placement.")
+      def fn2():
+        self.assertEqual("/device:GPU:0", constant_op.constant(3.0).op.device)
+        return test_ops.device_placement_op()
+
+      @def_function.function
+      def _cond_wrapper2():
+        with ops.device("/device:GPU:0"):
+          return cond_v2.cond_v2(constant_op.constant(True), fn2, fn2)
+
+      if test_util.is_gpu_available():
+        self.assertIn(compat.as_bytes("GPU:0"),
+                      self.evaluate(_cond_wrapper2()))
+      else:
+        self.skipTest("Test requires a GPU to check GPU device placement.")
+
+  def testColocationBeforeCond(self):
+    with context.eager_mode():
+
+      def _fn():
+        result = test_ops.device_placement_op()
+        self.assertIn("colocation_test_op",
+                      result.op.colocation_groups()[0].decode())
+        return result
+
+      @def_function.function(autograph=False)
+      def _cond_wrapper():
+        with ops.device("/device:CPU:0"):
+          op_on_cpu_0 = test_ops.device_placement_op(name="colocation_test_op")
+        with ops.device("/device:CPU:1"):
+          op_on_cpu_1 = test_ops.device_placement_op(
+              name="colocation_test_op_1")
+        condition = constant_op.constant(True)
+        with ops.colocate_with(op_on_cpu_0.op):
+          zero_expected = cond_v2.cond_v2(condition, _fn, _fn)
+        with ops.colocate_with(op_on_cpu_1.op):
+          one_expected = cond_v2.cond_v2(condition, _fn, _fn)
+        return zero_expected, one_expected
+      zero_expected, one_expected = self.evaluate(_cond_wrapper())
+      self.assertIn(compat.as_bytes("CPU:0"), zero_expected)
+      self.assertIn(compat.as_bytes("CPU:1"), one_expected)
 
   def testDeviceInAndOutOfCond(self):
     with ops.Graph().as_default() as g:
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 54bbd2b2e9e..532dac1d85a 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -730,6 +730,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         g for g in run_metadata.partition_graphs
         if device_str in g.node[0].device
     ]
+    if not device_graphs:
+      return 0
     self.assertLen(device_graphs, 1)
     switch_nodes = [
         n for n in device_graphs[0].node
@@ -759,7 +761,6 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       options = config_pb2.RunOptions(output_partition_graphs=True)
       sess.run(
           r, feed_dict={x: -10.}, options=options, run_metadata=run_metadata)
-      self.assertLen(run_metadata.partition_graphs, 2)
       # Check that the Switch for `arg` gets placed on CPU.
       self.assertEqual(
           self._count_matching_switch_nodes_on_device(run_metadata, "CPU",
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index ca8f171e700..276874d41c6 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -286,7 +286,7 @@ class CTCLossTest(test.TestCase):
     with self.session(use_gpu=False):
       loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
-      # Taking ths second gradient should fail, since it is not
+      # Taking this second gradient should fail, since it is not
       # yet supported.
       with self.assertRaisesRegex(LookupError, "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 96628a1a06a..d98942af52f 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -767,6 +767,14 @@ class MinMaxOpTest(test.TestCase):
         self._compare(x.astype(t), y.astype(t), use_gpu=False)
         self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
+  def testNaNPropagation(self):
+    x = np.array([1., np.nan, 1., np.nan], dtype=np.float64)
+    y = np.array([1., 1., np.nan, np.nan], dtype=np.float64)
+    for t in [np.float16, np.float32, np.float64]:
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
+
   def testDifferentShapes(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(2) * 100.  # should broadcast
diff --git a/tensorflow/python/kernel_tests/decode_compressed_op_test.py b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
index fd871c00906..2e863cddedd 100644
--- a/tensorflow/python/kernel_tests/decode_compressed_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
@@ -24,7 +24,7 @@ import zlib
 from six import BytesIO
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -43,31 +43,40 @@ class DecodeCompressedOpTest(test.TestCase):
         f.write(bytes_in)
       return out.getvalue()
 
-  @test_util.run_deprecated_v1
+  def testDecompressShapeInference(self):
+    with ops.Graph().as_default():
+      for compression_type in ["ZLIB", "GZIP", ""]:
+        with self.cached_session():
+          in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
+          decompressed = parsing_ops.decode_compressed(
+              in_bytes, compression_type=compression_type)
+          self.assertEqual([2], decompressed.get_shape().as_list())
+
   def testDecompress(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
-        in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
-        decompressed = parsing_ops.decode_compressed(
-            in_bytes, compression_type=compression_type)
-        self.assertEqual([2], decompressed.get_shape().as_list())
 
-        result = decompressed.eval(
-            feed_dict={in_bytes: [self._compress(b"AaAA", compression_type),
-                                  self._compress(b"bBbb", compression_type)]})
+        def decode(in_bytes, compression_type=compression_type):
+          return parsing_ops.decode_compressed(
+              in_bytes, compression_type=compression_type)
+
+        in_val = [self._compress(b"AaAA", compression_type),
+                  self._compress(b"bBbb", compression_type)]
+        result = self.evaluate(decode(in_val))
         self.assertAllEqual([b"AaAA", b"bBbb"], result)
 
-  @test_util.run_deprecated_v1
   def testDecompressWithRaw(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
-        in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
-        decompressed = parsing_ops.decode_compressed(
-            in_bytes, compression_type=compression_type)
-        decode = parsing_ops.decode_raw(decompressed, out_type=dtypes.int16)
 
-        result = decode.eval(
-            feed_dict={in_bytes: [self._compress(b"AaBC", compression_type)]})
+        def decode(in_bytes, compression_type=compression_type):
+          decompressed = parsing_ops.decode_compressed(in_bytes,
+                                                       compression_type)
+          return parsing_ops.decode_raw(decompressed, out_type=dtypes.int16)
+
+        result = self.evaluate(
+            decode([self._compress(b"AaBC", compression_type)]))
+
         self.assertAllEqual(
             [[ord("A") + ord("a") * 256, ord("B") + ord("C") * 256]], result)
 
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 4b0915b37e7..fc10b10be9d 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -11,7 +11,6 @@ cuda_py_test(
     name = "bijector_test",
     size = "small",
     srcs = ["bijector_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -49,7 +48,6 @@ cuda_py_test(
     name = "kullback_leibler_test",
     size = "small",
     srcs = ["kullback_leibler_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -62,7 +60,9 @@ cuda_py_test(
     name = "beta_test",
     size = "small",
     srcs = ["beta_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -114,6 +114,9 @@ cuda_py_test(
     name = "dirichlet_test",
     size = "small",
     srcs = ["dirichlet_test.py"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -268,7 +271,6 @@ cuda_py_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -285,7 +287,6 @@ cuda_py_test(
     name = "identity_bijector_test",
     size = "small",
     srcs = ["identity_bijector_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index a0d8bef327d..ba6df8ffb4f 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -197,7 +197,7 @@ class DirichletTest(test.TestCase):
 
     self.assertAllClose(sample_mean_, analytic_mean, atol=0.04, rtol=0.)
     self.assertAllClose(sample_cov_, analytic_cov, atol=0.06, rtol=0.)
-    self.assertAllClose(sample_var_, analytic_var, atol=0.03, rtol=0.)
+    self.assertAllClose(sample_var_, analytic_var, atol=0.04, rtol=0.)
     self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
 
   @test_util.run_without_tensor_float_32(
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 093fdb69dc3..b6ab771fe08 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -300,6 +300,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
             param)
         checked_param.eval(feed_dict={param: np.ones([int(2**11+1)])})
 
+  @test_util.disable_tfrt("b/169901260")
   @test_util.run_in_graph_and_eager_modes
   def testUnsupportedDtype(self):
     param = ops.convert_to_tensor(
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py
index f5f53bf1c7d..33ad531681d 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py
@@ -21,14 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed as random_seed_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -79,25 +79,31 @@ class ExtractVolumePatchesGradTest(test.TestCase, parameterized.TestCase):
         print('extract_volume_patches gradient err: %.4e' % err)
         self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
-  def testConstructGradientWithLargeVolumess(self):
-    batch_size = 4
-    planes = 8
-    height = 32
-    width = 32
-    ksize = 5
-    volumes = variable_scope.get_variable(
-        'inputs', (batch_size, planes, height, width, 1))
-    patches = array_ops.extract_volume_patches(
-        volumes,
-        ksizes=[1, ksize, ksize, ksize, 1],
-        strides=[1, 1, 1, 1, 1],
-        padding='SAME')
-    # Github issue: #20146
-    # tf.extract_volume_patches() gradient very slow at graph construction time
-    gradients = gradients_impl.gradients(patches, volumes)
-    # Won't time out.
-    self.assertIsNotNone(gradients)
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testConstructGradientWithLargeVolumes(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      batch_size = 4
+      planes = 8
+      height = 32
+      width = 32
+      ksize = 5
+      shape = (batch_size, planes, height, width, 1)
+
+      volumes = variables.Variable(
+          np.random.uniform(size=np.prod(shape)).reshape(shape), name='inputs')
+
+      tape.watch(volumes)
+      patches = array_ops.extract_volume_patches(
+          volumes,
+          ksizes=[1, ksize, ksize, ksize, 1],
+          strides=[1, 1, 1, 1, 1],
+          padding='SAME')
+      # Github issue: #20146
+      # tf.extract_volume_patches() gradient very slow at graph construction
+      # time.
+      gradients = tape.gradient(patches, volumes)
+      # Won't time out.
+      self.assertIsNotNone(gradients)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 67d1adba3a9..097183d1025 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -27,7 +27,6 @@ cuda_py_test(
     name = "linear_operator_addition_test",
     size = "small",
     srcs = ["linear_operator_addition_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -66,7 +65,6 @@ cuda_py_test(
     name = "linear_operator_algebra_test",
     size = "small",
     srcs = ["linear_operator_algebra_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index 560ba7b2fd4..96ebc38ce5a 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -11,7 +11,6 @@ cuda_py_test(
     name = "conjugate_gradient_test",
     size = "medium",
     srcs = ["conjugate_gradient_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -29,7 +28,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["csr_sparse_matrix_test.py"],
     main = "csr_sparse_matrix_test.py",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
@@ -55,7 +53,6 @@ cuda_py_test(
     srcs = ["csr_sparse_matrix_grad_test.py"],
     main = "csr_sparse_matrix_grad_test.py",
     shard_count = 50,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
@@ -67,7 +64,6 @@ cuda_py_test(
     srcs = ["csr_sparse_matrix_dense_mat_mul_grad_test.py"],
     main = "csr_sparse_matrix_dense_mat_mul_grad_test.py",
     shard_count = 50,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
@@ -79,7 +75,6 @@ cuda_py_test(
     srcs = ["csr_sparse_matrix_sparse_mat_mul_grad_test.py"],
     main = "csr_sparse_matrix_sparse_mat_mul_grad_test.py",
     shard_count = 50,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
index f42600bd334..b9d9f007167 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
@@ -587,7 +587,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
             c_t_value, c_dense_t_value = self.evaluate((c_t, c_dense_t))
 
             self.assertAllClose(
-                c_t_value, c_dense_t_value, rtol=1e-6, atol=1e-5)
+                c_t_value, c_dense_t_value, rtol=1e-6, atol=2e-5)
 
   @test_util.run_in_graph_and_eager_modes
   def testLargeBatchSparseMatrixMatMulTransposed(self):
@@ -650,7 +650,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
             self.assertAllEqual(c_t.shape, c_dense_t.shape)
             c_t_value, c_dense_t_value = self.evaluate((c_t, c_dense_t))
             self.assertAllClose(
-                c_t_value, c_dense_t_value, rtol=1e-6, atol=1e-5)
+                c_t_value, c_dense_t_value, rtol=1e-6, atol=2e-5)
 
   @test_util.run_in_graph_and_eager_modes
   def testLargeBatchSparseMatrixMatMulConjugate(self):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 4ed02fec222..411087d193c 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -242,10 +242,14 @@ if __name__ == '__main__':
         _AddTest(MatrixUnaryFunctorGradientTest, 'MatrixInverseGradient', name,
                  _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_inverse,
                                                     dtype, shape))
-        _AddTest(MatrixUnaryFunctorGradientTest, 'MatrixExponentialGradient',
-                 name,
-                 _GetMatrixUnaryFunctorGradientTest(
-                     linalg_impl.matrix_exponential, dtype, shape))
+        if not test_lib.is_built_with_rocm():
+          # TODO(rocm) :
+          # re-enable this test when upstream issues are resolved
+          # see commit msg for details
+          _AddTest(
+              MatrixUnaryFunctorGradientTest, 'MatrixExponentialGradient', name,
+              _GetMatrixUnaryFunctorGradientTest(linalg_impl.matrix_exponential,
+                                                 dtype, shape))
         _AddTest(
             MatrixUnaryFunctorGradientTest, 'MatrixDeterminantGradient', name,
             _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_determinant,
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index d6f796d5947..a8450e6b5b5 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -195,6 +195,20 @@ class StaticHashTableTest(BaseLookupTableTest):
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
+  def testStaticHashTableGetItem(self):
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(lookup_ops.KeyValueTensorInitializer(
+        keys, values), default_val)
+    self.initialize_table(table)
+
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table[input_string]
+
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
+
   def testStaticHashTableWithSparseTensorInput(self):
     default_val = constant_op.constant(-1, dtypes.int64)
     keys = constant_op.constant(["brain", "salad", "surgery"])
@@ -972,6 +986,21 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
     self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
+  def testStaticVocabularyTableGetItem(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    vocab_size = 3
+    oov_buckets = 1
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=vocab_size), oov_buckets)
+
+    self.initialize_table(table)
+
+    input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+    out = table[input_string]
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
+
   def testInt32StaticVocabularyTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
     vocab_size = 3
@@ -1244,71 +1273,85 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
 class DenseHashTableOpTest(test.TestCase):
 
   def testBasic(self):
-    with self.cached_session():
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+
+    remove_string = constant_op.constant([12, 15], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([0, -1, -1], result)
+
+  def testGetItem(self):
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1)
+
+    self.evaluate(table.insert(keys, values))
+
+    input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+    output = table[input_string]
+    self.assertAllEqual([3], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
+
+  def testBasicBool(self):
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([True, True, True, True], dtypes.bool)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.bool,
+        default_value=False,
+        empty_key=0,
+        deleted_key=-1)
+    self.assertAllEqual(0, self.evaluate(table.size()))
+
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+
+    remove_string = constant_op.constant([11, 15], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([False, True, False], result)
+
+  def testSameEmptyAndDeletedKey(self):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Empty and deleted keys"):
       table = lookup_ops.DenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
+          empty_key=42,
+          deleted_key=42)
       self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-
-      remove_string = constant_op.constant([12, 15], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = self.evaluate(output)
-      self.assertAllEqual([0, -1, -1], result)
-
-  def testBasicBool(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([True, True, True, True], dtypes.bool)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.bool,
-          default_value=False,
-          empty_key=0,
-          deleted_key=-1)
-      self.assertAllEqual(0, self.evaluate(table.size()))
-
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-
-      remove_string = constant_op.constant([11, 15], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = self.evaluate(output)
-      self.assertAllEqual([False, True, False], result)
-
-  def testSameEmptyAndDeletedKey(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Empty and deleted keys"):
-        table = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, self.evaluate(table.size()))
-
   @test_util.run_v1_only("uses placeholders")
   def testLookupUnknownShape(self):
     with self.cached_session():
@@ -1331,212 +1374,203 @@ class DenseHashTableOpTest(test.TestCase):
       self.assertAllEqual([0, 1, -1], result)
 
   def testMapStringToFloat(self):
-    with self.cached_session():
+    keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
+    values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
+    default_value = constant_op.constant(-1.5, dtypes.float32)
+    table = lookup_ops.DenseHashTable(
+        dtypes.string,
+        dtypes.float32,
+        default_value=default_value,
+        empty_key="",
+        deleted_key="$")
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
-      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
-      default_value = constant_op.constant(-1.5, dtypes.float32)
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+
+    remove_string = constant_op.constant(["b", "e"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
+    output = table.lookup(input_string)
+    self.assertAllEqual([4], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllClose([0, -1.5, 3.3, -1.5], result)
+
+  def testMapInt64ToFloat(self):
+    for float_dtype in [dtypes.float32, dtypes.float64]:
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
+      default_value = constant_op.constant(-1.5, float_dtype)
       table = lookup_ops.DenseHashTable(
-          dtypes.string,
-          dtypes.float32,
+          dtypes.int64,
+          float_dtype,
           default_value=default_value,
-          empty_key="",
-          deleted_key="$")
+          empty_key=0,
+          deleted_key=-1)
       self.assertAllEqual(0, self.evaluate(table.size()))
 
       self.evaluate(table.insert(keys, values))
       self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["b", "e"])
+      remove_string = constant_op.constant([12, 15], dtypes.int64)
       self.evaluate(table.remove(remove_string))
       self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
+      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
       output = table.lookup(input_string)
       self.assertAllEqual([4], output.get_shape())
 
       result = self.evaluate(output)
       self.assertAllClose([0, -1.5, 3.3, -1.5], result)
 
-  def testMapInt64ToFloat(self):
-    for float_dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-
-        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
-        default_value = constant_op.constant(-1.5, float_dtype)
-        table = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            float_dtype,
-            default_value=default_value,
-            empty_key=0,
-            deleted_key=-1)
-        self.assertAllEqual(0, self.evaluate(table.size()))
-
-        self.evaluate(table.insert(keys, values))
-        self.assertAllEqual(4, self.evaluate(table.size()))
-
-        remove_string = constant_op.constant([12, 15], dtypes.int64)
-        self.evaluate(table.remove(remove_string))
-        self.assertAllEqual(3, self.evaluate(table.size()))
-
-        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-        output = table.lookup(input_string)
-        self.assertAllEqual([4], output.get_shape())
-
-        result = self.evaluate(output)
-        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
-
   def testVectorValues(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
-                                    dtypes.int64)
-      default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
+                                  dtypes.int64)
+    default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=0,
+        deleted_key=-1,
+        initial_num_buckets=4)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
 
-      self.evaluate(
-          table.insert(
-              constant_op.constant([14], dtypes.int64),
-              constant_op.constant([[2, 3, 4, 5]], dtypes.int64)))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    self.evaluate(
+        table.insert(
+            constant_op.constant([14], dtypes.int64),
+            constant_op.constant([[2, 3, 4, 5]], dtypes.int64)))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      remove_string = constant_op.constant([12, 16], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    remove_string = constant_op.constant([12, 16], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4, 4],
-                          output.shape,
-                          msg="Saw shape: %s" % output.shape)
+    input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([4, 4],
+                        output.shape,
+                        msg="Saw shape: %s" % output.shape)
 
-      result = self.evaluate(output)
-      self.assertAllEqual(
-          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
-          result)
+    result = self.evaluate(output)
+    self.assertAllEqual(
+        [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
+        result)
 
   def testVectorKeys(self):
-    with self.cached_session():
-      keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
-      values = constant_op.constant([10, 11, 12], dtypes.int64)
-      empty_key = constant_op.constant([0, 3], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
+    values = constant_op.constant([10, 11, 12], dtypes.int64)
+    empty_key = constant_op.constant([0, 3], dtypes.int64)
+    deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+    default_value = constant_op.constant(-1, dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        initial_num_buckets=8)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      self.evaluate(
-          table.insert(
-              constant_op.constant([[0, 0]], dtypes.int64),
-              constant_op.constant([13], dtypes.int64)))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    self.evaluate(
+        table.insert(
+            constant_op.constant([[0, 0]], dtypes.int64),
+            constant_op.constant([13], dtypes.int64)))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4], output.get_shape())
+    input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
+                                        dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([4], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([10, -1, 12, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([10, -1, 12, -1], result)
 
   def testResize(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1,
+        initial_num_buckets=4)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
 
-      keys2 = constant_op.constant([12, 99], dtypes.int64)
-      self.evaluate(table.remove(keys2))
-      self.assertAllEqual(2, self.evaluate(table.size()))
-      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+    keys2 = constant_op.constant([12, 99], dtypes.int64)
+    self.evaluate(table.remove(keys2))
+    self.assertAllEqual(2, self.evaluate(table.size()))
+    self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
 
-      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
-      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
+    keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
+    values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
 
-      self.evaluate(table.insert(keys3, values3))
-      self.assertAllEqual(6, self.evaluate(table.size()))
-      self.assertAllEqual(16, len(self.evaluate(table.export()[0])))
+    self.evaluate(table.insert(keys3, values3))
+    self.assertAllEqual(6, self.evaluate(table.size()))
+    self.assertAllEqual(16, len(self.evaluate(table.export()[0])))
 
-      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
-                                   dtypes.int64)
-      output = table.lookup(keys4)
-      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output))
+    keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
+                                 dtypes.int64)
+    output = table.lookup(keys4)
+    self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output))
 
   def testExport(self):
-    with self.cached_session():
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=100,
+        deleted_key=200,
+        initial_num_buckets=8)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=100,
-          deleted_key=200,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    keys2 = constant_op.constant([12, 15], dtypes.int64)
+    self.evaluate(table.remove(keys2))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      keys2 = constant_op.constant([12, 15], dtypes.int64)
-      self.evaluate(table.remove(keys2))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    exported_keys, exported_values = table.export()
 
-      exported_keys, exported_values = table.export()
+    np_keys = self.evaluate(exported_keys)
+    np_values = self.evaluate(exported_values)
 
-      np_keys = self.evaluate(exported_keys)
-      np_values = self.evaluate(exported_values)
+    self.assertAllEqual(8, len(np_keys))
+    self.assertAllEqual(8, len(np_values))
 
-      self.assertAllEqual(8, len(np_keys))
-      self.assertAllEqual(8, len(np_values))
-
-      # pair up keys and values, drop extra added dimension
-      pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
-      # sort by key
-      pairs = pairs[pairs[:, 0].argsort()]
-      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
-                           [100, 0], [100, 0], [200, 2]], pairs)
+    # pair up keys and values, drop extra added dimension
+    pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
+    # sort by key
+    pairs = pairs[pairs[:, 0].argsort()]
+    self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
+                         [100, 0], [100, 0], [200, 2]], pairs)
 
   @test_util.run_v1_only("Saver V1 only")
   def testSaveRestore(self):
@@ -1910,137 +1944,134 @@ class DenseHashTableOpTest(test.TestCase):
       self.assertAllEqual([0, 1, -1, 3, -1], output)
 
   def testReprobe(self):
-    with self.cached_session():
-      # Insert 6 keys into a table with 8 buckets.
-      # The values are chosen to make sure collisions occur when using GCC STL
-      keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
-      values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    # Insert 6 keys into a table with 8 buckets.
+    # The values are chosen to make sure collisions occur when using GCC STL
+    keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
+    values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1,
+        initial_num_buckets=8)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(6, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(6, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([9], output.get_shape())
+    input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
+                                        dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([9], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
 
   def testCustomEmptyKey(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 0, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=12,
-          deleted_key=-1)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([11, 0, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=12,
+        deleted_key=-1)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([11, 0, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+    input_string = constant_op.constant([11, 0, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
   def testErrors(self):
-    with self.cached_session():
-      table = lookup_ops.DenseHashTable(
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1)
+
+    # Inserting the empty key returns an error
+    keys1 = constant_op.constant([11, 0], dtypes.int64)
+    values1 = constant_op.constant([0, 1], dtypes.int64)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "empty_key"):
+      self.evaluate(table.insert(keys1, values1))
+
+    # Looking up the empty key returns an error
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "empty_key"):
+      self.evaluate(table.lookup(keys1))
+
+    # Inserting the deleted key returns an error
+    keys2 = constant_op.constant([11, -1], dtypes.int64)
+    values2 = constant_op.constant([0, 1], dtypes.int64)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "deleted_key"):
+      self.evaluate(table.insert(keys2, values2))
+
+    # Looking up the empty key returns an error
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "deleted_key"):
+      self.evaluate(table.lookup(keys2))
+
+    # Arbitrary tensors of keys are not supported
+    keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+    values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Expected key shape"):
+      self.evaluate(table.lookup(keys))
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Expected key shape"):
+      self.evaluate(table.insert(keys, values))
+
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Number of buckets must be"):
+      table2 = lookup_ops.DenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
+          empty_key=17,
+          deleted_key=-1,
+          initial_num_buckets=12)
+      self.assertAllEqual(0, self.evaluate(table2.size()))
 
-      # Inserting the empty key returns an error
-      keys1 = constant_op.constant([11, 0], dtypes.int64)
-      values1 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "empty_key"):
-        self.evaluate(table.insert(keys1, values1))
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError,
+        "Empty and deleted keys must have same shape"):
+      table3 = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=42,
+          deleted_key=[1, 2])
+      self.assertAllEqual(0, self.evaluate(table3.size()))
 
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "empty_key"):
-        self.evaluate(table.lookup(keys1))
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Empty and deleted keys cannot be equal"):
+      table4 = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=42,
+          deleted_key=42)
+      self.assertAllEqual(0, self.evaluate(table4.size()))
 
-      # Inserting the deleted key returns an error
-      keys2 = constant_op.constant([11, -1], dtypes.int64)
-      values2 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "deleted_key"):
-        self.evaluate(table.insert(keys2, values2))
-
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "deleted_key"):
-        self.evaluate(table.lookup(keys2))
-
-      # Arbitrary tensors of keys are not supported
-      keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Expected key shape"):
-        self.evaluate(table.lookup(keys))
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Expected key shape"):
-        self.evaluate(table.insert(keys, values))
-
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Number of buckets must be"):
-        table2 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=17,
-            deleted_key=-1,
-            initial_num_buckets=12)
-        self.assertAllEqual(0, self.evaluate(table2.size()))
-
-      with self.assertRaisesRegex(
-          errors_impl.InvalidArgumentError,
-          "Empty and deleted keys must have same shape"):
-        table3 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=[1, 2])
-        self.assertAllEqual(0, self.evaluate(table3.size()))
-
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Empty and deleted keys cannot be equal"):
-        table4 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, self.evaluate(table4.size()))
-
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Empty and deleted keys cannot be equal"):
-        table5 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=[1, 2, 3],
-            deleted_key=[1, 2, 3])
-        self.assertAllEqual(0, self.evaluate(table5.size()))
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Empty and deleted keys cannot be equal"):
+      table5 = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=[1, 2, 3],
+          deleted_key=[1, 2, 3])
+      self.assertAllEqual(0, self.evaluate(table5.size()))
 
   @test_util.run_in_graph_and_eager_modes
   def testStringToResource(self):
@@ -2091,68 +2122,65 @@ class IndexTableFromFile(test.TestCase):
 
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_column_index=0,
-          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_column_index=0,
+        value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_column_index=0,
-          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          delimiter=" ")
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_column_index=0,
+        value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        delimiter=" ")
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
-    with self.cached_session():
-      vocabulary_file = constant_op.constant(vocabulary_file)
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    vocabulary_file = constant_op.constant(vocabulary_file)
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
-      if not context.executing_eagerly():
-        self.assertEqual(1,
-                         len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
   @test_util.run_v1_only("placeholder usage")
   def test_string_index_table_from_file_placeholder_filename(self):
@@ -2175,70 +2203,64 @@ class IndexTableFromFile(test.TestCase):
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_dtype=dtypes.int32)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_dtype=dtypes.int32)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_dtype=dtypes.int64)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_dtype=dtypes.int64)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, default_value=default_value)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, default_value=default_value)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
-      ids = table.lookup(
-          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1000)
+    ids = table.lookup(
+        constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual(
-          (
-              1,  # From vocabulary file.
-              2,  # From vocabulary file.
-              867,  # 3 + fingerprint("tarkus") mod 300.
-              860),  # 3 + fingerprint("toccata") mod 300.
-          self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual(
+        (
+            1,  # From vocabulary file.
+            2,  # From vocabulary file.
+            867,  # 3 + fingerprint("tarkus") mod 300.
+            860),  # 3 + fingerprint("toccata") mod 300.
+        self.evaluate(ids))
 
   def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
@@ -2269,26 +2291,24 @@ class IndexTableFromFile(test.TestCase):
 
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=2)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, vocab_size=2)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
-      self.assertEqual(2, self.evaluate(table.size()))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, -1, -1), self.evaluate(ids))
+    self.assertEqual(2, self.evaluate(table.size()))
 
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Invalid vocab_size"):
-        table = lookup_ops.index_table_from_file(
-            vocabulary_file=vocabulary_file, vocab_size=4)
-        self.evaluate(table.initializer)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Invalid vocab_size"):
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.evaluate(table.initializer)
 
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
@@ -2299,50 +2319,46 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=3)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, vocab_size=3)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
-      self.assertEqual(3, self.evaluate(table.size()))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, -1), self.evaluate(ids))
+    self.assertEqual(3, self.evaluate(table.size()))
 
   def test_index_table_from_file_with_invalid_hashers(self):
     vocabulary_file = self._createVocabFile("invalid_hasher.txt")
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        lookup_ops.index_table_from_file(
-            vocabulary_file=vocabulary_file,
-            vocab_size=3,
-            num_oov_buckets=1,
-            hasher_spec=1)
-
-      table = lookup_ops.index_table_from_file(
+    with self.assertRaises(TypeError):
+      lookup_ops.index_table_from_file(
           vocabulary_file=vocabulary_file,
           vocab_size=3,
           num_oov_buckets=1,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+          hasher_spec=1)
 
-      self.assertRaises(ValueError, table.lookup,
-                        constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        vocab_size=3,
+        num_oov_buckets=1,
+        hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+    self.assertRaises(ValueError, table.lookup,
+                      constant_op.constant(["salad", "surgery", "tarkus"]))
 
   def test_index_table_from_file_table_ref_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab9.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      self.assertIsNotNone(table.resource_handle)
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1)
+    self.assertIsNotNone(table.resource_handle)
 
   def test_index_table_from_file_table_ref_without_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=0)
-      self.assertIsNotNone(table.resource_handle)
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=0)
+    self.assertIsNotNone(table.resource_handle)
 
 
 class IndexTableFromTensor(test.TestCase):
@@ -2365,75 +2381,67 @@ class IndexTableFromTensor(test.TestCase):
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int32_index_table_from_tensor_with_tensor_init(self):
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.FailedPreconditionError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int64_index_table_from_tensor_with_tensor_init(self):
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.FailedPreconditionError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=["brain", "salad", "surgery"],
-          default_value=default_value)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=["brain", "salad", "surgery"],
+        default_value=default_value)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.FailedPreconditionError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(ValueError,
-                                  "vocabulary_list must be specified"):
-        lookup_ops.index_table_from_tensor(
-            vocabulary_list=None, num_oov_buckets=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary_list must be specified"):
+      lookup_ops.index_table_from_tensor(
+          vocabulary_list=None, num_oov_buckets=1)
 
   def test_index_table_from_tensor_empty_vocabulary_list(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.OpError,
-                                  "keys and values cannot be empty"):
-        _ = lookup_ops.index_table_from_tensor(
-            vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
-        self.evaluate(lookup_ops.tables_initializer())
+    with self.assertRaisesRegex(errors_impl.OpError,
+                                "keys and values cannot be empty"):
+      _ = lookup_ops.index_table_from_tensor(
+          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_index_table_from_tensor_with_invalid_hashers(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        lookup_ops.index_table_from_tensor(
-            vocabulary_list=["brain", "salad", "surgery"],
-            num_oov_buckets=1,
-            hasher_spec=1)
-
-      table = lookup_ops.index_table_from_tensor(
+    with self.assertRaises(TypeError):
+      lookup_ops.index_table_from_tensor(
           vocabulary_list=["brain", "salad", "surgery"],
           num_oov_buckets=1,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+          hasher_spec=1)
 
-      self.assertRaises(ValueError, table.lookup,
-                        constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=["brain", "salad", "surgery"],
+        num_oov_buckets=1,
+        hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+    self.assertRaises(ValueError, table.lookup,
+                      constant_op.constant(["salad", "surgery", "tarkus"]))
 
 
 class IndexToStringTableFromFileTest(test.TestCase):
@@ -2450,147 +2458,135 @@ class IndexToStringTableFromFileTest(test.TestCase):
     type_funcs = [str, constant_op.constant]
     for type_func in type_funcs:
       vocabulary_file = type_func(vocabulary_path)
-      with self.cached_session():
-        table = lookup_ops.index_to_string_table_from_file(
-            vocabulary_file=vocabulary_file)
-        features = table.lookup(
-            constant_op.constant([0, 1, 2, 3], dtypes.int64))
-        if not context.executing_eagerly():
-          with self.assertRaises(errors_impl.OpError):
-            self.evaluate(features)
-        self.evaluate(lookup_ops.tables_initializer())
-        self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            self.evaluate(features))
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          value_column_index=0)
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file,
+        key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        value_column_index=0)
+    features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                        self.evaluate(features))
 
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          value_column_index=0,
-          delimiter=" ")
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file,
+        key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        value_column_index=0,
+        delimiter=" ")
+    features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                        self.evaluate(features))
 
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, default_value=default_value)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", b"surgery", default_value),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file, default_value=default_value)
+    features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", b"surgery", default_value),
+                        self.evaluate(features))
 
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          vocab_size=2,
-          default_value=default_value)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", default_value, default_value),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file,
+        vocab_size=2,
+        default_value=default_value)
+    features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", default_value, default_value),
+                        self.evaluate(features))
 
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Invalid vocab_size"):
-        _ = lookup_ops.index_to_string_table_from_file(
-            vocabulary_file=vocabulary_file, vocab_size=4)
-        self.evaluate(lookup_ops.tables_initializer())
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Invalid vocab_size"):
+      _ = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=3)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file, vocab_size=3)
+    features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", b"surgery", b"UNK"), self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
   def test_index_to_string_table_from_tensor(self):
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list)
+    vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.index_to_string_table_from_tensor(
+        vocabulary_list=vocabulary_list)
 
-      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      features = table.lookup(indices)
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
+    indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    features = table.lookup(indices)
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
 
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                        self.evaluate(features))
 
   def test_duplicate_entries(self):
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["hello", "hello"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list)
-      indices = constant_op.constant([0, 1, 4], dtypes.int64)
-      features = table.lookup(indices)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
+    vocabulary_list = constant_op.constant(["hello", "hello"])
+    table = lookup_ops.index_to_string_table_from_tensor(
+        vocabulary_list=vocabulary_list)
+    indices = constant_op.constant([0, 1, 4], dtypes.int64)
+    features = table.lookup(indices)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list, default_value=default_value)
-      indices = constant_op.constant([1, 2, 4], dtypes.int64)
-      features = table.lookup(indices)
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", b"surgery", default_value),
-                          self.evaluate(features))
+    vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.index_to_string_table_from_tensor(
+        vocabulary_list=vocabulary_list, default_value=default_value)
+    indices = constant_op.constant([1, 2, 4], dtypes.int64)
+    features = table.lookup(indices)
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", b"surgery", default_value),
+                        self.evaluate(features))
 
 
 class IdTableWithHashBucketsTest(test.TestCase):
@@ -2753,45 +2749,40 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      table1 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value),
-          oov_buckets)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    table1 = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value), oov_buckets)
 
-      self.evaluate(table1.initializer)
+    self.evaluate(table1.initializer)
 
-      input_string_1 = constant_op.constant(
-          ["brain", "salad", "surgery", "UNK"])
+    input_string_1 = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
-      out1 = table1.lookup(input_string_1)
+    out1 = table1.lookup(input_string_1)
 
-      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
-      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
 
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
 
-      # Underlying lookup table already initialized in previous session.
-      # No need to call self.evaluate(table2.initializer)
-      table2 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value),
-          oov_buckets)
+    # Underlying lookup table already initialized in previous session.
+    # No need to call self.evaluate(table2.initializer)
+    table2 = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value), oov_buckets)
 
-      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+    input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
-      out2 = table2.lookup(input_string_2)
+    out2 = table2.lookup(input_string_2)
 
-      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
-      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
+    self.assertAllEqual([3, 1, 3], self.evaluate(out2))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
 
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
     vocab_file = self._createVocabFile("feat_to_id_6.txt")
@@ -2980,84 +2971,79 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
   def testIdTableWithHashBucketsWithInvalidHashers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      lookup_table = lookup_ops.StaticHashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    lookup_table = lookup_ops.StaticHashTable(
+        lookup_ops.TextFileIdTableInitializer(
+            vocab_file, vocab_size=vocab_size), default_value)
 
-      with self.assertRaises(TypeError):
-        lookup_ops.IdTableWithHashBuckets(
-            lookup_table, oov_buckets, hasher_spec=1)
+    with self.assertRaises(TypeError):
+      lookup_ops.IdTableWithHashBuckets(
+          lookup_table, oov_buckets, hasher_spec=1)
 
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_table,
+        oov_buckets,
+        hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+    input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+    with self.assertRaises(ValueError):
+      table.lookup(input_string)
+
+    with self.assertRaises(ValueError):
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table, oov_buckets, hasher_spec=lookup_ops.StrongHashSpec([]))
+
+    with self.assertRaises(ValueError):
       table = lookup_ops.IdTableWithHashBuckets(
           lookup_table,
           oov_buckets,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+          hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
 
-      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
-
-      with self.assertRaises(ValueError):
-        table.lookup(input_string)
-
-      with self.assertRaises(ValueError):
-        table = lookup_ops.IdTableWithHashBuckets(
-            lookup_table,
-            oov_buckets,
-            hasher_spec=lookup_ops.StrongHashSpec([]))
-
-      with self.assertRaises(ValueError):
-        table = lookup_ops.IdTableWithHashBuckets(
-            lookup_table,
-            oov_buckets,
-            hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
-
-      with self.assertRaises(TypeError):
-        table = lookup_ops.IdTableWithHashBuckets(
-            lookup_table,
-            oov_buckets,
-            hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
+    with self.assertRaises(TypeError):
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
 
   def testIdTableWithHashBucketsNoInnerTable(self):
-    with self.cached_session():
-      table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
-      self.assertIsNone(table.resource_handle)
+    table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
+    self.assertIsNone(table.resource_handle)
 
 
 class MutableHashTableOpTest(test.TestCase):
 
   def testMutableHashTable(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    remove_string = constant_op.constant(["tarkus", "tank"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
-      exported_keys, exported_values = table.export()
+    exported_keys, exported_values = table.export()
 
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(self.evaluate(exported_keys))
-      sorted_values = np.sort(self.evaluate(exported_values))
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      self.assertAllEqual([0, 1, 2], sorted_values)
+    # exported data is in the order of the internal map, i.e. undefined
+    sorted_keys = np.sort(self.evaluate(exported_keys))
+    sorted_values = np.sort(self.evaluate(exported_values))
+    self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+    self.assertAllEqual([0, 1, 2], sorted_values)
 
   @test_util.run_v1_only("SaverV1")
   def testSaveRestore(self):
@@ -3256,370 +3242,419 @@ class MutableHashTableOpTest(test.TestCase):
       self.assertAllEqual([b"-", b"a", b"b"], output)
 
   def testMutableHashTableOfTensors(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
-                                    dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = constant_op.constant([-1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
+                                  dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    remove_string = constant_op.constant(["tarkus", "tank"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3, 2], output.get_shape())
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
+    self.assertAllEqual([3, 2], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
 
-      exported_keys, exported_values = table.export()
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(self.evaluate(exported_keys))
-      sorted_values = np.sort(self.evaluate(exported_values), axis=0)
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
-      self.assertAllEqual(sorted_expected_values, sorted_values)
+    exported_keys, exported_values = table.export()
+    # exported data is in the order of the internal map, i.e. undefined
+    sorted_keys = np.sort(self.evaluate(exported_keys))
+    sorted_values = np.sort(self.evaluate(exported_values), axis=0)
+    self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+    sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
+    self.assertAllEqual(sorted_expected_values, sorted_values)
 
   def testMutableHashTableExportInsert(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
-      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      self.assertAllEqual(0, self.evaluate(table1.size()))
-      self.evaluate(table1.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table1.size()))
+    default_val = constant_op.constant([-1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    self.assertAllEqual(0, self.evaluate(table1.size()))
+    self.evaluate(table1.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table1.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      expected_output = [[0, 1], [2, 3], [-1, -1]]
-      output1 = table1.lookup(input_string)
-      self.assertAllEqual(expected_output, self.evaluate(output1))
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    expected_output = [[0, 1], [2, 3], [-1, -1]]
+    output1 = table1.lookup(input_string)
+    self.assertAllEqual(expected_output, self.evaluate(output1))
 
-      exported_keys, exported_values = table1.export()
-      self.assertAllEqual(3, self.evaluate(exported_keys).size)
-      self.assertAllEqual(6, self.evaluate(exported_values).size)
+    exported_keys, exported_values = table1.export()
+    self.assertAllEqual(3, self.evaluate(exported_keys).size)
+    self.assertAllEqual(6, self.evaluate(exported_values).size)
 
-      # Populate a second table from the exported data
-      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      self.assertAllEqual(0, self.evaluate(table2.size()))
-      self.evaluate(table2.insert(exported_keys, exported_values))
-      self.assertAllEqual(3, self.evaluate(table2.size()))
+    # Populate a second table from the exported data
+    table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    self.assertAllEqual(0, self.evaluate(table2.size()))
+    self.evaluate(table2.insert(exported_keys, exported_values))
+    self.assertAllEqual(3, self.evaluate(table2.size()))
 
-      # Verify lookup result is still the same
-      output2 = table2.lookup(input_string)
-      self.assertAllEqual(expected_output, self.evaluate(output2))
+    # Verify lookup result is still the same
+    output2 = table2.lookup(input_string)
+    self.assertAllEqual(expected_output, self.evaluate(output2))
 
   def testMutableHashTableOfTensorsInvalidShape(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant([-1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      # Shape [6] instead of [3, 2]
-      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Shape [2,3] instead of [3, 2]
-      values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Shape [2, 2] instead of [3, 2]
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Shape [3, 1] instead of [3, 2]
-      values = constant_op.constant([[0], [2], [4]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Valid Insert
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    # Shape [6] instead of [3, 2]
+    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
       self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+
+    # Shape [2,3] instead of [3, 2]
+    values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
+      self.evaluate(table.insert(keys, values))
+
+    # Shape [2, 2] instead of [3, 2]
+    values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
+      self.evaluate(table.insert(keys, values))
+
+    # Shape [3, 1] instead of [3, 2]
+    values = constant_op.constant([[0], [2], [4]], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
+      self.evaluate(table.insert(keys, values))
+
+    # Valid Insert
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
   def testMutableHashTableInvalidDefaultValue(self):
-    with self.cached_session():
-      default_val = constant_op.constant([[-1, -1]], dtypes.int64)
-      with self.assertRaisesOpError("Default value must be a vector"):
-        table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                            default_val)
-        self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = constant_op.constant([[-1, -1]], dtypes.int64)
+    with self.assertRaisesOpError("Default value must be a vector"):
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
 
   def testMutableHashTableDuplicateInsert(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([3, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([3, 1, -1], result)
 
   def testMutableHashTableFindHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2], output.get_shape())
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["tank", "tarkus"]])
+    output = table.lookup(input_string)
+    self.assertAllEqual([2, 2], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([[0, 1], [-1, -1]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([[0, 1], [-1, -1]], result)
+
+  def testMutableHashTableFindWithInvalidShapeDefaultValue(self):
+    default_val = [-1, -1]
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+
+    input_string = constant_op.constant([["brain", "salad"], ["tank",
+                                                              "tarkus"]])
+
+    invalid_default_val = constant_op.constant(
+        [[-2, -3], [-4, -5], [-6, -7], [-8, -9]], dtypes.int64)
+
+    with self.assertRaisesRegex(
+        (ValueError, errors_impl.InvalidArgumentError),
+        "Expected shape \[2\] or \[2,2,2\] for default value, got \[4,2]"):
+      self.evaluate(table.lookup(input_string, invalid_default_val))
+
+    invalid_default_val = constant_op.constant([[[-2, -3], [-4, -5]]],
+                                               dtypes.int64)
+    with self.assertRaisesRegex(
+        (ValueError, errors_impl.InvalidArgumentError),
+        "Expected shape \[2\] or \[2,2,2\] for default value, got \[1,2,2\]"):
+      self.evaluate(table.lookup(input_string, invalid_default_val))
+
+  def testMutableHashTableFindHighRankScalarWithDynamicDefaultValue(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant([["brain", "salad"], ["tank",
+                                                              "tarkus"]])
+
+    dynamic_default_val = constant_op.constant([[-2, -3], [-4, -5]],
+                                               dtypes.int64)
+    output = table.lookup(input_string, dynamic_default_val)
+    self.assertAllEqual([2, 2], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([[0, 1], [-4, -5]], result)
+
+  def testMutableHashTableFindHighRankVectorWithDynamicDefaultValue(self):
+    default_val = [-1, -1]
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant([["brain", "salad"], ["tank",
+                                                              "tarkus"]])
+
+    dynamic_default_val = constant_op.constant(
+        [[[-2, -3], [-4, -5]], [[-6, -7], [-8, -9]]], dtypes.int64)
+    output = table.lookup(input_string, dynamic_default_val)
+    self.assertAllEqual([2, 2, 2], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([[[0, 1], [2, 3]], [[-6, -7], [-8, -9]]], result)
 
   def testMutableHashTableInsertHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+    values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, 3, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, 3, -1], result)
 
   def testMutableHashTableRemoveHighRank(self):
-    with self.test_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+    values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["salad", "tarkus"])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    remove_string = constant_op.constant(["salad", "tarkus"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, -1, 3, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, -1, 3, -1], result)
 
   def testMutableHashTableOfTensorsFindHighRank(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                  dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["tank", "tarkus"]])
+    output = table.lookup(input_string)
+    self.assertAllEqual([2, 2, 3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual(
-          [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual(
+        [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
 
   def testMutableHashTableOfTensorsRemoveHighRank(self):
-    with self.test_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                  dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant([["brain", "tank"]])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(2, self.evaluate(table.size()))
+    remove_string = constant_op.constant([["brain", "tank"]])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(2, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["surgery", "tank"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["surgery", "tank"]])
+    output = table.lookup(input_string)
+    self.assertAllEqual([2, 2, 3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual(
-          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual(
+        [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
 
   def testMultipleMutableHashTables(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
 
-      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      self.evaluate(table1.insert(keys, values))
-      self.evaluate(table2.insert(keys, values))
-      self.evaluate(table3.insert(keys, values))
+    table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    self.evaluate(table1.insert(keys, values))
+    self.evaluate(table2.insert(keys, values))
+    self.evaluate(table3.insert(keys, values))
 
-      self.assertAllEqual(3, self.evaluate(table1.size()))
-      self.assertAllEqual(3, self.evaluate(table2.size()))
-      self.assertAllEqual(3, self.evaluate(table3.size()))
+    self.assertAllEqual(3, self.evaluate(table1.size()))
+    self.assertAllEqual(3, self.evaluate(table2.size()))
+    self.assertAllEqual(3, self.evaluate(table3.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output1 = table1.lookup(input_string)
-      output2 = table2.lookup(input_string)
-      output3 = table3.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output1 = table1.lookup(input_string)
+    output2 = table2.lookup(input_string)
+    output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = self.evaluate([output1, output2, output3])
-      self.assertAllEqual([0, 1, -1], out1)
-      self.assertAllEqual([0, 1, -1], out2)
-      self.assertAllEqual([0, 1, -1], out3)
+    out1, out2, out3 = self.evaluate([output1, output2, output3])
+    self.assertAllEqual([0, 1, -1], out1)
+    self.assertAllEqual([0, 1, -1], out2)
+    self.assertAllEqual([0, 1, -1], out3)
 
   def testMutableHashTableWithTensorDefault(self):
-    with self.cached_session():
-      default_val = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
   def testSignatureMismatch(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      # insert with keys of the wrong type
-      with self.assertRaises(ValueError):
-        self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values))
+    # insert with keys of the wrong type
+    with self.assertRaises(ValueError):
+      self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values))
 
-      # insert with values of the wrong type
-      with self.assertRaises(ValueError):
-        self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"])))
+    # insert with values of the wrong type
+    with self.assertRaises(ValueError):
+      self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"])))
 
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string_ref = variables.Variable("brain")
-      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
-      self.evaluate(variables.global_variables_initializer())
+    input_string_ref = variables.Variable("brain")
+    input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
+    self.evaluate(variables.global_variables_initializer())
 
-      # Ref types do not produce an insert signature mismatch.
-      self.evaluate(table.insert(input_string_ref, input_int64_ref))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    # Ref types do not produce an insert signature mismatch.
+    self.evaluate(table.insert(input_string_ref, input_int64_ref))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      # Ref types do not produce a lookup signature mismatch.
-      self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref)))
+    # Ref types do not produce a lookup signature mismatch.
+    self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref)))
 
-      # lookup with keys of the wrong type
-      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
-      with self.assertRaises(ValueError):
-        self.evaluate(table.lookup(input_string))
+    # lookup with keys of the wrong type
+    input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+    with self.assertRaises(ValueError):
+      self.evaluate(table.lookup(input_string))
 
-      # default value of the wrong type
-      with self.assertRaises(TypeError):
-        lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
+    # default value of the wrong type
+    with self.assertRaises(TypeError):
+      lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
 
   def testMutableHashTableStringFloat(self):
-    with self.cached_session():
-      default_val = -1.5
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1.5
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllClose([0, 1.1, default_val], result)
+    result = self.evaluate(output)
+    self.assertAllClose([0, 1.1, default_val], result)
 
   def testMutableHashTableIntFloat(self):
-    with self.cached_session():
-      default_val = -1.0
-      keys = constant_op.constant([3, 7, 0], dtypes.int64)
-      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
-      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1.0
+    keys = constant_op.constant([3, 7, 0], dtypes.int64)
+    values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
+    table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
-      output = table.lookup(input_string)
+    input_string = constant_op.constant([7, 0, 11], dtypes.int64)
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllClose([-1.2, 9.9, default_val], result)
+    result = self.evaluate(output)
+    self.assertAllClose([-1.2, 9.9, default_val], result)
 
   def testMutableHashTableInt64String(self):
-    with self.cached_session():
-      default_val = "n/a"
-      keys = constant_op.constant([0, 1, 2], dtypes.int64)
-      values = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = "n/a"
+    keys = constant_op.constant([0, 1, 2], dtypes.int64)
+    values = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([0, 1, 3], dtypes.int64)
-      output = table.lookup(input_string)
+    input_string = constant_op.constant([0, 1, 3], dtypes.int64)
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
+    result = self.evaluate(output)
+    self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
 
   def testExportShapeInference(self):
     default_value = -1
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 737ca777804..33e84b3ca19 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -197,14 +197,16 @@ class MatMulInfixOperatorTest(test_lib.TestCase):
 
   def testMismatchedShape(self):
     with self.assertRaisesRegex(
-        Exception, "(Shape must be rank 2 but is rank 1|is not a matrix)"):
+        Exception, (r"(In\[0\] and In\[1\] has different ndims|In\[0\] "
+                    r"ndims must be >= 2|Shape must be rank 2 but is rank 1)")):
       infix_matmul(
           ops.convert_to_tensor([10.0, 20.0, 30.0]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
   def testMismatchedDimensions(self):
     with self.assertRaisesRegex(
-        Exception, "(Dimensions must be equal|Matrix size-incompatible)"):
+        Exception,
+        r"(In\[0\] mismatch In\[1\] shape|Dimensions must be equal)"):
       infix_matmul(
           ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
@@ -234,9 +236,10 @@ if __name__ == "__main__":
   # TF2 does not support placeholders under eager so we skip it
   for use_static_shape in set([True, tf2.enabled()]):
     for dtype in dtypes_to_test:
-      if not use_static_shape and (dtype == np.int32 or dtype == np.int64):
-        # TODO(rmlarsen): Re-enable this test when we have fixed the underlying
-        # bug in Windows (b/35935459).
+      if test_util.is_xla_enabled() and (dtype == np.int32 or
+                                         dtype == np.int64):
+        # TODO(b/171924639): Enable this test when XLA DOT supports
+        # integer types.
         continue
       for m in sizes:
         for n in sizes:
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index e5c46c76e2e..0e935dfe8c4 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -126,7 +126,6 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
-    tfrt_enabled = True,
     deps = [
         ":descriptor_source_test_base",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 0a618b7f555..a9d855a5a2b 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
@@ -112,12 +111,12 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
     else:
       tol = 1e-14
     # Tests that a ~= q*r.
-    a_recon = math_ops.matmul(q, r)
+    a_recon = test_util.matmul_without_tf32(q, r)
     self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
-    xx = math_ops.matmul(x, x, adjoint_a=True)
+    xx = test_util.matmul_without_tf32(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     if is_single:
       tol = 1e-5
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 35129a59b84..74d275be8ca 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -1,7 +1,7 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -24,7 +24,6 @@ cuda_py_test(
     name = "parameterized_truncated_normal_op_test",
     size = "medium",
     srcs = ["parameterized_truncated_normal_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -43,7 +42,9 @@ tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
     srcs = ["random_shuffle_queue_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "no_cuda_on_cpu_tap",  # TODO(b/171060960) flakyly broken assertions
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -59,7 +60,6 @@ cuda_py_test(
     name = "multinomial_op_test",
     size = "small",
     srcs = ["multinomial_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -79,7 +79,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["multinomial_op_big_test.py"],
     shard_count = 3,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -98,7 +97,9 @@ cuda_py_test(
     name = "random_crop_test",
     size = "small",
     srcs = ["random_crop_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:random_ops",
@@ -110,7 +111,6 @@ cuda_py_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -125,7 +125,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     shard_count = 10,
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times-out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -143,6 +145,9 @@ cuda_py_test(
     srcs = ["random_gamma_test.py"],
     shard_count = 4,
     tags = ["nozapfhahn"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
@@ -159,7 +164,9 @@ cuda_py_test(
     name = "random_grad_test",
     size = "small",
     srcs = ["random_grad_test.py"],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -180,7 +187,6 @@ tf_py_test(
     srcs = ["random_binomial_test.py"],
     shard_count = 3,
     tags = ["no_oss"],
-    tfrt_enabled = True,
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
@@ -197,7 +203,6 @@ cuda_py_test(
     name = "random_poisson_test",
     size = "medium",
     srcs = ["random_poisson_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 8bf5a08a358..6a0f40108a8 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -728,9 +728,7 @@ class MinReductionTest(test.TestCase):
 
   def _compareAll(self, x, reduction_axes):
     self._compare(x, reduction_axes, False, use_gpu=True)
-    self._compare(x, reduction_axes, False, use_gpu=False)
     self._compare(x, reduction_axes, True, use_gpu=True)
-    self._compare(x, reduction_axes, True, use_gpu=False)
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
@@ -739,13 +737,12 @@ class MinReductionTest(test.TestCase):
         tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
-  @test_util.run_deprecated_v1
-  def testInfinity(self):
+  def testSpecialValues(self):
     for dtype in [np.float32, np.float64]:
-      for special_value_x in [-np.inf, np.inf]:
-        for special_value_y in [-np.inf, np.inf]:
-          np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
-          self._compareAll(np_arr, None)
+      for size in range(1, 4):
+        for arr in itertools.product([-np.inf, 1., np.nan, np.inf],
+                                     repeat=size):
+          self._compareAll(np.array(arr, dtype=dtype), None)
 
   def testFloatReduce3D(self):
     # Create a 3D array of floats and reduce across all possible
@@ -847,9 +844,7 @@ class MaxReductionTest(test.TestCase):
 
   def _compareAll(self, x, reduction_axes):
     self._compare(x, reduction_axes, False, use_gpu=True)
-    self._compare(x, reduction_axes, False, use_gpu=False)
     self._compare(x, reduction_axes, True, use_gpu=True)
-    self._compare(x, reduction_axes, True, use_gpu=False)
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
@@ -858,13 +853,12 @@ class MaxReductionTest(test.TestCase):
         tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
-  @test_util.run_deprecated_v1
-  def testInfinity(self):
+  def testSpecialValues(self):
     for dtype in [np.float32, np.float64]:
-      for special_value_x in [-np.inf, np.inf]:
-        for special_value_y in [-np.inf, np.inf]:
-          np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
-          self._compareAll(np_arr, None)
+      for size in range(1, 4):
+        for arr in itertools.product([-np.inf, 1., np.nan, np.inf],
+                                     repeat=size):
+          self._compareAll(np.array(arr, dtype=dtype), None)
 
   def testInt64Reduce3D(self):
     # Create a 3D array of int64s and reduce across all possible
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 44279a98a39..87fc2bec2e2 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -103,7 +103,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       v0 = resource_variable_ops.ResourceVariable(1.0)
       self.assertAllEqual(v0.numpy(), 1.0)
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testReadVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -199,7 +198,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       value, _ = sess.run([v, v.assign_add(1.0)])
       self.assertAllEqual(value, 0.0)
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -750,7 +748,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       self.assertEqual(v.handle.op.colocation_groups(),
                        v.initializer.inputs[1].op.colocation_groups())
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testCountUpTo(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(0, name="upto")
@@ -758,7 +755,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       with self.assertRaises(errors.OutOfRangeError):
         v.count_up_to(1)
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testCountUpToFunction(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(0, name="upto")
@@ -857,7 +853,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           variable_def=other_v_def)
       self.assertIsNotNone(other_v_prime._cached_value)
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session():
       v_def = resource_variable_ops.ResourceVariable(
@@ -979,7 +974,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.evaluate(assign_without_read)
     self.assertEqual(0.0, self.evaluate(v.value()))
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only("b/120545219")
   def testDestroyResource(self):
@@ -1006,7 +1000,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           [assign],
           feed_dict={placeholder: np.zeros(shape=[2, 2], dtype=np.float32)})
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testAssignDifferentShapesEagerNotAllowed(self):
     with context.eager_mode():
       with variable_scope.variable_scope("foo"):
@@ -1069,7 +1062,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
         .batch_scatter_update(batch_slices2),
         [[1, 3], [2, 3]])
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testInitValueWrongShape(self):
     with self.assertRaisesWithPredicateMatch(
@@ -1088,7 +1080,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertEqual(v.dtype, w.dtype)
 
   # TODO(alive): get caching to work in eager mode.
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with ops.device("/job:server/task:1"):
@@ -1105,7 +1096,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
@@ -1164,7 +1154,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       v.initializer.run(feed_dict={v.initial_value: 3.0})
       self.assertEqual(3.0, v.value().eval())
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
@@ -1252,7 +1241,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       self.assertEqual(1, v1.read_value().numpy())
       self.assertEqual(2, v2.read_value().numpy())
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   def testDestruction(self):
     with context.eager_mode():
       var = resource_variable_ops.ResourceVariable(initial_value=1.0,
@@ -1340,7 +1328,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       state_ops.scatter_update(v, [1], [3])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testScatterUpdateInvalidArgs(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3], name="update")
@@ -1350,7 +1337,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     with self.assertRaisesRegex(Exception, r"shape.*2.*3"):
       state_ops.scatter_update(v, [0, 1], [0, 1, 2])
 
-  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testAssignIncompatibleShape(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 2a3021f9821..b0161b8d232 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -105,6 +105,15 @@ class CumsumTest(test.TestCase):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumsum(x, axis).eval()
 
+  @test_util.run_deprecated_v1
+  def testNaN(self):
+    for dtype in (np.float16, np.float32, np.float64):
+      for nan_idx in range(0, 5):
+        x = np.arange(1, 6).reshape([5]).astype(dtype)
+        x[nan_idx] = np.nan
+        for axis in (-1, 0):
+          self._compareAll(x, axis)
+
   @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
@@ -229,6 +238,15 @@ class CumprodTest(test.TestCase):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumprod(x, axis).eval()
 
+  @test_util.run_deprecated_v1
+  def testNaN(self):
+    for dtype in (np.float16, np.float32, np.float64):
+      for nan_idx in range(0, 5):
+        x = np.arange(1, 6).reshape([5]).astype(dtype)
+        x[nan_idx] = np.nan
+        for axis in (-1, 0):
+          self._compareAll(x, axis)
+
   @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 368a7f18f8b..268f6891d4e 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -55,8 +55,8 @@ class TensordotTest(test_lib.TestCase):
     if context.executing_eagerly():
       return
     with self.cached_session() as sess:
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Matrix size-incompatible"):
+      with self.assertRaisesOpError(
+          r"In\[0\] mismatch In\[1\] shape: 2 vs\. 3: \[2,2\] \[3,2\]"):
         a_ph = array_ops.placeholder(dtypes.float32)
         b_ph = array_ops.placeholder(dtypes.float32)
         axes_ph = array_ops.placeholder(dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index a895827d5fe..b17a8f02594 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -102,11 +102,13 @@ class TopKTest(test.TestCase):
     self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])
 
   def testTop3(self):
-    k = 5
-    inputs = np.random.permutation(np.linspace(0, 100, 6140, dtype=np.float64))
-    indices = np.argsort(-inputs)[:k]
-    values = -np.sort(-inputs)[:k]
-    self._validateTopK(inputs, k, values, indices)
+    for k in range(3, 11, 2):
+      for dim in range(512, 12288, 512):
+        inputs = np.random.permutation(
+            np.linspace(0, 100, dim, dtype=np.float64))
+        indices = np.argsort(-inputs)[:k]
+        values = -np.sort(-inputs)[:k]
+        self._validateTopK(inputs, k, values, indices)
 
   def testTop1AllNan(self):
     inputs = [[np.NaN, np.NaN], [np.NaN, np.NaN]]
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
index ac04803ba3b..bd9c02d8101 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -12,7 +12,6 @@ tf_py_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -24,7 +23,6 @@ cuda_py_test(
     name = "scatter_nd_ops_test",
     size = "small",
     srcs = ["scatter_nd_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:state_ops",
@@ -37,7 +35,6 @@ cuda_py_test(
     name = "session_ops_test",
     size = "small",
     srcs = ["session_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 4b9c1fed916..f3902fb28f3 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
@@ -171,6 +172,147 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
       self.assertAllEqual(fnWithLoop(), 4.0)
 
+  def checkIteratedGradients(self, func):
+    with context.eager_mode():
+
+      def _Grad(f):
+        def _GradFunction(primal):
+          with backprop.GradientTape() as tape:
+            tape.watch(primal)
+            primal_out = f(primal)
+          return tape.gradient(primal_out, primal)
+        return _GradFunction
+
+      f = func
+      one = constant_op.constant(1.)
+
+      for _ in range(3):
+        theoretical, numerical = gradient_checker_v2.compute_gradient(
+            def_function.function(f), [one])
+        self.assertAllClose(theoretical, numerical, rtol=1e-3)
+        f = _Grad(f)
+        self.assertAllClose(array_ops.reshape(numerical, []),
+                            def_function.function(f)(one),
+                            rtol=1e-3)
+
+  def testIteratedGradients(self):
+
+    def _Func(x):
+      _, z = while_loop_v2(
+          lambda i, _: i < 2,
+          lambda i, y: (i + 1, math_ops.cos(y)),
+          [0, x])
+      return z
+
+    self.checkIteratedGradients(_Func)
+
+  def testIteratedGradientsWithList(self):
+
+    def _Func(x):
+      results = list_ops.empty_tensor_list(
+          element_shape=[], element_dtype=dtypes.float32)
+
+      def _LoopBody(i, y, handle):
+        return (i + 1, math_ops.cos(y),
+                list_ops.tensor_list_push_back(handle, y))
+
+      _, z, results = while_loop_v2(
+          lambda i, _, h: i < 2, _LoopBody, [0, x, results])
+      return z + math_ops.reduce_sum(list_ops.tensor_list_stack(
+          results, dtypes.float32))
+
+    self.checkIteratedGradients(_Func)
+
+  def testGradWhileGradWhileWithVariable(self):
+    with context.eager_mode():
+      v = variables.Variable(1.)
+
+      @def_function.function
+      def _Func(x):
+
+        def _Inner(a):
+          with backprop.GradientTape() as tape:
+            tape.watch(a)
+            _, b = while_loop_v2(
+                lambda i, _: i < 2,
+                lambda i, y: (i + 1, math_ops.cos(v + y)),
+                [0, a])
+          return tape.gradient(b, a)
+
+        _, z = while_loop_v2(
+            lambda i, _: i < 2,
+            lambda i, y: (i + 1, _Inner(y)),
+            [0, x])
+        return z
+
+      with backprop.GradientTape(persistent=True) as tape:
+        x = constant_op.constant(1.)
+        tape.watch(x)
+        y = _Func(x)
+      dx, _ = tape.gradient(y, [x, v])
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          _Func, [x])
+      self.assertAllClose(numerical, theoretical, rtol=1e-3)
+      self.assertAllClose(array_ops.reshape(numerical, []),
+                          dx, rtol=1e-3)
+
+  def testThreeNestWithLists(self):
+    with context.eager_mode():
+      def _WrapInWhile(f):
+        def _Wrapped(x):
+          results = list_ops.empty_tensor_list(
+              element_shape=[], element_dtype=dtypes.float32)
+
+          def _LoopBody(i, y, handle):
+            return (i + 1, f(math_ops.cos(y)),
+                    list_ops.tensor_list_push_back(handle, y))
+
+          _, z, results = control_flow_ops.while_loop(
+              lambda i, _, h: i < 2, _LoopBody, [0, x, results])
+          return z + math_ops.reduce_sum(list_ops.tensor_list_stack(
+              results, dtypes.float32))
+        return _Wrapped
+
+      f = math_ops.sin
+
+      target_function = _WrapInWhile(_WrapInWhile(_WrapInWhile(f)))
+
+      @def_function.function
+      def _TapeFromGraphMode(x):
+        with backprop.GradientTape(persistent=True) as tape:
+          tape.watch(x)
+          y = target_function(x)
+        return tape.gradient(y, x)
+
+      x = constant_op.constant(1.)
+      dx = _TapeFromGraphMode(x)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          target_function, [x])
+      self.assertAllClose(numerical, theoretical, rtol=1e-3)
+      self.assertAllClose(array_ops.reshape(numerical, []),
+                          dx, rtol=1e-3)
+
+  def testDeviceLabelsInherited(self):
+    def _LoopBody(i, y):
+      result = math_ops.cos(y)
+      self.assertIn("CPU:10", result.device)
+      with ops.device("CPU:11"):
+        result = array_ops.identity(result)
+      self.assertIn("CPU:11", result.device)
+      return i + 1, result
+
+    @def_function.function
+    def _FunctionWithWhileLoop():
+      x = constant_op.constant(1.)
+      with ops.device("CPU:10"):
+        _, z = while_loop_v2(
+            lambda i, _: i < 2,
+            _LoopBody,
+            [0, x])
+      return z
+    # The test assertion runs at trace time.
+    _FunctionWithWhileLoop.get_concrete_function()
+
   def testExternalControlDependencies(self):
     with ops.Graph().as_default(), self.test_session():
       v = variables.Variable(1.)
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index c50a0ff246c..31def39a98e 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -387,6 +387,9 @@ PyArray_Descr NPyBfloat16_Descr = {
     nullptr,                                              // fields
     nullptr,                                              // names
     &NPyBfloat16_ArrFuncs,                                // f
+    nullptr,                                              // metadata
+    nullptr,                                              // c_metadata
+    -1,                                                   // hash
 };
 
 // Registered numpy type ID. Global variable populated by the registration code.
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 1b93d8cf8b6..1bae3ce21a1 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -23,13 +23,21 @@ py_library(
 tf_py_test(
     name = "module_test",
     srcs = ["module_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":module",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:type_spec",
         "//tensorflow/python:variables",
-        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/distribute:tpu_values",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/module/module.py b/tensorflow/python/module/module.py
index 8878db4ebe2..ad46af0da95 100644
--- a/tensorflow/python/module/module.py
+++ b/tensorflow/python/module/module.py
@@ -157,7 +157,7 @@ class Module(tracking.AutoTrackable):
       name) followed by variables from all submodules recursively (breadth
       first).
     """
-    return tuple(self._flatten(predicate=_is_variable))
+    return tuple(self._flatten(predicate=_is_variable, expand_composites=True))
 
   @property
   def trainable_variables(self):
@@ -172,7 +172,23 @@ class Module(tracking.AutoTrackable):
       name) followed by variables from all submodules recursively (breadth
       first).
     """
-    return tuple(self._flatten(predicate=_is_trainable_variable))
+    return tuple(
+        self._flatten(predicate=_is_trainable_variable, expand_composites=True))
+
+  @property
+  def non_trainable_variables(self):
+    """Sequence of non-trainable variables owned by this module and its submodules.
+
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
+
+    Returns:
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
+    """
+    return tuple(self._flatten(predicate=_is_non_trainable_variable))
 
   @property
   def submodules(self):
@@ -202,7 +218,8 @@ class Module(tracking.AutoTrackable):
                recursive=True,
                predicate=None,
                attribute_traversal_key=None,
-               with_path=False):
+               with_path=False,
+               expand_composites=False):
     """Flattened attribute values in sorted order by attribute name.
 
     Modules are flattened by first walking their attributes in name order.
@@ -247,6 +264,8 @@ class Module(tracking.AutoTrackable):
         as the object itself. If `with_path` is `True` then leaves will not be
         de-duplicated (e.g. if the same leaf instance is reachable via multiple
         modules then it will be yielded multiple times with different paths).
+      expand_composites: If true, then composite tensors are expanded into their
+        component tensors.
 
     Returns:
       Flat generator for leaves of the current module and optionally all
@@ -261,7 +280,8 @@ class Module(tracking.AutoTrackable):
         predicate=predicate,
         attributes_to_ignore=self._TF_MODULE_IGNORED_PROPERTIES,
         attribute_traversal_key=attribute_traversal_key,
-        with_path=with_path)
+        with_path=with_path,
+        expand_composites=expand_composites)
 
   @classmethod
   def with_name_scope(cls, method):
@@ -305,6 +325,10 @@ def _is_trainable_variable(obj):
   return _is_variable(obj) and getattr(obj, "trainable", False)
 
 
+def _is_non_trainable_variable(obj):
+  return _is_variable(obj) and not getattr(obj, "trainable", False)
+
+
 def _is_module(obj):
   return isinstance(obj, Module)
 
@@ -326,6 +350,7 @@ def _flatten_module(module,
                     attribute_traversal_key,
                     attributes_to_ignore,
                     with_path,
+                    expand_composites,
                     module_path=(),
                     seen=None):
   """Implementation of `flatten`."""
@@ -341,7 +366,8 @@ def _flatten_module(module,
 
     prop = module_dict[key]
     try:
-      leaves = nest.flatten_with_tuple_paths(prop)
+      leaves = nest.flatten_with_tuple_paths(
+          prop, expand_composites=expand_composites)
     except Exception as cause:  # pylint: disable=broad-except
       six.raise_from(
           ValueError(
@@ -376,6 +402,7 @@ def _flatten_module(module,
         attribute_traversal_key=attribute_traversal_key,
         attributes_to_ignore=submodule._TF_MODULE_IGNORED_PROPERTIES,  # pylint: disable=protected-access
         with_path=with_path,
+        expand_composites=expand_composites,
         module_path=submodule_path,
         seen=seen)
 
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index e15bc734230..2d1b1627655 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -31,8 +31,10 @@ from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import type_spec
 from tensorflow.python.module import module
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -260,6 +262,37 @@ class VariableTrackingTest(test_util.TensorFlowTestCase):
     m.c = aggregating
     self.assertEqual(m.variables, (mirrored, tpu, aggregating))
 
+  def test_composite_variable(self):
+
+    class Spec(type_spec.TypeSpec):
+
+      value_type = property(lambda self: CompositeVariable)
+
+      def _component_specs(self):
+        pass
+
+      def _serialize(self):
+        pass
+
+      def _to_components(self, value):
+        return value._variables
+
+      def _from_components(self, variable_list):
+        return CompositeVariable(variable_list)
+
+    class CompositeVariable(composite_tensor.CompositeTensor):
+
+      def __init__(self, variable_list):
+        self._variables = variable_list
+
+      @property
+      def _type_spec(self):
+        return Spec()
+
+    m = module.Module()
+    m.a = CompositeVariable([variables.Variable(1.), variables.Variable(2.)])
+    self.assertAllEqual(m.variables, m.a._variables)
+
 
 class ModuleTrackingTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 8d03133d266..05445edc669 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -64,6 +64,9 @@ from tensorflow.python.framework.test_combinations import *
 from tensorflow.python.util.tf_decorator import make_decorator
 from tensorflow.python.util.tf_decorator import unwrap
 
+from tensorflow.python.distribute.parameter_server_strategy_v2 import *
+from tensorflow.python.distribute.coordinator.cluster_coordinator import *
+
 tf_export('__internal__.decorator.make_decorator', v1=[])(make_decorator)
 tf_export('__internal__.decorator.unwrap', v1=[])(unwrap)
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index b8c59bcbb6c..07edd54b494 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -665,7 +665,7 @@ def _GatherV2Grad(op, grad):
   # For axis 0 gathers, build an appropriately shaped IndexedSlices.
   if axis_static == 0:
     if context.executing_eagerly():
-      with ops.device("/cpu:0"):
+      with ops.device(indices_size.device):
         params_tail_shape = array_ops.identity(params_shape)[1:]
     else:
       params_tail_shape = params_shape[1:]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e92463d9583..8381c4314e3 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -702,7 +702,7 @@ def shape_n(input, out_type=dtypes.int32, name=None):
 def size_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
-  
+
   See also `tf.shape`.
 
   Returns a 0-D `Tensor` representing the number of elements in `input`
@@ -1330,13 +1330,23 @@ def parallel_stack(values, name="parallel_stack"):
 
       tf.parallel_stack([x, y, z]) = np.asarray([x, y, z])
 
+  @compatibility(eager)
+  parallel_stack is not compatible with eager execution.
+  @end_compatibility
+
   Args:
     values: A list of `Tensor` objects with the same shape and type.
     name: A name for this operation (optional).
 
   Returns:
     output: A stacked `Tensor` with the same type as `values`.
+
+  Raises:
+    RuntimeError: if executed in eager mode.
   """
+  if context.executing_eagerly():
+    raise RuntimeError("tf.parallel_stack() is not compatible with "
+                       "eager execution.")
   with ops.name_scope(name):
     value_t = ops.convert_to_tensor(values[0])
     value_shape = ops.convert_to_tensor(value_t).get_shape()
@@ -1758,9 +1768,9 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
             shape(tensor)[axis + ndims_mask:]
         ], 0))
     # TODO(yongtang): tf.reshape in C++ kernel might have set the shape
-    # correctly, so the following may not be needed? It still might ben
-    # possible that there are some edge case where tensor_util.constant_value
-    # resolves more case than ShapeInference of tf.reshape in C++ kernel.
+    # correctly, so the following may not be needed? It still might be possible
+    # that there are some edge case where tensor_util.constant_value resolves
+    # more cases than ShapeInference of tf.reshape in C++ kernel.
     if axis_value is not None:
       first_dim = shape_tensor[axis:axis + ndims_mask].num_elements()
       tensor.set_shape(
@@ -2108,7 +2118,7 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
   As above, simply calling `tf.transpose` will default to `perm=[2,1,0]`.
 
   To take the transpose of the matrices in dimension-0 (such as when you are
-  transposing matrices where 0 is the batch dimesnion), you would set
+  transposing matrices where 0 is the batch dimension), you would set
   `perm=[0,2,1]`.
 
   >>> tf.transpose(x, perm=[0, 2, 1])
@@ -3650,7 +3660,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   array([[inf, 1. ],
          [0.5, 1. ]], dtype=float32)>
 
-  The operaton returns a dense Tensor of shape `[2, 2]` with
+  The operation returns a dense Tensor of shape `[2, 2]` with
   edit distances normalized by `truth` lengths.
 
   **Note**: It is possible to calculate edit distance between two
@@ -3685,7 +3695,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   normalize = True
 
   # The output would be a dense Tensor of shape `(2,)`, with edit distances
-  noramlized by 'truth' lengths.
+  normalized by 'truth' lengths.
   # output => array([0., 0.5], dtype=float32)
   ```
 
@@ -3758,6 +3768,23 @@ def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
       narrow_range=op.get_attr("narrow_range"))
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV4")
+def _QuantizeAndDequantizeV4Grad(op, grad):
+  """Gradient for QuantizeAndDequantizeV4 op."""
+  return quantize_and_dequantize_v4_grad(
+      grad,
+      op.inputs[0],
+      op.inputs[1],
+      op.inputs[2],
+      axis=op.get_attr("axis"))
+
+
+@ops.RegisterGradient("QuantizeAndDequantizeV4Grad")
+def _QuantizeAndDequantizeV4GradGrad(op, grad):
+  """Gradient for QuantizeAndDequantizeV4Grad op."""
+  return _QuantizeAndDequantizeV4Grad(op, grad)
+
+
 @tf_export("required_space_to_batch_paddings")
 def required_space_to_batch_paddings(input_shape,
                                      block_shape,
@@ -5297,8 +5324,8 @@ def tensor_scatter_nd_update(tensor, indices, updates, name=None):
   tf.Tensor([ 0 9  0 10  11  0  0 12], shape=(8,), dtype=int32)
 
   The length (first axis) of `updates` must equal the length of the `indices`:
-  `num_updates`. This is the the number of updates being inserted. Each
-  scalar update is inserted into `tensor` at the indexed location.
+  `num_updates`. This is the number of updates being inserted. Each scalar
+  update is inserted into `tensor` at the indexed location.
 
   For a higher rank input `tensor` scalar updates can be inserted by using an
   `index_depth` that matches `tf.rank(tensor)`:
@@ -5322,7 +5349,7 @@ def tensor_scatter_nd_update(tensor, indices, updates, name=None):
   `outer_shape` and the `inner_shape`.
 
   `indices` indexes into the outer level of the input tensor (`outer_shape`).
-  and replaces the sub-array at that location with the coresponding item from
+  and replaces the sub-array at that location with the corresponding item from
   the `updates` list. The shape of each update is `inner_shape`.
 
   When updating a list of slices the shape constraints are:
@@ -5355,7 +5382,7 @@ def tensor_scatter_nd_update(tensor, indices, updates, name=None):
   >>> updates = tf.constant([[1, 2, 3],
   ...                        [4, 5, 6]])
 
-  Alltogether this gives:
+  Altogether this gives:
 
   >>> tf.tensor_scatter_nd_update(tensor, indices, updates).numpy()
   array([[0, 0, 0],
@@ -5380,7 +5407,7 @@ def tensor_scatter_nd_update(tensor, indices, updates, name=None):
     * Provide updates each with a shape matching the `inner_shape`:
       `[time, width, height, channels]`.
 
-  To relace the first two clips with ones:
+  To replace the first two clips with ones:
 
   >>> indices = [[0],[1]]
   >>> new_clips = tf.ones([2, time, width, height, channels])
@@ -5403,7 +5430,7 @@ def tensor_scatter_nd_update(tensor, indices, updates, name=None):
 
   ### Folded indices
 
-  In simple cases it's convienient to think of `indices` and `updates` as
+  In simple cases it's convenient to think of `indices` and `updates` as
   lists, but this is not a strict requirement. Instead of a flat `num_updates`,
   the `indices` and `updates` can be folded into a `batch_shape`. This
   `batch_shape` is all axes of the `indices`, except for the innermost
@@ -5630,6 +5657,13 @@ dequantize.__doc__ = gen_array_ops.dequantize.__doc__
 
 @tf_export("quantization.quantize_and_dequantize")
 @dispatch.add_dispatch_support
+@deprecation.deprecated(None,
+                        "This Op has been deprecated, use" +
+                        "`quantize_and_dequantize_v2` instead. To " +
+                        "To simulate the V1 the behavior of " +
+                        "tf.quantization.quantize_and_dequantize(...) use " +
+                        "tf.grad_pass_through(" +
+                        "tf.quantization.quantize_and_dequantize_v2)(...).")
 def quantize_and_dequantize(
     input,  # pylint: disable=redefined-builtin
     input_min,
@@ -5688,6 +5722,93 @@ def quantize_and_dequantize(
       name=name)
 
 
+@tf_export("quantization.quantize_and_dequantize_v2")
+@dispatch.add_dispatch_support
+def quantize_and_dequantize_v2(
+    input,  # pylint: disable=redefined-builtin
+    input_min,
+    input_max,
+    signed_input=True,
+    num_bits=8,
+    range_given=False,
+    round_mode="HALF_TO_EVEN",
+    name=None,
+    narrow_range=False,
+    axis=None):
+  """Quantizes then dequantizes a tensor.
+
+  Updates the gradient definition for quantization that is outside the range to
+  be 0.To simulate the V1 the behavior of
+  tf.quantization.quantize_and_dequantize(...) use
+  tf.grad_pass_through(tf.quantization.quantize_and_dequantize_v2)(...).
+
+  Example usage:
+
+  ```python
+  def getQuantizeOp(input):
+      input_tensor = tf.placeholder(tf.float32, shape=[4, 4])
+      net = tf.quantization.quantize_and_dequantize(input,
+                                                    input_min=min_threshold,
+                                                    input_max=max_threshold,
+                                                    range_given=True)
+
+  To simulate v1 behavior:
+
+  def testDecomposeQuantizeDequantize(self):
+      def f(input_tensor):
+        return tf.quantization.quantize_and_dequantize_v2(input_tensor,
+                                                          input_min = 5.0,
+                                                          input_max= -10.0,
+                                                          range_given=True)
+      input_tensor = tf.placeholder(tf.float32, shape=[4, 4])
+      net = tf.grad_pass_through(f)(input_tensor)
+  ```
+
+  Args:
+    input: A `Tensor` to quantize and dequantize.
+    input_min: If range_given=True, the minimum input value, that needs to be
+      represented in the quantized representation. If axis is specified, this
+      should be a vector of minimum values for each slice along axis.
+    input_max: If range_given=True, the maximum input value that needs to be
+      represented in the quantized representation. If axis is specified, this
+      should be a vector of maximum values for each slice along axis.
+    signed_input: True if the quantization is signed or unsigned.
+    num_bits: The bitwidth of the quantization.
+    range_given: If true use `input_min` and `input_max` for the range of the
+      input, otherwise determine min and max from the input `Tensor`.
+    round_mode: Rounding mode when rounding from float values to quantized ones.
+      one of ['HALF_TO_EVEN', 'HALF_UP']
+    name: Optional name for the operation.
+    narrow_range: If true, then the absolute value of the quantized minimum
+      value is the same as the quantized maximum value, instead of 1 greater.
+      i.e. for 8 bit quantization, the minimum value is -127 instead of -128.
+    axis: Integer. If specified, refers to a dimension of the input tensor, such
+      that quantization will be per slice along that dimension.
+
+  Returns:
+    A `Tensor`. Each element is the result of quantizing and dequantizing the
+    corresponding element of `input`.
+  """
+  if axis is None:
+    axis = -1
+  elif axis < 0:
+    if input.shape.ndims is None:
+      raise ValueError("input should have known rank to use negative axis.")
+    axis %= input.shape.ndims
+
+  return gen_array_ops.quantize_and_dequantize_v4(
+      input,
+      input_min=input_min,
+      input_max=input_max,
+      signed_input=signed_input,
+      num_bits=num_bits,
+      range_given=range_given,
+      round_mode=round_mode,
+      narrow_range=narrow_range,
+      axis=axis,
+      name=name)
+
+
 @tf_export("searchsorted")
 @dispatch.add_dispatch_support
 def searchsorted(sorted_sequence,
@@ -6175,7 +6296,7 @@ def _with_nonzero_rank(data):
 @dispatch.add_dispatch_support
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
   """Repeat elements of `input`.
-  
+
   See also `tf.concat`, `tf.stack`, `tf.tile`.
 
   Args:
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index 6afe923d795..4f33c3aeecc 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -78,7 +78,8 @@ def all_reduce_v2(t,
                   merge_op='Add',
                   final_op='Id',
                   communication_hint='auto',
-                  timeout=0):
+                  timeout=0,
+                  ordering_token=None):
   """Reduces tensors collectively, across devices.
 
   Args:
@@ -98,10 +99,15 @@ def all_reduce_v2(t,
     timeout: a float. If set to a non zero, set a completion timeout to detect
       staleness.  If the timer goes off, a DeadlineExceededError is raised.  The
       timeout value in seconds. This feature is experimental.
+    ordering_token: an optional resource tensor to pass to the op as inputs.
+      They aren't used by the kernel but allow AutoControlDependency to order
+      the collectives with control dependencies.
 
   Returns:
     An Op implementing the distributed reduction.
   """
+  if ordering_token is not None:
+    ordering_token = [ordering_token]
   return gen_collective_ops.collective_reduce_v2(
       t,
       group_size=group_size,
@@ -110,7 +116,8 @@ def all_reduce_v2(t,
       merge_op=merge_op,
       final_op=final_op,
       communication_hint=communication_hint.lower(),
-      timeout_seconds=timeout)
+      timeout_seconds=timeout,
+      ordering_token=ordering_token or [])
 
 
 def all_gather(t,
@@ -157,7 +164,8 @@ def all_gather_v2(t,
                   group_key,
                   instance_key,
                   communication_hint='auto',
-                  timeout=0):
+                  timeout=0,
+                  ordering_token=None):
   """Accumulates tensors collectively, across devices, along first dimension.
 
   Args:
@@ -173,18 +181,23 @@ def all_gather_v2(t,
     timeout: a float. If set to a non zero, set a completion timeout to detect
       staleness. If the timer goes off, a DeadlineExceededError is raised. The
       timeout value in seconds. This feature is experimental.
+    ordering_token: an optional resource tensor to pass to the op as inputs.
+      They aren't used by the kernel but allow AutoControlDependency to order
+      the collectives with control dependencies.
 
   Returns:
     An Op implementing the distributed operation.
   """
-  return gen_collective_ops.collective_gather(
+  if ordering_token is not None:
+    ordering_token = [ordering_token]
+  return gen_collective_ops.collective_gather_v2(
       t,
-      shape=[0],
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
       communication_hint=communication_hint.lower(),
-      timeout_seconds=timeout)
+      timeout_seconds=timeout,
+      ordering_token=ordering_token or [])
 
 
 def broadcast_send(t,
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 5bdd2494e91..059ace7f5ac 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -125,7 +125,7 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   false_grad_graph = _create_grad_func(
       false_graph, grads, util.unique_grad_fn_name(false_graph.name))
 
-  # Replaces output None grads with zeros if atleast one branch has non-None
+  # Replaces output None grads with zeros if at least one branch has non-None
   # grad at that index.
   _create_zeros_for_none_grads([true_graph, false_graph],
                                [true_grad_graph, false_grad_graph])
@@ -206,7 +206,7 @@ def _build_cond(pred,
   computation.
 
   true_graph and false_graph need not have the same input types, but they must
-  have the same outpute types.
+  have the same output types.
 
   Args:
     pred: boolean Tensor
@@ -268,27 +268,35 @@ def _build_cond(pred,
     else:
       op_fn = gen_functional_ops.stateless_if
 
-    tensors = op_fn(
-        pred,
-        cond_inputs, [t.dtype for t in true_graph.outputs],
-        util.create_new_tf_function(true_graph),
-        util.create_new_tf_function(false_graph),
-        output_shapes=_get_output_shapes(true_graph.outputs,
-                                         false_graph.outputs),
-        name=name)
+    def _make_op(inputs):
+      if_op, tensors = util.get_op_and_outputs(op_fn(
+          pred,
+          inputs, [t.dtype for t in true_graph.outputs],
+          util.create_new_tf_function(true_graph),
+          util.create_new_tf_function(false_graph),
+          output_shapes=_get_output_shapes(true_graph.outputs,
+                                           false_graph.outputs),
+          name=name))
+      _copy_handle_data(tensors, true_graph.outputs, false_graph.outputs)
+      # `if_op` is None if this is a `StatelessIf` op with no outputs.
+      if if_op is not None:
+        # The true and false graphs have already been created, and we need that
+        # to happen before we know which tensors will be captured and so whether
+        # to wrap the cond in a tf.function. Post-hoc mutation of the branch
+        # `outer_graph` properties seems like the only option if we want to
+        # conditionally wrap in a function.
+        true_graph.outer_graph = ops.get_default_graph()
+        false_graph.outer_graph = ops.get_default_graph()
+        if_op._true_graph = true_graph
+        if_op._false_graph = false_graph
+        util.maybe_set_lowering_attr(if_op)
+        util.maybe_propagate_compile_time_consts_in_xla(if_op)
+        _set_read_only_resource_inputs_attr(if_op, [true_graph, false_graph])
+        # Prevent fetching since the variant outputs can't be fetched directly.
+        if_op.graph.prevent_fetching(if_op)
+      return tensors
+    tensors = util.run_as_function_for_tape_gradients(_make_op, cond_inputs)
 
-  if_op, tensors = _get_op_and_outputs(tensors)
-  # `if_op` is None if this is a `StatelessIf` op with no outputs.
-  if if_op is not None:
-    if_op._true_graph = true_graph
-    if_op._false_graph = false_graph
-    util.maybe_set_lowering_attr(if_op)
-    util.maybe_propagate_compile_time_consts_in_xla(if_op)
-    _set_read_only_resource_inputs_attr(if_op, [true_graph, false_graph])
-    # Prevent fetching since the variant outputs can't be fetched directly.
-    if_op.graph.prevent_fetching(if_op)
-
-  _copy_handle_data(tensors, true_graph.outputs, false_graph.outputs)
   # Return identities for each output of the If op, rather than the output of
   # the If op directly. This makes pruning work if the output of cond() is
   # fetched: the lowering pass converts the If outputs into IdentityN outputs,
@@ -544,7 +552,7 @@ def _make_inputs_match(branch_graphs, branch_inputs):
 
 
 def _create_zeros_for_none_grads(forward_graphs, grad_graphs):
-  """Creates zeros for None out grads if atleast one branch has non-None grad.
+  """Creates zeros for None out grads if at least one branch has non-None grad.
 
   Args:
     forward_graphs: List of forward FuncGraphs.
@@ -685,15 +693,6 @@ def _make_indexed_slices_indices_types_match(op_type, branch_graphs):
         branch_graph.structured_outputs, branch_graph.outputs)
 
 
-def _get_op_and_outputs(op_or_outputs):
-  if isinstance(op_or_outputs, ops.Operation):
-    return op_or_outputs, []
-  elif not op_or_outputs:  # Empty list.
-    return None, []
-  else:
-    return op_or_outputs[0].op, op_or_outputs
-
-
 def _pack_sequence_as(structured_outputs, op_outputs):
   """Packs the outputs of the gradient If/Case op.
 
@@ -933,7 +932,7 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
     # If it is not a resource, we wrap it in an optional in the forward graph
     # and capture the optional normally. We then unwrap the captured optional
     # value in the gradient graph to get the raw intermediate value.
-    # If it is a resource, we trace the resource upto the input in the forward
+    # If it is a resource, we trace the resource up to the input in the forward
     # graph and capture that.
 
     if tensor.dtype == dtypes.resource:
@@ -1035,7 +1034,7 @@ def _CaseGrad(op, *grads):  # pylint: disable=invalid-name
     branch_grad_graphs.append(
         _create_grad_func(branch_graph, grads,
                           util.unique_grad_fn_name(branch_graph.name)))
-  # Replaces output None grads with zeros if atleast one branch has non-None
+  # Replaces output None grads with zeros if at least one branch has non-None
   # grad at that index.
   _create_zeros_for_none_grads(branch_graphs, branch_grad_graphs)
 
@@ -1121,7 +1120,7 @@ def _build_case(branch_index,
   computation.
 
   `branch_graphs` need not have the same input types, but they must
-  have the same outpute types.
+  have the same output types.
 
   Args:
     branch_index: integer Tensor
@@ -1156,23 +1155,24 @@ def _build_case(branch_index,
   # Create the Case op.
   with ops.control_dependencies(
       sum((list(bg.control_captures) for bg in branch_graphs), [])):
-    tensors = op_fn(
-        branch_index,
-        case_inputs, [t.dtype for t in branch_graphs[0].outputs],
-        [util.create_new_tf_function(g) for g in branch_graphs],
-        output_shapes=_get_output_shapes(*[g.outputs for g in branch_graphs]),
-        name=name)
 
-  case_op, tensors = _get_op_and_outputs(tensors)
+    def _make_op(inputs):
+      case_op, tensors = util.get_op_and_outputs(op_fn(
+          branch_index,
+          inputs, [t.dtype for t in branch_graphs[0].outputs],
+          [util.create_new_tf_function(g) for g in branch_graphs],
+          output_shapes=_get_output_shapes(*[g.outputs for g in branch_graphs]),
+          name=name))
+      _copy_handle_data(tensors, *[g.outputs for g in branch_graphs])
+      if case_op is not None:
+        util.maybe_set_lowering_attr(case_op, lower_using_switch_merge)
+        util.maybe_propagate_compile_time_consts_in_xla(case_op)
+        _set_read_only_resource_inputs_attr(case_op, branch_graphs)
+        # Prevent fetching since the variant outputs can't be fetched directly.
+        case_op.graph.prevent_fetching(case_op)
+      return tensors
+    tensors = util.run_as_function_for_tape_gradients(_make_op, case_inputs)
 
-  if case_op is not None:
-    util.maybe_set_lowering_attr(case_op, lower_using_switch_merge)
-    util.maybe_propagate_compile_time_consts_in_xla(case_op)
-    _set_read_only_resource_inputs_attr(case_op, branch_graphs)
-    # Prevent fetching since the variant outputs can't be fetched directly.
-    case_op.graph.prevent_fetching(case_op)
-
-  _copy_handle_data(tensors, *[g.outputs for g in branch_graphs])
   # Return identities for each output of the Case op, rather than the output of
   # the Case op directly. This makes pruning work if the output of switch_case()
   # is fetched: the lowering pass converts the Case outputs into IdentityN
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 8e58c8d4408..c75b910058b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -2881,7 +2881,7 @@ def group(*inputs, **kwargs):
 
   When operating in a v1-style graph context, ops are not executed in the same
   order as specified in the code; TensorFlow will attempt to execute ops in
-  parallel or in an order convienient to the result it is computing.  `tf.group`
+  parallel or in an order convenient to the result it is computing.  `tf.group`
   allows you to request that one or more results finish before execution
   continues.
 
@@ -3616,6 +3616,7 @@ def switch_case(branch_index,
   return _indexed_case_helper(branch_fns, default, branch_index, name)
 
 
+@tf_export("__internal__.execute_fn_for_device", v1=[])
 def execute_fn_for_device(device_branch_fns, default_fn, name="execute_fn"):
   """Executes one of the provided callables based on the device placement.
 
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 0bbee785641..48e221c074b 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -28,11 +28,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework.func_graph import FuncGraph
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_v2_func_graphs
+from tensorflow.python.ops import gradients_util
+from tensorflow.python.util import keras_deps
 from tensorflow.python.util import tf_contextlib
 
 
 _EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = None
-_KERAS_LAYER_CONTEXT_FUNCTION = None
 _DISABLE_LOWER_USING_SWITCH_MERGE = False
 
 
@@ -188,10 +189,10 @@ def resource_input_index(tensor_name, input_names, node_defs, functions):
     output_idx = int(output_idx)
     node_def = node_defs[op_name]
 
-    if node_def.op == "While":
+    if node_def.op in ("Identity", "While"):
       # Captured resources occur at the same index in the lists of inputs and
-      # outputs of a while op. So we lookup the input of `tensor.op` at the
-      # same index as the index of `tensor` in the `tensor.op.outputs`.
+      # outputs of a while or identity op. So we lookup the input of `tensor.op`
+      # at the same index as the index of `tensor` in the `tensor.op.outputs`.
       tensor_name = node_def.input[output_idx]
     elif node_def.op in ("PartitionedCall", "StatefulPartitionedCall"):
       # Functions output any captured resource tensors used by their
@@ -242,18 +243,11 @@ def _is_tpu_strategy(strategy):
           strategy.__class__.__name__.startswith("TPUStrategy"))
 
 
-def _register_keras_layer_context_function(func):
-  global _KERAS_LAYER_CONTEXT_FUNCTION
-  # TODO(scottzhu): Disable duplicated inject once keras is moved to
-  # third_party/py/keras.
-  _KERAS_LAYER_CONTEXT_FUNCTION = func
-
-
 def _is_building_keras_layer():
   # TODO(srbs): Remove this function when we no long support session with Keras.
-  global _KERAS_LAYER_CONTEXT_FUNCTION
-  if _KERAS_LAYER_CONTEXT_FUNCTION is not None:
-    return _KERAS_LAYER_CONTEXT_FUNCTION().layer is not None
+  keras_call_context_function = keras_deps.get_call_context_function()
+  if keras_call_context_function:
+    return keras_call_context_function().layer is not None
   else:
     return False
 
@@ -319,3 +313,56 @@ def get_func_graph(op, input_shapes, func_name):
     func_graph = function_def_to_graph.function_def_to_graph(
         fdef, input_shapes)
   return func_graph
+
+
+def get_op_and_outputs(op_or_outputs):
+  if isinstance(op_or_outputs, ops.Operation):
+    return op_or_outputs, []
+  elif not op_or_outputs:  # Empty list.
+    return None, []
+  else:
+    return op_or_outputs[0].op, op_or_outputs
+
+
+def graph_wrapped_for_higher_order_tape_gradients(graph):
+  """Check if `graph` is wrapped by `run_as_function_for_tape_gradients`."""
+  while graph is not None:
+    if "cflow_gradient_wrapper" in getattr(graph, "name", ""):
+      return True
+    graph = getattr(graph, "outer_graph", None)
+  return False
+
+
+def run_as_function_for_tape_gradients(make_op, inputs):
+  """Fix higher-order tape gradients by wrapping `make_op` in a function.
+
+  Args:
+    make_op: A function that takes a list of inputs and returns a list of output
+      tensors. This function should set any handle data relevant to its outputs
+      before returning.
+    inputs: A list of tensors to check for tape gradients and pass to
+      `make_op`. These should include all tensors used in `make_op`.
+
+  Returns:
+    Tensors corresponding to `make_op`'s output.
+  """
+  # GradientTapes created inside a function currently don't work well with
+  # un-wrapped control flow ops in that same function. Wrapping in an extra
+  # layer of intermediate function means we run extra logic in the function
+  # gradient code to record the correct intermediates on the tape.
+  #
+  # The function attribute inputs to control flow ops are not hashable, so we
+  # pass everything as a capture to bypass defun's caching.
+  if (gradients_util.PossibleTapeGradientTypes(inputs)
+      == gradients_util.POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER
+      # We only need one function between the tape and the op; if we've already
+      # wrapped once, we stop wrapping to avoid infinite recursion.
+      and not (ops.get_default_graph().building_function
+               and "cflow_gradient_wrapper" in ops.get_default_graph().name)):
+    results = function.defun_with_attributes(
+        make_op,
+        autograph=False,
+        attributes=dict(func_name="cflow_gradient_wrapper"))(inputs)
+    return results
+  else:
+    return make_op(inputs)
diff --git a/tensorflow/python/ops/control_flow_v2_func_graphs.py b/tensorflow/python/ops/control_flow_v2_func_graphs.py
index 23edd712797..edf14bc9755 100644
--- a/tensorflow/python/ops/control_flow_v2_func_graphs.py
+++ b/tensorflow/python/ops/control_flow_v2_func_graphs.py
@@ -22,43 +22,39 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 
 
-class CondBranchFuncGraph(func_graph.FuncGraph):
+class ControlFlowFuncGraph(func_graph.FuncGraph):
+  """Contains control flow-specific FuncGraph logic."""
+
+  def __init__(self, *args, **kwargs):
+    super(ControlFlowFuncGraph, self).__init__(*args, **kwargs)
+    outer_graph = self.outer_graph
+    # Unlike tf.function, control flow FuncGraphs are generally created one per
+    # op. This means hard-coding any outer device scopes in the body (rather
+    # than inspecting the call-time placement of the control flow op) makes
+    # sense.
+    self._device_function_stack = outer_graph._device_function_stack.copy()  # pylint: disable=protected-access
+    self.is_control_flow_graph = True
+    if ops.executing_eagerly_outside_functions():
+      func_graph.override_func_graph_name_scope(
+          self, self.outer_graph.get_name_scope())
+
+
+class CondBranchFuncGraph(ControlFlowFuncGraph):
   """FuncGraph for branches of tf.cond().
 
   This is used to distinguish cond branches from other functions.
   """
 
-  def __init__(self, *args, **kwargs):
-    super(CondBranchFuncGraph, self).__init__(*args, **kwargs)
-    self.is_control_flow_graph = True
-    if ops.executing_eagerly_outside_functions():
-      func_graph.override_func_graph_name_scope(
-          self, self.outer_graph.get_name_scope())
 
-
-class WhileCondFuncGraph(func_graph.FuncGraph):
+class WhileCondFuncGraph(ControlFlowFuncGraph):
   """FuncGraph for the condition of tf.while_loop().
 
   This is used to distinguish while conditions from other functions.
   """
 
-  def __init__(self, *args, **kwargs):
-    super(WhileCondFuncGraph, self).__init__(*args, **kwargs)
-    self.is_control_flow_graph = True
-    if ops.executing_eagerly_outside_functions():
-      func_graph.override_func_graph_name_scope(
-          self, self.outer_graph.get_name_scope())
 
-
-class WhileBodyFuncGraph(func_graph.FuncGraph):
+class WhileBodyFuncGraph(ControlFlowFuncGraph):
   """FuncGraph for the body of tf.while_loop().
 
   This is used to distinguish while bodies from other functions.
   """
-
-  def __init__(self, *args, **kwargs):
-    super(WhileBodyFuncGraph, self).__init__(*args, **kwargs)
-    self.is_control_flow_graph = True
-    if ops.executing_eagerly_outside_functions():
-      func_graph.override_func_graph_name_scope(
-          self, self.outer_graph.get_name_scope())
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 3e38f68a0f7..5d7f605b884 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -524,7 +524,7 @@ def recompute_grad(f):
         # Gradient calculation for reverse mode autodiff.
         variables = grad_kwargs.get("variables")
         with backprop.GradientTape() as t:
-          id_args = [gen_array_ops.identity(x) for x in args]
+          id_args = nest.map_structure(gen_array_ops.identity, args)
           t.watch(id_args)
           if variables is not None:
             t.watch(variables)
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index b49536ec350..18c64b4c634 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -1071,7 +1071,7 @@ class Bijector(object):
       return math_ops.range(-reduce_ndims, 0)
 
   def _check_valid_event_ndims(self, min_event_ndims, event_ndims):
-    """Check whether event_ndims is atleast min_event_ndims."""
+    """Check whether event_ndims is at least min_event_ndims."""
     event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
     event_ndims_ = tensor_util.constant_value(event_ndims)
     assertions = []
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index 4d4df0ffa48..d09f6eef81d 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -24,6 +24,7 @@ import contextlib
 from six.moves import xrange, zip  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
@@ -415,7 +416,7 @@ def _NonEagerInputs(op, xs_set):
   """Returns the inputs of op, crossing closure boundaries where necessary.
 
   Does not return any captured EagerTensors, i.e., the number of tensors
-  returned may be less than than the actual number of inputs.
+  returned may be less than the actual number of inputs.
 
   Args:
     op: Operation
@@ -910,7 +911,7 @@ class AggregationMethod(object):
   be supported in future releases:
 
   * `EXPERIMENTAL_TREE`: Gradient terms are summed in pairs using
-    using the "AddN" op. This method of summing gradients may reduce
+    the "AddN" op. This method of summing gradients may reduce
     performance, but it can improve memory utilization because the
     gradients can be released earlier.
 
@@ -1007,3 +1008,15 @@ def _AggregatedGrads(grads,
       # out_grads[i] is [], thus its aggregation is simply None.
       out_grads[i] = None
   return out_grads
+
+
+# Represents the output of TFE_Py_TapeSetPossibleGradientTypes. Real enums are
+# unfortunately too slow to use here.
+POSSIBLE_GRADIENT_TYPES_NONE = 0
+POSSIBLE_GRADIENT_TYPES_FIRST_ORDER = 1
+POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER = 2
+
+
+def PossibleTapeGradientTypes(tensors):
+  """Determines whether and how `args` may require tape gradients."""
+  return pywrap_tfe.TFE_Py_TapeSetPossibleGradientTypes(tensors)
diff --git a/tensorflow/python/ops/image_grad_test_base.py b/tensorflow/python/ops/image_grad_test_base.py
index 58e0ddc5284..e09bab70095 100644
--- a/tensorflow/python/ops/image_grad_test_base.py
+++ b/tensorflow/python/ops/image_grad_test_base.py
@@ -18,21 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
-from absl.testing import parameterized
-
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import image_ops
-from tensorflow.python.ops import gen_image_ops
-from tensorflow.python.platform import test
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 
 
 @test_util.for_all_test_methods(test_util.disable_xla,
@@ -221,18 +221,19 @@ class ResizeBilinearOpTestBase(test.TestCase, parameterized.TestCase):
           threshold = 1e-5
         self.assertAllClose(jacob_a, jacob_n, threshold, threshold)
 
-  @test_util.run_deprecated_v1
-  def testGradOnUnsupportedType(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradOnUnsupportedType(self, use_tape):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
 
-    x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
-
-    input_tensor = constant_op.constant(x, shape=in_shape)
-    resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-    with self.cached_session():
-      grad = gradients_impl.gradients(resize_out, [input_tensor])
-      self.assertEqual([None], grad)
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      tape.watch(input_tensor)
+      resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
+      with self.cached_session():
+        grad = tape.gradient(resize_out, [input_tensor])
+    self.assertEqual([None], grad)
 
   def _gpuVsCpuCase(self, in_shape, out_shape, align_corners,
                     half_pixel_centers, dtype):
@@ -279,7 +280,8 @@ class ResizeBilinearOpTestBase(test.TestCase, parameterized.TestCase):
         dtype=np.float64)
 
 
-class ResizeBicubicOpTestBase(test.TestCase):
+class ResizeBicubicOpTestBase(test.TestCase, parameterized.TestCase):
+  """Tests resize bicubic ops."""
 
   def testShapeIsCorrectAfterOp(self):
     in_shape = [1, 2, 2, 1]
@@ -296,55 +298,63 @@ class ResizeBicubicOpTestBase(test.TestCase):
         resize_out = self.evaluate(resize_out)
         self.assertEqual(out_shape, list(resize_out.shape))
 
-  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
 
     x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
+    input_tensor = constant_op.constant(x, shape=in_shape)
 
     for align_corners in [True, False]:
-      with self.cached_session():
-        input_tensor = constant_op.constant(x, shape=in_shape)
-        resize_out = image_ops.resize_bicubic(
+
+      def func(input_tensor, align_corners=align_corners):
+        return image_ops.resize_bicubic(
             input_tensor, out_shape[1:3], align_corners=align_corners)
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+
+      with self.cached_session():
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(func, [input_tensor]))
+
       self.assertLess(err, 1e-3)
 
-  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
 
     x = np.arange(0, 24).reshape(in_shape).astype(np.float32)
+    input_tensor = constant_op.constant(x, shape=in_shape)
 
     for align_corners in [True, False]:
-      input_tensor = constant_op.constant(x, shape=in_shape)
-      resize_out = image_ops.resize_bicubic(
-          input_tensor, out_shape[1:3], align_corners=align_corners)
+
+      def func(input_tensor, align_corners=align_corners):
+        return image_ops.resize_bicubic(
+            input_tensor, out_shape[1:3], align_corners=align_corners)
+
       with self.cached_session():
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(func, [input_tensor]))
+
       self.assertLess(err, 1e-3)
 
-  @test_util.run_deprecated_v1
-  def testGradOnUnsupportedType(self):
-    in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradOnUnsupportedType(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      in_shape = [1, 4, 6, 1]
+      out_shape = [1, 2, 3, 1]
 
-    x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
+      x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      tape.watch(input_tensor)
 
-    input_tensor = constant_op.constant(x, shape=in_shape)
-    resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3])
-    with self.cached_session():
-      grad = gradients_impl.gradients(resize_out, [input_tensor])
-      self.assertEqual([None], grad)
+      resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3])
+      with self.cached_session():
+        grad = tape.gradient(resize_out, [input_tensor])
+    self.assertEqual([None], grad)
 
 
 class ScaleAndTranslateOpTestBase(test.TestCase):
+  """Tests scale and translate op."""
 
-  @test_util.run_deprecated_v1
   def testGrads(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -363,19 +373,25 @@ class ScaleAndTranslateOpTestBase(test.TestCase):
           for antialias in [True, False]:
             with self.cached_session():
               input_tensor = constant_op.constant(x, shape=in_shape)
-              scale_and_translate_out = image_ops.scale_and_translate(
-                  input_tensor,
-                  out_shape[1:3],
-                  scale=constant_op.constant(scale),
-                  translation=constant_op.constant(translation),
-                  kernel_type=kernel_type,
-                  antialias=antialias)
-              err = gradient_checker.compute_gradient_error(
-                  input_tensor,
-                  in_shape,
-                  scale_and_translate_out,
-                  out_shape,
-                  x_init_value=x)
+
+              def scale_trans(input_tensor,
+                              scale=scale,
+                              translation=translation,
+                              kernel_type=kernel_type,
+                              antialias=antialias):
+                # pylint: disable=cell-var-from-loop
+                return image_ops.scale_and_translate(
+                    input_tensor,
+                    out_shape[1:3],
+                    scale=constant_op.constant(scale),
+                    translation=constant_op.constant(translation),
+                    kernel_type=kernel_type,
+                    antialias=antialias)
+
+              err = gradient_checker_v2.max_error(
+                  *gradient_checker_v2.compute_gradient(scale_trans,
+                                                        [input_tensor]))
+
             self.assertLess(err, 1e-3)
 
   def testIdentityGrads(self):
@@ -466,7 +482,6 @@ class CropAndResizeOpTestBase(test.TestCase):
         samples.append(sample)
     return samples
 
-  @test_util.run_deprecated_v1
   def testGradRandomBoxes(self):
     """Test that the gradient is correct for randomly generated boxes.
 
@@ -494,8 +509,6 @@ class CropAndResizeOpTestBase(test.TestCase):
               batch = num_boxes
               image_shape = [batch, image_height, image_width, depth]
               crop_size = [crop_height, crop_width]
-              crops_shape = [num_boxes, crop_height, crop_width, depth]
-              boxes_shape = [num_boxes, 4]
 
               image = np.arange(0, batch * image_height * image_width *
                                 depth).reshape(image_shape).astype(np.float32)
@@ -512,21 +525,28 @@ class CropAndResizeOpTestBase(test.TestCase):
               boxes = np.array(boxes, dtype=np.float32)
               box_ind = np.arange(batch, dtype=np.int32)
 
-              with self.cached_session(use_gpu=True):
-                image_tensor = constant_op.constant(image, shape=image_shape)
-                boxes_tensor = constant_op.constant(boxes, shape=[num_boxes, 4])
-                box_ind_tensor = constant_op.constant(
-                    box_ind, shape=[num_boxes])
-                crops = image_ops.crop_and_resize(
+              image_tensor = constant_op.constant(image, shape=image_shape)
+              boxes_tensor = constant_op.constant(boxes, shape=[num_boxes, 4])
+              box_ind_tensor = constant_op.constant(box_ind, shape=[num_boxes])
+
+              def crop_resize(image_tensor, boxes_tensor):
+                # pylint: disable=cell-var-from-loop
+                return image_ops.crop_and_resize(
                     image_tensor, boxes_tensor, box_ind_tensor,
                     constant_op.constant(crop_size, shape=[2]))
 
-                err = gradient_checker.compute_gradient_error(
-                    [image_tensor, boxes_tensor], [image_shape, boxes_shape],
-                    crops,
-                    crops_shape,
-                    delta=delta,
-                    x_init_value=[image, boxes])
+              with test_util.device(use_gpu=True):
+                with self.cached_session():
+                  # pylint: disable=cell-var-from-loop
+                  err1 = gradient_checker_v2.max_error(
+                      *gradient_checker_v2.compute_gradient(
+                          lambda x: crop_resize(x, boxes_tensor),
+                          [image_tensor]))
+                  err2 = gradient_checker_v2.max_error(
+                      *gradient_checker_v2.compute_gradient(
+                          lambda x: crop_resize(image_tensor, x),
+                          [boxes_tensor]))
+                  err = max(err1, err2)
 
               self.assertLess(err, 2e-3)
 
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 8a767597f00..75e66d8f513 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2964,6 +2964,12 @@ def decode_image(contents,
   function to `False`, in which case the op will return 3-dimensional tensors
   and will truncate animated GIF files to the first frame.
 
+  NOTE: If the first frame of an animated GIF does not occupy the entire
+  canvas (maximum frame width x maximum frame height), then it fills the
+  unoccupied areas (in the first frame) with zeros (black). For frames after the
+  first frame that does not occupy the entire canvas, it uses the previous
+  frame to fill the unoccupied areas.
+
   Args:
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 6a651bbdcce..3b1cf9e9a15 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -34,11 +34,13 @@ from tensorflow.python.client import session
 from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
@@ -46,7 +48,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
-from tensorflow.python.ops import gradients
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import image_ops_impl
 from tensorflow.python.ops import io_ops
@@ -2007,7 +2008,8 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           self.assertTrue(y.op.name.startswith("central_crop"))
 
 
-class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
+class PadToBoundingBoxTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
 
   def _PadToBoundingBox(self, x, offset_height, offset_width, target_height,
                         target_width, use_tensor_inputs):
@@ -2172,7 +2174,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           "inner 3 dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
-  def testBadParams(self):
+  def testBadParamsScalarInputs(self):
+    # In this test, inputs do not get converted to tensors before calling the
+    # tf.function. The error message here is raised in python
+    # since the python function has direct access to the scalars.
     x_shape = [3, 3, 1]
     x = np.zeros(x_shape)
 
@@ -2187,9 +2192,49 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
          "height must be <= target - offset"),
         (0, 2, 4, 4,
          "width must be <= target - offset"))
-
     for config_item in test_config:
-      self._assertRaises(x, x_shape, *config_item)
+      self._assertRaises(
+          x, x_shape, *config_item, use_tensor_inputs_options=[False])
+
+  def testBadParamsTensorInputsEager(self):
+    # In this test inputs get converted to EagerTensors before calling the
+    # tf.function. The error message here is raised in python
+    # since the python function has direct access to the tensor's values.
+    with context.eager_mode():
+      x_shape = [3, 3, 1]
+      x = np.zeros(x_shape)
+
+      # Each line is a test configuration:
+      #   offset_height, offset_width, target_height, target_width, err_msg
+      test_config = (
+          (-1, 0, 4, 4,
+           "offset_height must be >= 0"),
+          (0, -1, 4, 4,
+           "offset_width must be >= 0"),
+          (2, 0, 4, 4,
+           "height must be <= target - offset"),
+          (0, 2, 4, 4,
+           "width must be <= target - offset"))
+      for config_item in test_config:
+        self._assertRaises(
+            x, x_shape, *config_item, use_tensor_inputs_options=[True])
+
+  @parameterized.named_parameters([("OffsetHeight", (-1, 0, 4, 4)),
+                                   ("OffsetWidth", (0, -1, 4, 4)),
+                                   ("Height", (2, 0, 4, 4)),
+                                   ("Width", (0, 2, 4, 4))])
+  def testBadParamsTensorInputsGraph(self, config):
+    # In this test inputs get converted to tensors before calling the
+    # tf.function. The error message here is raised during shape inference.
+    with context.graph_mode():
+      x_shape = [3, 3, 1]
+      x = np.zeros(x_shape)
+      self._assertRaises(
+          x,
+          x_shape,
+          *config,
+          "Paddings must be non-negative",
+          use_tensor_inputs_options=[True])
 
   def testNameScope(self):
     # Testing name scope requires a graph.
@@ -4740,7 +4785,6 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
 class FormatTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testFormats(self):
     prefix = "tensorflow/core/lib"
     paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
@@ -4752,10 +4796,10 @@ class FormatTest(test_util.TensorFlowTestCase):
     }
     with self.cached_session():
       for path in paths:
-        contents = io_ops.read_file(os.path.join(prefix, path)).eval()
+        contents = self.evaluate(io_ops.read_file(os.path.join(prefix, path)))
         images = {}
         for name, decode in decoders.items():
-          image = decode(contents).eval()
+          image = self.evaluate(decode(contents))
           self.assertEqual(image.ndim, 3)
           for prev_name, prev in images.items():
             print("path %s, names %s %s, shapes %s %s" %
@@ -4773,7 +4817,6 @@ class FormatTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def NonMaxSuppressionTest(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -4789,50 +4832,56 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold)
       self.assertAllClose(selected_indices, [3, 0, 5])
 
-  @test_util.run_deprecated_v1
   def testInvalidShape(self):
+
+    def nms_func(box, score, iou_thres, score_thres):
+      return image_ops.non_max_suppression(box, score, iou_thres, score_thres)
+
+    iou_thres = 3
+    score_thres = 0.5
+
     # The boxes should be 2D of shape [num_boxes, 4].
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
                                 "Shape must be rank 2 but is rank 1"):
       boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
       scores = constant_op.constant([0.9])
-      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+      nms_func(boxes, scores, iou_thres, score_thres)
 
-    with self.assertRaisesRegex(ValueError, "Dimension must be 4 but is 3"):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                "Dimension must be 4 but is 3"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0]])
       scores = constant_op.constant([0.9])
-      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+      nms_func(boxes, scores, iou_thres, score_thres)
 
     # The boxes is of shape [num_boxes, 4], and the scores is
     # of shape [num_boxes]. So an error will be thrown.
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
                                 "Dimensions must be equal, but are 1 and 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9, 0.75])
-      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+      nms_func(boxes, scores, iou_thres, score_thres)
 
     # The scores should be 1D of shape [num_boxes].
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
                                 "Shape must be rank 1 but is rank 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([[0.9]])
-      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+      nms_func(boxes, scores, iou_thres, score_thres)
 
     # The max_output_size should be a scalar (0-D).
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
                                 "Shape must be rank 0 but is rank 1"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9])
-      image_ops.non_max_suppression(boxes, scores, [3], 0.5)
+      nms_func(boxes, scores, [iou_thres], score_thres)
 
     # The iou_threshold should be a scalar (0-D).
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
                                 "Shape must be rank 0 but is rank 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9])
-      image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
+      nms_func(boxes, scores, iou_thres, [[score_thres]])
 
-  @test_util.run_deprecated_v1
   @test_util.xla_allow_fallback(
       "non_max_suppression with dynamic output shape unsupported.")
   def testDataTypes(self):
@@ -4852,7 +4901,8 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
         max_output_size = constant_op.constant(max_output_size_np)
         iou_threshold = constant_op.constant(iou_threshold_np, dtype=dtype)
         selected_indices = gen_image_ops.non_max_suppression_v2(
-            boxes, scores, max_output_size, iou_threshold).eval()
+            boxes, scores, max_output_size, iou_threshold)
+        selected_indices = self.evaluate(selected_indices)
         self.assertAllClose(selected_indices, [3, 0, 5])
     # gen_image_ops.non_max_suppression_v3
     for dtype in [np.float16, np.float32]:
@@ -4897,7 +4947,6 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionWithScoresTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   @test_util.xla_allow_fallback(
       "non_max_suppression with dynamic output shape unsupported.")
   def testSelectFromThreeClustersWithSoftNMS(self):
@@ -4930,75 +4979,167 @@ class NonMaxSuppressionWithScoresTest(test_util.TensorFlowTestCase):
                         rtol=1e-2, atol=1e-2)
 
 
-class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
+class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase,
+                                  parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   @test_util.disable_xla(
       "b/141236442: "
       "non_max_suppression with dynamic output shape unsupported.")
-  def testSelectFromThreeClusters(self):
-    boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
-                [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
-    scores_np = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
-    max_output_size_np = 5
-    iou_threshold_np = 0.5
-    boxes = constant_op.constant(boxes_np)
-    scores = constant_op.constant(scores_np)
-    max_output_size = constant_op.constant(max_output_size_np)
-    iou_threshold = constant_op.constant(iou_threshold_np)
-    selected_indices_padded, num_valid_padded = \
-        image_ops.non_max_suppression_padded(
+  def testSelectFromThreeClustersV1(self):
+    with ops.Graph().as_default():
+      boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
+                  [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
+      scores_np = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
+      max_output_size_np = 5
+      iou_threshold_np = 0.5
+      boxes = constant_op.constant(boxes_np)
+      scores = constant_op.constant(scores_np)
+      max_output_size = constant_op.constant(max_output_size_np)
+      iou_threshold = constant_op.constant(iou_threshold_np)
+      selected_indices_padded, num_valid_padded = \
+          image_ops.non_max_suppression_padded(
+              boxes,
+              scores,
+              max_output_size,
+              iou_threshold,
+              pad_to_max_output_size=True)
+      selected_indices, num_valid = image_ops.non_max_suppression_padded(
+          boxes,
+          scores,
+          max_output_size,
+          iou_threshold,
+          pad_to_max_output_size=False)
+      # The output shape of the padded operation must be fully defined.
+      self.assertEqual(selected_indices_padded.shape.is_fully_defined(), True)
+      self.assertEqual(selected_indices.shape.is_fully_defined(), False)
+      with self.cached_session():
+        self.assertAllClose(selected_indices_padded, [3, 0, 5, 0, 0])
+        self.assertEqual(num_valid_padded.eval(), 3)
+        self.assertAllClose(selected_indices, [3, 0, 5])
+        self.assertEqual(num_valid.eval(), 3)
+
+  @parameterized.named_parameters([("_RunEagerly", True), ("_RunGraph", False)])
+  @test_util.disable_xla(
+      "b/141236442: "
+      "non_max_suppression with dynamic output shape unsupported.")
+  def testSelectFromThreeClustersV2(self, run_func_eagerly):
+    if not context.executing_eagerly() and run_func_eagerly:
+      # Skip running tf.function eagerly in V1 mode.
+      self.skipTest("Skip test that runs tf.function eagerly in V1 mode.")
+    else:
+
+      @def_function.function
+      def func(boxes, scores, max_output_size, iou_threshold):
+        boxes = constant_op.constant(boxes_np)
+        scores = constant_op.constant(scores_np)
+        max_output_size = constant_op.constant(max_output_size_np)
+        iou_threshold = constant_op.constant(iou_threshold_np)
+
+        yp, nvp = image_ops.non_max_suppression_padded(
             boxes,
             scores,
             max_output_size,
             iou_threshold,
             pad_to_max_output_size=True)
-    selected_indices, num_valid = image_ops.non_max_suppression_padded(
-        boxes,
-        scores,
-        max_output_size,
-        iou_threshold,
-        pad_to_max_output_size=False)
-    # The output shape of the padded operation must be fully defined.
-    self.assertEqual(selected_indices_padded.shape.is_fully_defined(), True)
-    self.assertEqual(selected_indices.shape.is_fully_defined(), False)
-    with self.cached_session():
-      self.assertAllClose(selected_indices_padded, [3, 0, 5, 0, 0])
-      self.assertEqual(num_valid_padded.eval(), 3)
-      self.assertAllClose(selected_indices, [3, 0, 5])
-      self.assertEqual(num_valid.eval(), 3)
 
-  @test_util.run_deprecated_v1
+        y, n = image_ops.non_max_suppression_padded(
+            boxes,
+            scores,
+            max_output_size,
+            iou_threshold,
+            pad_to_max_output_size=False)
+
+        # The output shape of the padded operation must be fully defined.
+        self.assertEqual(yp.shape.is_fully_defined(), True)
+        self.assertEqual(y.shape.is_fully_defined(), False)
+
+        return yp, nvp, y, n
+
+      boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
+                  [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
+      scores_np = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
+      max_output_size_np = 5
+      iou_threshold_np = 0.5
+
+      selected_indices_padded, num_valid_padded, selected_indices, num_valid = \
+          func(boxes_np, scores_np, max_output_size_np, iou_threshold_np)
+
+      with self.cached_session():
+        with test_util.run_functions_eagerly(run_func_eagerly):
+          self.assertAllClose(selected_indices_padded, [3, 0, 5, 0, 0])
+          self.assertEqual(self.evaluate(num_valid_padded), 3)
+          self.assertAllClose(selected_indices, [3, 0, 5])
+          self.assertEqual(self.evaluate(num_valid), 3)
+
   @test_util.xla_allow_fallback(
       "non_max_suppression with dynamic output shape unsupported.")
-  def testSelectFromContinuousOverLap(self):
-    boxes_np = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
-                [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
-    scores_np = [0.9, 0.75, 0.6, 0.5, 0.4, 0.3]
-    max_output_size_np = 3
-    iou_threshold_np = 0.5
-    score_threshold_np = 0.1
-    boxes = constant_op.constant(boxes_np)
-    scores = constant_op.constant(scores_np)
-    max_output_size = constant_op.constant(max_output_size_np)
-    iou_threshold = constant_op.constant(iou_threshold_np)
-    score_threshold = constant_op.constant(score_threshold_np)
-    selected_indices, num_valid = image_ops.non_max_suppression_padded(
-        boxes,
-        scores,
-        max_output_size,
-        iou_threshold,
-        score_threshold)
-    # The output shape of the padded operation must be fully defined.
-    self.assertEqual(selected_indices.shape.is_fully_defined(), False)
-    with self.cached_session():
-      self.assertAllClose(selected_indices, [0, 2, 4])
-      self.assertEqual(num_valid.eval(), 3)
+  def testSelectFromContinuousOverLapV1(self):
+    with ops.Graph().as_default():
+      boxes_np = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
+                  [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
+      scores_np = [0.9, 0.75, 0.6, 0.5, 0.4, 0.3]
+      max_output_size_np = 3
+      iou_threshold_np = 0.5
+      score_threshold_np = 0.1
+      boxes = constant_op.constant(boxes_np)
+      scores = constant_op.constant(scores_np)
+      max_output_size = constant_op.constant(max_output_size_np)
+      iou_threshold = constant_op.constant(iou_threshold_np)
+      score_threshold = constant_op.constant(score_threshold_np)
+      selected_indices, num_valid = image_ops.non_max_suppression_padded(
+          boxes,
+          scores,
+          max_output_size,
+          iou_threshold,
+          score_threshold)
+      # The output shape of the padded operation must be fully defined.
+      self.assertEqual(selected_indices.shape.is_fully_defined(), False)
+      with self.cached_session():
+        self.assertAllClose(selected_indices, [0, 2, 4])
+        self.assertEqual(num_valid.eval(), 3)
+
+  @parameterized.named_parameters([("_RunEagerly", True), ("_RunGraph", False)])
+  @test_util.xla_allow_fallback(
+      "non_max_suppression with dynamic output shape unsupported.")
+  def testSelectFromContinuousOverLapV2(self, run_func_eagerly):
+    if not context.executing_eagerly() and run_func_eagerly:
+      # Skip running tf.function eagerly in V1 mode.
+      self.skipTest("Skip test that runs tf.function eagerly in V1 mode.")
+    else:
+
+      @def_function.function
+      def func(boxes, scores, max_output_size, iou_threshold, score_threshold):
+        boxes = constant_op.constant(boxes)
+        scores = constant_op.constant(scores)
+        max_output_size = constant_op.constant(max_output_size)
+        iou_threshold = constant_op.constant(iou_threshold)
+        score_threshold = constant_op.constant(score_threshold)
+
+        y, nv = image_ops.non_max_suppression_padded(
+            boxes, scores, max_output_size, iou_threshold, score_threshold)
+
+        # The output shape of the padded operation must be fully defined.
+        self.assertEqual(y.shape.is_fully_defined(), False)
+
+        return y, nv
+
+      boxes_np = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
+                  [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
+      scores_np = [0.9, 0.75, 0.6, 0.5, 0.4, 0.3]
+      max_output_size_np = 3
+      iou_threshold_np = 0.5
+      score_threshold_np = 0.1
+      selected_indices, num_valid = func(boxes_np, scores_np,
+                                         max_output_size_np, iou_threshold_np,
+                                         score_threshold_np)
+      with self.cached_session():
+        with test_util.run_functions_eagerly(run_func_eagerly):
+          self.assertAllClose(selected_indices, [0, 2, 4])
+          self.assertEqual(self.evaluate(num_valid), 3)
 
 
 class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testSelectOneFromThree(self):
     overlaps_np = [
         [1.0, 0.7, 0.2],
@@ -5024,28 +5165,31 @@ class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
 class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
   """Tests utility function used by ssim() and psnr()."""
 
-  @test_util.run_deprecated_v1
   def testWrongDims(self):
-    img = array_ops.placeholder(dtype=dtypes.float32)
-    img_np = np.array((2, 2))
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      img = array_ops.placeholder(dtype=dtypes.float32)
+      img_np = np.array((2, 2))
 
-    with self.cached_session(use_gpu=True) as sess:
-      _, _, checks = image_ops_impl._verify_compatible_image_shapes(img, img)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(checks, {img: img_np})
+      with self.cached_session(use_gpu=True) as sess:
+        _, _, checks = image_ops_impl._verify_compatible_image_shapes(img, img)
+        with self.assertRaises(errors.InvalidArgumentError):
+          sess.run(checks, {img: img_np})
 
-  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
-    img1 = array_ops.placeholder(dtype=dtypes.float32)
-    img2 = array_ops.placeholder(dtype=dtypes.float32)
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      img1 = array_ops.placeholder(dtype=dtypes.float32)
+      img2 = array_ops.placeholder(dtype=dtypes.float32)
 
-    img1_np = np.array([1, 2, 2, 1])
-    img2_np = np.array([1, 3, 3, 1])
+      img1_np = np.array([1, 2, 2, 1])
+      img2_np = np.array([1, 3, 3, 1])
 
-    with self.cached_session(use_gpu=True) as sess:
-      _, _, checks = image_ops_impl._verify_compatible_image_shapes(img1, img2)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(checks, {img1: img1_np, img2: img2_np})
+      with self.cached_session(use_gpu=True) as sess:
+        _, _, checks = image_ops_impl._verify_compatible_image_shapes(
+            img1, img2)
+        with self.assertRaises(errors.InvalidArgumentError):
+          sess.run(checks, {img1: img1_np, img2: img2_np})
 
 
 class PSNRTest(test_util.TensorFlowTestCase):
@@ -5075,7 +5219,6 @@ class PSNRTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
-  @test_util.run_deprecated_v1
   def testPSNRSingleImage(self):
     image1 = self._RandomImage((8, 8, 1), 1)
     image2 = self._RandomImage((8, 8, 1), 1)
@@ -5086,10 +5229,9 @@ class PSNRTest(test_util.TensorFlowTestCase):
                                        dtype=dtypes.float32)
       tf_image2 = constant_op.constant(image2, shape=image2.shape,
                                        dtype=dtypes.float32)
-      tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1.0, "psnr").eval()
+      tf_psnr = self.evaluate(image_ops.psnr(tf_image1, tf_image2, 1.0, "psnr"))
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
-  @test_util.run_deprecated_v1
   def testPSNRMultiImage(self):
     image1 = self._RandomImage((10, 8, 8, 1), 1)
     image2 = self._RandomImage((10, 8, 8, 1), 1)
@@ -5100,10 +5242,9 @@ class PSNRTest(test_util.TensorFlowTestCase):
                                        dtype=dtypes.float32)
       tf_image2 = constant_op.constant(image2, shape=image2.shape,
                                        dtype=dtypes.float32)
-      tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1, "psnr").eval()
+      tf_psnr = self.evaluate(image_ops.psnr(tf_image1, tf_image2, 1, "psnr"))
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
-  @test_util.run_deprecated_v1
   def testGoldenPSNR(self):
     q20, q72, q95 = self._LoadTestImages()
 
@@ -5121,23 +5262,21 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
       tf_q72 = constant_op.constant(q72, shape=q72.shape, dtype=dtypes.float32)
       tf_q95 = constant_op.constant(q95, shape=q95.shape, dtype=dtypes.float32)
-      tf_psnr1 = image_ops.psnr(tf_q20, tf_q72, 1, "psnr1").eval()
-      tf_psnr2 = image_ops.psnr(tf_q20, tf_q95, 1, "psnr2").eval()
-      tf_psnr3 = image_ops.psnr(tf_q72, tf_q95, 1, "psnr3").eval()
+      tf_psnr1 = self.evaluate(image_ops.psnr(tf_q20, tf_q72, 1, "psnr1"))
+      tf_psnr2 = self.evaluate(image_ops.psnr(tf_q20, tf_q95, 1, "psnr2"))
+      tf_psnr3 = self.evaluate(image_ops.psnr(tf_q72, tf_q95, 1, "psnr3"))
       self.assertAllClose(psnr1, tf_psnr1, atol=0.001)
       self.assertAllClose(psnr2, tf_psnr2, atol=0.001)
       self.assertAllClose(psnr3, tf_psnr3, atol=0.001)
 
-  @test_util.run_deprecated_v1
   def testInfinity(self):
     q20, _, _ = self._LoadTestImages()
     psnr = self._PSNR_NumPy(q20, q20, 1)
     with self.cached_session(use_gpu=True):
       tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
-      tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
+      tf_psnr = self.evaluate(image_ops.psnr(tf_q20, tf_q20, 1, "psnr"))
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
-  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((10, 8, 8, 1), 255)
     img2 = self._RandomImage((10, 8, 8, 1), 255)
@@ -5149,7 +5288,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(
-          psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
+          self.evaluate(psnr_uint8), self.evaluate(psnr_float32), atol=0.001)
 
 
 class SSIMTest(test_util.TensorFlowTestCase):
@@ -5179,18 +5318,21 @@ class SSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
-  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against values produced by Matlab."""
     img = self._LoadTestImages()
     expected = self._ssim[np.triu_indices(3)]
 
-    ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
-    ssim = image_ops.ssim(
-        *ph, max_val=1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
+    def ssim_func(x):
+      return image_ops.ssim(
+          *x, max_val=1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
+
     with self.cached_session(use_gpu=True):
-      scores = [ssim.eval(dict(zip(ph, t)))
-                for t in itertools.combinations_with_replacement(img, 2)]
+      scores = [
+          self.evaluate(ssim_func(t))
+          for t in itertools.combinations_with_replacement(img, 2)
+      ]
+
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
 
   def testBatch(self):
@@ -5248,7 +5390,6 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
-  @test_util.run_deprecated_v1
   def testNegative(self):
     """Tests against negative SSIM index."""
     step = np.expand_dims(np.arange(0, 256, 16, dtype=np.uint8), axis=0)
@@ -5267,9 +5408,8 @@ class SSIMTest(test_util.TensorFlowTestCase):
         k1=0.01,
         k2=0.03)
     with self.cached_session(use_gpu=True):
-      self.assertLess(ssim.eval(), 0)
+      self.assertLess(self.evaluate(ssim), 0)
 
-  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 16, 16, 3), 255)
     img2 = self._RandomImage((1, 16, 16, 3), 255)
@@ -5283,7 +5423,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
         img1, img2, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(
-          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
+          self.evaluate(ssim_uint8), self.evaluate(ssim_float32), atol=0.001)
 
 
 class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
@@ -5313,7 +5453,6 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
-  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against MS-SSIM computed with Matlab implementation.
 
@@ -5322,32 +5461,68 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img = self._LoadTestImages()
     expected = self._msssim[np.triu_indices(3)]
 
-    ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
-    msssim = image_ops.ssim_multiscale(
-        *ph, max_val=1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
+    def ssim_func(x):
+      return image_ops.ssim_multiscale(
+          *x, max_val=1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
+
     with self.cached_session(use_gpu=True):
-      scores = [msssim.eval(dict(zip(ph, t)))
-                for t in itertools.combinations_with_replacement(img, 2)]
+      scores = [
+          self.evaluate(ssim_func(t))
+          for t in itertools.combinations_with_replacement(img, 2)
+      ]
 
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
 
-  @test_util.run_deprecated_v1
   def testUnweightedIsDifferentiable(self):
     img = self._LoadTestImages()
-    ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
+
+    @def_function.function
+    def msssim_func(x1, x2, scalar):
+      return image_ops.ssim_multiscale(
+          x1 * scalar,
+          x2 * scalar,
+          max_val=1.0,
+          power_factors=(1, 1, 1, 1, 1),
+          filter_size=11,
+          filter_sigma=1.5,
+          k1=0.01,
+          k2=0.03)
+
     scalar = constant_op.constant(1.0, dtype=dtypes.float32)
-    scaled_ph = [x * scalar for x in ph]
-    msssim = image_ops.ssim_multiscale(
-        *scaled_ph,
-        max_val=1.0,
-        power_factors=(1, 1, 1, 1, 1),
-        filter_size=11,
-        filter_sigma=1.5,
-        k1=0.01,
-        k2=0.03)
-    grads = gradients.gradients(msssim, scalar)
-    with self.cached_session(use_gpu=True) as sess:
-      np_grads = sess.run(grads, feed_dict={ph[0]: img[0], ph[1]: img[1]})
+
+    with backprop.GradientTape() as tape:
+      tape.watch(scalar)
+      y = msssim_func(img[0], img[1], scalar)
+
+    grad = tape.gradient(y, scalar)
+    np_grads = self.evaluate(grad)
+    self.assertTrue(np.isfinite(np_grads).all())
+
+  def testUnweightedIsDifferentiableEager(self):
+    if not context.executing_eagerly():
+      self.skipTest("Eager mode only")
+
+    img = self._LoadTestImages()
+
+    def msssim_func(x1, x2, scalar):
+      return image_ops.ssim_multiscale(
+          x1 * scalar,
+          x2 * scalar,
+          max_val=1.0,
+          power_factors=(1, 1, 1, 1, 1),
+          filter_size=11,
+          filter_sigma=1.5,
+          k1=0.01,
+          k2=0.03)
+
+    scalar = constant_op.constant(1.0, dtype=dtypes.float32)
+
+    with backprop.GradientTape() as tape:
+      tape.watch(scalar)
+      y = msssim_func(img[0], img[1], scalar)
+
+    grad = tape.gradient(y, scalar)
+    np_grads = self.evaluate(grad)
     self.assertTrue(np.isfinite(np_grads).all())
 
   def testBatch(self):
@@ -5409,7 +5584,6 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     self.assertTrue(np.all(msssim >= 0.0))
     self.assertTrue(np.all(msssim <= 1.0))
 
-  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 180, 240, 3), 255)
     img2 = self._RandomImage((1, 180, 240, 3), 255)
@@ -5423,7 +5597,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
         img1, img2, 1.0, filter_size=11, filter_sigma=1.5, k1=0.01, k2=0.03)
     with self.cached_session(use_gpu=True):
       self.assertAllClose(
-          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
+          self.evaluate(ssim_uint8), self.evaluate(ssim_float32), atol=0.001)
 
   def testNumpyInput(self):
     """Test case for GitHub issue 28241."""
@@ -5518,7 +5692,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class DecodeImageTest(test_util.TensorFlowTestCase):
+class DecodeImageTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   _FORWARD_COMPATIBILITY_HORIZONS = [
       (2020, 1, 1),
@@ -5698,7 +5872,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
           first_frame = array_ops.gather(animation, 0)
           image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
           image0, image1 = self.evaluate([image0, image1])
-          self.assertEqual(len(image0.shape), 3)
+          self.assertLen(image0.shape, 3)
           self.assertAllEqual(list(image0.shape), [40, 20, 3])
           self.assertAllEqual(image0, image1)
 
@@ -5706,10 +5880,103 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
           image2 = image_ops.decode_image(gif0, dtype=dtypes.float32)
           image3 = image_ops.convert_image_dtype(animation, dtypes.float32)
           image2, image3 = self.evaluate([image2, image3])
-          self.assertEqual(len(image2.shape), 4)
+          self.assertLen(image2.shape, 4)
           self.assertAllEqual(list(image2.shape), [12, 40, 20, 3])
           self.assertAllEqual(image2, image3)
 
+  def testImageCropAndResize(self):
+    if test_util.is_gpu_available():
+      op = image_ops_impl.crop_and_resize_v2(
+          image=array_ops.zeros((2, 1, 1, 1)),
+          boxes=[[1.0e+40, 0, 0, 0]],
+          box_indices=[1],
+          crop_size=[1, 1])
+      self.evaluate(op)
+    else:
+      message = "Boxes contains at least one element that is not finite"
+      with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                  message):
+        op = image_ops_impl.crop_and_resize_v2(
+            image=array_ops.zeros((2, 1, 1, 1)),
+            boxes=[[1.0e+40, 0, 0, 0]],
+            box_indices=[1],
+            crop_size=[1, 1])
+        self.evaluate(op)
+
+  @parameterized.named_parameters(
+      ("_jpeg", "JPEG", "jpeg_merge_test1.jpg"),
+      ("_png", "PNG", "lena_rgba.png"),
+      ("_gif", "GIF", "scan.gif"),
+  )
+  def testWrongOpBmp(self, img_format, filename):
+    base_folder = "tensorflow/core/lib"
+    base_path = os.path.join(base_folder, img_format.lower(), "testdata")
+    err_msg = "Trying to decode " + img_format + " format using DecodeBmp op"
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), err_msg):
+      img_bytes = io_ops.read_file(os.path.join(base_path, filename))
+      img = image_ops.decode_bmp(img_bytes)
+      self.evaluate(img)
+
+  @parameterized.named_parameters(
+      ("_jpeg", image_ops.decode_jpeg, "DecodeJpeg"),
+      ("_png", image_ops.decode_png, "DecodePng"),
+      ("_gif", image_ops.decode_gif, "DecodeGif"),
+  )
+  def testWrongOp(self, decode_op, op_used):
+    base = "tensorflow/core/lib/bmp/testdata"
+    bmp0 = io_ops.read_file(os.path.join(base, "rgba_small.bmp"))
+    err_msg = ("Trying to decode BMP format using a wrong op. Use `decode_bmp` "
+               "or `decode_image` instead. Op used: ") + op_used
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), err_msg):
+      img = decode_op(bmp0)
+      self.evaluate(img)
+
+  @parameterized.named_parameters(
+      ("_png", "PNG", "lena_rgba.png"),
+      ("_gif", "GIF", "scan.gif"),
+      ("_bmp", "BMP", "rgba_small.bmp"),
+  )
+  def testWrongOpJpeg(self, img_format, filename):
+    base_folder = "tensorflow/core/lib"
+    base_path = os.path.join(base_folder, img_format.lower(), "testdata")
+    err_msg = ("DecodeAndCropJpeg operation can run on JPEG only, but "
+               "detected ") + img_format
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), err_msg):
+      img_bytes = io_ops.read_file(os.path.join(base_path, filename))
+      img = image_ops.decode_and_crop_jpeg(img_bytes, [1, 1, 2, 2])
+      self.evaluate(img)
+
+  def testGifFramesWithDiffSize(self):
+    """Test decoding an animated GIF.
+
+    This test verifies that `decode_image` op can decode animated GIFs whose
+    first frame does not fill the canvas. The unoccupied areas should be filled
+    with zeros (black).
+
+    `squares.gif` is animated with two images of different sizes. It
+    alternates between a smaller image of size 10 x 10 and a larger image of
+    size 16 x 16. Because it starts animating with the smaller image, the first
+    frame does not fill the canvas. (Canvas size is equal to max frame width x
+    max frame height.)
+
+    `red_black.gif` has just a single image in a GIF format. It is the same
+    image as the smaller image (size 10 x 10) of the two images in
+    `squares.gif`. The only difference is that its background (canvas - smaller
+    image) is pre-filled with zeros (black); it is the groundtruth.
+    """
+    base = "tensorflow/core/lib/gif/testdata"
+    gif_bytes0 = io_ops.read_file(os.path.join(base, "squares.gif"))
+    image0 = image_ops.decode_image(gif_bytes0, dtype=dtypes.float32,
+                                    expand_animations=False)
+    gif_bytes1 = io_ops.read_file(os.path.join(base, "red_black.gif"))
+    image1 = image_ops.decode_image(gif_bytes1, dtype=dtypes.float32)
+    image1_0 = array_ops.gather(image1, 0)
+    image0, image1_0 = self.evaluate([image0, image1_0])
+    self.assertAllEqual(image0, image1_0)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 4ea7ef007d6..ae8bfbdbdd0 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -203,8 +203,6 @@ class InitializersTest(test.TestCase):
             run_metadata=run_metadata)
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('b/165614506: Incorrect device name set in '
-                          'tfrt::TensorHandle.')
   def test_eager_orthogonal_gpu(self):
     with context.eager_mode():
       v = variable_scope.get_variable(
diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py
index 3c110fe9cf9..02ae3a0ac4b 100644
--- a/tensorflow/python/ops/init_ops_v2.py
+++ b/tensorflow/python/ops/init_ops_v2.py
@@ -12,19 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Operations often used for initializing tensors.
-
-All variable initializers returned by functions in this file should have the
-following signature:
-
-def _initializer(shape, dtype=dtypes.float32):
-  Args:
-    shape: List of `int` representing the shape of the output `Tensor`. Some
-      initializers may also be able to accept a `Tensor`.
-    dtype: (Optional) Type of the output `Tensor`.
-  Returns:
-    A `Tensor` of type `dtype` and `shape`.
-"""
+"""Initializers for TF 2."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -44,18 +32,40 @@ from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops.init_ops import _compute_fans
 from tensorflow.python.util.tf_export import tf_export
 
+_PARTITION_SHAPE = "partition_shape"
+_PARTITION_OFFSET = "partition_offset"
+
 
 class Initializer(object):
   """Initializer base class: all initializers inherit from this class.
+
+  Initializers should implement a `__call__` method with the following
+  signature:
+
+  ```python
+  def __call__(self, shape, dtype=None, **kwargs):
+    # returns a tensor of shape `shape` and dtype `dtype`
+    # containing values drawn from a distribution of your choice.
+  ```
   """
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. If not provided will return tensor
-       of `tf.float32`.
+        of `tf.float32`.
+      **kwargs: Additional keyword arguments. Accepted values:
+        `partition_shape` and `partition_offset`. Used when creating a single
+        partition in a partitioned variable. `partition_shape` is the shape of
+        the partition (i.e. the shape of the returned tensor) and
+        `partition_offset` is a tuple of `int` specifying the offset of this
+        partition w.r.t each axis. For example, a tensor of shape `(30, 100)`
+        can be partitioned into two partitions: `p0` of shape `(10, 100)` and
+        `p1` of shape `(20, 100)`; if the initializer is called with
+        `partition_shape=(20, 100)` and `partition_offset=(10, 0)`, it should
+        return the value for `p1`.
     """
     raise NotImplementedError
 
@@ -89,6 +99,14 @@ class Initializer(object):
     config.pop("dtype", None)
     return cls(**config)
 
+  def _validate_kwargs(self, kwargs, support_partition=True):
+    for kwarg in kwargs:
+      if kwarg not in [_PARTITION_SHAPE, _PARTITION_OFFSET]:
+        raise TypeError("Unknown keyword arguments: %s" % kwarg)
+      elif not support_partition:
+        raise ValueError("%s initializer doesn't support partition-related"
+                         " arguments" % self.__class__.__name__)
+
 
 @tf_export("zeros_initializer", v1=[])
 class Zeros(Initializer):
@@ -115,20 +133,24 @@ class Zeros(Initializer):
   (<tf.Variable...shape=(4,) dtype=float32...>, <tf.Variable...shape=(4, 4) ...
   """
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
        supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValuesError: If the dtype is not numeric or boolean.
     """
+    self._validate_kwargs(kwargs)
     dtype = dtypes.as_dtype(dtype)
     if not dtype.is_numpy_compatible or dtype == dtypes.string:
       raise ValueError("Expected numeric or boolean dtype, got %s." % dtype)
+    if _PARTITION_SHAPE in kwargs:
+      shape = kwargs[_PARTITION_SHAPE]
     return array_ops.zeros(shape, dtype)
 
 
@@ -157,20 +179,24 @@ class Ones(Initializer):
   (<tf.Variable...shape=(4,) dtype=float32...>, <tf.Variable...shape=(4, 4) ...
   """
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
-       supported.
+        supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValuesError: If the dtype is not numeric or boolean.
     """
+    self._validate_kwargs(kwargs)
     dtype = dtypes.as_dtype(dtype)
     if not dtype.is_numpy_compatible or dtype == dtypes.string:
       raise ValueError("Expected numeric or boolean dtype, got %s." % dtype)
+    if _PARTITION_SHAPE in kwargs:
+      shape = kwargs[_PARTITION_SHAPE]
     return array_ops.ones(shape, dtype)
 
 
@@ -245,22 +271,23 @@ class Constant(Initializer):
           "tuple of values, or numpy.ndarray)." % type(value))
     self.value = value
 
-  def __call__(self, shape, dtype=None):
+  def __call__(self, shape, dtype=None, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. If not provided the dtype of the
-       tensor created will be the type of the inital value.
+        tensor created will be the type of the inital value.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       TypeError: If the initializer cannot create a tensor of the requested
        dtype.
     """
+    self._validate_kwargs(kwargs, support_partition=False)
     if dtype is not None:
       dtype = dtypes.as_dtype(dtype)
-    return constant_op.constant(
-        self.value, dtype=dtype, shape=shape)
+    return constant_op.constant(self.value, dtype=dtype, shape=shape)
 
   def get_config(self):
     return {"value": self.value}
@@ -305,20 +332,24 @@ class RandomUniform(Initializer):
     self.seed = seed
     self._random_generator = _RandomGenerator(seed)
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point and integer
-      types are supported.
+        types are supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValueError: If the dtype is not numeric.
     """
+    self._validate_kwargs(kwargs)
     dtype = dtypes.as_dtype(dtype)
     if not dtype.is_floating and not dtype.is_integer:
       raise ValueError("Expected float or integer dtype, got %s." % dtype)
+    if _PARTITION_SHAPE in kwargs:
+      shape = kwargs[_PARTITION_SHAPE]
     return self._random_generator.random_uniform(shape, self.minval,
                                                  self.maxval, dtype)
 
@@ -369,18 +400,22 @@ class RandomNormal(Initializer):
     self.seed = seed
     self._random_generator = _RandomGenerator(seed)
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported.
+        supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValueError: If the dtype is not floating point
     """
+    self._validate_kwargs(kwargs)
     dtype = _assert_float_dtype(dtype)
+    if _PARTITION_SHAPE in kwargs:
+      shape = kwargs[_PARTITION_SHAPE]
     return self._random_generator.random_normal(shape, self.mean, self.stddev,
                                                 dtype)
 
@@ -434,18 +469,22 @@ class TruncatedNormal(Initializer):
     self.seed = seed
     self._random_generator = _RandomGenerator(seed)
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported.
+        supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValueError: If the dtype is not floating point
     """
+    self._validate_kwargs(kwargs)
     dtype = _assert_float_dtype(dtype)
+    if _PARTITION_SHAPE in kwargs:
+      shape = kwargs[_PARTITION_SHAPE]
     return self._random_generator.truncated_normal(shape, self.mean,
                                                    self.stddev, dtype)
 
@@ -525,24 +564,24 @@ class VarianceScaling(Initializer):
     self.seed = seed
     self._random_generator = _RandomGenerator(seed)
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported.
+        supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValueError: If the dtype is not floating point
     """
-    partition_info = None  # Keeps logic so can be readded later if necessary
+    self._validate_kwargs(kwargs)
     dtype = _assert_float_dtype(dtype)
     scale = self.scale
-    scale_shape = shape
-    if partition_info is not None:
-      scale_shape = partition_info.full_shape
-    fan_in, fan_out = _compute_fans(scale_shape)
+    fan_in, fan_out = _compute_fans(shape)
+    if _PARTITION_SHAPE in kwargs:
+      shape = kwargs[_PARTITION_SHAPE]
     if self.mode == "fan_in":
       scale /= max(1., fan_in)
     elif self.mode == "fan_out":
@@ -616,18 +655,20 @@ class Orthogonal(Initializer):
     self.seed = seed
     self._random_generator = _RandomGenerator(seed)
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
         supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValueError: If the dtype is not floating point or the input shape is not
        valid.
     """
+    self._validate_kwargs(kwargs, support_partition=False)
     dtype = _assert_float_dtype(dtype)
     # Check the shape
     if len(shape) < 2:
@@ -686,28 +727,25 @@ class Identity(Initializer):
   def __init__(self, gain=1.0):
     self.gain = gain
 
-  def __call__(self, shape, dtype=dtypes.float32):
+  def __call__(self, shape, dtype=dtypes.float32, **kwargs):
     """Returns a tensor object initialized as specified by the initializer.
 
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
        supported.
+      **kwargs: Additional keyword arguments.
 
     Raises:
       ValueError: If the dtype is not floating point
       ValueError: If the requested shape does not have exactly two axes.
     """
-    partition_info = None  # Keeps logic so can be readded later if necessary
+    self._validate_kwargs(kwargs, support_partition=False)
     dtype = _assert_float_dtype(dtype)
-    full_shape = shape if partition_info is None else partition_info.full_shape
-    if len(full_shape) != 2:
+    if len(shape) != 2:
       raise ValueError(
           "Identity matrix initializer can only be used for 2D matrices.")
-    initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
-    if partition_info is not None:
-      initializer = array_ops.slice(initializer, partition_info.var_offset,
-                                    shape)
+    initializer = linalg_ops_impl.eye(*shape, dtype=dtype)
     return self.gain * initializer
 
   def get_config(self):
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
index d06ffa4cc68..d524f1e1fc3 100644
--- a/tensorflow/python/ops/init_ops_v2_test.py
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -78,6 +78,21 @@ class InitializersTest(test.TestCase):
     if target_min is not None:
       self.assertGreater(lim, abs(output.min() - target_min))
 
+  def _partition_test(self, init):
+    full_shape = (4, 2)
+    partition_shape = (2, 2)
+    partition_offset = (0, 0)
+    full_value = self.evaluate(init(full_shape, dtype=dtypes.float32))
+    got = self.evaluate(
+        init(
+            full_shape,
+            dtype=dtypes.float32,
+            partition_shape=partition_shape,
+            partition_offset=partition_offset))
+    self.assertEqual(got.shape, partition_shape)
+    self.assertAllClose(
+        got, array_ops.slice(full_value, partition_offset, partition_shape))
+
 
 class ConstantInitializersTest(InitializersTest):
 
@@ -86,11 +101,28 @@ class ConstantInitializersTest(InitializersTest):
     self._range_test(init_ops_v2.Zeros(), shape=(4, 5),
                      target_mean=0., target_max=0.)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testZerosPartition(self):
+    init = init_ops_v2.Zeros()
+    self._partition_test(init)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testZerosInvalidKwargs(self):
+    init = init_ops_v2.Zeros()
+    with self.assertRaisesWithLiteralMatch(TypeError,
+                                           r"Unknown keyword arguments: dtpye"):
+      init((2, 2), dtpye=dtypes.float32)
+
   @test_util.run_in_graph_and_eager_modes
   def testOnes(self):
     self._range_test(init_ops_v2.Ones(), shape=(4, 5),
                      target_mean=1., target_max=1.)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testOnesPartition(self):
+    init = init_ops_v2.Ones()
+    self._partition_test(init)
+
   @test_util.run_in_graph_and_eager_modes
   def testConstantInt(self):
     self._range_test(
@@ -100,6 +132,14 @@ class ConstantInitializersTest(InitializersTest):
         target_max=2,
         target_min=2)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantPartition(self):
+    init = init_ops_v2.Constant([1, 2, 3, 4])
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        r"Constant initializer doesn't support partition-related arguments"):
+      init((4, 2), dtype=dtypes.float32, partition_shape=(2, 2))
+
   @test_util.run_in_graph_and_eager_modes
   def testConstantTuple(self):
     init = init_ops_v2.constant_initializer((10, 20, 30))
@@ -188,6 +228,11 @@ class RandomUniformInitializerTest(InitializersTest):
     init = init_ops_v2.RandomUniform(0.0, 1.0)
     self._duplicated_test(init)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializePartition(self):
+    init = init_ops_v2.RandomUniform(0, 7, seed=1)
+    self._partition_test(init)
+
 
 class RandomNormalInitializerTest(InitializersTest):
 
@@ -217,6 +262,14 @@ class RandomNormalInitializerTest(InitializersTest):
     init = init_ops_v2.RandomNormal(0.0, 1.0)
     self._duplicated_test(init)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializePartition(self):
+    if test_util.is_xla_enabled():
+      self.skipTest(
+          "XLA ignores seeds for RandomNormal, skip xla-enabled test.")
+    init = init_ops_v2.RandomNormal(0, 7, seed=1)
+    self._partition_test(init)
+
 
 class TruncatedNormalInitializerTest(InitializersTest):
 
@@ -247,6 +300,12 @@ class TruncatedNormalInitializerTest(InitializersTest):
     init = init_ops_v2.TruncatedNormal(0.0, 1.0)
     self._duplicated_test(init)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializePartition(self):
+    init = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    self._partition_test(init)
+
+  @test_util.run_in_graph_and_eager_modes
   def testInvalidDataType(self):
     init = init_ops_v2.TruncatedNormal(0.0, 1.0)
     with self.assertRaises(ValueError):
@@ -317,6 +376,24 @@ class VarianceScalingInitializerTest(InitializersTest):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializePartition(self):
+    partition_shape = (100, 100)
+    shape = [1000, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="untruncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "random_normal",
+        wraps=random_ops.random_normal) as mock_random_normal:
+      x = self.evaluate(init(shape, partition_shape=partition_shape))
+      self.assertTrue(mock_random_normal.called)
+
+    self.assertEqual(x.shape, partition_shape)
+    self.assertNear(np.mean(x), expect_mean, err=1e-3)
+    self.assertNear(np.var(x), expect_var, err=1e-3)
+
 
 class OrthogonalInitializerTest(InitializersTest):
 
@@ -386,6 +463,14 @@ class OrthogonalInitializerTest(InitializersTest):
           self.assertAllClose(
               np.dot(t, t.T), np.eye(t.shape[0]), rtol=tol, atol=tol)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testPartition(self):
+    init = init_ops_v2.Orthogonal(seed=1)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        r"Orthogonal initializer doesn't support partition-related arguments"):
+      init((4, 2), dtype=dtypes.float32, partition_shape=(2, 2))
+
 
 class IdentityInitializerTest(InitializersTest):
 
@@ -439,6 +524,14 @@ class IdentityInitializerTest(InitializersTest):
         self.assertAllClose(self.evaluate(init_custom(shape, dtype=dtype)),
                             np.eye(*shape) * 0.9)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testPartition(self):
+    init = init_ops_v2.Identity()
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        r"Identity initializer doesn't support partition-related arguments"):
+      init((4, 2), dtype=dtypes.float32, partition_shape=(2, 2))
+
 
 class GlorotInitializersTest(InitializersTest):
 
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 8379a26a260..f4da26ccb94 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
@@ -395,8 +397,8 @@ def _build_element_shape(shape):
   # Shape is unknown.
   if shape is None:
     return -1
-  # Shape is a scalar.
-  if not shape:
+  # Shape is numpy array or a scalar.
+  if isinstance(shape, (np.ndarray, np.generic)) or not shape:
     return ops.convert_to_tensor(shape, dtype=dtypes.int32)
   # Shape is a sequence of dimensions. Convert None dims to -1.
   def convert(val):
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index e53629250c9..145c2b0195c 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -146,6 +146,10 @@ class LookupInterface(trackable.TrackableResource):
     """Looks up `keys` in a table, outputs the corresponding values."""
     raise NotImplementedError
 
+  def __getitem__(self, keys):
+    """Looks up `keys` in a table, outputs the corresponding values."""
+    return self.lookup(keys)
+
 
 class InitializableLookupTableBase(LookupInterface):
   """Initializable lookup table interface.
@@ -255,14 +259,28 @@ class StaticHashTable(InitializableLookupTableBase):
 
   Example usage:
 
-  ```python
-  keys_tensor = tf.constant([1, 2])
-  vals_tensor = tf.constant([3, 4])
-  input_tensor = tf.constant([1, 5])
-  table = tf.lookup.StaticHashTable(
-      tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), -1)
-  print(table.lookup(input_tensor))
-  ```
+  >>> keys_tensor = tf.constant(['a', 'b', 'c'])
+  >>> vals_tensor = tf.constant([7, 8, 9])
+  >>> input_tensor = tf.constant(['a', 'f'])
+  >>> table = tf.lookup.StaticHashTable(
+  ...     tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor),
+  ...     default_value=-1)
+  >>> table.lookup(input_tensor).numpy()
+  array([ 7, -1], dtype=int32)
+
+  Or for more pythonic code:
+
+  >>> table[input_tensor].numpy()
+  array([ 7, -1], dtype=int32)
+
+  The result of a lookup operation has the same shape as the argument:
+
+  >>> input_tensor = tf.constant([['a', 'b'], ['c', 'd']])
+  >>> table[input_tensor].numpy()
+  array([[ 7,  8],
+         [ 9, -1]], dtype=int32)
+
+
   """
 
   def __init__(self, initializer, default_value, name=None):
@@ -422,16 +440,15 @@ class DatasetInitializer(TableInitializerBase):
   """Creates a table initializer from a `tf.data.Dataset`.
 
   Sample usage:
-  ```python
-    keys = tf.data.Dataset.range(100)
-    values = tf.data.Dataset.range(100).map(
-        lambda x: string_ops.as_string(x * 2))
-    ds = tf.data.Dataset.zip((keys, values))
-    init = tf.lookup.experimental.DatasetInitializer(ds)
-    table = tf.lookup.StaticHashTable(init, "")
-    output = table.lookup([0, 1, 2])
-    assertEquals(outputs, ["0", "2", "4"])
-  ```
+
+  >>> keys = tf.data.Dataset.range(100)
+  >>> values = tf.data.Dataset.range(100).map(
+  ...     lambda x: string_ops.as_string(x * 2))
+  >>> ds = tf.data.Dataset.zip((keys, values))
+  >>> init = tf.lookup.experimental.DatasetInitializer(ds)
+  >>> table = tf.lookup.StaticHashTable(init, "")
+  >>> table.lookup(tf.constant([0, 1, 2], dtype=tf.int64)).numpy()
+  array([b'0', b'2', b'4'], dtype=object)
 
   Attributes:
     dataset: A `tf.data.Dataset` object that produces tuples of scalars. The
@@ -479,7 +496,19 @@ class DatasetInitializer(TableInitializerBase):
 
 @tf_export("lookup.KeyValueTensorInitializer")
 class KeyValueTensorInitializer(TableInitializerBase):
-  """Table initializers given `keys` and `values` tensors."""
+  """Table initializers given `keys` and `values` tensors.
+
+  >>> keys_tensor = tf.constant(['a', 'b', 'c'])
+  >>> vals_tensor = tf.constant([7, 8, 9])
+  >>> input_tensor = tf.constant(['a', 'f'])
+  >>> init = tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor)
+  >>> table = tf.lookup.StaticHashTable(
+  ...     init,
+  ...     default_value=-1)
+  >>> table.lookup(input_tensor).numpy()
+  array([ 7, -1], dtype=int32)
+
+  """
 
   def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None):
     """Constructs a table initializer object based on keys and values tensors.
@@ -537,7 +566,7 @@ class KeyValueTensorInitializer(TableInitializerBase):
 class TextFileIndex(object):
   """The key and value content to get from each line.
 
-  This class defines the key and value used for tf.lookup.TextFileInitializer.
+  This class defines the key and value used for `tf.lookup.TextFileInitializer`.
 
   The key and value content to get from each line is specified either
   by the following, or a value `>=0`.
@@ -555,7 +584,7 @@ class TextFileIndex(object):
 
 @tf_export("lookup.TextFileInitializer")
 class TextFileInitializer(TableInitializerBase):
-  """Table initializers from a text file.
+  r"""Table initializers from a text file.
 
   This initializer assigns one entry in the table for each line in the file.
 
@@ -574,11 +603,11 @@ class TextFileInitializer(TableInitializerBase):
 
   For example if we have a file with the following content:
 
-  ```
-  emerson 10
-  lake 20
-  palmer 30
-  ```
+  >>> import tempfile
+  >>> f = tempfile.NamedTemporaryFile(delete=False)
+  >>> content='\n'.join(["emerson 10", "lake 20", "palmer 30",])
+  >>> f.file.write(content.encode('utf-8'))
+  >>> f.file.close()
 
   The following snippet initializes a table with the first column as keys and
   second column as values:
@@ -587,12 +616,13 @@ class TextFileInitializer(TableInitializerBase):
   * `lake -> 20`
   * `palmer -> 30`
 
-  ```python
-  table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(
-      "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
-  ...
-  table.init.run()
-  ```
+  >>> init= tf.lookup.TextFileInitializer(
+  ...    filename=f.name,
+  ...    key_dtype=tf.string, key_index=0,
+  ...    value_dtype=tf.int64, value_index=1,
+  ...    delimiter=" ")
+  >>> table = tf.lookup.StaticHashTable(init, default_value=-1)
+  >>> table.lookup(tf.constant(['palmer','lake','tarkus'])).numpy()
 
   Similarly to initialize the whole line as keys and the line number as values.
 
@@ -600,13 +630,14 @@ class TextFileInitializer(TableInitializerBase):
   * `lake 20 -> 1`
   * `palmer 30 -> 2`
 
-  ```python
-  table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(
-      "test.txt", tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
-      tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
-  ...
-  table.init.run()
-  ```
+  >>> init = tf.lookup.TextFileInitializer(
+  ...   filename=f.name,
+  ...   key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+  ...   value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
+  ...   delimiter=" ")
+  >>> table = tf.lookup.StaticHashTable(init, -1)
+  >>> table.lookup(tf.constant('palmer 30')).numpy()
+  2
   """
 
   def __init__(self,
@@ -1106,45 +1137,53 @@ class IdTableWithHashBuckets(LookupInterface):
 
 @tf_export("lookup.StaticVocabularyTable", v1=[])
 class StaticVocabularyTable(LookupInterface):
-  r"""String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+  r"""String to Id table that assigns out-of-vocabulary keys to hash buckets.
 
   For example, if an instance of `StaticVocabularyTable` is initialized with a
   string-to-id initializer that maps:
 
-  * `emerson -> 0`
-  * `lake -> 1`
-  * `palmer -> 2`
+  >>> init = tf.lookup.KeyValueTensorInitializer(
+  ...     keys=tf.constant(['emerson', 'lake', 'palmer']),
+  ...     values=tf.constant([0, 1, 2], dtype=tf.int64))
+  >>> table = tf.lookup.StaticVocabularyTable(
+  ...    init,
+  ...    num_oov_buckets=5)
 
   The `Vocabulary` object will performs the following mapping:
 
   * `emerson -> 0`
   * `lake -> 1`
   * `palmer -> 2`
-  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
-  `3 + num_oov_buckets - 1`, calculated by:
+  * `<other term> -> bucket_id`, where `bucket_id` will be between `3` and
+  `3 + num_oov_buckets - 1 = 7`, calculated by:
   `hash(<term>) % num_oov_buckets + vocab_size`
 
-  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
-  the lookup result is `[0, 1, 2, 4, 7]`.
+  If input_tensor is:
+
+  >>> input_tensor = tf.constant(["emerson", "lake", "palmer",
+  ...                             "king", "crimson"])
+  >>> table[input_tensor].numpy()
+  array([0, 1, 2, 6, 7])
 
   If `initializer` is None, only out-of-vocabulary buckets are used.
 
   Example usage:
 
-  ```python
-  num_oov_buckets = 3
-  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
-  table = tf.lookup.StaticVocabularyTable(
-      tf.lookup.TextFileInitializer(
-          filename,
-          key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
-          value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
-          delimiter="\t"),
-      num_oov_buckets)
-  out = table.lookup(input_tensor).
-  table.init.run()
-  print(out.eval())
-  ```
+  >>> num_oov_buckets = 3
+  >>> vocab = ["emerson", "lake", "palmer", "crimnson"]
+  >>> import tempfile
+  >>> f = tempfile.NamedTemporaryFile(delete=False)
+  >>> f.write('\n'.join(vocab).encode('utf-8'))
+  >>> f.close()
+
+  >>> init = tf.lookup.TextFileInitializer(
+  ...     f.name,
+  ...     key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+  ...     value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
+  >>> table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)
+  >>> table.lookup(tf.constant(["palmer", "crimnson" , "king",
+  ...                           "tarkus", "black", "moon"])).numpy()
+  array([2, 3, 5, 6, 6, 4])
 
   The hash function used for generating out-of-vocabulary buckets ID is
   Fingerprint64.
@@ -1158,8 +1197,8 @@ class StaticVocabularyTable(LookupInterface):
     """Construct a `StaticVocabularyTable` object.
 
     Args:
-      initializer: A TableInitializerBase object that contains the data used to
-        initialize the table. If None, then we only use out-of-vocab buckets.
+      initializer: A `TableInitializerBase` object that contains the data used
+        to initialize the table. If None, then we only use out-of-vocab buckets.
       num_oov_buckets: Number of buckets to use for out-of-vocabulary keys. Must
         be greater than zero.
       lookup_key_dtype: Data type of keys passed to `lookup`. Defaults to
@@ -1810,7 +1849,7 @@ class MutableHashTable(LookupInterface):
 
     return op
 
-  def lookup(self, keys, name=None):
+  def lookup(self, keys, dynamic_default_values=None, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
 
     The `default_value` is used for keys not present in the table.
@@ -1818,6 +1857,23 @@ class MutableHashTable(LookupInterface):
     Args:
       keys: Keys to look up. Can be a tensor of any shape. Must match the
         table's key_dtype.
+      dynamic_default_values: The values to use if a key is missing in the
+        table. If None (by default), the `table.default_value` will be used.
+        Shape of `dynamic_default_values` must be same with
+        `table.default_value` or the lookup result tensor.
+        In the latter case, each key will have a different default value.
+
+        For example:
+
+          ```python
+          keys = [0, 1, 3]
+          dynamic_default_values = [[1, 3, 4], [2, 3, 9], [8, 3, 0]]
+
+          # The key '0' will use [1, 3, 4] as default value.
+          # The key '1' will use [2, 3, 9] as default value.
+          # The key '3' will use [8, 3, 0] as default value.
+          ```
+
       name: A name for the operation (optional).
 
     Returns:
@@ -1831,8 +1887,9 @@ class MutableHashTable(LookupInterface):
                         (self.resource_handle, keys, self._default_value)):
       keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
       with ops.colocate_with(self.resource_handle):
-        values = gen_lookup_ops.lookup_table_find_v2(self.resource_handle, keys,
-                                                     self._default_value)
+        values = gen_lookup_ops.lookup_table_find_v2(
+            self.resource_handle, keys, dynamic_default_values
+            if dynamic_default_values is not None else self._default_value)
     return values
 
   def insert(self, keys, values, name=None):
@@ -1926,17 +1983,18 @@ class DenseHashTable(LookupInterface):
 
   Example usage:
 
-  ```python
-  table = tf.lookup.DenseHashTable(key_dtype=tf.int64,
-                                   value_dtype=tf.int64,
-                                   default_value=-1,
-                                   empty_key=0,
-                                   deleted_key=-1)
-
-  sess.run(table.insert(keys, values))
-  out = table.lookup(query_keys)
-  print(out.eval())
-  ```
+  >>> table = tf.lookup.experimental.DenseHashTable(
+  ...     key_dtype=tf.string,
+  ...     value_dtype=tf.int64,
+  ...     default_value=-1,
+  ...     empty_key='',
+  ...     deleted_key='$')
+  >>> keys = tf.constant(['a', 'b', 'c'])
+  >>> values = tf.constant([0, 1, 2], dtype=tf.int64)
+  >>> table.insert(keys, values)
+  >>> table.remove(tf.constant(['c']))
+  >>> table.lookup(tf.constant(['a', 'b', 'c','d'])).numpy()
+  array([ 0,  1, -1, -1])
   """
 
   # TODO(andreasst): consider extracting common code with MutableHashTable into
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 49474833cdb..d4d1b5543f3 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -146,7 +146,7 @@ def scale_losses_by_sample_weight(losses, sample_weight):
 
 @tf_contextlib.contextmanager
 def check_per_example_loss_rank(per_example_loss):
-  """Context manager that checks that the rank of per_example_loss is atleast 1.
+  """Context manager that checks that the rank of per_example_loss is at least 1.
 
   Args:
     per_example_loss: Per example loss tensor.
@@ -183,7 +183,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     loss_collection: Optional collection to add the loss to.
   """
   # Since we have no way of figuring out when a training iteration starts or
-  # ends, holding on to a loss when executing eagerly is indistingishable from
+  # ends, holding on to a loss when executing eagerly is indistinguishable from
   # leaking memory. We instead leave the collection empty.
   if loss_collection and not context.executing_eagerly():
     ops.add_to_collection(loss_collection, loss)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 389f6f8dce9..b08c58295cd 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -414,6 +414,46 @@ def _SegmentMaxGrad(op, grad):
   return _SegmentMinOrMaxGrad(op, grad)
 
 
+@ops.RegisterGradient("SegmentProd")
+def _SegmentProdGrad(op, grad):
+  """Gradient for SegmentProd.
+
+  The gradient can be expressed for each segment by dividing the segment's
+  product by each element of the segment input tensor, but this approach can't
+  deal with zeros in the input.
+  Unlike reduce_prod we can't use cumsum here as individual segments may have
+  a different number of elements. Therefore we consider three cases:
+  1) A segment input contains no zeros and we can safely divide by the input
+     tensor.
+  2) A segment contains exactly one zero. Then the gradient of each input of
+     the segment is zero except for the 0-input, there the gradient is
+     the product of the remaining segment entries.
+  3) A segment contains at least two zeros. The gradient is zero for all
+     segment inputs.
+  """
+  data = op.inputs[0]
+  segment_ids = op.inputs[1]
+  is_zero = math_ops.equal(data, 0)
+  num_zeros = gen_math_ops.segment_sum(
+      math_ops.cast(is_zero, dtype=dtypes.int32), segment_ids)
+  # handle case 3 and set the gradient to 0 for segments with more than one
+  # 0 as input
+  grad = array_ops.where_v2(
+      math_ops.greater(num_zeros, 1), array_ops.zeros_like(grad), grad)
+  # replace all zeros with ones and compute the segment_prod
+  non_zero_data = array_ops.where_v2(is_zero, array_ops.ones_like(data), data)
+  non_zero_prod = gen_math_ops.segment_prod(non_zero_data, segment_ids)
+  gathered_prod = array_ops.gather(op.outputs[0], segment_ids)
+  gathered_non_zero_prod = array_ops.gather(non_zero_prod, segment_ids)
+  prod_divided_by_el = gathered_prod / non_zero_data
+  # Now fetch the individual results for segments containing 0 and those that
+  # don't.
+  partial_derivative = array_ops.where_v2(is_zero, gathered_non_zero_prod,
+                                          prod_divided_by_el)
+  gathered_grad = array_ops.gather(grad, segment_ids)
+  return gathered_grad * partial_derivative, None
+
+
 def _GatherDropNegatives(params,
                          ids,
                          zero_clipped_indices=None,
@@ -1827,18 +1867,25 @@ def _BatchMatMulV2(op, grad):
       grad_x = math_ops.matmul(y, grad, adjoint_a=True, adjoint_b=True)
       grad_y = math_ops.matmul(grad, x, adjoint_a=True, adjoint_b=True)
 
-  # Reduce along the broadcasted batch dimensions, if broadcasting is required.
+  # Possibly reduce along the broadcasted batch dimensions, if broadcasting
+  # is required.
   shape_x_static = x.get_shape()
   shape_y_static = y.get_shape()
-  if not (shape_x_static.is_fully_defined() and
-          shape_y_static.is_fully_defined() and
-          shape_x_static == shape_y_static):
-    sx = array_ops.shape(x)
-    sy = array_ops.shape(y)
-    rx, ry = gen_array_ops.broadcast_gradient_args(sx[:-2], sy[:-2])
-    grad_x = array_ops.reshape(math_ops.reduce_sum(grad_x, rx), sx)
-    grad_y = array_ops.reshape(math_ops.reduce_sum(grad_y, ry), sy)
+  output_may_have_non_empty_batch_shape = (
+      (shape_x_static.rank is None or shape_x_static.rank > 2) or
+      (shape_y_static.rank is None or shape_y_static.rank > 2))
+  batch_shapes_match = (
+      shape_x_static[:-2].is_fully_defined() and
+      shape_y_static[:-2].is_fully_defined() and
+      shape_x_static[:-2] == shape_y_static[:-2])
+  if (not output_may_have_non_empty_batch_shape) or batch_shapes_match:
+    return grad_x, grad_y
 
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx[:-2], sy[:-2])
+  grad_x = array_ops.reshape(math_ops.reduce_sum(grad_x, rx), sx)
+  grad_y = array_ops.reshape(math_ops.reduce_sum(grad_y, ry), sy)
   return grad_x, grad_y
 
 
@@ -1940,11 +1987,10 @@ def _CumprodGrad(op, grad):
   exclusive = op.get_attr("exclusive")
   reverse = op.get_attr("reverse")
 
-  # TODO This fails when x contains 0 and should be fixed
   prod = math_ops.cumprod(x, axis, exclusive=exclusive, reverse=reverse)
   out = math_ops.cumsum(
       prod * grad, axis, exclusive=exclusive, reverse=not reverse)
-  return [out / x, None]
+  return [math_ops.div_no_nan(out, x), None]
 
 
 @ops.RegisterGradient("CumulativeLogsumexp")
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index e856749f885..bbd30ef5537 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -374,6 +374,43 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class SegmentProdGradientTest(test.TestCase):
+
+  def _run_gradient_check(self, data, segment_ids):
+
+    def _segment_prod(x):
+      return math_ops.segment_prod(x, segment_ids)
+
+    err = gradient_checker_v2.max_error(
+        *gradient_checker_v2.compute_gradient(_segment_prod, [data]))
+    self.assertLess(err, 2e-4)
+
+  def testSegmentProdGradientWithoutOverlap(self):
+    data = constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]],
+                                dtype=dtypes.float32)
+    segment_ids = constant_op.constant([0, 1, 2], dtype=dtypes.int64)
+    self._run_gradient_check(data, segment_ids)
+
+  def testSegmentProdGradientWithoutZeros(self):
+    data = constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]],
+                                dtype=dtypes.float32)
+    segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
+    self._run_gradient_check(data, segment_ids)
+
+  def testSegmentProdGradientWithZeros(self):
+    data = constant_op.constant([[0, 2, 3, 4], [0, 0, 2, 0], [5, 0, 7, 0]],
+                                dtype=dtypes.float32)
+    segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
+    self._run_gradient_check(data, segment_ids)
+
+  def testSegmentProdGradientWithEmptySegment(self):
+    data = constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]],
+                                dtype=dtypes.float32)
+    segment_ids = constant_op.constant([0, 0, 2], dtype=dtypes.int64)
+    self._run_gradient_check(data, segment_ids)
+
+
 class FloorModGradientTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 119d4e95ccf..3048552dc46 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -114,8 +114,9 @@ def linspace_nd(start, stop, num, name=None, axis=0):
 
   A sequence of `num` evenly-spaced values are generated beginning at `start`
   along a given `axis`.
-  If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-  so that the last one is exactly `stop`. If `num <= 0`, `ValueError` is raised.
+  If `num > 1`, the values in the sequence increase by
+  `(stop - start) / (num - 1)`, so that the last one is exactly `stop`.
+  If `num <= 0`, `ValueError` is raised.
 
   Matches
   [np.linspace](https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html)'s
@@ -486,7 +487,7 @@ def multiply(x, y, name=None):
   >>> tf.math.multiply(7,6)
   <tf.Tensor: shape=(), dtype=int32, numpy=42>
 
-  If `x.shape` is not thes same as `y.shape`, they will be broadcast to a
+  If `x.shape` is not the same as `y.shape`, they will be broadcast to a
   compatible shape. (More about broadcasting
   [here](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).)
 
@@ -512,7 +513,7 @@ def multiply(x, y, name=None):
 
   Raises:
 
-   * InvalidArgumentError: When `x` and `y` have incomptatible shapes or types.
+   * InvalidArgumentError: When `x` and `y` have incompatible shapes or types.
   """
 
   return gen_math_ops.mul(x, y, name)
@@ -1867,9 +1868,9 @@ def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disa
                            key=dtype_hierarchy.index)
     else:
       inferred_dtype = dtype
-    # Always try perform a cast even start/limit/delta are already tensors.
-    # This will revole the case where start/limit/delta's original's dtype
-    # is different from provided dtype.
+    # Always try to perform a cast even when start/limit/delta are already
+    # tensors. This will resolve the case where start/limit/delta's original's
+    # dtype is different from provided dtype.
     start = cast(start, inferred_dtype)
     limit = cast(limit, inferred_dtype)
     delta = cast(delta, inferred_dtype)
@@ -2721,10 +2722,10 @@ def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
   tf.Tensor(-1, shape=(), dtype=int32)
   >>> x = tf.constant([4, float('nan')])
   >>> print(tf.reduce_max(x))
-  tf.Tensor(4.0, shape=(), dtype=float32)
+  tf.Tensor(nan, shape=(), dtype=float32)
   >>> x = tf.constant([float('nan'), float('nan')])
   >>> print(tf.reduce_max(x))
-  tf.Tensor(-inf, shape=(), dtype=float32)
+  tf.Tensor(nan, shape=(), dtype=float32)
   >>> x = tf.constant([float('-inf'), float('inf')])
   >>> print(tf.reduce_max(x))
   tf.Tensor(inf, shape=(), dtype=float32)
@@ -4809,6 +4810,35 @@ def ndtri(x, name=None):
     return gen_math_ops.ndtri(x)
 
 
+@tf_export("math.erfcinv")
+@dispatch.add_dispatch_support
+def erfcinv(x, name=None):
+  """Computes the inverse of complementary error function.
+
+  Given `x`, compute the inverse complementary error function of `x`.
+  This function is the inverse of `tf.math.erfc`, and is defined on
+  `[0, 2]`.
+
+  >>> tf.math.erfcinv([0., 0.2, 1., 1.5, 2.])
+  <tf.Tensor: shape=(5,), dtype=float32, numpy=
+  array([       inf,  0.9061935, -0.       , -0.4769363,       -inf],
+        dtype=float32)>
+
+  Args:
+    x: `Tensor` with type `float` or `double`.
+    name: A name for the operation (optional).
+  Returns:
+    Inverse complementary error function of `x`.
+
+  @compatibility(numpy)
+  Equivalent to scipy.special.erfcinv
+  @end_compatibility
+  """
+  with ops.name_scope(name, "erfcinv", [x]):
+    x = ops.convert_to_tensor(x, name="start")
+    return -ndtri(0.5 * x) * np.sqrt(0.5)
+
+
 @tf_export("math.ceil", v1=["math.ceil", "ceil"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("ceil")
@@ -4860,7 +4890,7 @@ def sqrt(x, name=None):  # pylint: disable=redefined-builtin
     array([[0.0+1.j],
            [4.0+0.j]])>
 
-  Note: In order to support complex complex, please provide an input tensor
+  Note: In order to support complex type, please provide an input tensor
   of `complex64` or `complex128`.
 
   Args:
@@ -5014,7 +5044,7 @@ def floor(x, name=None):
   """Returns element-wise largest integer not greater than x.
 
   Both input range is `(-inf, inf)` and the
-  ouput range consists of all integer values.
+  output range consists of all integer values.
 
   For example:
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index dabf4bb9d33..f2e637b5b09 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -882,5 +883,20 @@ class RangeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(values, self.evaluate(tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class ErfcinvTest(test_util.TensorFlowTestCase):
+
+  def testErfcinv(self):
+    if test.is_built_with_rocm():
+      # The implementation of erfcinv calls ndtri op,
+      # and the ROCm implementaion for ndtri op has a known bug in it
+      # whose fix will be in a forthcoming ROCm release (4.0 ?).
+      # Need to skip this unit-test until that ROCm release is out
+      self.skipTest("ndtri op implementation is buggy on ROCm")
+    values = np.random.uniform(0.1, 1.9, size=int(1e4)).astype(np.float32)
+    approx_id = math_ops.erfc(math_ops.erfcinv(values))
+    self.assertAllClose(values, self.evaluate(approx_id))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index a02e31f80a5..70b137a57a8 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -579,7 +579,7 @@ def _Conv2DGrad(op, grad):
 
   # We call the gen_nn_ops backprop functions instead of nn_ops backprop
   # functions for performance reasons in Eager mode. gen_nn_ops functions take a
-  # `explicit_paddings` parameter, but nn_ops functions do not. So if were were
+  # `explicit_paddings` parameter, but nn_ops functions do not. So if we were
   # to use the nn_ops functions, we would have to convert `padding` and
   # `explicit_paddings` into a single `padding` parameter, increasing overhead
   # in Eager mode.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 2477fa1e920..f19ca68797c 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -5143,6 +5143,19 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
     if not x_dtype.is_floating:
       raise ValueError("x has to be a floating point tensor since it's going "
                        "to be scaled. Got a %s tensor instead." % x_dtype)
+    if is_rate_number and rate == 0:
+      # Fast-path: Return the input immediately if rate is non-tensor & is `0`.
+      # We trigger this after all error checking
+      # and after `x` has been converted to a tensor, to prevent inconsistent
+      # tensor conversions/error raising if rate is changed to/from 0.
+      #
+      # We also explicitly call `random_seed.get_seed` to make sure
+      # we don't change the random number generation behavior of
+      # stateful random ops by entering a fastpath,
+      # despite not generating a random tensor in the fastpath
+      random_seed.get_seed(seed)
+      return x
+
     is_executing_eagerly = context.executing_eagerly()
     if not tensor_util.is_tensor(rate):
       if is_rate_number:
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index d22c96e50c8..96eb5509de3 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -43,7 +43,6 @@ py_library(
 cuda_py_test(
     name = "np_arrays_test",
     srcs = ["np_arrays_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":numpy",
         "//tensorflow/python:dtypes",
@@ -69,7 +68,6 @@ cuda_py_test(
 cuda_py_test(
     name = "np_logic_test",
     srcs = ["np_logic_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":numpy",
         "//third_party/py/numpy",
@@ -102,7 +100,6 @@ cuda_py_test(
 cuda_py_test(
     name = "np_utils_test",
     srcs = ["np_utils_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":numpy",
         "//tensorflow/python:platform",
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index f50f1934643..35c303c73c8 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -19,7 +19,7 @@ operations. APIs are based on and have been tested with NumPy 1.16 version.
 
 The set of supported APIs may be expanded over time. Also future releases may
 change the baseline version of NumPy API being supported. A list of some
-systematic differences with NumPy are listed later in the "Differences with
+systematic differences with NumPy is listed later in the "Differences with
 NumPy" section.
 
 ## Getting Started
@@ -31,7 +31,7 @@ In the code snippets below, we will assume that `tf.experimental.numpy` is
 imported as `tnp` and NumPy is imported as `np`
 
 ```python
-print(tnp.ones([2,1]) + tnp.ones([1, 2]))
+print(tnp.ones([2,1]) + np.ones([1, 2]))
 ```
 
 ## Types
@@ -153,7 +153,7 @@ Here is a non-exhaustive list of differences:
     are not supported.
 *   Only a subset of functions and modules are supported. This set will be
     expanded over time. For supported functions, some arguments or argument
-    values may not be supported. This differences are generally provide in the
+    values may not be supported. These differences are generally provided in the
     function comments. Full `ufunc` support is also not provided.
 *   Buffer mutation is currently not supported. `ndarrays` wrap immutable
     tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Text_Generation.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Text_Generation.ipynb
new file mode 100644
index 00000000000..c2d4dee2aea
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Text_Generation.ipynb
@@ -0,0 +1,1088 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "t09eeeR5prIJ"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "cellView": "form",
+    "id": "GCCk8_dHpuNf"
+   },
+   "outputs": [],
+   "source": [
+    "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ovpZyIhNIgoq"
+   },
+   "source": [
+    "# Text generation with an RNN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "hcD2nPQvPOFM"
+   },
+   "source": [
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/tutorials/text/text_generation\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/text_generation.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/text/text_generation.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/text/text_generation.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BwpJ5IffzRG6"
+   },
+   "source": [
+    "This tutorial demonstrates how to generate text using a character-based RNN. We will work with a dataset of Shakespeare's writing from Andrej Karpathy's [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). Given a sequence of characters from this data (\"Shakespear\"), train a model to predict the next character in the sequence (\"e\"). Longer sequences of text can be generated by calling the model repeatedly.\n",
+    "\n",
+    "Note: Enable GPU acceleration to execute this notebook faster. In Colab: *Runtime > Change runtime type > Hardware accelerator > GPU*. If running locally make sure TensorFlow version >= 2.4.\n",
+    "\n",
+    "This tutorial includes runnable code implemented using [tf.experimental.numpy](https://www.tensorflow.org/api_docs/python/tf/experimental/numpy). The following is sample output when the model in this tutorial trained for 30 epochs, and started with the string \"Q\":\n",
+    "\n",
+    "<pre>\n",
+    "QUEENE:\n",
+    "I had thought thou hadst a Roman; for the oracle,\n",
+    "Thus by All bids the man against the word,\n",
+    "Which are so weak of care, by old care done;\n",
+    "Your children were in your holy love,\n",
+    "And the precipitation through the bleeding throne.\n",
+    "\n",
+    "BISHOP OF ELY:\n",
+    "Marry, and will, my lord, to weep in such a one were prettiest;\n",
+    "Yet now I was adopted heir\n",
+    "Of the world's lamentable day,\n",
+    "To watch the next way with his father with his face?\n",
+    "\n",
+    "ESCALUS:\n",
+    "The cause why then we are all resolved more sons.\n",
+    "\n",
+    "VOLUMNIA:\n",
+    "O, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, it is no sin it should be dead,\n",
+    "And love and pale as any will to that word.\n",
+    "\n",
+    "QUEEN ELIZABETH:\n",
+    "But how long have I heard the soul for this world,\n",
+    "And show his hands of life be proved to stand.\n",
+    "\n",
+    "PETRUCHIO:\n",
+    "I say he look'd on, if I must be content\n",
+    "To stay him from the fatal of our country's bliss.\n",
+    "His lordship pluck'd from this sentence then for prey,\n",
+    "And then let us twain, being the moon,\n",
+    "were she such a case as fills m\n",
+    "</pre>\n",
+    "\n",
+    "While some of the sentences are grammatical, most do not make sense. The model has not learned the meaning of words, but consider:\n",
+    "\n",
+    "* The model is character-based. When training started, the model did not know how to spell an English word, or that words were even a unit of text.\n",
+    "\n",
+    "* The structure of the output resembles a play—blocks of text generally begin with a speaker name, in all capital letters similar to the dataset.\n",
+    "\n",
+    "* As demonstrated below, the model is trained on small batches of text (100 characters each), and is still able to generate a longer sequence of text with coherent structure."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "srXC6pLGLwS6"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WGyKZj3bzf9p"
+   },
+   "source": [
+    "### Import TensorFlow and other libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "yG_n40gFzf9s"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow.experimental.numpy as tnp\n",
+    "\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EHDoRoc5PKWz"
+   },
+   "source": [
+    "### Download the Shakespeare dataset\n",
+    "\n",
+    "Change the following line to run this code on your own data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "pD_55cOxLkAb"
+   },
+   "outputs": [],
+   "source": [
+    "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UHjdCjDuSvX_"
+   },
+   "source": [
+    "### Read the data\n",
+    "\n",
+    "First, look in the text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "aavnuByVymwK"
+   },
+   "outputs": [],
+   "source": [
+    "# Read, then decode for py2 compat.\n",
+    "text = open(path_to_file, 'rb').read().decode(encoding='utf-8')\n",
+    "# length of text is the number of characters in it\n",
+    "print ('Length of text: {} characters'.format(len(text)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "Duhg9NrUymwO"
+   },
+   "outputs": [],
+   "source": [
+    "# Take a look at the first 250 characters in text\n",
+    "print(text[:250])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "IlCgQBRVymwR"
+   },
+   "outputs": [],
+   "source": [
+    "# The unique characters in the file\n",
+    "vocab = sorted(set(text))\n",
+    "print ('{} unique characters'.format(len(vocab)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rNnrKn_lL-IJ"
+   },
+   "source": [
+    "## Process the text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LFjSVAlWzf-N"
+   },
+   "source": [
+    "### Vectorize the text\n",
+    "\n",
+    "Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "IalZLbvOzf-F"
+   },
+   "outputs": [],
+   "source": [
+    "# Creating a mapping from unique characters to indices\n",
+    "char2idx = {u:i for i, u in enumerate(vocab)}\n",
+    "idx2char = np.array(vocab)\n",
+    "\n",
+    "text_as_int = np.array([char2idx[c] for c in text])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bbmsf23Bymwe"
+   },
+   "source": [
+    "### The prediction task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wssHQ1oGymwe"
+   },
+   "source": [
+    "Given a character, or a sequence of characters, what is the most probable next character? This is the task we're training the model to perform. The input to the model will be a sequence of characters, and we train the model to predict the output—the following character at each time step.\n",
+    "\n",
+    "Since RNNs maintain an internal state that depends on the previously seen elements, given all the characters computed until this moment, what is the next character?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "hgsVvVxnymwf"
+   },
+   "source": [
+    "### Create training examples and targets\n",
+    "\n",
+    "Next divide the text into example sequences. Each input sequence will contain `seq_length` characters from the text.\n",
+    "\n",
+    "For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.\n",
+    "\n",
+    "So break the text into chunks of `seq_length+1`. For example, say `seq_length` is 4 and our text is \"Hello\". The input sequence would be \"Hell\", and the target sequence \"ello\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "0UHJDA39zf-O"
+   },
+   "outputs": [],
+   "source": [
+    "# The maximum length sentence we want for a single input in characters\n",
+    "seq_length = 100\n",
+    "examples_per_epoch = len(text)//(seq_length+1)\n",
+    "\n",
+    "# Create training examples / targets\n",
+    "char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)\n",
+    "\n",
+    "for i in char_dataset.take(5):\n",
+    "  print(idx2char[i.numpy()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "l4hkDU3i7ozi"
+   },
+   "outputs": [],
+   "source": [
+    "sequences = char_dataset.batch(seq_length+1, drop_remainder=True)\n",
+    "\n",
+    "for item in sequences.take(5):\n",
+    "  print(repr(''.join(idx2char[item.numpy()])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "9NGu-FkO_kYU"
+   },
+   "outputs": [],
+   "source": [
+    "def split_input_target(chunk):\n",
+    "    input_text = chunk[:-1]\n",
+    "    target_text = chunk[1:]\n",
+    "    return input_text, target_text\n",
+    "\n",
+    "dataset = sequences.map(split_input_target)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "GNbw-iR0ymwj"
+   },
+   "outputs": [],
+   "source": [
+    "for input_example, target_example in  dataset.take(1):\n",
+    "  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))\n",
+    "  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_33OHL3b84i0"
+   },
+   "source": [
+    "Each index of these vectors are processed as one time step. For the input at time step 0, the model receives the index for \"F\" and tries to predict the index for \"i\" as the next character. At the next timestep, it does the same thing but the `RNN` considers the previous step context in addition to the current input character."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "0eBu9WZG84i0"
+   },
+   "outputs": [],
+   "source": [
+    "for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):\n",
+    "    print(\"Step {:4d}\".format(i))\n",
+    "    print(\"  input: {} ({:s})\".format(input_idx, repr(idx2char[input_idx])))\n",
+    "    print(\"  expected output: {} ({:s})\".format(target_idx, repr(idx2char[target_idx])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MJdfPmdqzf-R"
+   },
+   "source": [
+    "### Create training batches\n",
+    "\n",
+    "We used `tf.data` to split the text into manageable sequences. But before feeding this data into the model, we need to shuffle the data and pack it into batches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "p2pGotuNzf-S"
+   },
+   "outputs": [],
+   "source": [
+    "# Batch size\n",
+    "BATCH_SIZE = 64\n",
+    "\n",
+    "# Buffer size to shuffle the dataset\n",
+    "# (TF data is designed to work with possibly infinite sequences,\n",
+    "# so it doesn't attempt to shuffle the entire sequence in memory. Instead,\n",
+    "# it maintains a buffer in which it shuffles elements).\n",
+    "BUFFER_SIZE = 10000\n",
+    "\n",
+    "dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)\n",
+    "\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "r6oUuElIMgVx"
+   },
+   "source": [
+    "## Build The Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "oKesgVJRY9g4"
+   },
+   "source": [
+    "We manually implement the model from scratch, using `tf.numpy` and some low-level TF ops. A `Model` object has three layers: `Embedding`, `GRU` and `Dense`. `Embedding` and `Dense` are little more than just wrappers around `tnp.take` and `tnp.dot`, but we can use them to familiarize ourself with the structure of a layer. Each layer has two essential methods: `build` and `__call__`. `build` creates and initializes the layer's weights and state, which are things that change during the training process. `__call__` is the forward function that calculates outputs given inputs, using the layer's weights and state internally."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9dm_WoL29UmO"
+   },
+   "source": [
+    "Our model (more precisely the `GRU` layer) is stateful, because each call of `__call__` will change its internal state, affecting the next call. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "zHT8cLh7EAsg"
+   },
+   "outputs": [],
+   "source": [
+    "# Length of the vocabulary in chars\n",
+    "vocab_size = len(vocab)\n",
+    "\n",
+    "# The embedding dimension\n",
+    "embedding_dim = 256\n",
+    "\n",
+    "# Number of RNN units\n",
+    "rnn_units = 1024"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "dGrbGm-oGnqB"
+   },
+   "outputs": [],
+   "source": [
+    "class Embedding:\n",
+    "\n",
+    "  def __init__(self, vocab_size, embedding_dim):\n",
+    "    self._vocab_size = vocab_size\n",
+    "    self._embedding_dim = embedding_dim\n",
+    "    self._built = False\n",
+    "\n",
+    "  def __call__(self, inputs):\n",
+    "    if not self._built:\n",
+    "      self.build(inputs)\n",
+    "    return tnp.take(self.weights, inputs, axis=0)\n",
+    "\n",
+    "  def build(self, inputs):\n",
+    "    del inputs\n",
+    "    self.weights = tf.Variable(tnp.random.randn(\n",
+    "        self._vocab_size, self._embedding_dim).astype(np.float32))\n",
+    "    self._built = True\n",
+    "\n",
+    "\n",
+    "class GRUCell:\n",
+    "  \"\"\"Builds a traditional GRU cell with dense internal transformations.\n",
+    "\n",
+    "  Gated Recurrent Unit paper: https://arxiv.org/abs/1412.3555\n",
+    "  \"\"\"\n",
+    "\n",
+    "  def __init__(self, n_units, forget_bias=0.0):\n",
+    "    self._n_units = n_units\n",
+    "    self._forget_bias = forget_bias\n",
+    "    self._built = False\n",
+    "\n",
+    "  def __call__(self, inputs):\n",
+    "    if not self._built:\n",
+    "      self.build(inputs)\n",
+    "    x, gru_state = inputs\n",
+    "    # Dense layer on the concatenation of x and h.\n",
+    "    y = tnp.dot(tnp.concatenate([x, gru_state], axis=-1), self.w1) + self.b1\n",
+    "    # Update and reset gates.\n",
+    "    u, r = tnp.split(tf.sigmoid(y), 2, axis=-1)\n",
+    "    # Candidate.\n",
+    "    c = tnp.dot(tnp.concatenate([x, r * gru_state], axis=-1), self.w2) + self.b2\n",
+    "    new_gru_state = u * gru_state + (1 - u) * tnp.tanh(c)\n",
+    "    return new_gru_state\n",
+    "\n",
+    "  def build(self, inputs):\n",
+    "    # State last dimension must be n_units.\n",
+    "    assert inputs[1].shape[-1] == self._n_units\n",
+    "    # The dense layer input is the input and half of the GRU state.\n",
+    "    dense_shape = inputs[0].shape[-1] + self._n_units\n",
+    "    self.w1 = tf.Variable(tnp.random.uniform(\n",
+    "        -0.01, 0.01, (dense_shape, 2 * self._n_units)).astype(tnp.float32))\n",
+    "    self.b1 = tf.Variable((tnp.random.randn(2 * self._n_units) * 1e-6 + self._forget_bias\n",
+    "               ).astype(tnp.float32))\n",
+    "    self.w2 = tf.Variable(tnp.random.uniform(\n",
+    "        -0.01, 0.01, (dense_shape, self._n_units)).astype(tnp.float32))\n",
+    "    self.b2 = tf.Variable((tnp.random.randn(self._n_units) * 1e-6).astype(tnp.float32))\n",
+    "    self._built = True\n",
+    "\n",
+    "  @property\n",
+    "  def weights(self):\n",
+    "    return (self.w1, self.b1, self.w2, self.b2)\n",
+    "\n",
+    "\n",
+    "class GRU:\n",
+    "\n",
+    "  def __init__(self, n_units, forget_bias=0.0, stateful=False):\n",
+    "    self._cell = GRUCell(n_units, forget_bias)\n",
+    "    self._stateful = stateful\n",
+    "    self._built = False\n",
+    "\n",
+    "  def __call__(self, inputs):\n",
+    "    if not self._built:\n",
+    "      self.build(inputs)\n",
+    "    if self._stateful:\n",
+    "      state = self.state.read_value()\n",
+    "    else:\n",
+    "      state = self._init_state(inputs.shape[0])    \n",
+    "    inputs = tnp.transpose(inputs, (1, 0, 2))\n",
+    "    output =  tf.scan(\n",
+    "        lambda gru_state, x: self._cell((x, gru_state)),\n",
+    "        inputs, state)\n",
+    "    if self._stateful:\n",
+    "      self.state.assign(output[-1, ...])\n",
+    "    return tnp.transpose(output, [1, 0, 2])\n",
+    "\n",
+    "  def _init_state(self, batch_size):\n",
+    "    return tnp.zeros([batch_size, self._cell._n_units], tnp.float32)\n",
+    "\n",
+    "  def reset_state(self):\n",
+    "    if not self._stateful:\n",
+    "      return\n",
+    "    self.state.assign(tf.zeros_like(self.state))\n",
+    "\n",
+    "  def create_state(self, batch_size):\n",
+    "    self.state = tf.Variable(self._init_state(batch_size))\n",
+    "\n",
+    "  def build(self, inputs):\n",
+    "    s = inputs.shape[0:1] + inputs.shape[2:]\n",
+    "    shapes = (s, s[:-1] + (self._cell._n_units,))   \n",
+    "    self._cell.build([tf.TensorSpec(x, tf.float32) for x in shapes])\n",
+    "    if self._stateful:\n",
+    "      self.create_state(inputs.shape[0])\n",
+    "    else:\n",
+    "      self.state = ()\n",
+    "    self._built = True\n",
+    "    \n",
+    "  @property\n",
+    "  def weights(self):\n",
+    "    return self._cell.weights\n",
+    "\n",
+    "\n",
+    "class Dense:\n",
+    "\n",
+    "  def __init__(self, n_units, activation=None):\n",
+    "    self._n_units = n_units\n",
+    "    self._activation = activation\n",
+    "    self._built = False\n",
+    "\n",
+    "  def __call__(self, inputs):\n",
+    "    if not self._built:\n",
+    "      self.build(inputs)\n",
+    "    y = tnp.dot(inputs, self.w) +self.b\n",
+    "    if self._activation != None:\n",
+    "      y = self._activation(y)\n",
+    "    return y\n",
+    "\n",
+    "  def build(self, inputs):\n",
+    "    shape_w = (inputs.shape[-1], self._n_units)\n",
+    "    lim = tnp.sqrt(6.0 / (shape_w[0] + shape_w[1]))\n",
+    "    self.w = tf.Variable(tnp.random.uniform(-lim, lim, shape_w).astype(tnp.float32))\n",
+    "    self.b = tf.Variable((tnp.random.randn(self._n_units) * 1e-6).astype(tnp.float32))\n",
+    "    self._built = True\n",
+    "\n",
+    "  @property\n",
+    "  def weights(self):\n",
+    "    return (self.w, self.b)\n",
+    "\n",
+    "\n",
+    "class Model:\n",
+    "\n",
+    "  def __init__(self, vocab_size, embedding_dim, rnn_units, forget_bias=0.0, stateful=False, activation=None):\n",
+    "    self._embedding = Embedding(vocab_size, embedding_dim)\n",
+    "    self._gru = GRU(rnn_units, forget_bias=forget_bias, stateful=stateful)\n",
+    "    self._dense = Dense(vocab_size, activation=activation)\n",
+    "    self._layers = [self._embedding, self._gru, self._dense]\n",
+    "    self._built = False\n",
+    "\n",
+    "  def __call__(self, inputs):\n",
+    "    if not self._built:\n",
+    "      self.build(inputs)\n",
+    "    xs = inputs\n",
+    "    for layer in self._layers:\n",
+    "      xs = layer(xs)\n",
+    "    return xs\n",
+    "    \n",
+    "  def build(self, inputs):\n",
+    "    self._embedding.build(inputs)\n",
+    "    self._gru.build(tf.TensorSpec(inputs.shape + (self._embedding._embedding_dim,), tf.float32))\n",
+    "    self._dense.build(tf.TensorSpec(inputs.shape + (self._gru._cell._n_units,), tf.float32))\n",
+    "    self._built = True\n",
+    "\n",
+    "  @property\n",
+    "  def weights(self):\n",
+    "    return [layer.weights for layer in self._layers]\n",
+    "\n",
+    "  @property\n",
+    "  def state(self):\n",
+    "    return self._gru.state\n",
+    "\n",
+    "  def create_state(self, *args):\n",
+    "    self._gru.create_state(*args)\n",
+    "\n",
+    "  def reset_state(self, *args):\n",
+    "    self._gru.reset_state(*args)\n",
+    "\n",
+    "\n",
+    "model = Model(\n",
+    "  vocab_size = vocab_size,\n",
+    "  embedding_dim=embedding_dim,\n",
+    "  rnn_units=rnn_units,\n",
+    "  stateful=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RkA5upJIJ7W7"
+   },
+   "source": [
+    "For each character the model looks up the embedding, runs the GRU one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-ubPo0_9Prjb"
+   },
+   "source": [
+    "## Try the model\n",
+    "\n",
+    "Now run the model to see that it behaves as expected.\n",
+    "\n",
+    "First check the shape of the output:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "lzuvs0a4IR6m"
+   },
+   "outputs": [],
+   "source": [
+    "  for input_example_batch, target_example_batch in dataset.take(1):\n",
+    "    input_example_batch = tnp.asarray(input_example_batch)\n",
+    "    example_batch_predictions = model(input_example_batch)\n",
+    "    print(example_batch_predictions.shape, \"# (batch_size, sequence_length, vocab_size)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Q6NzLBi4VM4o"
+   },
+   "source": [
+    "In the above example the sequence length of the input is `100` but the model can be run on inputs of any length:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uwv0gEkURfx1"
+   },
+   "source": [
+    "To get actual predictions from the model we need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary.\n",
+    "\n",
+    "Note: It is important to _sample_ from this distribution as taking the _argmax_ of the distribution can easily get the model stuck in a loop.\n",
+    "\n",
+    "Try it for the first example in the batch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "RP56TbSEgcNp"
+   },
+   "outputs": [],
+   "source": [
+    "example_batch_predictions[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "4V4MfFg0RQJg"
+   },
+   "outputs": [],
+   "source": [
+    "sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)\n",
+    "sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QM1Vbxs_URw5"
+   },
+   "source": [
+    "This gives us, at each timestep, a prediction of the next character index:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "YqFMUQc_UFgM"
+   },
+   "outputs": [],
+   "source": [
+    "sampled_indices"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LfLtsP3mUhCG"
+   },
+   "source": [
+    "Decode these to see the text predicted by this untrained model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "xWcFwPwLSo05"
+   },
+   "outputs": [],
+   "source": [
+    "print(\"Input: \\n\", repr(\"\".join(idx2char[input_example_batch[0]])))\n",
+    "print()\n",
+    "print(\"Next Char Predictions: \\n\", repr(\"\".join(idx2char[sampled_indices ])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LJL0Q0YPY6Ee"
+   },
+   "source": [
+    "## Train the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YCbHQHiaa4Ic"
+   },
+   "source": [
+    "At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "trpqTWyvk0nr"
+   },
+   "source": [
+    "### Loss function"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mJRGdSfi-2D8"
+   },
+   "source": [
+    "We define the loss function from scratch, using `tf.nn.log_softmax`. (Our definition is the same as `tf.keras.losses.sparse_categorical_crossentropy`.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "Dhv7DC6TZ-2i"
+   },
+   "outputs": [],
+   "source": [
+    "def one_hot(labels, n):\n",
+    "  return (labels[..., np.newaxis] == tnp.arange(n)).astype(np.float32)\n",
+    "\n",
+    "def loss_fn(labels, predictions):\n",
+    "  predictions = tf.nn.log_softmax(predictions)\n",
+    "  return -tnp.sum(predictions * one_hot(tnp.asarray(labels), predictions.shape[-1]), axis=-1)\n",
+    "\n",
+    "example_batch_loss  = loss_fn(target_example_batch, example_batch_predictions)\n",
+    "print(\"Prediction shape: \", example_batch_predictions.shape, \" # (batch_size, sequence_length, vocab_size)\")\n",
+    "print(\"scalar_loss:      \", tnp.mean(example_batch_loss))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mHQWnJCY_fBu"
+   },
+   "source": [
+    "### Optimizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4jHj8s57_NCk"
+   },
+   "source": [
+    "Keeping the DIY spirit, we implement the Adam optimizer from scratch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "NJDx4_SN5Vse"
+   },
+   "outputs": [],
+   "source": [
+    "class Adam:\n",
+    "\n",
+    "  def __init__(self, learning_rate=0.001, b1=0.9, b2=0.999, eps=1e-7):\n",
+    "    self._lr = learning_rate\n",
+    "    self._b1 = b1\n",
+    "    self._b2 = b2\n",
+    "    self._eps = eps\n",
+    "    self._built = False\n",
+    "\n",
+    "  def build(self, weights):\n",
+    "    self._m = tf.nest.map_structure(lambda x: tf.Variable(tnp.zeros_like(x)), weights)\n",
+    "    self._v = tf.nest.map_structure(lambda x: tf.Variable(tnp.zeros_like(x)), weights)\n",
+    "    self._step = tf.Variable(tnp.asarray(0, np.int64))\n",
+    "    self._built = True\n",
+    "\n",
+    "  def _update(self, weights_var, grads, m_var, v_var):\n",
+    "    b1 = self._b1\n",
+    "    b2 = self._b2\n",
+    "    eps = self._eps\n",
+    "    step = tnp.asarray(self._step, np.float32)\n",
+    "    lr = self._lr\n",
+    "    weights = tnp.asarray(weights_var)\n",
+    "    m = tnp.asarray(m_var)\n",
+    "    v = tnp.asarray(v_var)\n",
+    "    m = (1 - b1) * grads + b1 * m  # First  moment estimate.\n",
+    "    v = (1 - b2) * (grads ** 2) + b2 * v  # Second moment estimate.\n",
+    "    mhat = m / (1 - b1 ** (step + 1))  # Bias correction.\n",
+    "    vhat = v / (1 - b2 ** (step + 1))   \n",
+    "    weights_var.assign_sub((lr * mhat / (tnp.sqrt(vhat) + eps)).astype(weights.dtype))\n",
+    "    m_var.assign(m)\n",
+    "    v_var.assign(v)\n",
+    "\n",
+    "  def apply_gradients(self, weights, grads):\n",
+    "    if not self._built:\n",
+    "      self.build(weights)\n",
+    "    tf.nest.map_structure(lambda *args: self._update(*args), weights, grads, self._m, self._v)\n",
+    "    self._step.assign_add(1)\n",
+    "\n",
+    "  @property\n",
+    "  def state(self):\n",
+    "    return (self._step, self._m, self._v)\n",
+    "\n",
+    "\n",
+    "optimizer = Adam()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3Ky3F_BhgkTW"
+   },
+   "source": [
+    "### Training loop"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EdhARuXFACy0"
+   },
+   "source": [
+    "Again, we write our training loop from scratch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "IxdOA-rgyGvs"
+   },
+   "source": [
+    "To keep training time reasonable, use 10 epochs to train the model. In Colab, set the runtime to GPU for faster training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "Q4nN6i0oirh2"
+   },
+   "outputs": [],
+   "source": [
+    "@tf.function\n",
+    "def train_step(inp, target):\n",
+    "  with tf.GradientTape() as tape:\n",
+    "    # tape.watch(tf.nest.flatten(weights))\n",
+    "    predictions = model(inp)\n",
+    "    loss = tnp.mean(loss_fn(target, predictions))\n",
+    "  weights = model.weights\n",
+    "  grads = tape.gradient(loss, weights)\n",
+    "  optimizer.apply_gradients(weights, grads)\n",
+    "  return loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "RIq1BwUD5mRQ"
+   },
+   "outputs": [],
+   "source": [
+    "# Training step\n",
+    "EPOCHS = 10\n",
+    "\n",
+    "model.create_state(BATCH_SIZE)\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "  start = time.time()\n",
+    "\n",
+    "  # initializing the hidden state at the start of every epoch\n",
+    "  model.reset_state()\n",
+    "\n",
+    "  for (batch_n, (inp, target)) in enumerate(dataset):\n",
+    "    loss = train_step(inp, target)\n",
+    "\n",
+    "    if batch_n % 100 == 0:\n",
+    "      template = 'Epoch {} Batch {} Loss {}'\n",
+    "      print(template.format(epoch+1, batch_n, loss))\n",
+    "\n",
+    "  print ('Epoch {} Loss {}'.format(epoch+1, loss))\n",
+    "  print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kKkD5M6eoSiN"
+   },
+   "source": [
+    "## Generate text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DjGz1tDkzf-u"
+   },
+   "source": [
+    "The following code block generates the text:\n",
+    "\n",
+    "* It Starts by choosing a start string, initializing the RNN state and setting the number of characters to generate.\n",
+    "\n",
+    "* Get the prediction distribution of the next character using the start string and the RNN state.\n",
+    "\n",
+    "* Then, use a categorical distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.\n",
+    "\n",
+    "* The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one character. After predicting the next character, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted characters.\n",
+    "\n",
+    "Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LyeYRiuVjodY"
+   },
+   "source": [
+    "To keep this prediction step simple, use a batch size of 1."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "WvuwZBX5Ogfd"
+   },
+   "outputs": [],
+   "source": [
+    "def generate_text(model, start_string):\n",
+    "  # Evaluation step (generating text using the learned model)\n",
+    "\n",
+    "  # Number of characters to generate\n",
+    "  num_generate = 1000\n",
+    "\n",
+    "  # Converting our start string to numbers (vectorizing)\n",
+    "  input_eval = [char2idx[s] for s in start_string]\n",
+    "  input_eval = tf.expand_dims(input_eval, 0)\n",
+    "\n",
+    "  # Empty string to store our results\n",
+    "  text_generated = []\n",
+    "\n",
+    "  # Low temperatures results in more predictable text.\n",
+    "  # Higher temperatures results in more surprising text.\n",
+    "  # Experiment to find the best setting.\n",
+    "  temperature = 1.0\n",
+    "\n",
+    "  # Here batch size == 1\n",
+    "  model.create_state(1)\n",
+    "  for i in range(num_generate):\n",
+    "      predictions = model(input_eval)\n",
+    "      # remove the batch dimension\n",
+    "      predictions = tf.squeeze(predictions, 0)\n",
+    "\n",
+    "      # using a categorical distribution to predict the character returned by the model\n",
+    "      predictions = predictions / temperature\n",
+    "      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()\n",
+    "\n",
+    "      # We pass the predicted character as the next input to the model\n",
+    "      # along with the previous hidden state\n",
+    "      input_eval = tf.expand_dims([predicted_id], 0)\n",
+    "\n",
+    "      text_generated.append(idx2char[predicted_id])\n",
+    "\n",
+    "  return (start_string + ''.join(text_generated))\n",
+    "\n",
+    "print(generate_text(model, start_string=u\"ROMEO: \"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AM2Uma_-yVIq"
+   },
+   "source": [
+    "The easiest thing you can do to improve the results it to train it for longer (try `EPOCHS=30`).\n",
+    "\n",
+    "You can also experiment with a different start string, or try adding another RNN layer to improve the model's accuracy, or adjusting the temperature parameter to generate more or less random predictions."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "TensorFlow_NumPy_Text_Generation.ipynb",
+   "private_outputs": true,
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 369297bd85a..3d5c3f93d2e 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -572,7 +572,7 @@ def size(x, axis=None):  # pylint: disable=missing-docstring
     return 1
   x = asarray(x).data
   if x.shape.is_fully_defined():
-    return np.prod(x.shape.as_list())
+    return np.prod(x.shape.as_list(), dtype=int)
   else:
     return np_utils.tensor_to_ndarray(array_ops.size_v2(x))
 
@@ -822,16 +822,32 @@ def transpose(a, axes=None):
 @np_utils.np_doc('swapaxes')
 def swapaxes(a, axis1, axis2):  # pylint: disable=missing-docstring
   a = asarray(a).data
+  def adjust_axes(axes, rank):
+    def f(x):
+      if isinstance(x, int):
+        if x < 0:
+          x = x + rank
+      else:
+        x = array_ops.where_v2(x < 0, np_utils.add(x, a_rank), x)
+      return x
+    return nest.map_structure(f, axes)
 
-  a_rank = array_ops.rank(a)
-  axis1 = array_ops.where_v2(axis1 < 0, axis1 + a_rank, axis1)
-  axis2 = array_ops.where_v2(axis2 < 0, axis2 + a_rank, axis2)
-
-  perm = math_ops.range(a_rank)
-  perm = array_ops.tensor_scatter_update(perm, [[axis1], [axis2]],
-                                         [axis2, axis1])
+  if (a.shape.rank is not None and
+      isinstance(axis1, int) and isinstance(axis2, int)):
+    # This branch makes sure `perm` is statically known, to avoid a
+    # not-compile-time-constant XLA error.
+    a_rank = a.shape.rank
+    axis1, axis2 = adjust_axes((axis1, axis2), a_rank)
+    perm = list(range(a_rank))
+    perm[axis1] = axis2
+    perm[axis2] = axis1
+  else:
+    a_rank = array_ops.rank(a)
+    axis1, axis2 = adjust_axes((axis1, axis2), a_rank)
+    perm = math_ops.range(a_rank)
+    perm = array_ops.tensor_scatter_update(perm, [[axis1], [axis2]],
+                                           [axis2, axis1])
   a = array_ops.transpose(a, perm)
-
   return np_utils.tensor_to_ndarray(a)
 
 
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 631975c9b8a..85cfdf6c5b8 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -264,10 +264,11 @@ def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):  # pylint: disable=mis
 
   def f(a, b):  # pylint: disable=missing-docstring
     # We can't assign to captured variable `axisa`, so make a new variable
-    axis_a = axisa
-    axis_b = axisb
-    axis_c = axisc
-    if axis is not None:
+    if axis is None:
+      axis_a = axisa
+      axis_b = axisb
+      axis_c = axisc
+    else:
       axis_a = axis
       axis_b = axis
       axis_c = axis
diff --git a/tensorflow/python/ops/op_selector_test.py b/tensorflow/python/ops/op_selector_test.py
index 249c78bae3d..8d8e99c5fff 100644
--- a/tensorflow/python/ops/op_selector_test.py
+++ b/tensorflow/python/ops/op_selector_test.py
@@ -99,7 +99,7 @@ class SelectTest(test.TestCase):
       a0 = constant_op.constant(1)
       b0 = constant_op.constant(2)
       c0 = math_ops.add(a0, b0)  # pylint: disable=unused-variable
-    # Should extract the tensors from tre graph.
+    # Should extract the tensors from the graph.
     self.assertEqual(len(op_selector.make_list_of_t(g0)), 3)
     # Should extract the tensors from the tuple
     self.assertEqual(len(op_selector.make_list_of_t((a0, b0))), 2)
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 119a944e867..9208073d946 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -111,7 +111,10 @@ py_library(
 cuda_py_test(
     name = "control_flow_ops_test",
     srcs = ["control_flow_ops_test.py"],
-    tags = ["no_rocm"],
+    shard_count = 16,
+    tags = [
+        "no_rocm",
+    ],
     deps = [
         ":control_flow_ops",
         ":test_util",
@@ -154,6 +157,9 @@ cuda_py_test(
 cuda_py_test(
     name = "array_test",
     srcs = ["array_test.py"],
+    tags = [
+        "notsan",  # TODO(b/170999669): Data race
+    ],
     deps = [
         ":control_flow_ops",
         ":test_util",
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 3a0c6cf1a14..f641687e990 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -174,6 +174,7 @@ class PForTest(PForTestCase):
                         pfor_control_flow_ops.vectorized_map(
                             lambda x: x * x, math_ops.range(4)))
     self.assertTrue(def_function.functions_run_eagerly())
+    def_function.run_functions_eagerly(False)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -970,6 +971,65 @@ class TensorListTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  def test_create_outside_and_push_back(self):
+    h = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_push_back(h, [i, 2])
+      handle = list_ops.tensor_list_push_back(handle, [1, 2])
+      handle = list_ops.tensor_list_push_back(handle, [1, 2])
+      return list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_inside_and_push_back(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_push_back(handle, [i, 2])
+      handle = list_ops.tensor_list_push_back(handle, [1, 2])
+      return list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_pop_back_no_shape(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_push_back(handle, [1, 2])
+      handle = list_ops.tensor_list_push_back(handle, [i, 2])
+      handle, tensor = list_ops.tensor_list_pop_back(handle, dtypes.int32)
+      return tensor, list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_pop_back_no_shape_capture(self):
+    h = list_ops.tensor_list_reserve([2], 1, dtypes.int32)
+    h = list_ops.tensor_list_push_back(h, [1, 2])
+
+    def loop_fn(i):
+      handle, tensor = list_ops.tensor_list_pop_back(h, dtypes.int32)
+      handle = list_ops.tensor_list_push_back(handle, [1, i])
+      return tensor, list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_pop_back_with_shape(self):
+
+    @def_function.function
+    def loop_fn(i):
+      with backprop.GradientTape() as tape:
+        handle = list_ops.tensor_list_reserve(None, 1, dtypes.float32)
+        x = math_ops.cast(i, dtypes.float32)[None]
+        tape.watch(x)
+        handle = list_ops.tensor_list_push_back(handle, x)
+        stacked = list_ops.tensor_list_stack(handle, dtypes.float32)
+      list_grad = tape.gradient(stacked, x, x)
+      self.assertEqual("TensorListPopBack", list_grad.op.type)
+      return list_grad, stacked, list_grad.op.inputs[1]
+
+    self._test_loop_fn(loop_fn, 3)
+
   def test_create_outside_and_scatter(self):
     h = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
 
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 90cc2a0ef31..b2fbfb61388 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -529,7 +529,7 @@ class GradientsTest(test.TestCase):
     os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "0"
     data_format = ("channels_first"
                    if test.is_gpu_available() else "channels_last")
-    # Note that we we are setting training=False here so that dropout produces
+    # Note that we are setting training=False here so that dropout produces
     # the same result with pfor and with while_loop.
     pfor_outputs, while_outputs = create_mnist_per_eg_grad(
         4, data_format, training=False)
@@ -543,7 +543,7 @@ class GradientsTest(test.TestCase):
     os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "0"
     data_format = ("channels_first"
                    if test.is_gpu_available() else "channels_last")
-    # Note that we we are setting training=False here so that dropout produces
+    # Note that we are setting training=False here so that dropout produces
     # the same result with pfor and with while_loop.
     pfor_outputs, while_outputs = create_mnist_per_eg_jacobian(
         2, data_format, training=False)
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 7e460176c61..2b02d5e30d3 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -424,7 +424,7 @@ class WhileOp(object):
     return inp, stacked
 
   def _maybe_stacked(self, cache, inp):
-    """Heuristic to figue out if the converting inp leads to a stacked value.
+    """Heuristic to figure out if the converting inp leads to a stacked value.
 
 
     Args:
@@ -1001,7 +1001,11 @@ def _create_op(op_type, inputs, op_dtypes, attrs=None):
   """Utility to create an op."""
   op = ops.get_default_graph().create_op(
       op_type, inputs, op_dtypes, attrs=attrs, compute_device=True)
-  flat_attrs = nest.flatten([(str(a), op.get_attr(str(a))) for a in attrs])
+  flat_attrs = []
+  # The tape expects an alternating flat list of names and attribute values.
+  for a in attrs:
+    flat_attrs.append(str(a))
+    flat_attrs.append(op.get_attr(str(a)))
   execute.record_gradient(op_type, op.inputs, tuple(flat_attrs), op.outputs[:])
   return op
 
@@ -1128,7 +1132,7 @@ class PForConfig(object):
     concrete_function = def_function.function(fn).get_concrete_function(
         *tensor_specs)
 
-    # Creates PlaceholderWithDefault and IdentityN nodes corresponding the the
+    # Creates PlaceholderWithDefault and IdentityN nodes corresponding the
     # reduction.
     pl_outputs = []
     with ops.control_dependencies(args):
@@ -1429,13 +1433,13 @@ class PFor(object):
     return [wrap(output, False) for output in nest.flatten(outputs)]
 
   def _convert_helper(self, op_or_tensor):
-    stack = [op_or_tensor]
+    stack = collections.deque([op_or_tensor])
     while stack:
       y = stack[0]
       if y in self._conversion_map:
         assert isinstance(self._conversion_map[y],
                           (WrappedTensor, ops.Operation))
-        stack.pop(0)
+        stack.popleft()
         continue
       if isinstance(y, ops.Operation):
         assert not y.outputs, (
@@ -1472,7 +1476,7 @@ class PFor(object):
 
       def _add_to_stack(x):
         if x not in self._conversion_map:
-          stack.insert(0, x)
+          stack.appendleft(x)
           return True
         else:
           return False
@@ -1630,7 +1634,7 @@ class PFor(object):
               else:
                 new_output.t.set_shape(output_shape)
             self._add_conversion(old_output, new_output)
-        stack.pop(0)
+        stack.popleft()
 
     return self._conversion_map[op_or_tensor]
 
@@ -3748,6 +3752,42 @@ def _convert_tensor_array_set_item(pfor_input):
     return wrap(_tile_variant(handle, pfor_input), True)
 
 
+@RegisterPFor("TensorListPushBack")
+def _convert_tensor_list_push_back(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  tensor, tensor_stacked, _ = pfor_input.input(1)
+  if handle_stacked:
+    handle = _untile_variant(handle)
+  else:
+    handle = _stack_tensor_list(handle, tensor.dtype,
+                                pfor_input.pfor.loop_len_vector)
+  if not tensor_stacked:
+    tensor = _stack(tensor, pfor_input.pfor.loop_len_vector).t
+  handle = list_ops.tensor_list_push_back(handle, tensor)
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListPopBack")
+def _convert_tensor_array_push_back(pfor_input):
+  handle = pfor_input.stacked_input(0)
+  element_shape = pfor_input.unstacked_input(1)
+  handle = _untile_variant(handle)
+
+  if element_shape.shape.ndims == 0:
+    # Default / unspecified
+    vectorized_shape = -1
+  else:
+    # PopBack has an element shape set when it's the gradient of PushBack, only
+    # used when the list is uninitialized.
+    vectorized_shape = array_ops.concat(
+        [pfor_input.pfor.loop_len_vector, element_shape], axis=0)
+
+  output_handle, tensor = gen_list_ops.tensor_list_pop_back(
+      input_handle=handle, element_dtype=pfor_input.get_attr("element_dtype"),
+      element_shape=vectorized_shape)
+  return wrap(output_handle, True), wrap(tensor, True)
+
+
 @RegisterPFor("TensorListConcatV2")
 def _convert_tensor_list_concat_v2(pfor_input):
   input_handle = pfor_input.stacked_input(0)
@@ -4134,7 +4174,7 @@ def _outputs_for_branch(func_name, indices, pfor_input, inputs):
   stacked_outputs = []
   for out in outputs:
     if not out.is_stacked:
-      stacked_outputs.append(_stack(out.t, array_ops.size(indices)).t)
+      stacked_outputs.append(_stack(out.t, [array_ops.size(indices)]).t)
     else:
       stacked_outputs.append(out.t)
   return stacked_outputs
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 309957a76a1..2934491e69a 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -510,6 +510,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -641,6 +642,9 @@ py_test(
     python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = [
+        "notsan",  # TODO(b/170902201): Flaky
+    ],
     deps = [
         ":ragged_factory_ops",
         ":ragged_gather_ops",
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 782902f2f71..46349a8c6de 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -447,7 +447,10 @@ def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
       return ragged_tensor.RaggedTensor.from_uniform_row_length(
           input, uniform_row_length=1, nrows=input.nrows(), validate=False)
     else:
-      return input.with_values(expand_dims(input.values, axis - 1))
+      if ragged_tensor.is_ragged(input.values):
+        return input.with_values(expand_dims(input.values, axis - 1))
+      else:
+        return input.with_values(array_ops.expand_dims(input.values, axis - 1))
 
 
 #===============================================================================
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index e8c625ccc73..e915d1ecd61 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -143,3 +143,42 @@ def to_sparse(rt_input, name=None):
 
 def from_sparse(st_input, name=None):
   return ragged_tensor.RaggedTensor.from_sparse(st_input, name)
+
+
+@ops.RegisterGradient("RaggedTensorFromVariant")
+def _ragged_tensor_from_variant_grad(op, *grads):
+  """Gradient for RaggedTensorFromVariant op."""
+
+  variant_rank = op.inputs[0].shape.rank
+  if variant_rank == 0:
+    batched_input = False
+  elif variant_rank == 1:
+    batched_input = True
+  elif variant_rank is None:
+    batched_input = (op.get_attr("output_ragged_rank") > 0)
+  else:
+    # TODO(edloper): Add a batch_dims argument to RaggedTensorToVariant, so
+    # we can support this.
+    raise ValueError("Unable to compute gradient: RaggedTensorToVariant "
+                     "can currently only generate 0D or 1D output.")
+  return [
+      gen_ragged_conversion_ops.ragged_tensor_to_variant(
+          rt_nested_splits=op.outputs[:-1],
+          rt_dense_values=grads[-1],
+          batched_input=batched_input)
+  ]
+
+
+@ops.RegisterGradient("RaggedTensorToVariant")
+def _ragged_tensor_to_variant_grad(op, encoded_ragged_grad):
+  """Gradient for RaggedTensorToVariant op."""
+  dense_values = op.inputs[-1]
+  ragged_rank = len(op.inputs) - 1
+  row_splits = 0 if ragged_rank == 0 else op.inputs[0]
+  values_grad = gen_ragged_conversion_ops.ragged_tensor_to_variant_gradient(
+      encoded_ragged_grad=encoded_ragged_grad,
+      row_splits=row_splits,
+      dense_values_shape=array_ops.shape(dense_values),
+      Tvalues=op.inputs[-1].dtype)
+  result = [None] * ragged_rank + [values_grad]
+  return result
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 5ec307cd0ed..312a5bada35 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -538,8 +538,9 @@ def register_dispatchers():
     _, undecorated_op = tf_decorator.unwrap(op)
     if not hasattr(undecorated_op,
                    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names):
-      raise AssertionError('Expected %s to be an exported symbol '
-                           '(while adding a RaggedTensor dispatcher)')
+      raise AssertionError('Expected %r to be an exported symbol '
+                           '(while adding a RaggedTensor dispatcher)'
+                           % (undecorated_op,))
 
   for op in _UNARY_ELEMENTWISE_OPS:
     UnaryRaggedElementwiseDispatcher(op).register(op)
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index b380dae63c6..db8694a0698 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -347,7 +347,7 @@ def _slice_length(value_length, slice_key):
 
   Args:
     value_length: Scalar int `Tensor`: the length of the value being sliced.
-    slice_key: A `slice` object used to slice elements from the the value.
+    slice_key: A `slice` object used to slice elements from the value.
 
   Returns:
     The number of elements in the sliced value.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 5f713fa0793..0f9443cabb4 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged.row_partition import RowPartition
 from tensorflow.python.types import internal as internal_types
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
@@ -341,6 +342,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
         row_partition=row_partition)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_value_rowids(cls,
                         values,
                         value_rowids,
@@ -399,6 +401,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_splits(cls, values, row_splits, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
 
@@ -445,6 +448,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_lengths(cls, values, row_lengths, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
 
@@ -487,6 +491,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_starts(cls, values, row_starts, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
 
@@ -526,6 +531,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_limits(cls, values, row_limits, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
 
@@ -562,6 +568,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_uniform_row_length(cls,
                               values,
                               uniform_row_length,
@@ -636,6 +643,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_nested_value_rowids(cls,
                                flat_values,
                                nested_value_rowids,
@@ -692,6 +700,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return result
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_nested_row_splits(cls,
                              flat_values,
                              nested_row_splits,
@@ -731,6 +740,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return result
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_nested_row_lengths(cls,
                               flat_values,
                               nested_row_lengths,
@@ -1307,6 +1317,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       A `RaggedTensor`.  `result.rank = 1 + new_values.rank`.
       `result.ragged_rank = 1 + new_values.ragged_rank`
     """
+    new_values = _convert_to_ragged_tensor_values(new_values)
     new_values.shape.with_rank_at_least(1)
     self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
     if (isinstance(new_values, RaggedTensor) and
@@ -1339,8 +1350,8 @@ class RaggedTensor(composite_tensor.CompositeTensor,
     if isinstance(self._values, RaggedTensor):
       return self.with_values(self.values.with_flat_values(new_values))
     else:
-      _assert_is_supported_ragged_values_type(new_values)
-      return self.with_values(new_values)
+      new_values = _convert_to_ragged_tensor_values(new_values)
+    return self.with_values(new_values)
 
   def with_row_splits_dtype(self, dtype):
     """Returns a copy of this RaggedTensor with the given `row_splits` dtype.
@@ -1479,6 +1490,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
 #=============================================================================
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_tensor(cls,
                   tensor,
                   lengths=None,
@@ -1725,6 +1737,11 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       if default_value is None:
         default_value = array_ops.zeros((), self.dtype)
 
+      if (isinstance(shape, (list, tuple)) and
+          any(isinstance(v, ops.Tensor) for v in shape) and
+          all(isinstance(v, (int, ops.Tensor)) for v in shape)):
+        shape = array_ops.stack(shape)
+
       shape_tensor = _shape_as_tensor(shape, row_partition_tensors[0].dtype)
       tensor = gen_ragged_conversion_ops.ragged_tensor_to_tensor(
           shape=shape_tensor,
@@ -1751,6 +1768,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return tensor
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_sparse(cls, st_input, name=None, row_splits_dtype=dtypes.int64):
     """Converts a 2D `tf.sparse.SparseTensor` to a `RaggedTensor`.
 
@@ -2521,8 +2539,8 @@ def convert_to_tensor_or_ragged_tensor(value,
       return RaggedTensor.from_nested_row_splits(
           flat_values, value.nested_row_splits, validate=False)
   else:
-    return ops.convert_to_tensor(
-        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+    return ops.convert_to_tensor_v2_with_dispatch(
+        value=value, dtype=dtype, dtype_hint=preferred_dtype, name=name)
 
 
 def _convert_to_ragged_tensor_values(value):
@@ -2863,9 +2881,6 @@ def _get_optional_partition_dtype(values):
   return None
 
 
-ops.no_gradient("RaggedTensorToVariant")
-
-
 _SUPPORTED_RAGGED_VALUE_TYPES = (ops.Tensor, RaggedTensor)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py b/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
index 9d0241e7cf3..e1f71e4e82f 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
@@ -271,6 +271,9 @@ class RaggedTensorSupportedValuesTest(test_util.TensorFlowTestCase,
            'x': ([[-2.0, 3.0], [-3.0]]),
            'rate': 0.5,
            'seed': 1},
+          {'op': array_ops.expand_dims_v2,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'axis': -1},
       ])  # pyformat: disable
   def testUnaryElementwiseOp(self,
                              x,
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index d92cb9cec6c..a38c5527305 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,8 +32,15 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -1233,19 +1242,21 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(factory(**kwargs))
 
+  #=============================================================================
+  # RaggedTensor Variant conversion
+  #=============================================================================
 
-#=============================================================================
-# RaggedTensor Variant conversion
-#=============================================================================
-
-  @parameterized.parameters(
+  @parameterized.named_parameters(
       {
+          'testcase_name': 'Shape_5_none',
           'ragged_constant': [[1, 2], [3, 4, 5], [6], [], [7]],
           'ragged_rank': 1
       }, {
+          'testcase_name': 'Shape_4_none_2',
           'ragged_constant': [[[1, 2]], [], [[3, 4]], []],
           'ragged_rank': 1
       }, {
+          'testcase_name': 'Shape_1_none_none',
           'ragged_constant': [[[1], [2, 3, 4, 5, 6, 7]], [[]]],
           'ragged_rank': 2
       })
@@ -1432,6 +1443,131 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           output_ragged_rank=1,
           input_ragged_rank=1)
 
+  def _testRaggedVarientGradient(self, func, x, expected_grad):
+    x = constant_op.constant(x)
+    if context.executing_eagerly():
+      with backprop.GradientTape() as t:
+        t.watch(x)
+        y = func(x)
+        g = t.gradient(y, x)
+    else:
+      y = func(x)
+      g = gradients_impl.gradients(ys=y, xs=x)[0]
+    self.assertAllClose(g, expected_grad)
+
+  def testRaggedVariantGradients(self):
+    def func(x):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      rt2 = rt1 * [[10], [100], [1000]]
+      v = rt2._to_variant(batched_input=False)
+      rt3 = RaggedTensor._from_variant(v, dtype=rt2.dtype, output_ragged_rank=1)
+      return rt3.flat_values
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [10., 10., 10., 10., 100., 100., 100., 1000.])
+
+  def testRaggedVariantGradientsBatched(self):
+    def func(x):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      rt2 = rt1 * [[10], [100], [1000]]
+      v = rt2._to_variant(batched_input=True)
+      rt3 = RaggedTensor._from_variant(v, dtype=rt2.dtype, output_ragged_rank=1)
+      return rt3.flat_values
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [10., 10., 10., 10., 100., 100., 100., 1000.])
+
+  def testRaggedVariantGradientsBatchedAndSliced(self):
+    def func(x, i):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      rt2 = rt1 * [[10], [100], [1000]]
+      v_slice = rt2._to_variant(batched_input=True)[i]
+      return RaggedTensor._from_variant(v_slice, dtype=rt2.dtype,
+                                        output_ragged_rank=0)
+
+    self._testRaggedVarientGradient(
+        functools.partial(func, i=0),
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [10., 10., 10., 10., 0., 0., 0., 0.])
+    self._testRaggedVarientGradient(
+        functools.partial(func, i=1),
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [0., 0., 0., 0., 100., 100., 100., 0.])
+    self._testRaggedVarientGradient(
+        functools.partial(func, i=2),
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [0., 0., 0., 0., 0., 0., 0., 1000.])
+
+  def testRaggedVariantGradientsRaggedRank0(self):
+    def func(x):
+      x2 = x * 2
+      v = gen_ragged_conversion_ops.ragged_tensor_to_variant(
+          [], x2, batched_input=False)
+      return RaggedTensor._from_variant(v, dtype=x2.dtype, output_ragged_rank=0)
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
+
+  def testRaggedVariantGradientsRaggedRank3(self):
+    def func(x):
+      x2 = x * 2
+      rt1 = RaggedTensor.from_nested_row_splits(
+          x2, ([0, 0, 3], [0, 2, 2, 3], [0, 4, 7, 8]))
+      v = rt1._to_variant(batched_input=False)
+      rt3 = RaggedTensor._from_variant(v, dtype=x2.dtype, output_ragged_rank=3)
+      return rt3.flat_values
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
+
+  def testRaggedVariantGradientsViaMapFn(self):
+    rt = RaggedTensor.from_row_splits(
+        values=[3, 1.0, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 7, 8])
+
+    def func(x):
+
+      def transform_row(row):
+        return math_ops.sqrt(
+            math_ops.reduce_mean(math_ops.square(row * x), keepdims=True))
+
+      return math_ops.reduce_sum(map_fn.map_fn(transform_row, rt))
+
+    self._testRaggedVarientGradient(func, 3.0, 14.653377)
+
+  def testRaggedVariantGradientsViaMapFnReduce(self):
+    def func(x):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      return map_fn.map_fn(
+          math_ops.reduce_max, rt1,
+          fn_output_signature=tensor_spec.TensorSpec((), x.dtype))
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0])
+
+  def testRaggedVariantGradientsErrors(self):
+    if context.executing_eagerly():
+      return
+
+    rt = RaggedTensor.from_row_splits([1.0, 2.0], row_splits=[0, 2, 2])
+    v1 = rt._to_variant()
+    v2 = array_ops.stack([array_ops.stack([v1])])
+    y = RaggedTensor._from_variant(v2, rt.dtype, output_ragged_rank=3)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Unable to compute gradient: RaggedTensorToVariant '
+        'can currently only generate 0D or 1D output.'):
+      gradients_impl.gradients(ys=y.flat_values, xs=rt.flat_values)
+
   def assertNumpyObjectTensorsRecursivelyEqual(self, a, b, msg):
     """Check that two numpy arrays are equal.
 
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index 28955c825e6..e01c3442cd9 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -720,6 +720,11 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
       return array_ops.placeholder_with_default(arg, [None] * arg.shape.rank)
     raise AssertionError('Unexpected shape_info %r' % shape_info)
 
+  def test_shape_is_list_including_tensor_element(self):
+    rt = ragged_factory_ops.constant([[1, 2, 3], [4], [5, 6]])
+    result = rt.to_tensor(shape=[2, constant_op.constant(2)])
+    self.assertAllEqual(result, [[1, 2], [4, 0]])
+
 
 class RaggedToDenseBenchmark(googletest.Benchmark):
 
diff --git a/tensorflow/python/ops/risc/BUILD b/tensorflow/python/ops/risc/BUILD
new file mode 100644
index 00000000000..458798e1ece
--- /dev/null
+++ b/tensorflow/python/ops/risc/BUILD
@@ -0,0 +1,22 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "risc_grad",
+    srcs = ["risc_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+py_library(
+    name = "risc_ops",
+    srcs = ["risc_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:risc_ops_gen",
+    ],
+)
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh b/tensorflow/python/ops/risc/risc_grad.py
similarity index 59%
rename from tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
rename to tensorflow/python/ops/risc/risc_grad.py
index ccc80e1bafd..b125aab895a 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
+++ b/tensorflow/python/ops/risc/risc_grad.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,11 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""RISC operation gradient."""
 
-set -e
-set -x
 
-# Install latest bazel
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import ops
+
+
+@ops.RegisterGradient("RiscAdd")
+def _RiscAddGrad(_, grad):
+  # pylint: disable=unused-argument
+  # TODO(b/171294012): Implement gradient of RISC with RISC ops.
+  return None, None
diff --git a/tensorflow/python/ops/risc/risc_ops.py b/tensorflow/python/ops/risc/risc_ops.py
new file mode 100644
index 00000000000..8682ebdd269
--- /dev/null
+++ b/tensorflow/python/ops/risc/risc_ops.py
@@ -0,0 +1,34 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RISC Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.ops import gen_risc_ops
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.risc_ops_gen import *
+# pylint: enable=wildcard-import
+
+
+def risc_add(
+    input_lhs,
+    input_rhs,
+    name="RISC_ADD"):
+  return gen_risc_ops.risc_add(input_lhs, input_rhs, name=name)
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index ebe15ec0dce..ed0b66443cf 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -508,12 +508,10 @@ def stateless_random_normal(shape,
     shape = tensor_util.shape_tensor(shape)
     mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-    if compat.forward_compatible(2020, 10, 25):
-      key, counter, alg = _get_key_counter_alg(seed)
-      rnd = gen_stateless_random_ops_v2.stateless_random_normal_v2(
-          shape, key=key, counter=counter, dtype=dtype, alg=alg)
-    else:
-      rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
+    # TODO(b/171746875): stateless_random_normal([1024, 32000], dtype='float32')
+    #   OOM on TPU with StatelessRandomNormalV2 because of excessive padding.
+    #   Investigate and switch to StatelessRandomNormalV2.
+    rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
     result = math_ops.add(rnd * stddev, mean, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index c09a38f1d21..5b50cf42c56 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -1028,6 +1028,57 @@ class StructuredTensorSpec(type_spec.BatchableTypeSpec):
         self._shape[1:],
         dict((k, v._unbatch()) for (k, v) in self._field_specs.items()))
 
+  @property
+  def _flat_tensor_specs(self):
+    # pylint: disable=protected-access
+    result = []
+    for _, field_spec in sorted(self._field_specs.items(), key=lambda t: t[0]):
+      result.extend(field_spec._flat_tensor_specs)
+    return result
+
+  def _to_tensor_list(self, value):
+    return self._to_tensor_list_internal(value, batched=False)
+
+  def _to_batched_tensor_list(self, value):
+    return self._to_tensor_list_internal(value, batched=True)
+
+  def _from_compatible_tensor_list(self, tensor_list):
+    # pylint: disable=protected-access
+    fields = {}
+    pos = 0
+    for field_name, field_spec in sorted(
+        self._field_specs.items(), key=lambda t: t[0]):
+      num_tensors_for_field = len(field_spec._flat_tensor_specs)
+      field_tensors = tensor_list[pos:pos + num_tensors_for_field]
+      fields[field_name] = field_spec._from_compatible_tensor_list(
+          field_tensors)
+      pos += num_tensors_for_field
+    return StructuredTensor.from_fields(fields, self._shape)
+
+  def _to_tensor_list_internal(self, value, batched):
+    """Returns a dict whose entries are each field's (batched) tensor_list.
+
+    If a field is a StructuredTensor, then its entry will be a dict,
+    recursively.
+
+    Args:
+      value: A StructuredTensor (conforming to `self`).
+      batched: A boolean. if True, produce `batched_tensor_list` for each field
+        otherwise produce `tensor_list`.
+    Returns:
+      A dict.
+    """
+    result = []
+    for field_name, field_spec in sorted(
+        self._field_specs.items(), key=lambda t: t[0]):
+      # pylint: disable=protected-access
+      field_value = value._fields[field_name]
+      if batched:
+        result.extend(field_spec._to_batched_tensor_list(field_value))
+      else:
+        result.extend(field_spec._to_tensor_list(field_value))
+
+    return result
 
 # Regular expression used to determine whether a string is a valid field name.
 # Note: we plan to relax (or possibly eliminate) this in the future; you
diff --git a/tensorflow/python/ops/structured/structured_tensor_spec_test.py b/tensorflow/python/ops/structured/structured_tensor_spec_test.py
index 4637a1a51e5..9cf9acf5ac3 100644
--- a/tensorflow/python/ops/structured/structured_tensor_spec_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_spec_test.py
@@ -213,30 +213,56 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
               'b': StructuredTensor.from_fields(shape=[2], fields={
                   'x': [[5], [6]]})}),
       },
+      {
+          'unbatched': lambda: [
+              StructuredTensor.from_fields(shape=[], fields={
+                  'Ragged3d': ragged_factory_ops.constant_value([[1, 2], [3]]),
+                  'Ragged2d': ragged_factory_ops.constant_value([1]),
+              }),
+              StructuredTensor.from_fields(shape=[], fields={
+                  'Ragged3d': ragged_factory_ops.constant_value([[1]]),
+                  'Ragged2d': ragged_factory_ops.constant_value([2, 3]),
+              })],
+          'batch_size': 2,
+          'batched': lambda: StructuredTensor.from_fields(shape=[2], fields={
+              'Ragged3d': ragged_factory_ops.constant_value(
+                  [[[1, 2], [3]], [[1]]]),
+              'Ragged2d': ragged_factory_ops.constant_value([[1], [2, 3]]),
+          }),
+          'use_only_batched_spec': True,
+      },
   ])  # pyformat: disable
-  def testBatchUnbatchValues(self, unbatched, batch_size, batched):
+  def testBatchUnbatchValues(self, unbatched, batch_size, batched,
+                             use_only_batched_spec=False):
     batched = batched()  # Deferred init because it creates tensors.
     unbatched = unbatched()  # Deferred init because it creates tensors.
 
     # Test batching.
-    unbatched_spec = type_spec.type_spec_from_value(unbatched[0])
+    if use_only_batched_spec:
+      unbatched_spec = type_spec.type_spec_from_value(batched)._unbatch()
+    else:
+      unbatched_spec = type_spec.type_spec_from_value(unbatched[0])
     unbatched_tensor_lists = [unbatched_spec._to_tensor_list(st)
                               for st in unbatched]
     batched_tensor_list = [array_ops.stack(tensors)
                            for tensors in zip(*unbatched_tensor_lists)]
     actual_batched = unbatched_spec._batch(batch_size)._from_tensor_list(
         batched_tensor_list)
+    self.assertTrue(
+        unbatched_spec._batch(batch_size).is_compatible_with(actual_batched))
     self.assertAllEqual(actual_batched, batched)
 
     # Test unbatching
     batched_spec = type_spec.type_spec_from_value(batched)
-    batched_tensor_list = batched_spec._to_tensor_list(batched)
+    batched_tensor_list = batched_spec._to_batched_tensor_list(batched)
     unbatched_tensor_lists = zip(
         *[array_ops.unstack(tensor) for tensor in batched_tensor_list])
     actual_unbatched = [
         batched_spec._unbatch()._from_tensor_list(tensor_list)
         for tensor_list in unbatched_tensor_lists]
     self.assertLen(actual_unbatched, len(unbatched))
+    for st in actual_unbatched:
+      self.assertTrue(batched_spec._unbatch().is_compatible_with(st))
     for (actual, expected) in zip(actual_unbatched, unbatched):
       self.assertAllEqual(actual, expected)
 
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index db9227c97cb..7b020d2b9f0 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -1202,53 +1202,6 @@ def run_metadata_graphs(name, data, step=None):
         metadata=summary_metadata)
 
 
-def keras_model(name, data, step=None):
-  """Writes a Keras model as JSON to as a Summary.
-
-  Writing the Keras model configuration allows the TensorBoard graph plugin to
-  render a conceptual graph, as opposed to graph of ops. In case the model fails
-  to serialize as JSON, it ignores and returns False.
-
-  Args:
-    name: A name for this summary. The summary tag used for TensorBoard will be
-      this name prefixed by any active name scopes.
-    data: A Keras Model to write.
-    step: Explicit `int64`-castable monotonic step value for this summary. If
-      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
-      not be None.
-
-  Returns:
-    True on success, or False if no summary was written because no default
-    summary writer was available.
-
-  Raises:
-    ValueError: if a default writer exists, but no step was provided and
-      `tf.summary.experimental.get_step()` is None.
-  """
-  summary_metadata = summary_pb2.SummaryMetadata()
-  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
-  # the rationale.
-  summary_metadata.plugin_data.plugin_name = "graph_keras_model"
-  # version number = 1
-  summary_metadata.plugin_data.content = b"1"
-
-  try:
-    json_string = data.to_json()
-  except Exception as exc:  # pylint: disable=broad-except
-    # An exception should not break a model code.
-    logging.warn("Model failed to serialize as JSON. Ignoring... %s" % exc)
-    return False
-
-  with summary_scope(name, "graph_keras_model", [data, step]) as (tag, _):
-    with ops.device("cpu:0"):
-      tensor = constant_op.constant(json_string, dtype=dtypes.string)
-    return write(
-        tag=tag,
-        tensor=tensor,
-        step=step,
-        metadata=summary_metadata)
-
-
 _TraceContext = collections.namedtuple("TraceContext", ("graph", "profiler"))
 _current_trace_context_lock = threading.Lock()
 _current_trace_context = None
diff --git a/tensorflow/python/ops/tensor_array_ops_test.py b/tensorflow/python/ops/tensor_array_ops_test.py
index 4f09ff5c22d..ec18fcd8271 100644
--- a/tensorflow/python/ops/tensor_array_ops_test.py
+++ b/tensorflow/python/ops/tensor_array_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -72,6 +74,18 @@ class TensorArrayOpsTest(test.TestCase):
     self.assertAllEqual(fn(['a', 'b', 'c'], ['c', 'd', 'e']),
                         [b'a', b'b', b'c', b'c', b'd', b'e'])
 
+  def test_init_numpy_shape(self):
+    @def_function.function
+    def fn():
+      values = tensor_array_ops.TensorArray(
+          np.float32,
+          size=1,
+          dynamic_size=False,
+          element_shape=np.array((2, 3)))
+      values = values.write(0, np.ones((2, 3)))
+      return values.concat()
+    self.assertAllEqual(fn(), [[1., 1., 1.], [1., 1., 1.]])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/v1_compat_tests/BUILD b/tensorflow/python/ops/v1_compat_tests/BUILD
index 3f44e5208a2..37bff01d429 100644
--- a/tensorflow/python/ops/v1_compat_tests/BUILD
+++ b/tensorflow/python/ops/v1_compat_tests/BUILD
@@ -10,7 +10,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["gradient_checker_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 556360ce640..8c6d9692a3d 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -276,12 +276,8 @@ def while_loop(cond,
           body_graph,
           output_shapes=output_shapes,
           parallel_iterations=parallel_iterations,
-          name=scope)
-      # This is needed so we do not compute derivative wrt these extra outputs.
-      outputs[0].op._set_attr("_num_original_outputs",
-                              attr_value_pb2.AttrValue(i=num_original_outputs))
-    outputs[0].op._cond_graph = cond_graph
-    outputs[0].op._body_graph = body_graph
+          name=scope,
+          num_original_outputs=num_original_outputs)
     if not ops.get_default_graph().building_function:
       # In V1 graph mode, return identities for each output of the While op,
       # rather than the output of the While op directly. This makes pruning work
@@ -366,11 +362,18 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
     cond_graph.name += "_rewritten"
     body_graph.name += "_rewritten"
 
+    # `body_grad_graph.extra_inputs` here is equivalent to skimming off the new
+    # `body_graph.external_captures` added during `_create_grad_func`.
     new_inputs = body_grad_graph.extra_inputs
     new_outputs = body_graph.outputs[orig_num_params:]
 
     while_op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
     while_op._set_func_attr("body", util.create_new_tf_function(body_graph))
+    if len(body_graph.output_types) != len(while_op.inputs) + len(new_inputs):
+      # Continuing leads to an invalid graph with disconnected inputs.
+      raise AssertionError(
+          "Inputs and outputs constructed for the forward op of a While "
+          "gradient don't match. This doesn't make sense, please file a bug.")
     while_op._set_type_list_attr("T", body_graph.output_types)
     while_op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
     while_op._add_while_inputs(new_inputs)
@@ -408,7 +411,8 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       body_grad_graph,
       output_shapes=[t.shape for t in body_grad_graph.outputs],
       parallel_iterations=parallel_iterations,
-      name="%s_grad" % while_op.name)
+      name="%s_grad" % while_op.name,
+      num_original_outputs=len(body_grad_graph.outputs))
 
   # See comment in while_loop.
   outputs = [array_ops.identity(t) for t in outputs]
@@ -416,7 +420,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
 
 
 def _build_while_op(loop_vars, cond_graph, body_graph, output_shapes,
-                    parallel_iterations, name):
+                    parallel_iterations, name, num_original_outputs):
   """Builds the functional StatelessWhile/While op."""
   cond_stateful_ops = [
       op for op in cond_graph.get_operations() if op._is_stateful
@@ -429,19 +433,30 @@ def _build_while_op(loop_vars, cond_graph, body_graph, output_shapes,
   else:
     op_fn = gen_functional_ops.stateless_while
 
-  outputs = op_fn(
-      loop_vars,
-      util.create_new_tf_function(cond_graph),
-      util.create_new_tf_function(body_graph),
-      output_shapes=output_shapes,
-      parallel_iterations=parallel_iterations,
-      name=name)
-  while_op = outputs[0].op
-  _copy_handle_data(body_graph.outputs, outputs)
-  util.maybe_set_lowering_attr(while_op)
-  util.maybe_propagate_compile_time_consts_in_xla(while_op)
-  _set_read_only_resource_inputs_attr(while_op, [cond_graph, body_graph])
-  return outputs
+  def _make_op(inputs):
+    while_op, tensors = util.get_op_and_outputs(op_fn(
+        inputs,
+        util.create_new_tf_function(cond_graph),
+        util.create_new_tf_function(body_graph),
+        output_shapes=output_shapes,
+        parallel_iterations=parallel_iterations,
+        name=name))
+    _copy_handle_data(body_graph.outputs, tensors)
+    util.maybe_set_lowering_attr(while_op)
+    util.maybe_propagate_compile_time_consts_in_xla(while_op)
+    _set_read_only_resource_inputs_attr(while_op, [cond_graph, body_graph])
+    # This is needed so we do not compute derivative wrt these extra outputs.
+    while_op._set_attr("_num_original_outputs",
+                       attr_value_pb2.AttrValue(i=num_original_outputs))
+    # The while op may be created inside a tf.function, in which case ops
+    # needs to capture "through" it when taking gradients; outer_graph is used
+    # as a sanity check that capturing only happens from parent to child.
+    cond_graph.outer_graph = ops.get_default_graph()
+    body_graph.outer_graph = ops.get_default_graph()
+    while_op._cond_graph = cond_graph
+    while_op._body_graph = body_graph
+    return tensors
+  return util.run_as_function_for_tape_gradients(_make_op, loop_vars)
 
 
 def _get_intermediates(func_graph):
@@ -639,22 +654,19 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
   # tensors. We capture 3 types of tensors when building the grad fn:
   # 1. Accumulators for forward graph intermediates which are not loop
   #    invariants. The outputs corresponding to these are populated in
-  #    `popped_tensor_lists` by `_WhileBodyGradFuncGraph`.
+  #    `internal_capture_to_output` by `_WhileBodyGradFuncGraph`.
   # 2. Resources, which are output as is.
   # 3. Forward graph loop invariants, which are output as is.
   for external_capture, internal_capture in grad_func_graph.captures:
-    if ops.tensor_id(internal_capture) in grad_func_graph.popped_tensor_lists:
-      new_output = grad_func_graph.popped_tensor_lists[ops.tensor_id(
+    if (ops.tensor_id(internal_capture)
+        in grad_func_graph.internal_capture_to_output):
+      new_output = grad_func_graph.internal_capture_to_output[ops.tensor_id(
           internal_capture)]
-    elif (internal_capture.dtype == dtypes.resource or _is_loop_invariant(
-        external_capture, body_graph_inputs, body_graph_outputs)):
-      new_output = internal_capture
     else:
-      raise ValueError("Tensor %s which captures %s is in list of "
-                       "internal_captures but is not a resource, is not in "
-                       "popped_tensor_lists and does not capture a loop "
-                       "invariant." %
-                       (str(internal_capture), str(external_capture)))
+      raise ValueError(
+          "Tensor %s which captures %s is in list of "
+          "internal_captures but not in internal_capture_to_output." %
+          (str(internal_capture), str(external_capture)))
     grad_func_graph.outputs.append(new_output)
     grad_func_graph.structured_outputs.append(new_output)
 
@@ -719,13 +731,12 @@ def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
 
   Returns:
     A list of input tensors to be passed as the captured inputs to
-      `body_grad_graph`.
+    `body_grad_graph`.
   """
   new_capture_inputs = []
   for t in body_grad_graph.external_captures:
-    # All values captured by gradient computation should be from the forward
-    # graph or a captured resource variable (note that input gradients are
-    # regular non-captured inputs).
+    # Resolve tensors captured from the forward graph to the outputs of the
+    # forward while_op.
     if t.graph == body_graph:
       # Captured accumulator or loop invariant.
       for i, output in enumerate(t.graph.outputs):
@@ -737,9 +748,6 @@ def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
       # correctly capture the tensors in `body_graph.outer_graph`. Both cond_v2
       # and while_v2 handle this while building their gradient functions.
       assert t.graph == body_graph.outer_graph
-    else:
-      # Captured resource variable
-      assert t.dtype == dtypes.resource
 
     new_capture_inputs.append(t)
   return new_capture_inputs
@@ -822,7 +830,7 @@ def _get_accumulator(tensor):
     # tf.defun adds an Identity for each output, check whether that is the case.
     identity_op = t.consumers()[0]
     if (identity_op.type == "Identity" and
-        identity_op.outputs[0] in tensor.graph.outputs):
+        any(identity_op.outputs[0] is t for t in tensor.graph.outputs)):
       return identity_op.outputs[0]
     return None
 
@@ -874,8 +882,8 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
 
   This only allows capturing tensors in the forward graph. A ValueError is
   raised if an attempt is made to capture a tensor not in the forward graph.
-  To manually capture capture a tensor that is not in the forward graph, call
-  `capture` with `allowlisted=True`.
+  To manually capture a tensor that is not in the forward graph, call `capture`
+  with `allowlisted=True`.
 
   Note: The `captures` dict does not contain the forward tensor since it is not
   directly captured. It contains the accumulator corresponding to this forward
@@ -888,9 +896,13 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     extra_inputs: list of EmptyTensorList tensors to be used as initial input to
     the new accumulators in the forward graph. It may also contain external
     captures of the custom gradient function.
-    popped_tensor_lists: dict from the captured accumulator placeholder to the
-      TensorList obtained after popping the intermediate tensor from it. The
-      values of this dict need to be added to the list of outputs.
+    internal_capture_to_output: dict from a tensor_id(captured placeholder) to
+      the corresponding tensor that needs to be added to the list of outputs.
+      For instance, when capturing an accumulator TensorList this contains the
+      TensorList obtained after popping a tensor from the list. Other entries
+      in this dict are expected, though not enforced, to be identities.
+      This dict is needed because these output tensors need to be added to
+      FuncGraph.outputs "after" the tensors returned from the gradient function.
   """
 
   def __init__(self, name, forward_cond_graph, forward_body_graph,
@@ -898,18 +910,13 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
                body_graph_outputs):
     super(_WhileBodyGradFuncGraph, self).__init__(name)
     self.extra_inputs = []
-    self.popped_tensor_lists = {}
+    self.internal_capture_to_output = {}
     # FuncGraph for the body of the forward While op.
     self._forward_graph = forward_body_graph
     # FuncGraph for the cond of the forward While op.
     self._forward_cond_graph = forward_cond_graph
     self._maximum_iterations = maximum_iterations
     self._forward_while_op = forward_while_op
-    # Only for use in `_is_loop_invariant`. These are not updated when
-    # additional tensors are added to `forward_body_graph.inputs` and
-    # `forward_body_graph.outputs` in `_capture_helper`.
-    self._forward_graph_inputs = body_graph_inputs
-    self._forward_graph_outputs = body_graph_outputs
     # Dict from forward intermediate tensor to its indirectly captured tensor
     # in this graph. Indirect capturing happens in two ways:
     # 1. For non-resource tensors we capture their accumulators from the forward
@@ -933,8 +940,8 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
       attrs=None,
       op_def=None,
       compute_device=True):
-    # For a reduction op, if op is in in the gradient body graph and its input
-    # is from the forward graph, moving op to the forward graph means we would
+    # For a reduction op, if op is in the gradient body graph and its input is
+    # from the forward graph, moving op to the forward graph means we would
     # store the tensor after the reduction as opposed to the tensor before
     # reduction, and therefore could significantly reduce memory consumption.
     # For now, we do this only for a few ops.
@@ -946,10 +953,16 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     # and popping from a TensorList removes the constant property of an op and
     # breaks XLA compilation, which requires certain inputs to be compile-time
     # constant for certain ops.
+    #
+    # This optimization is currently also disabled when under a persistent tape,
+    # since it leads to an unbounded number of side outputs. With caching it may
+    # be possible to re-enable it.
     if (op_type in {"Shape", "Size", "Rank"} and
         all(input.graph is self._forward_graph for input in inputs) and
         all(_get_accumulator(input) is None for input in inputs) and
-        not util_v1.GraphOrParentsInXlaContext(self._forward_graph)):
+        not util_v1.GraphOrParentsInXlaContext(self._forward_graph) and
+        not util.graph_wrapped_for_higher_order_tape_gradients(
+            self._forward_graph)):
       with self._forward_graph.as_default():
         # `name` was built using name_scope stack of gradient graph and may not
         # be unique in the forward graph. `Graph.create_op` does not uniquify
@@ -978,43 +991,23 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
         op_def=op_def,
         compute_device=compute_device)
 
-  def capture(self, tensor, name=None, allowlisted=False):
-    """Selectively captures external tensors.
-
-    If `allowlisted` is False only allows capturing tensors in the
-    `_forward_graph`.
-
-    Args:
-      tensor: Tensor. May be from this FuncGraph or a different graph.
-      name: Optional name if a placeholder is created.
-      allowlisted: If False (default), only allows capturing tensors from the
-        forward graph.
-
-    Returns:
-      The placeholder in this graph for the tensor.
-
-    Raises:
-      ValueError: If attempting to capture an external tensor not in the forward
-        graph with `allowlisted` set to False.
-    """
-    if not allowlisted and (isinstance(tensor, ops.EagerTensor) or
-                            (tensor.graph is not self and
-                             tensor.graph != self._forward_graph)):
-      with self._forward_cond_graph.as_default():
-        self._forward_cond_graph.capture(tensor)
-      with self._forward_graph.as_default():
-        already_captured = self._forward_graph.captured(tensor)
-        if not already_captured:
-          self.extra_inputs.append(tensor)
-        tensor = self._forward_graph.capture(tensor)
-        if not already_captured:
-          self._forward_graph.outputs.append(tensor)
-
-    return super(_WhileBodyGradFuncGraph, self).capture(tensor, name)
-
   def _capture_helper(self, tensor, name):
+    """Implements the capturing described in the class docstring."""
+    captured_tensor = self._indirect_captures.get(ops.tensor_id(tensor))
+    if captured_tensor is not None:
+      return captured_tensor
+
     if tensor.graph is not self._forward_graph:
-      return super(_WhileBodyGradFuncGraph, self)._capture_helper(tensor, name)
+      already_captured = self.captured(tensor)
+      captured_tensor = super(_WhileBodyGradFuncGraph, self)._capture_helper(
+          tensor, name)
+      if not already_captured:
+        # Adds the captured tensor to the list of outputs so that the input
+        # and output signatures match.
+        self.internal_capture_to_output[ops.tensor_id(
+            captured_tensor)] = captured_tensor
+        self._indirect_captures[ops.tensor_id(tensor)] = captured_tensor
+      return captured_tensor
 
     while tensor.op.type == "Identity":
       # We do not accumulate the output of identity nodes so we try to capture
@@ -1025,15 +1018,17 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     if captured_tensor is not None:
       return captured_tensor
 
-    # Do not accumulate loop invariants.
-    if (any(tensor is t for t in self._forward_graph.inputs) and
-        any(tensor is t for t in self._forward_graph.outputs)):
+    # No need to accumulate loop invariants. Capture them directly.
+    # The captured tensor gets resolved to the corresponding while output in
+    # `_resolve_grad_captures`.
+    if _is_loop_invariant(tensor, self._forward_graph.inputs,
+                          self._forward_graph.outputs):
       captured_tensor = super(_WhileBodyGradFuncGraph,
                               self)._capture_helper(tensor, name)
-      # Add to `popped_tensor_lists` so that this gets added to the list of
-      # outputs.
-      # TODO(srbs): Rename popped_tensor_lists.
-      self.popped_tensor_lists[ops.tensor_id(captured_tensor)] = captured_tensor
+      # Add to `internal_capture_to_output` so that this gets added to the list
+      # of outputs.
+      self.internal_capture_to_output[ops.tensor_id(
+          captured_tensor)] = captured_tensor
       self._indirect_captures[ops.tensor_id(tensor)] = captured_tensor
       return captured_tensor
 
@@ -1052,15 +1047,6 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     if tensor.dtype == dtypes.resource:
       return self._resource_capture_helper(tensor)
 
-    # No need to accumulate loop invariants. Capture them directly.
-    # The captured tensor gets resolved to the corresponding while output in
-    # `_resolve_grad_captures`.
-    if _is_loop_invariant(tensor, self._forward_graph_inputs,
-                          self._forward_graph_outputs):
-      captured_tensor = super(_WhileBodyGradFuncGraph,
-                              self)._capture_helper(tensor, name)
-      return captured_tensor
-
     # Create or find an existing accumulator output for `tensor` in the forward
     # graph, and fetch from this accumulator in the gradient graph to get the
     # raw intermediate value.
@@ -1112,7 +1098,7 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
         captured_accumulator, element_dtype=tensor.dtype)
 
     self._indirect_captures[ops.tensor_id(tensor)] = captured_tensor
-    self.popped_tensor_lists[ops.tensor_id(
+    self.internal_capture_to_output[ops.tensor_id(
         captured_accumulator)] = new_tensor_list
     return captured_tensor
 
@@ -1146,7 +1132,7 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
         "Resource tensors must be loop invariants %s." % tensor_in_outer_graph)
 
     self._indirect_captures[ops.tensor_id(tensor)] = self.capture(
-        tensor_in_outer_graph, allowlisted=True)
+        tensor_in_outer_graph)
     return self._indirect_captures[ops.tensor_id(tensor)]
 
 
@@ -1282,7 +1268,8 @@ def _build_accumulator_name(tensor):
 
 
 def _is_loop_invariant(tensor, inputs, outputs):
-  return tensor in inputs and tensor in outputs
+  return (any(tensor is t for t in inputs) and
+          any(tensor is t for t in outputs))
 
 
 class _OperationWithOutputs(ops.Operation):
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 0f328b2df5f..d79410faa22 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -168,8 +168,29 @@ class _BenchmarkRegistrar(type):
     return newclass
 
 
+@tf_export("__internal__.test.ParameterizedBenchmark", v1=[])
 class ParameterizedBenchmark(_BenchmarkRegistrar):
-  """Metaclass to generate parameterized benchmarks."""
+  """Metaclass to generate parameterized benchmarks.
+
+  Use this class as a metaclass and override the `_benchmark_parameters` to
+  generate multiple benchmark test cases. For example:
+
+  class FooBenchmark(metaclass=tf.test.ParameterizedBenchmark,
+                     tf.test.Benchmark):
+    # The `_benchmark_parameters` is expected to be a list with test cases.
+    # Each of the test case is a tuple, with the first time to be test case
+    # name, followed by any number of the parameters needed for the test case.
+    _benchmark_parameters = [
+      ('case_1', Foo, 1, 'one'),
+      ('case_2', Bar, 2, 'two'),
+    ]
+
+    def benchmark_test(self, target_class, int_param, string_param):
+      # benchmark test body
+
+  The example above will generate two benchmark test cases:
+  "benchmark_test__case_1" and "benchmark_test__case_2".
+  """
 
   def __new__(mcs, clsname, base, attrs):
     param_config_list = attrs["_benchmark_parameters"]
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 622e14616ab..9996f5c9894 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -36,7 +36,6 @@ cuda_py_test(
     srcs = ["profiler_client_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
-    tfrt_enabled = True,
     deps = [
         ":profiler_client",
         "//tensorflow/python/eager:test",
@@ -64,7 +63,6 @@ cuda_py_test(
         "no_pip",
         "no_rocm",
     ],
-    tfrt_enabled = True,
     deps = [
         ":profiler_v2",
         "//tensorflow/python:constant_op",
@@ -124,7 +122,6 @@ cuda_py_test(
     srcs = ["profiler_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer",
@@ -185,7 +182,6 @@ cuda_py_test(
         "no_gpu",  # b/136036359
         "no_pip",
     ],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":profile_context",
diff --git a/tensorflow/python/profiler/integration_test/profiler_api_test.py b/tensorflow/python/profiler/integration_test/profiler_api_test.py
index b19f0595583..3a603e7ae71 100644
--- a/tensorflow/python/profiler/integration_test/profiler_api_test.py
+++ b/tensorflow/python/profiler/integration_test/profiler_api_test.py
@@ -59,6 +59,11 @@ def _make_temp_log_dir(test_obj):
 
 class ProfilerApiTest(test_util.TensorFlowTestCase):
 
+  def setUp(self):
+    super().setUp()
+    self.worker_start = threading.Event()
+    self.profile_done = False
+
   def _check_tools_pb_exist(self, logdir):
     expected_files = [
         'overview_page.pb',
@@ -67,10 +72,15 @@ class ProfilerApiTest(test_util.TensorFlowTestCase):
         'kernel_stats.pb',
     ]
     for file in expected_files:
-      path = os.path.join(logdir, 'plugins/profile/*/*{}'.format(file))
+      path = os.path.join(logdir, 'plugins', 'profile', '*', '*{}'.format(file))
       self.assertEqual(1, len(glob.glob(path)),
                        'Expected one path match: ' + path)
 
+  def _check_xspace_pb_exist(self, logdir):
+    path = os.path.join(logdir, 'plugins', 'profile', '*', '*.xplane.pb')
+    self.assertEqual(1, len(glob.glob(path)),
+                     'Expected one path match: ' + path)
+
   def test_single_worker_no_profiling(self):
     """Test single worker without profiling."""
 
@@ -78,38 +88,61 @@ class ProfilerApiTest(test_util.TensorFlowTestCase):
 
     model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
 
-  def test_single_worker_sampling_mode(self):
+  def test_single_worker_sampling_mode(self, delay_ms=None):
     """Test single worker sampling mode."""
 
-    def on_worker(port):
+    def on_worker(port, worker_start):
       logging.info('worker starting server on {}'.format(port))
       profiler.start_server(port)
       _, steps, train_ds, model = _model_setup()
-      model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
-      logging.info('worker finishing')
+      worker_start.set()
+      while True:
+        model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
+        if self.profile_done:
+          break
 
-    def on_profile(port, logdir):
+    def on_profile(port, logdir, worker_start):
       # Request for 30 milliseconds of profile.
       duration_ms = 30
 
+      worker_start.wait()
       options = profiler.ProfilerOptions(
           host_tracer_level=2,
           python_tracer_level=0,
           device_tracer_level=1,
+          delay_ms=delay_ms,
       )
 
       profiler_client.trace('localhost:{}'.format(port), logdir, duration_ms,
                             '', 100, options)
 
+      self.profile_done = True
+
     logdir = self.get_temp_dir()
     port = portpicker.pick_unused_port()
-    thread_profiler = threading.Thread(target=on_profile, args=(port, logdir))
-    thread_worker = threading.Thread(target=on_worker, args=(port,))
+    thread_profiler = threading.Thread(
+        target=on_profile, args=(port, logdir, self.worker_start))
+    thread_worker = threading.Thread(
+        target=on_worker, args=(port, self.worker_start))
     thread_worker.start()
     thread_profiler.start()
     thread_profiler.join()
     thread_worker.join(120)
-    self._check_tools_pb_exist(logdir)
+    self._check_xspace_pb_exist(logdir)
+
+  def test_single_worker_sampling_mode_short_delay(self):
+    """Test single worker sampling mode with a short delay.
+
+    Expect that requested delayed start time will arrive late, and a subsequent
+    retry will issue an immediate start.
+    """
+
+    self.test_single_worker_sampling_mode(delay_ms=1)
+
+  def test_single_worker_sampling_mode_long_delay(self):
+    """Test single worker sampling mode with a long delay."""
+
+    self.test_single_worker_sampling_mode(delay_ms=1000)
 
   def test_single_worker_programmatic_mode(self):
     """Test single worker programmatic mode."""
@@ -124,6 +157,7 @@ class ProfilerApiTest(test_util.TensorFlowTestCase):
     _, steps, train_ds, model = _model_setup()
     model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
     profiler.stop()
+    self._check_xspace_pb_exist(logdir)
     self._check_tools_pb_exist(logdir)
 
 
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 47d34aa000b..cfc718b9d35 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -65,7 +65,6 @@ cuda_py_test(
         "no_gpu",  # b/138442728
         "no_pip",
     ],
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer_testlib",
@@ -120,6 +119,7 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/convert:xplane_to_tools_data",
         "//tensorflow/core/profiler/convert:xplane_to_trace_events",
         "//tensorflow/core/profiler/lib:profiler_session_for_pybind",
@@ -130,6 +130,7 @@ tf_python_pybind_extension(
         "//tensorflow/python:pybind11_status",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 6d349a571ba..6827e4e32a5 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -14,15 +14,22 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
-#include "pybind11/cast.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
@@ -38,7 +45,12 @@ namespace py = ::pybind11;
 
 namespace {
 
-tensorflow::Status ValidateHostPortPair(const std::string& host_port) {
+using ::tensorflow::RemoteProfilerSessionManagerOptions;
+
+// Profiler gives grace after profiling duration to terminate.
+constexpr absl::Duration kMinSessionGraceTime = absl::Seconds(60);
+
+tensorflow::Status ValidateHostPortPair(absl::string_view host_port) {
   tensorflow::uint32 port;
   std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
   // Must be host:port, port must be a number, host must not contain a '/',
@@ -51,34 +63,162 @@ tensorflow::Status ValidateHostPortPair(const std::string& host_port) {
   return tensorflow::Status::OK();
 }
 
-// Takes profiler options in a py::dict and returns a ProfileOptions.
+tensorflow::Status ValidateOptions(
+    const RemoteProfilerSessionManagerOptions& options) {
+  if (options.service_addresses().empty()) {
+    return tensorflow::errors::InvalidArgument("No service address provided.");
+  }
+
+  if (options.profiler_options().duration_ms() == 0) {
+    return tensorflow::errors::InvalidArgument(
+        "duration_ms must be greater than zero.");
+  }
+
+  for (absl::string_view host_port : options.service_addresses()) {
+    TF_RETURN_IF_ERROR(ValidateHostPortPair(host_port));
+  }
+
+  if (options.max_session_duration_ms() <
+      options.profiler_options().duration_ms()) {
+    return tensorflow::errors::InvalidArgument(
+        "The maximum profiling session duration must be greater than or equal "
+        "to the local profiler duration.");
+  }
+
+  return tensorflow::Status::OK();
+}
+
+// Receives a comma delimited list of service_addresses and adds them to
+// RemoteProfilerSessionManagerOptions::service_addresses.
+void AddServiceAddresses(absl::string_view service_addresses,
+                         RemoteProfilerSessionManagerOptions* options) {
+  for (absl::string_view server : absl::StrSplit(service_addresses, ',')) {
+    options->add_service_addresses(server.data(), server.size());
+  }
+}
+
+// Sets gRPC deadline to a grace period based on the profiling duration.
+void UpdateMaxSessionDuration(RemoteProfilerSessionManagerOptions& options) {
+  auto local_profiler_duration = options.profiler_options().duration_ms();
+  auto session_creation_ts = options.session_creation_timestamp_ns();
+  auto requested_start_ts = options.profiler_options().start_timestamp_ns();
+  // User only needs to set maximal session duration if the profiling duration
+  // is bounded.
+  DCHECK_GT(local_profiler_duration, 0);
+  VLOG(3) << "duration_ms was given as " << local_profiler_duration;
+  // Max session duration is the profiling session with grace time.
+  auto profile_duration = std::max(
+      kMinSessionGraceTime, absl::Milliseconds(local_profiler_duration) * 2);
+  absl::Duration delay_duration;
+  // When requested start timestamp is 0, profiling starts immediately.
+  if (requested_start_ts > 0) {
+    delay_duration =
+        absl::Nanoseconds(requested_start_ts - session_creation_ts);
+  }
+
+  auto max_session_duration = profile_duration + delay_duration;
+  options.set_max_session_duration_ms(
+      absl::ToInt64Milliseconds(max_session_duration));
+  VLOG(1) << "max_session_duration set to " << max_session_duration;
+}
+
+// Takes profiler options in a py::dict and returns a
+// RemoteProfilerSessionManagerOptions.
 // This must be called under GIL because it reads Python objects. Reading Python
 // objects require GIL because the objects can be mutated by other Python
 // threads. In addition, Python objects are reference counted; reading py::dict
 // will increase its reference count.
-tensorflow::ProfileOptions GetOptionsLocked(const py::dict& opts) {
-  tensorflow::ProfileOptions options =
+RemoteProfilerSessionManagerOptions GetOptionsLocked(absl::string_view logdir,
+                                                     const py::dict& opts) {
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
       tensorflow::ProfilerSession::DefaultOptions();
+  // Store a timestamp of when this session was created. This will be the basis
+  // of gRPC deadline afterwards.
+  auto now = absl::Now();
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(now));
+  VLOG(2) << "set_session_creation_timestamp_ns set to "
+          << options.session_creation_timestamp_ns() << " [" << now << "]";
+
+  // Set the path of where to store XSpaces.
+  options.mutable_profiler_options()->set_repository_path(logdir.data(),
+                                                          logdir.size());
+  VLOG(2) << "repository_path set to "
+          << options.profiler_options().repository_path();
+
   for (const auto& kw : opts) {
     std::string key = py::cast<std::string>(kw.first);
     if (key == "host_tracer_level") {
-      options.set_host_tracer_level(py::cast<int>(kw.second));
-      VLOG(1) << "host_tracer_level set to " << options.host_tracer_level();
+      auto value = py::cast<int>(kw.second);
+      options.mutable_profiler_options()->set_host_tracer_level(value);
+      VLOG(1) << "host_tracer_level set to " << value;
     } else if (key == "device_tracer_level") {
-      options.set_device_tracer_level(py::cast<int>(kw.second));
-      VLOG(1) << "device_tracer_level set to " << options.device_tracer_level();
+      auto value = py::cast<int>(kw.second);
+      options.mutable_profiler_options()->set_device_tracer_level(value);
+      VLOG(1) << "device_tracer_level set to " << value;
     } else if (key == "python_tracer_level") {
-      options.set_python_tracer_level(py::cast<int>(kw.second));
-      VLOG(1) << "python_tracer_level set to " << options.python_tracer_level();
+      auto value = py::cast<int>(kw.second);
+      options.mutable_profiler_options()->set_python_tracer_level(value);
+      VLOG(1) << "python_tracer_level set to " << value;
+    } else if (key == "delay_ms") {
+      if (!kw.second.is_none()) {
+        auto value = py::cast<int>(kw.second);
+        options.set_delay_ms(value);
+        VLOG(1) << "delay_ms was set to " << value;
+      }
+    } else {
+      LOG(WARNING) << "Unrecognised key: " << key;
     }
   }
+
+  return options;
+}
+
+RemoteProfilerSessionManagerOptions GetOptionsLocked(
+    absl::string_view service_addresses, absl::string_view logdir,
+    absl::string_view worker_list, bool include_dataset_ops,
+    tensorflow::int32 duration_ms, py::dict opts, bool* is_cloud_tpu_session) {
+  RemoteProfilerSessionManagerOptions options = GetOptionsLocked(logdir, opts);
+
+  // Remote profiling does not support any use cases where the following options
+  // are set by `py::dict opts`. e.g. `opts['service_addrs']` will not happen.
+  DCHECK(options.service_addresses().empty());
+  // In remote profiling, duration is always passed by value explicitly and not
+  // set in py::dict opts.
+  DCHECK_EQ(options.profiler_options().duration_ms(), 0);
+  // Because duration_ms is not set from py::dict opts, it follows that
+  // max_session_duration_ms must be unset as well.
+  DCHECK_EQ(options.max_session_duration_ms(), 0);
+
+  // Worker_list is only used for TensorBoard TPU capture cases. For a TPU
+  // cluster, service_address is the Master, which can already be found in the
+  // list of workers. These sessions will be used with the ProfileAnalysis
+  // service.
+  *is_cloud_tpu_session = !worker_list.empty();
+  AddServiceAddresses(*is_cloud_tpu_session ? worker_list : service_addresses,
+                      &options);
+
+  // Set local profiler duration and profiler session durations.
+  options.mutable_profiler_options()->set_include_dataset_ops(
+      include_dataset_ops);
+  options.mutable_profiler_options()->set_duration_ms(duration_ms);
+  UpdateMaxSessionDuration(options);
+
+  for (int idx = 0; idx < options.service_addresses_size(); ++idx) {
+    VLOG(1) << "service_addr " << idx << " set to "
+            << options.service_addresses(idx);
+  }
+  VLOG(1) << "include_dataset_ops set to " << include_dataset_ops;
+  VLOG(1) << "duration_ms set to " << duration_ms;
+
   return options;
 }
 
 class ProfilerSessionWrapper {
  public:
   void Start(const char* logdir, const py::dict& options) {
-    session_ = tensorflow::ProfilerSession::Create(GetOptionsLocked(options));
+    auto opts = GetOptionsLocked(logdir, options);
+    session_ = tensorflow::ProfilerSession::Create(opts.profiler_options());
     logdir_ = logdir;
     tensorflow::MaybeRaiseRegisteredFromStatus(session_->Status());
   }
@@ -101,6 +241,7 @@ class ProfilerSessionWrapper {
     tensorflow::profiler::XSpace xspace;
     tensorflow::Status status;
     status = session_->CollectData(&xspace);
+    xspace.add_hostnames(tensorflow::port::Hostname());
     session_.reset();
     status = tensorflow::profiler::ExportToTensorBoard(xspace, logdir_);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
@@ -130,26 +271,28 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
     profiler_server.release();
   });
 
-  m.def("trace",
-        [](const char* service_addr, const char* logdir,
-           const char* worker_list, bool include_dataset_ops, int duration_ms,
-           int num_tracing_attempts, py::dict options) {
-          // Normalize py::dict into a well defined proto.
-          tensorflow::ProfileOptions opts = GetOptionsLocked(options);
+  m.def("trace", [](const char* service_addr, const char* logdir,
+                    const char* worker_list, bool include_dataset_ops,
+                    int duration_ms, int num_tracing_attempts,
+                    py::dict options) {
+    // TPU capture is true if the user sets worker_list.
+    bool is_cloud_tpu_session = false;
+    // Normalize py::dict into a well defined and validated proto.
+    tensorflow::RemoteProfilerSessionManagerOptions opts =
+        GetOptionsLocked(service_addr, logdir, worker_list, include_dataset_ops,
+                         duration_ms, options, &is_cloud_tpu_session);
+    tensorflow::Status status = ValidateOptions(opts);
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
 
-          tensorflow::Status status = ValidateHostPortPair(service_addr);
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
-          opts.set_include_dataset_ops(include_dataset_ops);
-          {
-            // Release the lock to keep the lock scope to a minimum, and allow
-            // other threads to proceed.
-            py::gil_scoped_release release;
-            status = tensorflow::profiler::Trace(service_addr, logdir,
-                                                 worker_list, duration_ms,
-                                                 num_tracing_attempts, opts);
-          }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
-        });
+    {
+      // Release the lock to keep the lock scope to a minimum, and allow
+      // other threads to proceed.
+      py::gil_scoped_release release;
+      status = tensorflow::profiler::Trace(logdir, num_tracing_attempts, opts,
+                                           is_cloud_tpu_session);
+    }
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+  });
 
   m.def("monitor", [](const char* service_addr, int duration_ms,
                       int monitoring_level, bool display_timestamp) {
diff --git a/tensorflow/python/profiler/profiler_client.py b/tensorflow/python/profiler/profiler_client.py
index 1380b6578fc..d4aaa8ca5ce 100644
--- a/tensorflow/python/profiler/profiler_client.py
+++ b/tensorflow/python/profiler/profiler_client.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import errors
 from tensorflow.python.profiler.internal import _pywrap_profiler
 
 from tensorflow.python.util.tf_export import tf_export
@@ -32,66 +33,103 @@ def trace(service_addr,
           worker_list='',
           num_tracing_attempts=3,
           options=None):
-  """Sends grpc requests to profiler server to perform on-demand profiling.
+  """Sends gRPC requests to one or more profiler servers to perform on-demand profiling.
 
-  This method will block caller thread until it receives tracing result. This
-  method supports CPU, GPU, and Cloud TPU. This method supports profiling a
-  single host for CPU, GPU, TPU, as well as multiple TPU workers.
-  The profiled results will be saved to your specified TensorBoard log
-  directory (e.g. the directory you save your model checkpoints). Use the
+  This method will block the calling thread until it receives responses from all
+  servers or until deadline expiration. Both single host and multiple host
+  profiling are supported on CPU, GPU, and TPU.
+  The profiled results will be saved by each server to the specified TensorBoard
+  log directory (i.e. the directory you save your model checkpoints). Use the
   TensorBoard profile plugin to view the visualization and analysis results.
 
   Args:
-    service_addr: gRPC address of profiler service e.g. grpc://localhost:6009.
-    logdir: Path of TensorBoard log directory e.g. /tmp/tb_log.
-    duration_ms: Duration of tracing or monitoring in ms.
-    worker_list: Optional. The list of workers that we are about to profile in
-      the current session (TPU only).
+    service_addr: A comma delimited string of gRPC addresses of the workers to
+      profile.
+      e.g. service_addr='grpc://localhost:6009'
+           service_addr='grpc://10.0.0.2:8466,grpc://10.0.0.3:8466'
+           service_addr='grpc://localhost:12345,grpc://localhost:23456'
+    logdir: Path to save profile data to, typically a TensorBoard log directory.
+      This path must be accessible to both the client and server.
+      e.g. logdir='gs://your_tb_dir'
+    duration_ms: Duration of tracing or monitoring in milliseconds. Must be
+      greater than zero.
+    worker_list: An optional TPU only configuration. The list of workers to
+      profile in the current session.
     num_tracing_attempts: Optional. Automatically retry N times when no trace
       event is collected (default 3).
     options: profiler.experimental.ProfilerOptions namedtuple for miscellaneous
       profiler options.
 
   Raises:
-    UnavailableError: If no trace event is collected.
+    InvalidArgumentError: For when arguments fail validation checks.
+    UnavailableError: If no trace event was collected.
 
   Example usage (CPU/GPU):
-  # Start a profiler server before your model runs.
+
   ```python
-  tf.profiler.experimental.server.start(6009)
-  # your model code.
-  # Send gRPC request to the profiler server to collect a trace of your model.
+    # Start a profiler server before your model runs.
+    tf.profiler.experimental.server.start(6009)
+    # (Model code goes here).
+    # Send gRPC request to the profiler server to collect a trace of your model.
+    tf.profiler.experimental.client.trace('grpc://localhost:6009',
+                                          '/nfs/tb_log', 2000)
+  ```
+
+  Example usage (Multiple GPUs):
+
   ```python
-  tf.profiler.experimental.client.trace('grpc://localhost:6009',
-                                        '/tmp/tb_log', 2000)
+    # E.g. your worker IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you
+    # would like to schedule start of profiling 1 second from now, for a
+    # duration of 2 seconds.
+    options['delay_ms'] = 1000
+    tf.profiler.experimental.client.trace(
+        'grpc://10.0.0.2:8466,grpc://10.0.0.3:8466,grpc://10.0.0.4:8466',
+        'gs://your_tb_dir',
+        2000,
+        options=options)
+  ```
 
   Example usage (TPU):
-  # Send gRPC request to a TPU worker to collect a trace of your model. A
-  # profiler service has been started in the TPU worker at port 8466.
+
   ```python
-  # E.g. your TPU IP address is 10.0.0.2 and you want to profile for 2 seconds.
-  tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
-                                        'gs://your_tb_dir', 2000)
+    # Send gRPC request to a TPU worker to collect a trace of your model. A
+    # profiler service has been started in the TPU worker at port 8466.
+    # E.g. your TPU IP address is 10.0.0.2 and you want to profile for 2 seconds
+    # .
+    tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
+                                          'gs://your_tb_dir', 2000)
+  ```
 
   Example usage (Multiple TPUs):
-  # Send gRPC request to a TPU pod to collect a trace of your model on multiple
-  # TPUs. A profiler service has been started in all the TPU workers at the
-  # port 8466.
+
   ```python
-  # E.g. your TPU IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you want to
-  # profile for 2 seconds.
-  tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
-                                        'gs://your_tb_dir',
-                                        2000, '10.0.0.3,10.0.0.4')
+    # Send gRPC request to a TPU pod to collect a trace of your model on
+    # multipleTPUs. A profiler service has been started in all the TPU workers
+    # at theport 8466.
+    # E.g. your TPU IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you want
+    # to profile for 2 seconds.
+    tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
+                                          'gs://your_tb_dir',
+                                          2000, '10.0.0.2,10.0.0.3,10.0.0.4')
+  ```
 
   Launch TensorBoard and point it to the same logdir you provided to this API.
-  $ tensorboard --logdir=/tmp/tb_log (or gs://your_tb_dir in the above examples)
+
+  ```shell
+    # logdir can be gs://your_tb_dir as in the above examples.
+    $ tensorboard --logdir=/tmp/tb_log
+  ```
+
   Open your browser and go to localhost:6006/#profile to view profiling results.
 
   """
+  if duration_ms <= 0:
+    raise errors.InvalidArgumentError(None, None,
+                                      'duration_ms must be greater than zero.')
+
   opts = dict(options._asdict()) if options is not None else {}
   _pywrap_profiler.trace(
-      _strip_prefix(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
+      _strip_addresses(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
       duration_ms, num_tracing_attempts, opts)
 
 
@@ -113,12 +151,15 @@ def monitor(service_addr, duration_ms, level=1):
     A string of monitoring output.
 
   Example usage:
-  # Continuously send gRPC requests to the Cloud TPU to monitor the model
-  # execution.
-  ```python
-  for query in range(0, 100):
-    print(tf.profiler.experimental.client.monitor('grpc://10.0.0.2:8466', 1000))
 
+  ```python
+    # Continuously send gRPC requests to the Cloud TPU to monitor the model
+    # execution.
+
+    for query in range(0, 100):
+      print(
+        tf.profiler.experimental.client.monitor('grpc://10.0.0.2:8466', 1000))
+  ```
 
   """
   return _pywrap_profiler.monitor(
@@ -127,3 +168,7 @@ def monitor(service_addr, duration_ms, level=1):
 
 def _strip_prefix(s, prefix):
   return s[len(prefix):] if s.startswith(prefix) else s
+
+
+def _strip_addresses(addresses, prefix):
+  return ','.join([_strip_prefix(s, prefix) for s in addresses.split(',')])
diff --git a/tensorflow/python/profiler/profiler_client_test.py b/tensorflow/python/profiler/profiler_client_test.py
index 343f09834fd..85042be5409 100644
--- a/tensorflow/python/profiler/profiler_client_test.py
+++ b/tensorflow/python/profiler/profiler_client_test.py
@@ -38,7 +38,7 @@ class ProfilerClientTest(test_util.TensorFlowTestCase):
     with self.assertRaises(errors.UnavailableError) as error:
       profiler_client.trace(
           'localhost:' + str(test_port), self.get_temp_dir(), duration_ms=10)
-    self.assertEqual('No trace event is collected', str(error.exception))
+    self.assertStartsWith(str(error.exception), 'No trace event was collected')
 
   def testTrace_ProfileIdleServerWithOptions(self):
     test_port = portpicker.pick_unused_port()
@@ -54,7 +54,7 @@ class ProfilerClientTest(test_util.TensorFlowTestCase):
           self.get_temp_dir(),
           duration_ms=10,
           options=options)
-    self.assertEqual('No trace event is collected', str(error.exception))
+    self.assertStartsWith(str(error.exception), 'No trace event was collected')
 
   def testMonitor_ProcessInvalidAddress(self):
     # Monitor is only supported in cloud TPU. Test invalid address instead.
diff --git a/tensorflow/python/profiler/profiler_v2.py b/tensorflow/python/profiler/profiler_v2.py
index bcba9d52d23..102a510906b 100644
--- a/tensorflow/python/profiler/profiler_v2.py
+++ b/tensorflow/python/profiler/profiler_v2.py
@@ -48,9 +48,10 @@ _profiler_lock = threading.Lock()
 
 @tf_export('profiler.experimental.ProfilerOptions', v1=[])
 class ProfilerOptions(
-    collections.namedtuple(
-        'ProfilerOptions',
-        ['host_tracer_level', 'python_tracer_level', 'device_tracer_level'])):
+    collections.namedtuple('ProfilerOptions', [
+        'host_tracer_level', 'python_tracer_level', 'device_tracer_level',
+        'delay_ms'
+    ])):
   """Options for finer control over the profiler.
 
   Use `tf.profiler.ProfilerOptions` to control `tf.profiler`
@@ -63,15 +64,22 @@ class ProfilerOptions(
     - enabled, 0 - disabled [default value is 0]
     device_tracer_level: Adjust device (TPU/GPU) tracing level. Values are: 1 -
     enabled, 0 - disabled [default value is 1]
+    delay_ms: Requests for all hosts to start profiling at a timestamp that is
+      `delay_ms` away from the current time. `delay_ms` is in milliseconds. If
+      zero, each host will start profiling immediately upon receiving the
+      request. Default value is None, allowing the profiler guess the best
+      value.
+
   """
 
   def __new__(cls,
               host_tracer_level=2,
               python_tracer_level=0,
-              device_tracer_level=1):
+              device_tracer_level=1,
+              delay_ms=None):
     return super(ProfilerOptions,
                  cls).__new__(cls, host_tracer_level, python_tracer_level,
-                              device_tracer_level)
+                              device_tracer_level, delay_ms)
 
 
 @tf_export('profiler.experimental.start', v1=[])
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index b18c7c1e738..5768cbdc15d 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -107,7 +107,6 @@ tf_py_test(
     name = "loader_test",
     size = "small",
     srcs = ["loader_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":builder",
         ":loader",
@@ -166,7 +165,6 @@ tf_py_test(
     srcs = ["saved_model_test.py"],
     data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":builder",
         ":constants",
@@ -217,7 +215,6 @@ tf_py_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":utils",
         "//tensorflow/core:protos_all_py",
@@ -250,7 +247,6 @@ tf_py_test(
     name = "signature_def_utils_test",
     size = "small",
     srcs = ["signature_def_utils_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":signature_constants",
         ":signature_def_utils",
@@ -266,7 +262,6 @@ tf_py_test(
     name = "simple_save_test",
     size = "small",
     srcs = ["simple_save_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":loader",
         ":signature_constants",
@@ -311,7 +306,6 @@ tf_py_test(
     name = "save_context_test",
     srcs = ["save_context_test.py"],
     srcs_version = "PY2AND3",
-    tfrt_enabled = True,
     deps = [
         ":save_context",
         ":save_options",
@@ -349,6 +343,7 @@ py_strict_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:tf_export",
@@ -364,6 +359,7 @@ py_strict_library(
         "//tensorflow/python/training/tracking:base",
         "//tensorflow/python/training/tracking:graph_view",
         "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -531,7 +527,6 @@ py_strict_library(
 tf_py_test(
     name = "revived_types_test",
     srcs = ["revived_types_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":revived_types",
         "//tensorflow/core:protos_all_py",
@@ -567,7 +562,6 @@ py_strict_library(
         "//tensorflow/python:func_graph",
         "//tensorflow/python:function_def_to_graph",
         "//tensorflow/python:op_def_registry",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf_decorator",
@@ -575,6 +569,7 @@ py_strict_library(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -606,7 +601,6 @@ py_strict_library(
 tf_py_test(
     name = "nested_structure_coder_test",
     srcs = ["nested_structure_coder_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
@@ -651,7 +645,6 @@ py_strict_library(
 tf_py_test(
     name = "method_name_updater_test",
     srcs = ["method_name_updater_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":method_name_updater",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 092e4177f44..7e3e3d3b32b 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import re
+from absl import logging
 
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.protobuf import saved_object_graph_pb2
@@ -32,7 +33,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -220,7 +220,6 @@ def recreate_function(saved_function, concrete_functions):
   # instead of creating a new `Function` backed by a Python layer to
   # glue things together. Current approach is nesting functions deeper for each
   # serialization cycle.
-
   coder = nested_structure_coder.StructureCoder()
 
   # Note: handling method functions is tricky since make_decorator does not
@@ -237,7 +236,9 @@ def recreate_function(saved_function, concrete_functions):
       coder)
 
   def restored_function_body(*args, **kwargs):
-    """Calls a restored function."""
+    """Calls a restored function or raises an error if no matching function."""
+    if not saved_function.concrete_functions:
+      raise ValueError("Found zero restored functions for caller function.")
     # This is the format of function.graph.structured_input_signature. At this
     # point, the args and kwargs have already been canonicalized.
     inputs = (args, kwargs)
@@ -406,13 +407,14 @@ def _sort_function_defs(library, library_function_names):
   return [reverse[x] for x in output]
 
 
-def fix_node_def(node_def, functions, shared_name_suffix, debug_name):
+def _check_op_has_custom_gradients(node_def):
+  """Returns True if op has custom gradients."""
+  return ("_gradient_op_type" in node_def.attr and
+          node_def.op not in ["StatefulPartitionedCall", "PartitionedCall"])
+
+
+def fix_node_def(node_def, functions, shared_name_suffix):
   """Replace functions calls and shared names in `node_def`."""
-  if ("_gradient_op_type" in node_def.attr and
-      node_def.op not in ["StatefulPartitionedCall", "PartitionedCall"]):
-    logging.warning(
-        "Importing a function (%s) with ops with custom gradients. Will likely "
-        "fail if a gradient is requested.", debug_name)
   if node_def.op in functions:
     node_def.op = functions[node_def.op].name
   for _, attr_value in node_def.attr.items():
@@ -470,8 +472,16 @@ def _fix_fdef(orig_fdef, functions, shared_name_suffix):
   """
   fdef = function_pb2.FunctionDef()
   fdef.CopyFrom(orig_fdef)
+  contains_custom_gradients = False
+
   for node_def in fdef.node_def:
-    fix_node_def(node_def, functions, shared_name_suffix, fdef.signature.name)
+    fix_node_def(node_def, functions, shared_name_suffix)
+    if not contains_custom_gradients:
+      contains_custom_gradients = _check_op_has_custom_gradients(node_def)
+  if contains_custom_gradients:
+    logging.warning(
+        "Importing a function (%s) with ops with custom gradients. Will likely "
+        "fail if a gradient is requested.", fdef.signature.name)
 
   fdef.signature.name = _clean_function_name(fdef.signature.name)
   return fdef
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 0c64275ce01..381fe95bff0 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -48,6 +48,7 @@ from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
@@ -119,7 +120,7 @@ class Loader(object):
   """Helper class to load an object-based SavedModel."""
 
   def __init__(self, object_graph_proto, saved_model_proto, export_dir,
-               ckpt_options):
+               ckpt_options, filters):
     meta_graph = saved_model_proto.meta_graphs[0]
     self._asset_file_def = meta_graph.asset_file_def
     self._operation_attributes = {
@@ -131,6 +132,26 @@ class Loader(object):
             meta_graph.graph_def.library))
     self._checkpoint_options = ckpt_options
 
+    # Stores user-defined node_filters argument.
+    self._node_filters = filters
+    # Stores map of string paths to integers.
+    self._node_path_to_id = self._convert_node_paths_to_ints()
+    self._loaded_nodes = {}
+    if isinstance(filters, dict):
+      # If node_filters is a dict, then the values may contain already created
+      # trackable objects. In this case, create a dictionary mapping node IDs to
+      # the already created nodes. This dict will be updated in
+      # `_retrieve_all_filtered_nodes` with tracked dependencies.
+      for node_path, node in filters.items():
+        if isinstance(node, tuple):
+          self._loaded_nodes[self._node_path_to_id[node_path]] = node
+        else:
+          self._loaded_nodes[self._node_path_to_id[node_path]] = (node, setattr)
+
+    # Get a list of all integer node ids to load, or None if all nodes should be
+    # loaded. This list includes ids of child nodes.
+    self._filtered_nodes = self._retrieve_all_filtered_nodes()
+
     for name, concrete_function in self._concrete_functions.items():
       # Wrap all the concrete function so that they are capable of dealing with
       # both in replica and cross replica cases.
@@ -145,6 +166,91 @@ class Loader(object):
         if not context.executing_eagerly():
           ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
 
+  def _convert_node_paths_to_ints(self):
+    """Maps all string node paths in node_filters to the int node ids."""
+    if self._node_filters is None:
+      return None
+    path_to_int = {}
+    for node_id in self._node_filters:
+      int_node_id = None
+      if isinstance(node_id, str):
+        node_path = node_id.split(".")
+        if node_path[0] != "root":
+          raise ValueError(
+              "When passing string identifiers to node_filters, the first name"
+              " must be root.")
+        int_node_id = 0
+        for n, name in enumerate(node_path[1:]):
+          int_node_id = self._find_node_child(
+              int_node_id, name, ".".join(node_path[:n+2]))
+        path_to_int[node_id] = int_node_id
+      else:
+        raise TypeError("Elements in node_filters must be strings.")
+    return path_to_int
+
+  def _retrieve_all_filtered_nodes(self):
+    """Traverses through the object graph to get the IDs of all nodes to load.
+
+    As a side-effect, if node_filters is a dictionary that contains already-
+    created objects, then the dependencies tracked by those objects will be
+    added to node_filters.
+
+    Returns:
+      List of all nodes to load, or None if all nodes should be loaded.
+
+    """
+    if self._node_filters is None:
+      return None  # All nodes should be loaded.
+
+    all_filtered_nodes = set()
+    nodes_to_visit = list(self._node_filters)
+
+    while nodes_to_visit:
+      node_path = nodes_to_visit.pop(0)
+      node_id = self._node_path_to_id[node_path]
+      if node_id in all_filtered_nodes:
+        continue
+      all_filtered_nodes.add(node_id)
+
+      node, setter = self._loaded_nodes.get(node_id, (None, None))
+      if node is not None:
+        if not isinstance(node, base.Trackable):
+          raise TypeError(
+              "Error when processing dictionary values passed to nodes_to_load."
+              "Object at {} is expected to be a checkpointable TensorFlow "
+              "object (e.g. tf.Variable, tf.Module or Keras layer)."
+              .format(node_path))
+        node._maybe_initialize_trackable()  # pylint: disable=protected-access
+
+      for reference in self._proto.nodes[node_id].children:
+        child_object, _ = self._loaded_nodes.get(
+            reference.node_id, (None, None))
+
+        # See if node already tracks the child reference, in which case add the
+        # child to the loaded_nodes dict.
+        if child_object is None and node is not None:
+          child_object = node._lookup_dependency(reference.local_name)  # pylint: disable=protected-access
+          if isinstance(child_object, data_structures.TrackableDataStructure):
+            # Make setattr a noop to avoid overwriting already existing data
+            # structures.
+            setter = lambda *args: None
+
+            self._loaded_nodes[reference.node_id] = (child_object, setter)
+
+        child_path = "{}.{}".format(node_path, reference.local_name)
+        self._node_path_to_id[child_path] = reference.node_id
+        nodes_to_visit.append(child_path)
+
+    if 0 in all_filtered_nodes:
+      return None
+    return all_filtered_nodes
+
+  def _find_node_child(self, node_id, child_name, path):
+    for reference in self._proto.nodes[node_id].children:
+      if reference.local_name == child_name:
+        return reference.node_id
+    raise ValueError("unable to find node {}".format(path))
+
   def _load_all(self):
     """Loads all nodes and functions from the SavedModel and their edges."""
     self._load_nodes()
@@ -159,7 +265,7 @@ class Loader(object):
     self._create_saveable_object_factories()
 
   def _create_saveable_object_factories(self):
-    for node_id, proto in enumerate(self._proto.nodes):
+    for node_id, proto in self._iter_all_nodes():
       node = self.get(node_id)
       node._self_saveable_object_factories = {}  # pylint: disable=protected-access
       for name, saveable_object_proto in proto.saveable_objects.items():
@@ -170,9 +276,24 @@ class Loader(object):
 
   def _load_edges(self):
     """Adds edges from objects to other objects and functions."""
-    for node_id, object_proto in enumerate(self._proto.nodes):
+    for node_id, object_proto in self._iter_all_nodes():
       self._add_object_graph_edges(object_proto, node_id)
 
+    # If root object isn't loaded, then create edges from the root for
+    # checkpoint compatibility.
+    if self._filtered_nodes is not None and 0 not in self._filtered_nodes:
+      root = self.get(0)
+      for node_path in self._node_filters:
+        loaded_node = self._nodes[self._node_path_to_id[node_path]]
+        path = node_path.split(".")
+        current_node = root
+        for name in path[1:-1]:
+          if not hasattr(current_node, name):
+            setattr(current_node, name, self._recreate_base_user_object()[0])
+          current_node = getattr(current_node, name)
+        if not hasattr(current_node, path[-1]):
+          setattr(current_node, path[-1], loaded_node)
+
   def _add_object_graph_edges(self, proto, node_id):
     """Adds edges from an object to its children."""
     obj = self._nodes[node_id]
@@ -214,7 +335,7 @@ class Loader(object):
     for name, proto in concrete_functions:
       concrete_function = self._concrete_functions[name]
       bound_inputs = [
-          self._get_tensor_from_node(node_id)
+          self._get_tensor_from_node(node_id, name)
           for node_id in proto.bound_inputs]
       bound_variables = [
           self._nodes[node_id]
@@ -251,8 +372,14 @@ class Loader(object):
             # placeholder for this input.
             concrete_function.graph.capture(bound_input)
 
-  def _get_tensor_from_node(self, node_id):
+  def _get_tensor_from_node(self, node_id, fn_name):
     """Resolves a node id into a tensor to be captured for a function."""
+    if self._node_filters is not None and self._nodes[node_id] is None:
+      raise ValueError(
+          "Error when processing nodes_to_load. Function \"{}\" requires "
+          "inputs/variables that are not loaded when nodes_to_load={}"
+          .format(fn_name, self._node_filters))
+
     with ops.init_scope():
       obj = self._nodes[node_id]
       if distribute_utils.is_distributed_variable(obj):
@@ -268,24 +395,39 @@ class Loader(object):
         return obj.resource_handle
       raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
+  def _initialize_loaded_nodes(self):
+    nodes = {}
+    node_setters = {}
+    for node_id, (node, setter) in self._loaded_nodes.items():
+      nodes[node_id] = node
+      node_setters[node_id] = setter
+    return nodes, node_setters
+
+  def _iter_all_nodes(self):
+    if self._filtered_nodes is None:
+      return enumerate(self._proto.nodes)
+    else:
+      return [(node_id, self._proto.nodes[node_id])
+              for node_id in self._filtered_nodes]
+
   def _load_nodes(self):
     """Load all saved objects."""
-    # Maps from node ids to recreated objects
-    nodes = {}
-    # Maps from node ids to setter functions (same signature as setattr) for
-    # setting dependencies.
-    node_setters = {}
+    # `nodes` maps from node ids to recreated objects
+    # `node_setters` maps from node ids to setter functions
+    # (same signature as setattr) for setting dependencies.
+    nodes, node_setters = self._initialize_loaded_nodes()
 
     # Figure out which objects are slot variables. These objects are created
     # with Optimizer.add_slot rather than _recreate_variable.
     slot_variable_node_ids = set()
-    for proto in self._proto.nodes:
+
+    for _, proto in self._iter_all_nodes():
       for slot_variable_proto in proto.slot_variables:
         slot_variable_node_ids.add(slot_variable_proto.slot_variable_node_id)
 
     # Re-create everything except slot variables.
-    for node_id, proto in enumerate(self._proto.nodes):
-      if node_id in slot_variable_node_ids:
+    for node_id, proto in self._iter_all_nodes():
+      if node_id in slot_variable_node_ids or nodes.get(node_id) is not None:
         # Defer recreating slot variables so we can use the public Optimizer
         # interface.
         continue
@@ -295,7 +437,7 @@ class Loader(object):
 
     # Now that we have created the variables being optimized, we have enough
     # information to re-create slot variables for them.
-    for node_id, proto in enumerate(self._proto.nodes):
+    for node_id, proto in self._iter_all_nodes():
       optimizer_object = nodes[node_id]
       for slot_variable_proto in proto.slot_variables:
         optimized_variable = nodes[
@@ -306,7 +448,13 @@ class Loader(object):
         nodes[slot_variable_proto.slot_variable_node_id] = slot_variable
         node_setters[slot_variable_proto.slot_variable_node_id] = setattr
 
-    self._nodes = [nodes[node_id] for node_id in range(len(self._proto.nodes))]
+    # If root object is not loaded, add a dummy root object for checkpoint
+    # compatibility.
+    if 0 not in nodes:
+      nodes[0] = self._recreate_base_user_object()[0]
+
+    self._nodes = [nodes.get(node_id)
+                   for node_id in range(len(self._proto.nodes))]
     self._node_setters = node_setters
 
   @property
@@ -380,6 +528,8 @@ class Loader(object):
     return output_debug_info
 
   def get(self, node_id):
+    if isinstance(node_id, str):
+      node_id = self._node_path_to_id[node_id]
     return self._nodes[node_id]
 
   def _recreate(self, proto, node_id):
@@ -408,7 +558,7 @@ class Loader(object):
       return self._recreate_base_user_object(proto, node_id)
     return looked_up
 
-  def _recreate_base_user_object(self, proto, node_id):
+  def _recreate_base_user_object(self, proto=None, node_id=None):
     del proto, node_id
     # Note: each user object has its own class. This allows making each one
     # individually callable by adding a `__call__` method to the classes of
@@ -518,6 +668,103 @@ def _call_attribute(instance, *args, **kwargs):
   return instance.__call__(*args, **kwargs)
 
 
+def load_partial(export_dir, filters, tags=None, options=None):
+  """Partially load a SavedModel (saved from V2).
+
+  Similar to `tf.saved_model.load`, but with an additional argument that
+  lets you specify which nodes to load.
+  `tf.saved_model.load_partial(export_dir, ["root"])` and
+  `tf.saved_model.load(export_dir)` are equivalent.
+
+  Note: This only works for SavedModels saved with TensorFlow V2 from
+  `tf.saved_model.save` or Keras. This will not load SavedModels save from
+  the Estimator API.
+
+  In Tensorflow V2, SavedModel stores the **object graph** of the saved object.
+  The graph contains nodes (`tf.Module`, `tf.Variable`, `tf.function`, Keras
+  layers, etc.) and edges that are the name of the attributes connecting the
+  objects.
+
+  *Example 1*
+
+  ```
+  model = tf.Module()
+  model.child_layer = tf.Module()
+  model.child_layer.v = tf.Variable(5.)
+  tf.saved_model.save(model, '/tmp/model')
+  loaded = tf.__internal__.saved_model.load_partial(
+  ...   '/tmp/model',
+  ...   ['root.child_layer', 'root.child_layer.v'])
+  loaded['root.child_layer'].v.numpy()
+  5.
+  loaded['root.child_layer'].v is loaded['root.child_layer.v']
+  True
+
+  *Example 2*
+  model = tf.Module()
+  model.child_layer = tf.Module()
+  model.child_layer.v = tf.Variable(5.)
+  >>>
+  tf.saved_model.save(model, '/tmp/model')
+  # Create a variable
+  new_variable = tf.Variable(0.)
+  loaded = tf.__internal__.saved_model.load_partial(
+  ...   '/tmp/model',
+  ...   {'root.child_layer': None, 'root.child_layer.v': new_variable})
+  loaded['root.child_layer'].v.numpy()
+  5.
+  new_variable.numpy()
+  5.
+  ```
+
+  **Loading under different distribution strategies**
+  You can load different parts of the model under different distribution
+  strategies. Note that this is very experimental so use with care.
+
+  ```
+  model = tf.Module()
+  model.layer_1 = tf.Module()
+  model.layer_1.v = tf.Variable(5.)
+  model.layer_2 = tf.Module()
+  model.layer_2.v = tf.Variable(7.)
+  tf.saved_model.save(model, '/tmp/model')
+  # Load with no strategy
+  loaded = tf.__internal__.saved_model.load_partial(
+  ...   '/tmp/model',
+  ...   ['root.layer_1'])
+  loaded['root.layer_1'].v
+  <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=5.0>
+  strategy = tf.distribute.MirroredStrategy()
+  with strategy.scope():
+  ...   loaded2 = tf.__internal__.saved_model.load_partial(
+  ...     '/tmp/model',
+  ...     ['root.layer_2'])
+  loaded2['root.layer_2'].v
+  MirroredVariable:{
+      0: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=7.0>
+  }
+  ```
+
+  Args:
+    export_dir: The SavedModel directory to load from.
+    filters: A list or dictionary where each element or key is a string
+      path to nodes that should be loaded. Node paths consist of all the child
+      attribute names to reach that node in the form: `root.{attribute_name}`.
+      The loader will load all of the specified nodes and their recursive
+      descendants. When this option is defined, the loader will return a
+      dictionary mapping the node paths to the loaded objects.
+    tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
+      if the SavedModel contains a single MetaGraph, as for those exported from
+      `tf.saved_model.save`.
+    options: `tf.saved_model.LoadOptions` object that specifies options for
+      loading.
+
+  Returns:
+    A dictionary mapping node paths from the filter to loaded objects.
+  """
+  return load_internal(export_dir, tags, options, filters=filters)
+
+
 @tf_export("saved_model.load", v1=["saved_model.load_v2"])
 def load(export_dir, tags=None, options=None):
   """Load a SavedModel from `export_dir`.
@@ -597,8 +844,8 @@ def load(export_dir, tags=None, options=None):
     tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
       if the SavedModel contains a single MetaGraph, as for those exported from
       `tf.saved_model.save`.
-    options: Optional, `tf.saved_model.LoadOptions` object that specifies
-      options for loading.
+    options: `tf.saved_model.LoadOptions` object that specifies options for
+      loading.
 
   Returns:
     A trackable object with a `signatures` attribute mapping from signature
@@ -609,10 +856,11 @@ def load(export_dir, tags=None, options=None):
   Raises:
     ValueError: If `tags` don't match a MetaGraph in the SavedModel.
   """
-  return load_internal(export_dir, tags, options)
+  return load_internal(export_dir, tags, options)["root"]
 
 
-def load_internal(export_dir, tags=None, options=None, loader_cls=Loader):
+def load_internal(export_dir, tags=None, options=None, loader_cls=Loader,
+                  filters=None):
   """Loader implementation."""
   options = options or load_options.LoadOptions()
   if tags is not None and not isinstance(tags, set):
@@ -639,7 +887,7 @@ def load_internal(export_dir, tags=None, options=None, loader_cls=Loader):
     with ops.init_scope():
       try:
         loader = loader_cls(object_graph_proto, saved_model_proto, export_dir,
-                            ckpt_options)
+                            ckpt_options, filters)
       except errors.NotFoundError as err:
         raise FileNotFoundError(
             str(err) + "\n If trying to load on a different device from the "
@@ -654,7 +902,14 @@ def load_internal(export_dir, tags=None, options=None, loader_cls=Loader):
     root.tensorflow_git_version = (
         meta_graph_def.meta_info_def.tensorflow_git_version)
   else:
+    if filters:
+      raise ValueError("SavedModels saved from Tensorflow V1 or Estimator (any "
+                       "version) cannot be loaded with node filters.")
     with ops.init_scope():
       root = load_v1_in_v2.load(export_dir, tags)
       root.graph_debug_info = debug_info
-  return root
+
+  if filters:
+    return {node_id: loader.get(node_id) for node_id in filters}
+  else:
+    return {"root": root}
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index a8244850308..d58e6c1239e 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -52,6 +52,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numpy_ops as tnp
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.numpy_ops import np_arrays
@@ -86,7 +87,8 @@ def cycle(obj, cycles, signatures=None):
 @parameterized.named_parameters(
     dict(testcase_name="ReloadOnce", cycles=1),
     dict(testcase_name="ReloadTwice", cycles=2),
-    dict(testcase_name="ReloadThrice", cycles=3))
+    dict(testcase_name="ReloadThrice", cycles=3)
+)
 class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_structure_import(self, cycles):
@@ -404,7 +406,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
                             dtype=dtypes.int32).numpy())
 
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
-    self.assertEqual(4, len(concrete_functions))
+    self.assertLen(concrete_functions, 4)
 
     imported = cycle(root, cycles)
 
@@ -421,6 +423,28 @@ class LoadTest(test.TestCase, parameterized.TestCase):
                             constant_op.constant([1.0, 2.0, 3.0]),
                             dtype=dtypes.int32).numpy())
 
+  def test_function_with_str_bytes_input(self, cycles):
+
+    @def_function.function
+    def func(x, y):
+      return string_ops.string_join([x, y])
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    self.assertAllEqual(b"ab", root.f("a", "b"))
+    self.assertAllEqual(b"ab", root.f("a", constant_op.constant("b")))
+    self.assertAllEqual(b"ab", root.f(constant_op.constant("a"), "b"))
+
+    concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    self.assertLen(concrete_functions, 3)
+
+    imported = cycle(root, cycles)
+
+    self.assertAllEqual(b"ab", imported.f("a", "b"))
+    self.assertAllEqual(b"ab", imported.f("a", constant_op.constant("b")))
+    self.assertAllEqual(b"ab", imported.f(constant_op.constant("a"), "b"))
+
   def test_function_no_return(self, cycles):
 
     class TrackableWithOneVariable(tracking.AutoTrackable):
@@ -891,7 +915,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
 
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
-    self.assertEqual(1, len(concrete_functions))
+    self.assertLen(concrete_functions, 1)
 
     imported = cycle(root, cycles)
 
@@ -1176,7 +1200,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(1., imported.variables[0].numpy())
     self.assertEqual(3., imported.variables[2].numpy())
     self.assertIs(None, imported.variables[1])
-    self.assertEqual(3, len(imported.variables))
+    self.assertLen(imported.variables, 3)
 
   def test_tuple(self, cycles):
     root = tracking.AutoTrackable()
@@ -2028,6 +2052,55 @@ class SingleCycleTests(test.TestCase, parameterized.TestCase):
             tensor_spec.TensorSpec(shape=[], dtype=dtypes.float32)])
     cycle(root, 1)
 
+  def test_load_partial_object(self):
+    root = module.Module()
+    root.variables_holder = module.Module()
+    root.variables_holder.v = variables.Variable(1.)
+
+    class Adder(module.Module):
+
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(shape=[])])
+      def __call__(self, y):
+        root.variables_holder.v.assign_add(y)
+        return 1
+
+    root.adder = Adder()
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+
+    imported = load.load_partial(save_dir,
+                                 ["root.variables_holder.v", "root.adder"])
+    v = imported["root.variables_holder.v"]
+    adder = imported["root.adder"]
+    self.assertEqual(self.evaluate(v), 1)
+    adder(5)
+    self.assertEqual(self.evaluate(v), 6)
+
+    with self.assertRaisesRegex(ValueError, "requires inputs/variables"):
+      imported = load.load_partial(save_dir, ["root.adder"])
+
+  def test_call_untraced_function_raises_error(self):
+
+    class ObjWithFunction(module.Module):
+
+      @def_function.function
+      def foo(self, a):
+        return a
+
+    root = ObjWithFunction()
+    with self.assertLogs(level="WARNING") as logs:
+      loaded = cycle(root, 1)
+
+    expected_save_message = (
+        "WARNING:absl:Found untraced functions such as foo while saving "
+        "(showing 1 of 1). These functions will not be directly callable after "
+        "loading.")
+    self.assertIn(expected_save_message, logs.output)
+
+    with self.assertRaisesRegex(
+        ValueError, "Found zero restored functions for caller function."):
+      loaded.foo(1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index a8627701bb8..7978e86d093 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -216,8 +216,7 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     # the GraphDef itself for consistency.
     for node_def in meta_graph_def.graph_def.node:
       function_deserialization.fix_node_def(node_def, functions,
-                                            load_shared_name_suffix,
-                                            debug_name="MetaGraph import")
+                                            load_shared_name_suffix)
 
     load_graph_returns = [None]
     wrapped = wrap_function.wrap_function(
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index d03ae64a480..45d135d2e61 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -23,6 +23,7 @@ import functools
 import gc
 import os
 
+from absl import logging
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
@@ -42,6 +43,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_serialization
@@ -72,6 +74,9 @@ _UNCOPIABLE_DTYPES = frozenset((dtypes.resource, dtypes.variant))
 _CapturedConstant = collections.namedtuple("_CapturedConstant",
                                            ["eager_tensor", "graph_tensor"])
 
+# Number of untraced functions to display to user in warning message.
+_NUM_DISPLAY_UNTRACED_FUNCTIONS = 5
+
 
 class _AugmentedGraphView(graph_view.ObjectGraphView):
   """An extendable graph which also tracks functions attached to objects.
@@ -178,13 +183,15 @@ class _SaveableView(object):
     """
     self.options = options
     self.checkpoint_view = checkpoint_view
-    trackable_objects, node_ids, slot_variables = (
-        self.checkpoint_view.objects_ids_and_slot_variables())
+    trackable_objects, path_to_root, node_ids, slot_variables = (
+        self.checkpoint_view.objects_ids_and_slot_variables_and_paths())
+    self.node_paths = path_to_root
     self.nodes = trackable_objects
     self.node_ids = node_ids
     self.captured_tensor_node_ids = object_identity.ObjectIdentityDictionary()
     self.slot_variables = slot_variables
     self.concrete_functions = []
+    self.untraced_functions = []
 
     self.saveable_objects_for_node, all_saveable_functions = (
         self._add_saveable_objects())
@@ -220,10 +227,20 @@ class _SaveableView(object):
               function._list_all_concrete_functions_for_serialization())  # pylint: disable=protected-access
         else:
           concrete_functions = [function]
+        if not concrete_functions:
+          self.untraced_functions.append(function._name)
+
         for concrete_function in concrete_functions:
           if concrete_function.name not in seen_function_names:
             seen_function_names.add(concrete_function.name)
             self.concrete_functions.append(concrete_function)
+    if self.untraced_functions:
+      logging.warning(
+          "Found untraced functions such as %s while saving (showing %d of %d)."
+          " These functions will not be directly callable after loading.",
+          ", ".join(self.untraced_functions[:_NUM_DISPLAY_UNTRACED_FUNCTIONS]),
+          min(_NUM_DISPLAY_UNTRACED_FUNCTIONS, len(self.untraced_functions)),
+          len(self.untraced_functions))
 
   def _add_saveable_objects(self):
     """Retrieves SaveablesObjects and traces their save/restore functions."""
@@ -735,14 +752,17 @@ def _serialize_object_graph(saveable_view, asset_file_def_index):
     if serialized is not None:
       proto.concrete_functions[name].CopyFrom(serialized)
 
+  saved_object_metadata = False
   for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
-    _write_object_proto(obj, obj_proto, asset_file_def_index,
-                        saveable_view.function_name_map)
-  return proto
+    has_saved_object_metadata = _write_object_proto(
+        obj, obj_proto, asset_file_def_index, saveable_view.function_name_map)
+    saved_object_metadata = saved_object_metadata or has_saved_object_metadata
+  return proto, saved_object_metadata
 
 
 def _write_object_proto(obj, proto, asset_file_def_index, function_name_map):
   """Saves an object into SavedObject proto."""
+  has_saved_object_metadata = False  # The metadata field will be deprecated.
   if isinstance(obj, tracking.Asset):
     proto.asset.SetInParent()
     proto.asset.asset_file_def_index = asset_file_def_index[obj]
@@ -778,11 +798,14 @@ def _write_object_proto(obj, proto, asset_file_def_index, function_name_map):
     if registered_type_proto is None:
       # Fallback for types with no matching registration
       # pylint:disable=protected-access
+      metadata = obj._tracking_metadata
+      if metadata:
+        has_saved_object_metadata = True
       registered_type_proto = saved_object_graph_pb2.SavedUserObject(
           identifier=obj._object_identifier,
           version=versions_pb2.VersionDef(
               producer=1, min_consumer=1, bad_consumers=[]),
-          metadata=obj._tracking_metadata)
+          metadata=metadata)
       # pylint:enable=protected-access
     proto.user_object.CopyFrom(registered_type_proto)
 
@@ -795,6 +818,7 @@ def _write_object_proto(obj, proto, asset_file_def_index, function_name_map):
   # documentation.
   if hasattr(obj, "_write_object_proto"):
     obj._write_object_proto(proto, options)  # pylint: disable=protected-access
+  return has_saved_object_metadata
 
 
 def _export_debug_info(exported_graph, export_dir):
@@ -992,8 +1016,7 @@ def save(obj, export_dir, signatures=None, options=None):
         instances with input signatures or concrete functions. Keys of such a
         dictionary may be arbitrary strings, but will typically be from the
         `tf.saved_model.signature_constants` module.
-    options: Optional, `tf.saved_model.SaveOptions` object that specifies
-      options for saving.
+    options: `tf.saved_model.SaveOptions` object for configuring save options.
 
   Raises:
     ValueError: If `obj` is not trackable.
@@ -1007,6 +1030,30 @@ def save(obj, export_dir, signatures=None, options=None):
   May not be called from within a function body.
   @end_compatibility
   """
+  save_and_return_nodes(obj, export_dir, signatures, options,
+                        raise_metadata_warning=True)
+
+
+def save_and_return_nodes(obj, export_dir, signatures=None, options=None,
+                          raise_metadata_warning=False):
+  """Saves a SavedModel while returning all saved nodes and their paths.
+
+  Please see `tf.saved_model.save` for details.
+
+  Args:
+    obj: A trackable object to export.
+    export_dir: A directory in which to write the SavedModel.
+    signatures: A function or dictionary of functions to save in the SavedModel
+      as signatures.
+    options: `tf.saved_model.SaveOptions` object for configuring save options.
+    raise_metadata_warning: Whether to raise the metadata warning. This arg will
+      be removed in TF 2.5.
+
+  Returns:
+    A tuple of (a list of saved nodes in the order they are serialized to the
+      `SavedObjectGraph`, dictionary mapping nodes to one possible path from
+      the root node to the key node)
+  """
   options = options or save_options.SaveOptions()
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
@@ -1014,8 +1061,9 @@ def save(obj, export_dir, signatures=None, options=None):
   saved_model = saved_model_pb2.SavedModel()
   meta_graph_def = saved_model.meta_graphs.add()
 
-  _, exported_graph, object_saver, asset_info = _build_meta_graph(
-      obj, signatures, options, meta_graph_def)
+  _, exported_graph, object_saver, asset_info, saved_nodes, node_paths = (
+      _build_meta_graph(obj, signatures, options, meta_graph_def,
+                        raise_metadata_warning))
   saved_model.saved_model_schema_version = constants.SAVED_MODEL_SCHEMA_VERSION
 
   # Write the checkpoint, copy assets into the assets directory, and write out
@@ -1055,6 +1103,8 @@ def save(obj, export_dir, signatures=None, options=None):
   # constants in the saved graph.
   ops.dismantle_graph(exported_graph)
 
+  return saved_nodes, node_paths
+
 
 def export_meta_graph(obj, filename, signatures=None, options=None):
   """Exports the MetaGraph proto of the `obj` to a file.
@@ -1081,7 +1131,7 @@ def export_meta_graph(obj, filename, signatures=None, options=None):
   """
   options = options or save_options.SaveOptions()
   export_dir = os.path.dirname(filename)
-  meta_graph_def, exported_graph, _, _ = _build_meta_graph(
+  meta_graph_def, exported_graph, _, _, _, _ = _build_meta_graph(
       obj, signatures, options)
 
   file_io.atomic_write_string_to_file(
@@ -1100,7 +1150,8 @@ def export_meta_graph(obj, filename, signatures=None, options=None):
 def _build_meta_graph_impl(obj,
                            signatures,
                            options,
-                           meta_graph_def=None):
+                           meta_graph_def=None,
+                           raise_metadata_warning=True):
   """Creates a MetaGraph containing the resources and functions of an object."""
   if ops.inside_function():
     raise AssertionError(
@@ -1144,17 +1195,35 @@ def _build_meta_graph_impl(obj,
       for fdef in func._stateless_fn._function_cache.all_values():  # pylint: disable=protected-access
         function_aliases[fdef.name] = alias
 
-  object_graph_proto = _serialize_object_graph(saveable_view,
-                                               asset_info.asset_index)
+  object_graph_proto, saved_object_metadata = _serialize_object_graph(
+      saveable_view, asset_info.asset_index)
   meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
 
-  return meta_graph_def, exported_graph, object_saver, asset_info
+  if saved_object_metadata and raise_metadata_warning:
+    tf_logging.warn(
+        'FOR KERAS USERS: The object that you are saving contains one or more '
+        'Keras models or layers. If you are loading the SavedModel with '
+        '`tf.keras.models.load_model`, continue reading (otherwise, you may '
+        'ignore the following instructions). Please change your code to save '
+        'with `tf.keras.models.save_model` or `model.save`, and confirm that '
+        'the file "keras.metadata" exists in the export directory. In the '
+        'future, Keras will only load the SavedModels that have this file. In '
+        'other words, `tf.saved_model.save` will no longer write SavedModels '
+        'that can be recovered as Keras models (this will apply in TF 2.5).'
+        '\n\nFOR DEVS: If you are overwriting _tracking_metadata in your class,'
+        ' this property has been used to save metadata in the SavedModel. The '
+        'metadta field will be deprecated soon, so please move the metadata to '
+        'a different file.')
+
+  return (meta_graph_def, exported_graph, object_saver, asset_info,
+          saveable_view.nodes, saveable_view.node_paths)
 
 
 def _build_meta_graph(obj,
                       signatures,
                       options,
-                      meta_graph_def=None):
+                      meta_graph_def=None,
+                      raise_metadata_warning=True):
   """Creates a MetaGraph under a save context.
 
   Args:
@@ -1167,6 +1236,8 @@ def _build_meta_graph(obj,
     options: `tf.saved_model.SaveOptions` object that specifies options for
       saving.
     meta_graph_def: Optional, the MetaGraphDef proto fill.
+    raise_metadata_warning: Whether to raise a warning when user objects contain
+      non-empty metadata.
 
   Raises:
     AssertionError: If `export_meta_graph` is executing inside a `tf.function`.
@@ -1180,4 +1251,5 @@ def _build_meta_graph(obj,
   """
 
   with save_context.save_context(options):
-    return _build_meta_graph_impl(obj, signatures, options, meta_graph_def)
+    return _build_meta_graph_impl(obj, signatures, options, meta_graph_def,
+                                  raise_metadata_warning)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 7521a5d8593..77e19b8a8ec 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -289,6 +289,30 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
 
+  def test_save_function_no_trace(self):
+
+    class ObjWithFunction(module.Module):
+
+      @def_function.function
+      def foo(self, a):
+        return a
+
+      @def_function.function
+      def bar(self, a):
+        return a + 1
+
+    root = ObjWithFunction()
+    root.bar(1)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertLogs(level="WARNING") as logs:
+      save.save(root, save_dir)
+
+    expected_message = (
+        "WARNING:absl:Found untraced functions such as foo while saving "
+        "(showing 1 of 1). These functions will not be directly callable after "
+        "loading.")
+    self.assertIn(expected_message, logs.output)
+
   def test_find_default_save_function(self):
 
     class ObjWithDefaultSignature(util.Checkpoint):
@@ -552,10 +576,15 @@ class SaveTest(test.TestCase, parameterized.TestCase):
       self.assertEmpty(v1.device)
 
   @parameterized.named_parameters(
-      ("_ExpandDistributedVariables",
-       save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES),
-      ("_DiscardDistributedVariables", save_options.VariablePolicy.NONE))
-  def test_expand_distributed_variables(self, expand_strategy):
+      ("_ExpandDistributedVariablesWithPolicy",
+       save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES, True),
+      ("_ExpandDistributedVariablesWithoutPolicy",
+       save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES, False),
+      ("_DiscardDistributedVariablesWithPolicy",
+       save_options.VariablePolicy.NONE, True),
+      ("_DiscardDistributedVariablesWithoutPolicy",
+       save_options.VariablePolicy.NONE, False))
+  def test_expand_distributed_variables(self, expand_strategy, policy):
     # 1. Create a context with both CPU:0 and CPU:1.
     context._reset_context()
     cpus = context.context().list_physical_devices("CPU")
@@ -569,7 +598,9 @@ class SaveTest(test.TestCase, parameterized.TestCase):
 
     # 2. Create and save a model under a mirrored strategy.
     file_name = os.path.join(self.get_temp_dir(), "saved_model.pb")
-    with mirrored_strategy.MirroredStrategy(["CPU:0", "CPU:1"]).scope():
+    strategy = mirrored_strategy.MirroredStrategy(["CPU:0", "CPU:1"])
+    strategy.extended._use_var_policy = policy
+    with strategy.scope():
       root = tracking.AutoTrackable()
       root.v = variables.Variable([1., 1.], name="v")
 
@@ -638,6 +669,23 @@ class SaveTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(ValueError):
         loader.load(session, [tag_constants.SERVING], export_dir)
 
+  def test_concrete_function_with_set_shape(self,):
+    # Serialized concrete function should retain the shape from the TensorSpec,
+    # instead of using the shape of the inputs (which are changed by set_shape).
+    @def_function.function
+    def f(x):
+      x.set_shape((5, 1))
+      return x
+
+    root = tracking.AutoTrackable()
+    path = os.path.join(self.get_temp_dir(), "saved_model")
+    concrete = f.get_concrete_function(
+        tensor_spec.TensorSpec((None, 1), name="name"))
+    save.save(root, path, signatures={"key": concrete})
+    imported = load.load(path)
+    self.assertEqual(imported.signatures["key"].structured_input_signature[1],
+                     {"name": tensor_spec.TensorSpec((None, 1), name="name")})
+
 
 class VariablePolicyEnumTest(test.TestCase):
 
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index 74f76c690f2..14f4df81380 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -124,15 +124,26 @@ def canonicalize_signatures(signatures):
       structured_outputs = signature_function(**kwargs)
       return _normalize_outputs(
           structured_outputs, signature_function.name, signature_key)
-    # TODO(b/123902469): Use ConcreteFunction.structured_inputs once their names
-    # always match keyword arguments.
     tensor_spec_signature = {}
-    for keyword, tensor in zip(
+    if signature_function.structured_input_signature is not None:
+      # The structured input signature may contain other non-tensor arguments.
+      inputs = filter(
+          lambda x: isinstance(x, tensor_spec.TensorSpec),
+          nest.flatten(signature_function.structured_input_signature,
+                       expand_composites=True))
+    else:
+      # Structured input signature isn't always defined for some functions.
+      inputs = signature_function.inputs
+
+    for keyword, inp in zip(
         signature_function._arg_keywords,  # pylint: disable=protected-access
-        signature_function.inputs):
+        inputs):
       keyword = compat.as_str(keyword)
-      tensor_spec_signature[keyword] = tensor_spec.TensorSpec.from_tensor(
-          tensor, name=keyword)
+      if isinstance(inp, tensor_spec.TensorSpec):
+        spec = tensor_spec.TensorSpec(inp.shape, inp.dtype, name=keyword)
+      else:
+        spec = tensor_spec.TensorSpec.from_tensor(inp, name=keyword)
+      tensor_spec_signature[keyword] = spec
     final_concrete = signature_wrapper._get_concrete_function_garbage_collected(  # pylint: disable=protected-access
         **tensor_spec_signature)
     # pylint: disable=protected-access
diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
index 4c9d027221f..11d7b6f1aab 100644
--- a/tensorflow/python/tf2.py
+++ b/tensorflow/python/tf2.py
@@ -24,24 +24,27 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python.util.tf_export import tf_export
+
+
 _force_enable = None
 
 
 def enable():
-  #Enables v2 behaviors.
+  # Enables v2 behaviors.
   global _force_enable
   _force_enable = True
 
 
 def disable():
-  #Disables v2 behaviors.
+  # Disables v2 behaviors.
   global _force_enable
   _force_enable = False
 
 
+@tf_export("__internal__.tf2.enabled", v1=[])
 def enabled():
-  #Returns True iff TensorFlow 2.0 behavior should be enabled.
+  # Returns True iff TensorFlow 2.0 behavior should be enabled.
   if _force_enable is None:
     return os.getenv("TF2_BEHAVIOR", "0") != "0"
-
   return _force_enable
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 36165deeaad..b694ab9683b 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -204,6 +204,13 @@ TFE_OutputTensorHandles InputTFE_OutputTensorHandles(
 #else
   long sz = PyLong_AsLong(num_outputs.ptr());  // NOLINT
 #endif
+  // We can't handle more than int32 sizes for number of outputs.
+  if (static_cast<long>(static_cast<int32>(sz)) != sz) {  // NOLINT
+    PyErr_SetString(PyExc_ValueError, tensorflow::strings::StrCat(
+                                          "Number of outputs is too big: ", sz)
+                                          .c_str());
+    throw py::error_already_set();
+  }
   if (sz > 0) {
 #if PY_MAJOR_VERSION < 3
     output_tensor_handles.resize(PyInt_AsLong(num_outputs.ptr()), nullptr);
@@ -580,10 +587,17 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
 
   // MLIR Logic
   m.def("TF_IsMlirBridgeEnabled", [] {
-    return tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge;
+    // Since python protobuf enums are integers, cast to an integer before
+    // returning the enum to python.
+    return static_cast<int32_t>(
+        tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge);
   });
   m.def("TF_EnableMlirBridge", [](bool enabled) {
-    tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge = enabled;
+    tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge =
+        enabled
+            ? tensorflow::ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED
+            : tensorflow::ConfigProto::Experimental::
+                  MLIR_BRIDGE_ROLLOUT_DISABLED;
   });
   m.def("TF_EnableXlaDevices", [] {
     tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
@@ -1047,11 +1061,11 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     TFE_AbortCollectiveOps(tensorflow::InputTFE_Context(ctx), status.get());
   });
   m.def("TFE_CollectiveOpsCheckPeerHealth",
-        [](const py::handle& ctx, const char* task) {
+        [](const py::handle& ctx, const char* task, int64_t timeout_in_ms) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           TFE_CollectiveOpsCheckPeerHealth(tensorflow::InputTFE_Context(ctx),
-                                           task, status.get());
+                                           task, timeout_in_ms, status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
         });
   m.def("TF_ListPhysicalDevices", &tensorflow::TF_ListPhysicalDevices);
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 049594ead90..0a3ad2b4022 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -368,6 +368,74 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "make_aot_compile_models",
+    srcs = ["make_aot_compile_models.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/saved_model",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@six_archive//:six",
+    ],
+)
+
+EMITTED_AOT_SAVE_MODEL_OBJECTS = [
+    "x_matmul_y_large/saved_model.pb",
+    "x_matmul_y_large/variables/variables.index",
+    "x_matmul_y_small/saved_model.pb",
+    "x_matmul_y_small/variables/variables.index",
+]
+
+genrule(
+    name = "create_models_for_aot_compile",
+    outs = EMITTED_AOT_SAVE_MODEL_OBJECTS,
+    cmd = (
+        "$(location :make_aot_compile_models) --out_dir $(@D)"
+    ),
+    exec_tools = [":make_aot_compile_models"],
+    tags = ["no_rocm"],
+)
+
+filegroup(
+    name = "aot_saved_models",
+    srcs = EMITTED_AOT_SAVE_MODEL_OBJECTS,
+)
+
+saved_model_compile_aot(
+    name = "aot_compiled_x_matmul_y_large",
+    cpp_class = "XMatmulYLarge",
+    directory = "//tensorflow/python/tools:x_matmul_y_large",
+    filegroups = [":aot_saved_models"],
+    force_without_xla_support_flag = False,
+    tags = ["no_rocm"],
+)
+
+saved_model_compile_aot(
+    name = "aot_compiled_x_matmul_y_large_multithreaded",
+    cpp_class = "XMatmulYLargeMultithreaded",
+    directory = "//tensorflow/python/tools:x_matmul_y_large",
+    filegroups = [":aot_saved_models"],
+    force_without_xla_support_flag = False,
+    multithreading = True,
+    tags = ["no_rocm"],
+)
+
+saved_model_compile_aot(
+    name = "aot_compiled_x_matmul_y_small",
+    cpp_class = "XMatmulYSmall",
+    directory = "//tensorflow/python/tools:x_matmul_y_small",
+    filegroups = [":aot_saved_models"],
+    force_without_xla_support_flag = False,
+    tags = ["no_rocm"],
+)
+
 saved_model_compile_aot(
     name = "aot_compiled_x_plus_y",
     cpp_class = "XPlusY",
@@ -402,6 +470,34 @@ saved_model_compile_aot(
     variables_to_feed = "variable_x",
 )
 
+sh_test(
+    name = "large_matmul_no_multithread_test",
+    srcs = if_xla_available(
+        ["no_xla_multithread_symbols_test.sh"],
+        if_false = ["skip_test.sh"],
+    ),
+    args = if_xla_available(["$(location :aot_compiled_x_matmul_y_large.o)"]),
+    data = if_xla_available([":aot_compiled_x_matmul_y_large.o"]),
+    tags = ["no_windows"],  # TODO(b/171875345)
+)
+
+sh_test(
+    name = "large_matmul_yes_multithread_test",
+    srcs = if_xla_available(
+        [
+            "xla_multithread_symbols_test.sh",
+        ],
+        if_false = ["skip_test.sh"],
+    ),
+    args = if_xla_available(
+        ["$(location :aot_compiled_x_matmul_y_large_multithreaded.o)"],
+    ),
+    data = if_xla_available(
+        [":aot_compiled_x_matmul_y_large_multithreaded.o"],
+    ),
+    tags = ["no_windows"],  # TODO(b/171875345)
+)
+
 tf_cc_test(
     name = "aot_compiled_test",
     srcs = if_xla_available([
@@ -413,8 +509,12 @@ tf_cc_test(
     ] + if_xla_available([
         ":aot_compiled_vars_and_arithmetic",
         ":aot_compiled_vars_and_arithmetic_frozen",
+        ":aot_compiled_x_matmul_y_large",
+        ":aot_compiled_x_matmul_y_large_multithreaded",
+        ":aot_compiled_x_matmul_y_small",
         ":aot_compiled_x_plus_y",
         "//tensorflow/core:test",
+        "//third_party/eigen3",
         "//tensorflow/core/platform:logging",
     ]),
 )
diff --git a/tensorflow/python/tools/aot_compiled_test.cc b/tensorflow/python/tools/aot_compiled_test.cc
index 3e8084590db..0c15e638841 100644
--- a/tensorflow/python/tools/aot_compiled_test.cc
+++ b/tensorflow/python/tools/aot_compiled_test.cc
@@ -13,10 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/python/tools/aot_compiled_vars_and_arithmetic.h"
 #include "tensorflow/python/tools/aot_compiled_vars_and_arithmetic_frozen.h"
+#include "tensorflow/python/tools/aot_compiled_x_matmul_y_large.h"
+#include "tensorflow/python/tools/aot_compiled_x_matmul_y_large_multithreaded.h"
+#include "tensorflow/python/tools/aot_compiled_x_matmul_y_small.h"
 #include "tensorflow/python/tools/aot_compiled_x_plus_y.h"
 
 namespace tensorflow {
@@ -30,6 +36,97 @@ TEST(AOTCompiledSavedModelTest, XPlusY) {
   ASSERT_NEAR(model.result_fetch_output_0(), 7.0f, /*abs_error=*/1e-6f);
 }
 
+TEST(AOTCompiledSavedModelTest, XMatmulYLarge) {
+  XMatmulYLarge model;
+  // Calculation is: output_0 = x @ y.
+  EXPECT_EQ(model.arg_feed_x_count(), 3000 * 5000);
+  EXPECT_EQ(model.arg_feed_y_count(), 5000 * 4000);
+  EXPECT_EQ(model.result0_count(), 3000 * 4000);
+
+  Eigen::Tensor<float, 2, Eigen::RowMajor> arg_feed_x(3000, 5000);
+  Eigen::Tensor<float, 2, Eigen::RowMajor> arg_feed_y(5000, 4000);
+  arg_feed_x.setRandom();
+  arg_feed_y.setRandom();
+
+  // Set up dimensions for standard matmul.
+  const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
+      Eigen::IndexPair<int>(1, 0)};
+  // Ground truth matmul.
+  const Eigen::Tensor<float, 2, Eigen::RowMajor> expected_output0 =
+      arg_feed_x.contract(arg_feed_y, product_dims);
+
+  model.set_arg_feed_x_data(arg_feed_x.data());
+  model.set_arg_feed_y_data(arg_feed_y.data());
+  CHECK(model.Run());
+  EXPECT_NEAR(model.result_fetch_output_0(0, 0), expected_output0(0, 0),
+              /*abs_error=*/1e-6f);
+  EXPECT_NEAR(model.result_fetch_output_0(2999, 3999),
+              expected_output0(2999, 3999),
+              /*abs_error=*/1e-6f);
+}
+
+TEST(AOTCompiledSavedModelTest, XMatmulYLargeMultithreaded) {
+  XMatmulYLargeMultithreaded model;
+
+  Eigen::ThreadPool pool(2);
+  Eigen::ThreadPoolDevice device(&pool, pool.NumThreads());
+  model.set_thread_pool(&device);
+
+  // Calculation is: output_0 = x @ y.
+  EXPECT_EQ(model.arg_feed_x_count(), 3000 * 5000);
+  EXPECT_EQ(model.arg_feed_y_count(), 5000 * 4000);
+  EXPECT_EQ(model.result0_count(), 3000 * 4000);
+
+  Eigen::Tensor<float, 2, Eigen::RowMajor> arg_feed_x(3000, 5000);
+  Eigen::Tensor<float, 2, Eigen::RowMajor> arg_feed_y(5000, 4000);
+  arg_feed_x.setRandom();
+  arg_feed_y.setRandom();
+
+  // Set up dimensions for standard matmul.
+  const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
+      Eigen::IndexPair<int>(1, 0)};
+  // Ground truth matmul.
+  const Eigen::Tensor<float, 2, Eigen::RowMajor> expected_output0 =
+      arg_feed_x.contract(arg_feed_y, product_dims);
+
+  model.set_arg_feed_x_data(arg_feed_x.data());
+  model.set_arg_feed_y_data(arg_feed_y.data());
+  CHECK(model.Run());
+  EXPECT_NEAR(model.result_fetch_output_0(0, 0), expected_output0(0, 0),
+              /*abs_error=*/1e-3f);
+  EXPECT_NEAR(model.result_fetch_output_0(2999, 3999),
+              expected_output0(2999, 3999),
+              /*abs_error=*/1e-3f);
+}
+
+TEST(AOTCompiledSavedModelTest, XMatmulYSmall) {
+  XMatmulYSmall model;
+  // Calculation is: output_0 = x @ y.
+  EXPECT_EQ(model.arg_feed_x_count(), 3 * 5);
+  EXPECT_EQ(model.arg_feed_y_count(), 5 * 4);
+  EXPECT_EQ(model.result0_count(), 3 * 4);
+
+  Eigen::Tensor<float, 2, Eigen::RowMajor> arg_feed_x(3, 5);
+  Eigen::Tensor<float, 2, Eigen::RowMajor> arg_feed_y(5, 4);
+  arg_feed_x.setRandom();
+  arg_feed_y.setRandom();
+
+  // Set up dimensions for standard matmul.
+  const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
+      Eigen::IndexPair<int>(1, 0)};
+  // Ground truth matmul.
+  const Eigen::Tensor<float, 2, Eigen::RowMajor> expected_output0 =
+      arg_feed_x.contract(arg_feed_y, product_dims);
+
+  model.set_arg_feed_x_data(arg_feed_x.data());
+  model.set_arg_feed_y_data(arg_feed_y.data());
+  CHECK(model.Run());
+  EXPECT_NEAR(model.result_fetch_output_0(0, 0), expected_output0(0, 0),
+              /*abs_error=*/1e-6f);
+  EXPECT_NEAR(model.result_fetch_output_0(2, 3), expected_output0(2, 3),
+              /*abs_error=*/1e-6f);
+}
+
 TEST(AOTCompiledSavedModelTest, VarsAndArithmetic) {
   VarsAndArithmeticFrozen frozen_model;
   // Calculation is:
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index c04ab6900e9..3daf9eedb41 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -6,11 +6,13 @@ TENSORFLOW_API_INIT_FILES = [
     "__init__.py",
     "__internal__/__init__.py",
     "__internal__/decorator/__init__.py",
+    "__internal__/dispatch/__init__.py",
     "__internal__/distribute/__init__.py",
     "__internal__/distribute/combinations/__init__.py",
     "__internal__/distribute/multi_process_runner/__init__.py",
     "__internal__/test/__init__.py",
     "__internal__/test/combinations/__init__.py",
+    "__internal__/tf2/__init__.py",
     "__internal__/tracking/__init__.py",
     "__operators__/__init__.py",
     "audio/__init__.py",
@@ -31,6 +33,8 @@ TENSORFLOW_API_INIT_FILES = [
     "distribute/__init__.py",
     "distribute/cluster_resolver/__init__.py",
     "distribute/experimental/__init__.py",
+    "distribute/experimental/coordinator/__init__.py",
+    "distribute/experimental/partitioners/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index cc2f8ebfb28..aa872c4f975 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -699,7 +699,7 @@ def main():
       metavar='O',
       type=str,
       nargs='+',
-      help='If a single file is passed in, then we we assume it contains a '
+      help='If a single file is passed in, then we assume it contains a '
       'semicolon-separated list of Python files that we expect this script to '
       'output. If multiple files are passed in, then we assume output files '
       'are listed directly as arguments.')
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index 25035e0567a..37589b0ced4 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -27,6 +27,8 @@ from tensorflow.lite.python import lite as _tflite_for_api_traversal
 from tensorflow.python import modules_with_exports
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.coordinator import cluster_coordinator
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import test_combinations
 # pylint: enable=unused-import
diff --git a/tensorflow/python/tools/make_aot_compile_models.py b/tensorflow/python/tools/make_aot_compile_models.py
new file mode 100644
index 00000000000..2a8f3550472
--- /dev/null
+++ b/tensorflow/python/tools/make_aot_compile_models.py
@@ -0,0 +1,82 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate some SavedModels for use by AOT compilation tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import app
+from tensorflow.python.saved_model import save
+from tensorflow.python.training.tracking import tracking
+
+
+flags.DEFINE_string('out_dir', None,
+                    'Directory to output saved models to.')
+
+FLAGS = flags.FLAGS
+
+
+def create_large_matmul_savedmodel(out_dir):
+  """Create a SavedModel that performs a large matmul."""
+  root = tracking.AutoTrackable()
+  root.f = def_function.function(
+      lambda x, y: math_ops.matmul(x, y),  # pylint: disable=unnecessary-lambda
+      input_signature=[tensor_spec.TensorSpec([3000, 5000], dtypes.float32),
+                       tensor_spec.TensorSpec([5000, 4000], dtypes.float32),])
+  root.f(x=array_ops.zeros((3000, 5000)),
+         y=array_ops.zeros((5000, 4000)))
+  save_dir = os.path.join(out_dir, 'x_matmul_y_large')
+  save.save(root, save_dir, root.f)
+  # This simple SavedModel lacks any variables, but we need to create a
+  # variables.index file to make bazel genrule happy.
+  with open(os.path.join(save_dir, 'variables', 'variables.index'), 'w'):
+    pass
+
+
+def create_small_matmul_savedmodel(out_dir):
+  """Create a SavedModel that performs a small matmul."""
+  root = tracking.AutoTrackable()
+  root.f = def_function.function(
+      lambda x, y: math_ops.matmul(x, y),  # pylint: disable=unnecessary-lambda
+      input_signature=[tensor_spec.TensorSpec([3, 5], dtypes.float32),
+                       tensor_spec.TensorSpec([5, 4], dtypes.float32),])
+  root.f(x=array_ops.zeros((3, 5)),
+         y=array_ops.zeros((5, 4)))
+  save_dir = os.path.join(out_dir, 'x_matmul_y_small')
+  save.save(root, save_dir, root.f)
+  # This simple SavedModel lacks any variables, but we need to create a
+  # variables.index file to make bazel genrule happy.
+  with open(os.path.join(save_dir, 'variables', 'variables.index'), 'w'):
+    pass
+
+
+def main(unused_args):
+  create_small_matmul_savedmodel(FLAGS.out_dir)
+  create_large_matmul_savedmodel(FLAGS.out_dir)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('out_dir')
+  app.run(main)
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh b/tensorflow/python/tools/no_xla_multithread_symbols_test.sh
old mode 100644
new mode 100755
similarity index 66%
rename from tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh
rename to tensorflow/python/tools/no_xla_multithread_symbols_test.sh
index 039f9516d86..468c283ad98
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh
+++ b/tensorflow/python/tools/no_xla_multithread_symbols_test.sh
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,11 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 set -e
-set -x
 
-source tensorflow/tools/ci_build/release/common.sh
-
-# Copy and rename to tensorflow
-for f in $(ls py_test_dir/tensorflow-*cp3*-cp3*m-win_amd64.whl); do
-  copy_to_new_project_name "${f}" tensorflow_gpu
-done
+SYMBOLS=$(nm "$@" | grep __xla_cpu_runtime)
+if echo "${SYMBOLS}" | grep -q SingleThread; then
+  exit 0
+else
+  echo "" 1>&2
+  echo "Did not see SingleThread runtime symbol in $@:" 1>&2
+  echo "" 1>&2
+  echo "${SYMBOLS}" 1>&2
+  echo "" 1>&2
+  exit 1
+fi
diff --git a/tensorflow/python/tools/print_selective_registration_header_test.py b/tensorflow/python/tools/print_selective_registration_header_test.py
index b06f6123ddd..637d73c5434 100644
--- a/tensorflow/python/tools/print_selective_registration_header_test.py
+++ b/tensorflow/python/tools/print_selective_registration_header_test.py
@@ -108,15 +108,14 @@ class PrintOpFilegroupTest(test.TestCase):
 
     ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
         'rawproto', self.WriteGraphFiles(graphs), default_ops)
-    matmul_prefix = ''
+    matmul_prefix = 'Batch'
 
     self.assertListEqual(
         [
             ('AccumulateNV2', None),  #
             ('BiasAdd', 'BiasOp<CPUDevice, float>'),  #
-            ('MatMul',
-             matmul_prefix + 'MatMulOp<CPUDevice, double, false >'),  #
-            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, float, false >'),  #
+            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, double, true>'),  #
+            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, float, true>'),  #
             ('NoOp', 'NoOp'),  #
             ('Reshape', 'ReshapeOp'),  #
             ('_Recv', 'RecvOp'),  #
@@ -132,9 +131,8 @@ class PrintOpFilegroupTest(test.TestCase):
         [
             ('AccumulateNV2', None),  #
             ('BiasAdd', 'BiasOp<CPUDevice, float>'),  #
-            ('MatMul',
-             matmul_prefix + 'MatMulOp<CPUDevice, double, false >'),  #
-            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, float, false >'),  #
+            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, double, true>'),  #
+            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, float, true>'),  #
             ('NoOp', 'NoOp'),  #
             ('Reshape', 'ReshapeOp'),  #
             ('_Recv', 'RecvOp'),  #
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index bf955ad825c..d1478e205d3 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -19,11 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-
 import copy
-import hashlib
 import os
 import pipes
+import re
 import shlex
 
 import six
@@ -217,7 +216,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
                                    target_triple,
                                    target_cpu,
                                    variables_to_feed=(),
-                                   enable_multithreading=False):
+                                   multithreading=False):
   """Compile a `MetaGraphDef` to header+object files in `output_prefix`.
 
   Use XLA AOT (`tfcompile`) to convert the given meta graph and
@@ -245,8 +244,9 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
       user; these won't be frozen.  If `None`, then we will extract all the
       variables in the graph and mark them as to-feed.  The default behavior is
       an empty tuple: all variables must be frozen.
-    enable_multithreading: Not implemented.  Enable multithreading in the
-      compiled computation.
+    multithreading: Whether to enable multithreading in the compiled
+      computation.  Note that if using this option, the resulting object files
+      may have external dependencies on multithreading libraries like nsync.
 
   Raises:
     RuntimeError: If tensorflow was not built with XLA.
@@ -254,23 +254,20 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
       issue importing the tfcompile python wrapper.
     ValueError: If `meta_graph_def.signature_def[signature_def_key]` is
       missing or has empty outputs.
-    NotImplementedError: If `enable_multithreading is True`.
   """
   if _pywrap_tfcompile_import_error:
-    raise _pywrap_tfcompile_import_error
+    raise _pywrap_tfcompile_import_error  # pylint: disable=raising-bad-type
 
-  if enable_multithreading:
-    raise NotImplementedError(
-        'Multithreading is not currently supported because it requires '
-        'additional dependencies in the AOT runtime.')
   else:
     # TODO(ebrevdo): Pipe DebugOptions through tfcompile::Main and pywrap
     # so that we can set these directly instead of relying on env vars.
     xla_flags = os.environ.get('XLA_FLAGS')
     if not xla_flags:
-      xla_flags = '--xla_cpu_multi_thread_eigen=false'
+      xla_flags = '--xla_cpu_multi_thread_eigen={}'.format(
+          'true' if multithreading else 'false')
     else:
-      xla_flags += ',--xla_cpu_multi_thread_eigen=false'
+      xla_flags += ',--xla_cpu_multi_thread_eigen={}'.format(
+          'true' if multithreading else 'false')
     os.environ['XLA_FLAGS'] = xla_flags
 
   signature_def_map = meta_graph_def.signature_def
@@ -352,10 +349,9 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
   output_dir = os.path.dirname(output_prefix)
   file_io.recursive_create_dir(output_dir)
 
-  entry_digest = hashlib.md5()
-  entry_digest.update(str(config).encode())
-  entry_digest.update(str(graph_def).encode())
-  entry_digest = entry_digest.hexdigest()
+  entry_point = re.sub(
+      '[^0-9a-zA-Z]+', '_',
+      '__xla_' + output_prefix + '__' + cpp_class)
 
   logging.info('Generating XLA AOT artifacts in: {}'.format(output_dir))
 
@@ -371,7 +367,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
       cpp_class=cpp_class,
       target_triple=target_triple,
       target_cpu=target_cpu,
-      entry_point='entry_{}'.format(entry_digest),
+      entry_point=entry_point,
       out_function_object='{}.o'.format(output_prefix),
       out_header='{}.h'.format(output_prefix),
       out_metadata_object='{}_metadata.o'.format(output_prefix),
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 0c8b8f5576b..124686dff13 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -821,6 +821,7 @@ def aot_compile_cpu(args):
     variables_to_feed = None  # We will identify them after.
   else:
     variables_to_feed = args.variables_to_feed.split(',')
+
   saved_model_aot_compile.aot_compile_cpu_meta_graph_def(
       checkpoint_path=checkpoint_path,
       meta_graph_def=saved_model_utils.get_meta_graph_def(
@@ -831,7 +832,7 @@ def aot_compile_cpu(args):
       target_triple=args.target_triple,
       target_cpu=args.target_cpu,
       cpp_class=args.cpp_class,
-      enable_multithreading=args.enable_multithreading)
+      multithreading=args.multithreading.lower() not in ('f', 'false', '0'))
 
 
 def add_show_subparser(subparsers):
@@ -1140,11 +1141,13 @@ def add_aot_compile_cpu_subparser(subparsers):
             '(this applies to all input arguments from the signature as '
             'well).'))
   parser_compile.add_argument(
-      '--enable_multithreading',
-      type=bool,
-      default='',
-      help=('*NOT CURRENTLY SUPPORTED*  '
-            'Enable multithreading in the compiled computation.'))
+      '--multithreading',
+      type=str,
+      default='False',
+      help=('Enable multithreading in the compiled computation.  '
+            'Note that if using this option, the resulting object files '
+            'may have external dependencies on multithreading libraries '
+            'like nsync.'))
 
   parser_compile.set_defaults(func=aot_compile_cpu)
 
diff --git a/tensorflow/lite/micro/testing/Dockerfile.bluepill b/tensorflow/python/tools/skip_test.sh
old mode 100644
new mode 100755
similarity index 74%
rename from tensorflow/lite/micro/testing/Dockerfile.bluepill
rename to tensorflow/python/tools/skip_test.sh
index 330d8457b3e..5c9407175fe
--- a/tensorflow/lite/micro/testing/Dockerfile.bluepill
+++ b/tensorflow/python/tools/skip_test.sh
@@ -12,10 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-# This docker configuration file lets you emulate a Blue Pill board
-# on an x86 desktop or laptop, which can be useful for debugging and
-# automated testing.
-FROM antmicro/renode:latest
-
-LABEL maintainer="Pete Warden <petewarden@google.com>"
\ No newline at end of file
+exit 0
diff --git a/tensorflow/python/tools/tools.bzl b/tensorflow/python/tools/tools.bzl
index 79f771bbcad..db886746006 100644
--- a/tensorflow/python/tools/tools.bzl
+++ b/tensorflow/python/tools/tools.bzl
@@ -21,6 +21,7 @@ def saved_model_compile_aot(
         variables_to_feed = "",
         target_triple = None,
         target_cpu = None,
+        multithreading = False,
         force_without_xla_support_flag = True,
         tags = None):
     """Compile a SavedModel directory accessible from a filegroup.
@@ -93,6 +94,11 @@ def saved_model_compile_aot(
         target architecture's triple).  Similar to clang's -target flag.
       target_cpu: The LLVM cpu name used for compilation.  Similar to clang's
         -mcpu flag.
+      multithreading: Whether to compile multithreaded AOT code.
+        Note, this increases the set of dependencies for binaries using
+        the AOT library at both build and runtime.  For example,
+        the resulting object files may have external dependencies on
+        multithreading libraries like nsync.
       force_without_xla_support_flag: Whether to compile even when
         `--define=with_xla_support=true` is not set.  If `False`, and the
         define is not passed when building, then the created `cc_library`
@@ -135,6 +141,7 @@ def saved_model_compile_aot(
             "--cpp_class {} ".format(cpp_class) +
             "--variables_to_feed {} ".format(variables_to_feed) +
             "--signature_def_key {} ".format(signature_def) +
+            "--multithreading {} ".format(multithreading) +
             "--target_triple " + target_triple + " " +
             ("--target_cpu " + target_cpu + " " if target_cpu else "") +
             "--tag_set {} ".format(tag_set)
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh b/tensorflow/python/tools/xla_multithread_symbols_test.sh
old mode 100644
new mode 100755
similarity index 67%
rename from tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh
rename to tensorflow/python/tools/xla_multithread_symbols_test.sh
index 43982623109..9576c762112
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh
+++ b/tensorflow/python/tools/xla_multithread_symbols_test.sh
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,12 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 set -e
-set -x
 
-source tensorflow/tools/ci_build/release/common.sh
-
-# Rename to tensorflow_cpu
-for f in $(ls py_test_dir/tensorflow-*cp3*-cp3*m-win_amd64.whl); do
-  copy_to_new_project_name "${f}" tensorflow_cpu
-  rm "${f}"
-done
+SYMBOLS=$(nm "$@" | grep __xla_cpu_runtime)
+if echo "${SYMBOLS}" | grep -q SingleThread; then
+  echo "" 1>&2
+  echo "Saw a SingleThread runtime symbol in $@:" 1>&2
+  echo "" 1>&2
+  echo "${SYMBOLS}" 1>&2
+  echo "" 1>&2
+  exit 1
+else
+  exit 0
+fi
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 0afe2086d93..22a77fc89b0 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -337,7 +337,6 @@ tf_py_test(
         "no_oss",  # TODO(b/131157871): Reenable in OSS when fixed
         "no_windows",  # TODO: needs investigation on Windows
     ],
-    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:client_testlib",
@@ -351,7 +350,6 @@ tf_py_test(
     name = "tpu_sharding_test",
     size = "small",
     srcs = ["tpu_sharding_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:client_testlib",
@@ -363,7 +361,6 @@ tf_py_test(
     name = "bfloat16_test",
     size = "small",
     srcs = ["bfloat16_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:client_testlib",
@@ -375,7 +372,6 @@ tf_py_test(
     name = "tpu_infeed_test",
     size = "small",
     srcs = ["tpu_infeed_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:framework",
@@ -387,7 +383,6 @@ tf_py_test(
     name = "topology_test",
     size = "medium",
     srcs = ["topology_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:framework_test_lib",
@@ -464,7 +459,6 @@ tf_py_test(
         "feature_column_test.py",
     ],
     main = "feature_column_test.py",
-    tfrt_enabled = True,
     deps = [
         ":feature_column",
         "//tensorflow/python:client_testlib",
@@ -487,7 +481,6 @@ tf_py_test(
         "feature_column_v2_test.py",
     ],
     main = "feature_column_v2_test.py",
-    tfrt_enabled = True,
     deps = [
         ":feature_column_v2",
         "//tensorflow/python:client_testlib",
@@ -622,7 +615,6 @@ tf_py_test(
     ],
     python_version = "PY3",
     srcs_version = "PY2AND3",
-    tfrt_enabled = True,
     deps = [
         ":tpu_embedding_v2",
         "//tensorflow/python/compat:v2_compat",
diff --git a/tensorflow/python/tpu/client/BUILD b/tensorflow/python/tpu/client/BUILD
index a6973d4ec22..dc94bffb64e 100644
--- a/tensorflow/python/tpu/client/BUILD
+++ b/tensorflow/python/tpu/client/BUILD
@@ -43,7 +43,6 @@ tf_py_test(
     tags = [
         "no_oss_py2",
     ],
-    tfrt_enabled = True,
     deps = [
         ":client",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index 41da6251ba6..a7c169072b4 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -82,6 +82,7 @@ _REASON_FEEDS_WHILELOOP_OP = 'not-traced-feeds-special-whileloop-op'
 
 _OUTPUT_STREAM_ESCAPE = 'file://'
 _TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
+TENSOR_TRACER_SUMMARY_COLLECTION = 'tensor_tracer_summary_writers'
 _TRACE_FILE_NAME = 'trace.all'
 _COMPACT_TRACE_FILE_PREFIX = 'compact_trace.'
 _COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
@@ -92,6 +93,7 @@ _SKIP_REPORT_FILE = 'None'  # Do not write report proto if --report_file=None
 
 _TT_SUMMARY_NORM = tensor_tracer_flags.TT_SUMMARY_NORM
 _TT_SUMMARY_MAX = tensor_tracer_flags.TT_SUMMARY_MAX
+_TT_SUMMARY_MAX_ABS = tensor_tracer_flags.TT_SUMMARY_MAX_ABS
 _TT_SUMMARY_MIN = tensor_tracer_flags.TT_SUMMARY_MIN
 _TT_SUMMARY_MEAN = tensor_tracer_flags.TT_SUMMARY_MEAN
 _TT_SUMMARY_VAR = tensor_tracer_flags.TT_SUMMARY_VAR
@@ -177,6 +179,9 @@ def set_parameters(tensor_tracer_params=None):
           traced. included_optypes can be set as a regular expression. E.g,
           '--included_optypes=some_op_type --excluded_optypes=*.' will trace
           only the ops with type 'some_op_type'
+        - flush_summaries: If summary mode is used, flush_summaries=1 will
+          flush summaries using outside compilation. Note that, if used with
+          low level APIs, flush_summaries=1 is necessary to obtain results.
         Advanced Flags:
         - trace_scalar: Scalar values are not traced by default. If this flag is
           set, scalar values will also be traced.
@@ -813,14 +818,9 @@ class TensorTracer(object):
         var = array_ops.reshape(var, [])
       return mean, var
 
-    def _show_max_abs(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float32)
-      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
-      zero = constant_op.constant(0, dtypes.float32)
-      output_tensor = gen_math_ops.maximum(zero, output_tensor)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
+    def _show_max_abs(tensor, cast_to_f32=True):
+      return _compute_signature(
+          tensor, lambda t: math_ops.reduce_max(math_ops.abs(t)), cast_to_f32)
 
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
       return {self._parameters.trace_mode: _detect_nan_inf(tensor)}
@@ -852,6 +852,8 @@ class TensorTracer(object):
           signature_result_tensor = _show_norm(tensor, cast_to_f32=False)
         elif signature_name == _TT_SUMMARY_MAX:
           signature_result_tensor = _show_max(tensor, cast_to_f32=False)
+        elif signature_name == _TT_SUMMARY_MAX_ABS:
+          signature_result_tensor = _show_max_abs(tensor, cast_to_f32=False)
         elif signature_name == _TT_SUMMARY_MIN:
           signature_result_tensor = _show_min(tensor, cast_to_f32=False)
         elif signature_name == _TT_SUMMARY_SIZE:
@@ -1664,6 +1666,8 @@ class TensorTracer(object):
           self._parameters.trace_dir,
           filename_suffix=file_suffix,
           max_queue=_TT_SUMMARY_MAX_QUEUE)
+      ops.get_default_graph().add_to_collection(
+          TENSOR_TRACER_SUMMARY_COLLECTION, summary_writer)
       with summary_writer.as_default():
         summary_metadata = summary_pb2.SummaryMetadata(
             plugin_data=summary_pb2.SummaryMetadata.PluginData(
@@ -1893,6 +1897,32 @@ class TensorTracer(object):
           processed_t_fetches = control_flow_ops.tuple(
               processed_t_fetches, control_inputs=[cache_write_op])
           del self._host_call_fn[_TT_HOSTCALL_KEY]
+        elif self._parameters.flush_summaries_with_outside_compile:
+          write_cache, caches_to_write = self._host_call_fn[_TT_HOSTCALL_KEY]
+          if (_TT_SUMMARY_TAG in caches_to_write and 'step' in caches_to_write):
+            step = caches_to_write['step']
+            tensor_tracer_summary = caches_to_write[_TT_SUMMARY_TAG]
+            tt_core_summary = self.merge_caches_on_tpu(tensor_tracer_summary[0])
+            if not self._parameters.collect_summary_per_core:
+              tt_core_summary = self.aggregate_global_cache(tt_core_summary)
+
+            def write_if_core_0(step, replica_id, tt_summary):
+
+              return control_flow_ops.cond(
+                  math_ops.equal(replica_id, 0),
+                  lambda: write_cache(step=step, event_file_suffix=None,  # pylint: disable=g-long-lambda
+                                      tensor_tracer_summary=tt_summary),
+                  control_flow_ops.no_op)
+
+            write_op = tpu.outside_compilation(write_if_core_0, step=step,
+                                               replica_id=self._replica_id,
+                                               tt_summary=tt_core_summary)
+            processed_t_fetches = control_flow_ops.tuple(
+                processed_t_fetches, control_inputs=[write_op])
+            del self._host_call_fn[_TT_HOSTCALL_KEY]
+          else:
+            raise ValueError('Outside compiled flush in only supported for '
+                             'summary mode')
       else:
         processed_t_fetches = self._flush_tensor_values_cache(
             processed_t_fetches, op_fetches, on_tpu=on_tpu,
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index ba375737866..a8d0ceafd16 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -71,6 +71,7 @@ FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
 FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
 FLAG_NAME_INSPECT_TRACE = 'inspect_trace'
 FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
+FLAG_FLUSH_SUMMARY = 'flush_summaries'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
@@ -80,6 +81,7 @@ _TT_PREFIX = 'tensor_tracer'
 
 _TT_NORM = 'norm'
 _TT_MAX = 'max'
+_TT_MAX_ABS = 'max-abs'
 _TT_MIN = 'min'
 _TT_MEAN = 'mean'
 _TT_VAR = 'var'
@@ -87,13 +89,15 @@ _TT_SIZE = 'size'
 
 TT_SUMMARY_NORM = '%s_%s' % (_TT_PREFIX, _TT_NORM)
 TT_SUMMARY_MAX = '%s_%s' % (_TT_PREFIX, _TT_MAX)
+TT_SUMMARY_MAX_ABS = '%s_%s' % (_TT_PREFIX, _TT_MAX_ABS)
 TT_SUMMARY_MIN = '%s_%s' % (_TT_PREFIX, _TT_MIN)
 TT_SUMMARY_MEAN = '%s_%s' % (_TT_PREFIX, _TT_MEAN)
 TT_SUMMARY_VAR = '%s_%s' % (_TT_PREFIX, _TT_VAR)
 TT_SUMMARY_SIZE = '%s_%s' % (_TT_PREFIX, _TT_SIZE)
 
 TT_SUMMARY_SIGNATURES = (TT_SUMMARY_NORM, TT_SUMMARY_MAX, TT_SUMMARY_MIN,
-                         TT_SUMMARY_MEAN, TT_SUMMARY_VAR, TT_SUMMARY_SIZE)
+                         TT_SUMMARY_MEAN, TT_SUMMARY_VAR, TT_SUMMARY_SIZE,
+                         TT_SUMMARY_MAX_ABS)
 
 
 class TTParameters(object):
@@ -135,6 +139,8 @@ class TTParameters(object):
                                                 _TT_DEFAULT_TRACE_LEVEL)
     self.summary_signatures = self._get_summary_signatures()
     self.collect_summary_per_core = self.is_flag_on(FLAG_NAME_SUMMARY_PER_CORE)
+    self.flush_summaries_with_outside_compile = self.is_flag_on(
+        FLAG_FLUSH_SUMMARY)
 
   def _get_report_filepath(self):
     """Sets the path of the output report file."""
@@ -253,7 +259,7 @@ class TTParameters(object):
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
         FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
         FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR,
-        FLAG_NAME_INSPECT_TRACE
+        FLAG_NAME_INSPECT_TRACE, FLAG_FLUSH_SUMMARY
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
@@ -292,7 +298,7 @@ class TTParameters(object):
             signature, TT_SUMMARY_SIGNATURES))
     if not tt_signatures:
       # Default case collects norm and max only.
-      return {TT_SUMMARY_MAX: 0, TT_SUMMARY_NORM: 1}
+      return {TT_SUMMARY_MAX_ABS: 0, TT_SUMMARY_NORM: 1}
     else:
       return {signature: idx for idx, signature in enumerate(tt_signatures)}
 
@@ -303,6 +309,9 @@ class TTParameters(object):
             TRACE_MODE_NAN_INF: math_ops.reduce_max,
             TT_SUMMARY_NORM: linalg_ops.norm,
             TT_SUMMARY_MAX: math_ops.reduce_max,
+            TT_SUMMARY_MAX_ABS:
+                lambda t, axis=0: math_ops.reduce_max(math_ops.abs(t),  # pylint: disable=g-long-lambda
+                                                      axis=axis),
             TT_SUMMARY_MIN: math_ops.reduce_min,
             TT_SUMMARY_MEAN: math_ops.reduce_mean,
             TT_SUMMARY_VAR: math_ops.reduce_max,  # Simply reduce max variance.
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index c6b5a256b42..084ec1f3dba 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -288,6 +288,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._outer_device_function_stack = None
     self._oc_dev_fn_stack = None
     self._outside_compilation_cluster = None
+    self._outside_compilation_v2_context = None
     self._outside_compilation_counter = 0
     self._in_gradient_colocation = None
     self._gradient_colocation_stack = []
@@ -379,6 +380,21 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def EnterGradientColocation(self, op, gradient_uid):
     if op is not None:
+      if ops.get_default_graph()._control_flow_context is None:  # pylint: disable=protected-access
+        # If we are in TF 2 functions (control flow V2 functions, or
+        # tf.function()), we need to attach _xla_outside_compilation attribute
+        # directly because we are not in TPUReplicateContext.
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR).decode("ascii")
+        except ValueError:
+          # The attr was not present: do nothing.
+          return
+        parts = outside_attr.split(".")
+        cluster = parts[0] + "." + gradient_uid
+        self._outside_compilation_v2_context = OutsideCompilationV2Context(
+            cluster)
+        self._outside_compilation_v2_context.Enter()
+        return
       self._gradient_colocation_stack.append(op)
       if not self._outside_compilation_cluster:
         try:
@@ -418,6 +434,17 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def ExitGradientColocation(self, op, gradient_uid):
     if op is not None:
+      if ops.get_default_graph()._control_flow_context is None:  # pylint: disable=protected-access
+        # Inside a TF2 tf.function or control flow graph and `op` was not
+        # marked to be outside compiled.
+        assert self._outside_compilation_v2_context is None
+        return
+      if self._outside_compilation_v2_context is not None:
+        # Inside a TF2 tf.function or control flow graph and `op` was
+        # marked to be outside compiled.
+        self._outside_compilation_v2_context.Exit()
+        self._outside_compilation_v2_context = None
+        return
       if not self._gradient_colocation_stack:
         raise errors.InternalError(
             op.node_def, op,
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 3a45a579631..4cd828eb215 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -52,8 +52,13 @@ INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
 #  as AdagradParameters etc instead of learning_rate.
 class TableConfig(
     collections.namedtuple('TableConfig', [
-        'vocabulary_size', 'dimension', 'initializer', 'combiner',
-        'hot_id_replication', 'learning_rate', 'learning_rate_fn',
+        'vocabulary_size',
+        'dimension',
+        'initializer',
+        'combiner',
+        'hot_id_replication',
+        'learning_rate',
+        'learning_rate_fn',
         'optimization_parameters',
     ])):
   """Embedding table configuration."""
@@ -85,16 +90,16 @@ class TableConfig(
       hot_id_replication: If true, enables hot id replication, which can make
         embedding lookups faster if there are some hot rows in the table.
       learning_rate: float, static learning rate for this table. If
-        learning_rate and learning_rate_fn are both `None`, static learning
-        rate as specified in local `optimization_parameters` will be used.
-        In case local `optimization_parameters` is `None`, global
+        learning_rate and learning_rate_fn are both `None`, static learning rate
+        as specified in local `optimization_parameters` will be used. In case
+        local `optimization_parameters` is `None`, global
         `optimization_parameters` in `TPUEmbedding` constructor will be used.
         `learning_rate_fn` must be `None` if `learning_rate` is not `None.
       learning_rate_fn: string, use dynamic learning rate given by the function.
-        This function function will be passed the current global step. If
-        learning_rate and learning_rate_fn are both `None`, static
-        learning rate as specified in `optimization_parameters` is used.
-        `learning_rate` must be `None` if `learning_rate_fn` is not `None.
+        This function will be passed the current global step. If learning_rate
+        and learning_rate_fn are both `None`, static learning rate as specified
+        in `optimization_parameters` is used. `learning_rate` must be `None` if
+        `learning_rate_fn` is not `None.
       optimization_parameters: `AdagradParameters`, `AdamParameters`,
         `Stochasticgradientdescentparameters`. Specifies table level optimizer.
         If it's `None` global optimizer in `TPUEmbedding` constructor is used.
@@ -127,8 +132,8 @@ class TableConfig(
 
     if learning_rate is not None and learning_rate_fn is not None:
       raise ValueError('At most one of learning_rate and learning_rate_fn '
-                       'can be None; got {} and {}'
-                       .format(learning_rate, learning_rate_fn))
+                       'can be None; got {} and {}'.format(
+                           learning_rate, learning_rate_fn))
 
     if optimization_parameters is not None:
       if not isinstance(optimization_parameters, _OptimizationParameters):
@@ -144,15 +149,11 @@ class TableConfig(
 
 
 class FeatureConfig(
-    collections.namedtuple(
-        'FeatureConfig',
-        ['table_id', 'max_sequence_length', 'weight_key'])):
+    collections.namedtuple('FeatureConfig',
+                           ['table_id', 'max_sequence_length', 'weight_key'])):
   """Feature configuration."""
 
-  def __new__(cls,
-              table_id,
-              max_sequence_length=0,
-              weight_key=None):
+  def __new__(cls, table_id, max_sequence_length=0, weight_key=None):
     """Feature configuration.
 
     Args:
@@ -171,8 +172,8 @@ class FeatureConfig(
       ValueError: if `max_sequence_length` non-negative.
     """
     if not isinstance(max_sequence_length, int) or max_sequence_length < 0:
-      raise ValueError('Invalid max_sequence_length {}.'.format(
-          max_sequence_length))
+      raise ValueError(
+          'Invalid max_sequence_length {}.'.format(max_sequence_length))
 
     return super(FeatureConfig, cls).__new__(cls, table_id, max_sequence_length,
                                              weight_key)
@@ -191,19 +192,19 @@ class EnqueueData(
     """Data to be enqueued through generate_enqueue_ops().
 
     Args:
-      embedding_indices: A rank 1 Tensors, indices into the embedding tables. It
+      embedding_indices: A rank 1 Tensor, indices into the embedding tables. It
         corresponds to sp_ids.values in embedding_lookup_sparse(). Both int32
         and int64 are allowed and will be converted to int32 internally.
-      sample_indices: A rank 2 Tensors specifying the training example to which
+      sample_indices: A rank 2 Tensor specifying the training example to which
         the corresponding embedding_indices and aggregation_weights values
         belong. It corresponds to sp_ids.indices in embedding_lookup_sparse().
         If it is None, we assume each embedding_indices belongs to a different
         sample. Both int32 and int64 are allowed and will be converted to int32
         internally.
-      aggregation_weights: A rank 1 Tensors containing aggregation weights.
-        It corresponds to sp_weights.values in embedding_lookup_sparse(). If it
-        is None, we assume all weights are 1. Both float32 and float64 are
-        allowed and will be converted to float32 internally.
+      aggregation_weights: A rank 1 Tensor containing aggregation weights. It
+        corresponds to sp_weights.values in embedding_lookup_sparse(). If it is
+        None, we assume all weights are 1. Both float32 and float64 are allowed
+        and will be converted to float32 internally.
 
     Returns:
       An EnqueueData tuple.
@@ -310,11 +311,11 @@ def get_enqueue_datas_list_from_ragged_tensors_list(rg_tensors_list):
   return enqueue_datas_list
 
 
-AdamSlotVariableNames = collections.namedtuple(
-    'AdamSlotVariableNames', ['m', 'v'])
+AdamSlotVariableNames = collections.namedtuple('AdamSlotVariableNames',
+                                               ['m', 'v'])
 
-AdagradSlotVariableName = collections.namedtuple(
-    'AdagradSlotVariableName', ['accumulator'])
+AdagradSlotVariableName = collections.namedtuple('AdagradSlotVariableName',
+                                                 ['accumulator'])
 
 MomentumSlotVariableName = collections.namedtuple('MomentumSlotVariableName',
                                                   ['momenta'])
@@ -325,14 +326,13 @@ RMSPropSlotVariableNames = collections.namedtuple('RMSPropSlotVariableNames',
 ProximalAdagradSlotVariableName = collections.namedtuple(
     'ProximalAdagradSlotVariableName', ['accumulator'])
 
-FtrlSlotVariableName = collections.namedtuple(
-    'FtrlSlotVariableName', ['accumulator', 'linear'])
+FtrlSlotVariableName = collections.namedtuple('FtrlSlotVariableName',
+                                              ['accumulator', 'linear'])
 
 ProximalYogiSlotVariableNames = collections.namedtuple(
     'ProximalYogiSlotVariableNames', ['v', 'm'])
 
-AdamSlotVariables = collections.namedtuple(
-    'AdamSlotVariables', ['m', 'v'])
+AdamSlotVariables = collections.namedtuple('AdamSlotVariables', ['m', 'v'])
 
 MomentumSlotVariable = collections.namedtuple('MomentumSlotVariable',
                                               ['momenta'])
@@ -340,23 +340,22 @@ MomentumSlotVariable = collections.namedtuple('MomentumSlotVariable',
 RMSPropSlotVariables = collections.namedtuple('RMSPropSlotVariables',
                                               ['ms', 'mom'])
 
-AdagradSlotVariable = collections.namedtuple(
-    'AdagradSlotVariable', ['accumulator'])
+AdagradSlotVariable = collections.namedtuple('AdagradSlotVariable',
+                                             ['accumulator'])
 
 ProximalAdagradSlotVariable = collections.namedtuple(
     'ProximalAdagradSlotVariable', ['accumulator'])
 
-FtrlSlotVariable = collections.namedtuple(
-    'FtrlSlotVariable', ['accumulator', 'linear'])
+FtrlSlotVariable = collections.namedtuple('FtrlSlotVariable',
+                                          ['accumulator', 'linear'])
 
 ProximalYogiSlotVariables = collections.namedtuple('ProximalYogiSlotVariables',
                                                    ['v', 'm'])
 
-VariablesAndOps = collections.namedtuple(
-    'VariablesAndOps',
-    ['embedding_variables_by_table', 'slot_variables_by_table',
-     'load_ops', 'retrieve_ops']
-)
+VariablesAndOps = collections.namedtuple('VariablesAndOps', [
+    'embedding_variables_by_table', 'slot_variables_by_table', 'load_ops',
+    'retrieve_ops'
+])
 
 
 class _OptimizationParameters(object):
@@ -370,6 +369,8 @@ class _OptimizationParameters(object):
       clip_weight_max: Optional[float],
       weight_decay_factor: Optional[float],
       multiply_weight_decay_factor_by_learning_rate: Optional[bool],
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     self.learning_rate = learning_rate
     self.use_gradient_accumulation = use_gradient_accumulation
@@ -378,6 +379,8 @@ class _OptimizationParameters(object):
     self.weight_decay_factor = weight_decay_factor
     self.multiply_weight_decay_factor_by_learning_rate = (
         multiply_weight_decay_factor_by_learning_rate)
+    self.clip_gradient_min = clip_gradient_min
+    self.clip_gradient_max = clip_gradient_max
 
 
 @tf_export(v1=['tpu.experimental.AdagradParameters'])
@@ -409,6 +412,8 @@ class AdagradParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for Adagrad.
 
@@ -418,18 +423,26 @@ class AdagradParameters(_OptimizationParameters):
       use_gradient_accumulation: setting this to `False` makes embedding
         gradients calculation less accurate but faster. Please see
         `optimization_parameters.proto` for details.
-        for details.
       clip_weight_min: the minimum value to clip by; None means -infinity.
       clip_weight_max: the maximum value to clip by; None means +infinity.
       weight_decay_factor: amount of weight decay to apply; None means that the
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
-    super(AdagradParameters,
-          self).__init__(learning_rate, use_gradient_accumulation,
-                         clip_weight_min, clip_weight_max, weight_decay_factor,
-                         multiply_weight_decay_factor_by_learning_rate)
+    super(AdagradParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
+    )
     if initial_accumulator <= 0:
       raise ValueError('Adagrad initial_accumulator must be positive')
     self.initial_accumulator = initial_accumulator
@@ -455,6 +468,8 @@ class ProximalAdagradParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for Adagrad.
 
@@ -474,11 +489,20 @@ class ProximalAdagradParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
-    super(ProximalAdagradParameters,
-          self).__init__(learning_rate, use_gradient_accumulation,
-                         clip_weight_min, clip_weight_max, weight_decay_factor,
-                         multiply_weight_decay_factor_by_learning_rate)
+    super(ProximalAdagradParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
+    )
     if initial_accumulator <= 0:
       raise ValueError('Adagrad initial_accumulator must be positive')
     if l1_regularization_strength < 0.:
@@ -527,35 +551,45 @@ class AdamParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for Adam.
 
     Args:
       learning_rate: a floating point value. The learning rate.
-      beta1: A float value.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value.
-        The exponential decay rate for the 2nd moment estimates.
+      beta1: A float value. The exponential decay rate for the 1st moment
+        estimates.
+      beta2: A float value. The exponential decay rate for the 2nd moment
+        estimates.
       epsilon: A small constant for numerical stability.
-      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
-        Please see `optimization_parameters.proto` for details.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster. See
+        `optimization_parameters.proto` for details.
       sum_inside_sqrt: This improves training speed. Please see
         `optimization_parameters.proto` for details.
       use_gradient_accumulation: setting this to `False` makes embedding
         gradients calculation less accurate but faster. Please see
         `optimization_parameters.proto` for details.
-        for details.
       clip_weight_min: the minimum value to clip by; None means -infinity.
       clip_weight_max: the maximum value to clip by; None means +infinity.
       weight_decay_factor: amount of weight decay to apply; None means that the
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
-    super(AdamParameters,
-          self).__init__(learning_rate, use_gradient_accumulation,
-                         clip_weight_min, clip_weight_max, weight_decay_factor,
-                         multiply_weight_decay_factor_by_learning_rate)
+    super(AdamParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
+    )
     if beta1 < 0. or beta1 >= 1.:
       raise ValueError('beta1 must be between 0. and 1; got {}.'.format(beta1))
     if beta2 < 0. or beta2 >= 1.:
@@ -608,6 +642,8 @@ class FtrlParameters(_OptimizationParameters):
       multiply_linear_by_learning_rate: bool = False,
       beta: float = 0,
       allow_zero_accumulator: bool = False,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for Ftrl.
 
@@ -617,19 +653,18 @@ class FtrlParameters(_OptimizationParameters):
     Args:
       learning_rate: a floating point value. The learning rate.
       learning_rate_power: A float value, must be less or equal to zero.
-        Controls how the learning rate decreases during training. Use zero for
-        a fixed learning rate. See section 3.1 in the
+        Controls how the learning rate decreases during training. Use zero for a
+        fixed learning rate. See section 3.1 in the
         [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
-      initial_accumulator_value: The starting value for accumulators.
-        Only zero or positive values are allowed.
-      l1_regularization_strength: A float value, must be greater than or
-        equal to zero.
-      l2_regularization_strength: A float value, must be greater than or
-        equal to zero.
+      initial_accumulator_value: The starting value for accumulators. Only zero
+        or positive values are allowed.
+      l1_regularization_strength: A float value, must be greater than or equal
+        to zero.
+      l2_regularization_strength: A float value, must be greater than or equal
+        to zero.
       use_gradient_accumulation: setting this to `False` makes embedding
         gradients calculation less accurate but faster. Please see
-        `optimization_parameters.proto` for details.
-        for details.
+        `optimization_parameters.proto` for details. for details.
       clip_weight_min: the minimum value to clip by; None means -infinity.
       clip_weight_max: the maximum value to clip by; None means +infinity.
       weight_decay_factor: amount of weight decay to apply; None means that the
@@ -644,11 +679,20 @@ class FtrlParameters(_OptimizationParameters):
       allow_zero_accumulator: Changes the implementation of the square root to
         allow for the case of initial_accumulator_value being zero. This will
         cause a slight performance drop.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
-    super(FtrlParameters,
-          self).__init__(learning_rate, use_gradient_accumulation,
-                         clip_weight_min, clip_weight_max, weight_decay_factor,
-                         multiply_weight_decay_factor_by_learning_rate)
+    super(FtrlParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
+    )
     if learning_rate_power > 0.:
       raise ValueError('learning_rate_power must be less than or equal to 0. '
                        'got {}.'.format(learning_rate_power))
@@ -680,13 +724,15 @@ class ProximalYogiParameters(_OptimizationParameters):
   """Optimization parameters for Proximal Yogi with TPU embeddings.
 
   Implements the Yogi optimizer as described in
-  [Adaptive Methods for Nonconvex Optimization](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization).
+  [Adaptive Methods for Nonconvex
+  Optimization](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization).
 
   Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
   `optimization_parameters` argument to set the optimizer and its parameters.
   See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
   for more details.
   """
+
   # pylint: enable=line-too-long
 
   def __init__(
@@ -703,6 +749,8 @@ class ProximalYogiParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for Proximal Yogi.
 
@@ -728,11 +776,20 @@ class ProximalYogiParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
-    super(ProximalYogiParameters,
-          self).__init__(learning_rate, use_gradient_accumulation,
-                         clip_weight_min, clip_weight_max, weight_decay_factor,
-                         multiply_weight_decay_factor_by_learning_rate)
+    super(ProximalYogiParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
+    )
     if beta1 < 0. or beta1 >= 1.:
       raise ValueError('beta1 must be between 0. and 1; got {}.'.format(beta1))
     if beta2 < 0. or beta2 >= 1.:
@@ -783,6 +840,8 @@ class MomentumParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for momentum.
 
@@ -807,6 +866,8 @@ class MomentumParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
     super(MomentumParameters, self).__init__(
         learning_rate=learning_rate,
@@ -816,6 +877,8 @@ class MomentumParameters(_OptimizationParameters):
         weight_decay_factor=weight_decay_factor,
         multiply_weight_decay_factor_by_learning_rate=(
             multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
     )
     self.momentum = momentum
     self.use_nesterov = use_nesterov
@@ -851,6 +914,8 @@ class RMSPropParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for RMS prop.
 
@@ -868,6 +933,8 @@ class RMSPropParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
     super(RMSPropParameters, self).__init__(
         learning_rate=learning_rate,
@@ -877,6 +944,8 @@ class RMSPropParameters(_OptimizationParameters):
         weight_decay_factor=weight_decay_factor,
         multiply_weight_decay_factor_by_learning_rate=(
             multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
     )
     self.rho = rho
     self.momentum = momentum
@@ -910,6 +979,8 @@ class StochasticGradientDescentParameters(_OptimizationParameters):
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      clip_gradient_min: Optional[float] = None,
+      clip_gradient_max: Optional[float] = None,
   ):
     """Optimization parameters for stochastic gradient descent.
 
@@ -921,11 +992,20 @@ class StochasticGradientDescentParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clip_gradient_min: the minimum value to clip by; None means -infinity.
+      clip_gradient_max: the maximum value to clip by; None means +infinity.
     """
-    super(StochasticGradientDescentParameters,
-          self).__init__(learning_rate, False, clip_weight_min, clip_weight_max,
-                         weight_decay_factor,
-                         multiply_weight_decay_factor_by_learning_rate)
+    super(StochasticGradientDescentParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=False,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+        clip_gradient_min=clip_gradient_min,
+        clip_gradient_max=clip_gradient_max,
+    )
 
 
 DeviceConfig = collections.namedtuple('DeviceConfig',
@@ -1058,6 +1138,7 @@ class TPUEmbedding(object):
                cluster_def=None,
                pipeline_execution_with_tensor_core=False,
                partition_strategy='div',
+               profile_data_directory=None,
                device_config=None,
                master_job_name=None):
     """API for using TPU for embedding lookups.
@@ -1084,6 +1165,21 @@ class TPUEmbedding(object):
       partition_strategy: A string, either 'mod' or 'div', specifying how to map
         the lookup id to the embedding tensor. For more information see
         `tf.nn.embedding_lookup_sparse`.
+      profile_data_directory: Directory where embedding lookup statistics are
+        stored. These statistics summarize information about the inputs to the
+        embedding lookup operation, in particular, the average number of
+        embedding IDs per example and how well the embedding IDs are load
+        balanced across the system. The lookup statistics are used during TPU
+        initialization for embedding table partitioning. Collection of lookup
+        statistics is done at runtime by  profiling the embedding inputs: only
+        3% of input samples are profiled to minimize host CPU overhead. Once
+        a suitable number of samples are profiled, the lookup statistics are
+        saved to table-specific files in the profile data directory generally
+        at the end of a TPU training loop. The filename corresponding to each
+        table is obtained by hashing table specific parameters (e.g., table
+        name and number of features) and global configuration parameters (e.g.,
+        sharding strategy and task count). The same profile data directory can
+        be shared among several models to reuse embedding lookup statistics.
       device_config: A DeviceConfig instance, used when `master` and
         `cluster_def` are both `None`.
       master_job_name: if set, overrides the master job name used to schedule
@@ -1097,6 +1193,8 @@ class TPUEmbedding(object):
           'Invalid partition_strategy {}'.format(partition_strategy))
     self._partition_strategy = partition_strategy
 
+    self._profile_data_directory = profile_data_directory
+
     _validate_table_to_config_dict(table_to_config_dict)
     # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
     self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
@@ -1138,14 +1236,14 @@ class TPUEmbedding(object):
       self._num_hosts = tpu_system_metadata.num_hosts
       if master_job_name is None:
         try:
-          master_job_name = tpu_system_metadata_lib.master_job(master,
-                                                               cluster_def)
+          master_job_name = tpu_system_metadata_lib.master_job(
+              master, cluster_def)
         except ValueError as e:
           raise ValueError(str(e) + ' Please specify a master_job_name.')
       self._hosts = []
       for device in tpu_system_metadata.devices:
-        if 'device:CPU:' in device.name and (
-            master_job_name is None or master_job_name in device.name):
+        if 'device:CPU:' in device.name and (master_job_name is None or
+                                             master_job_name in device.name):
           self._hosts.append(device.name)
       self._num_cores_per_host = tpu_system_metadata.num_of_cores_per_host
       self._num_cores = tpu_system_metadata.num_cores
@@ -1162,11 +1260,10 @@ class TPUEmbedding(object):
       if optimization_parameters is not None:
         raise ValueError('`optimization_parameters` should be `None` '
                          'for inference mode.')
-      self._optimization_parameters = (
-          StochasticGradientDescentParameters(1.))
+      self._optimization_parameters = (StochasticGradientDescentParameters(1.))
     else:
-      raise ValueError('`mode` only supports {} and {}; got {}.'
-                       .format(TRAINING, INFERENCE, mode))
+      raise ValueError('`mode` only supports {} and {}; got {}.'.format(
+          TRAINING, INFERENCE, mode))
     self._mode = mode
 
     # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
@@ -1177,11 +1274,13 @@ class TPUEmbedding(object):
 
     self._pipeline_execution_with_tensor_core = (
         pipeline_execution_with_tensor_core)
-    self._learning_rate_fn = list(set(
-        c.learning_rate_fn for c in self._table_to_config_dict.values()
-        if c.learning_rate_fn is not None))
+    self._learning_rate_fn = list(
+        set(c.learning_rate_fn
+            for c in self._table_to_config_dict.values()
+            if c.learning_rate_fn is not None))
     self._learning_rate_fn_to_tag = {
-        fn: id for id, fn in enumerate(self._learning_rate_fn)}
+        fn: id for id, fn in enumerate(self._learning_rate_fn)
+    }
 
     self._config_proto = self._create_config_proto()
 
@@ -1285,6 +1384,14 @@ class TPUEmbedding(object):
           optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
           if optimization_parameters.use_gradient_accumulation else
           optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+
+      if optimization_parameters.clip_gradient_min is not None:
+        parameters.gradient_clipping_limits.lower.value = (
+            optimization_parameters.clip_gradient_min)
+      if optimization_parameters.clip_gradient_max is not None:
+        parameters.gradient_clipping_limits.upper.value = (
+            optimization_parameters.clip_gradient_max)
+
       if optimization_parameters.clip_weight_min is not None:
         parameters.clipping_limits.lower.value = (
             optimization_parameters.clip_weight_min)
@@ -1313,10 +1420,13 @@ class TPUEmbedding(object):
         elc.TPUEmbeddingConfiguration.MOD)
     config_proto.pipeline_execution_with_tensor_core = (
         self._pipeline_execution_with_tensor_core)
+    if self._profile_data_directory:
+      config_proto.profile_data_directory = self._profile_data_directory
 
     return config_proto
 
-  def create_variables_and_ops(self, embedding_variable_name_by_table=None,
+  def create_variables_and_ops(self,
+                               embedding_variable_name_by_table=None,
                                slot_variable_names_by_table=None):
     """Create embedding and slot variables, with ops to load and retrieve them.
 
@@ -1335,8 +1445,8 @@ class TPUEmbedding(object):
 
     Args:
       embedding_variable_name_by_table: A dictionary mapping from string of
-        table name to string of embedding variable name. If `None`,
-        defaults from `get_default_slot_variable_names()` will be used.
+        table name to string of embedding variable name. If `None`, defaults
+        from `get_default_slot_variable_names()` will be used.
       slot_variable_names_by_table: A dictionary mapping from string of table
         name to `AdamSlotVariableNames`, `AdagradSlotVariableNames` etc. If
         `None`, defaults from `get_default_slot_variable_names()` will be used.
@@ -1420,8 +1530,7 @@ class TPUEmbedding(object):
       return retrieve_ops_list
 
     return VariablesAndOps(embedding_variables_by_table,
-                           slot_variables_by_table,
-                           load_ops, retrieve_ops)
+                           slot_variables_by_table, load_ops, retrieve_ops)
 
   def generate_enqueue_ops(
       self,
@@ -1432,10 +1541,9 @@ class TPUEmbedding(object):
     """Generate enqueue ops.
 
     Args:
-      enqueue_datas_list: a list of dictionary mapping from string
-        of feature names to EnqueueData. Each dictionary is for one
-        TPU core. Dictionaries for the same host should be contiguous
-        on the list.
+      enqueue_datas_list: a list of dictionary mapping from string of feature
+        names to EnqueueData. Each dictionary is for one TPU core. Dictionaries
+        for the same host should be contiguous in the list.
       mode_override: A string input that overrides the mode specified in the
         TPUEmbeddingConfiguration. Supported values are {'unspecified',
         'inference', 'training', 'backward_pass_only'}. When set to
@@ -1633,8 +1741,8 @@ class TPUEmbedding(object):
             if enqueue_data.sample_indices is not None else int_zeros)
 
         kwargs['aggregation_weights'].append(
-            enqueue_data.aggregation_weights if
-            enqueue_data.aggregation_weights is not None else float_zeros)
+            enqueue_data.aggregation_weights
+            if enqueue_data.aggregation_weights is not None else float_zeros)
 
         kwargs['embedding_indices'].append(enqueue_data.embedding_indices)
 
@@ -1673,14 +1781,13 @@ class TPUEmbedding(object):
           feature_index = feature_index + 1
         else:
           activations[feature] = (
-              table_activations[:, feature_index:(feature_index+seq_length), :])
+              table_activations[:,
+                                feature_index:(feature_index + seq_length), :])
           feature_index = feature_index + seq_length
 
     return activations
 
-  def generate_send_gradients_op(self,
-                                 feature_to_gradient_dict,
-                                 step=None):
+  def generate_send_gradients_op(self, feature_to_gradient_dict, step=None):
     """Send gradient to TPU embedding.
 
     Args:
@@ -1696,8 +1803,8 @@ class TPUEmbedding(object):
     """
     if self._mode != TRAINING:
       raise RuntimeError('Only in training mode gradients need to '
-                         'be sent to TPU embedding; got mode {}.'
-                         .format(self._mode))
+                         'be sent to TPU embedding; got mode {}.'.format(
+                             self._mode))
     if step is None and self._learning_rate_fn:
       raise ValueError('There are dynamic learning rates but step is None.')
 
@@ -1718,8 +1825,10 @@ class TPUEmbedding(object):
 
     return tpu_ops.send_tpu_embedding_gradients(
         inputs=gradients,
-        learning_rates=[math_ops.cast(fn(step), dtype=dtypes.float32)
-                        for fn in self._learning_rate_fn],
+        learning_rates=[
+            math_ops.cast(fn(step), dtype=dtypes.float32)
+            for fn in self._learning_rate_fn
+        ],
         config=self.config_proto.SerializeToString())
 
   def _get_optimizer_handler_by_table(self):
@@ -1745,21 +1854,21 @@ def _validate_table_to_config_dict(table_to_config_dict):
 def _validate_feature_to_config_dict(table_to_config_dict,
                                      feature_to_config_dict):
   """Validate `feature_to_config_dict`."""
-  used_table_set = set([feature.table_id
-                        for feature in feature_to_config_dict.values()])
+  used_table_set = set(
+      [feature.table_id for feature in feature_to_config_dict.values()])
   table_set = set(table_to_config_dict.keys())
 
   unused_table_set = table_set - used_table_set
   if unused_table_set:
-    raise ValueError('`table_to_config_dict` specifies table that is not '
-                     'used in `feature_to_config_dict`: {}.'
-                     .format(unused_table_set))
+    raise ValueError(
+        '`table_to_config_dict` specifies table that is not '
+        'used in `feature_to_config_dict`: {}.'.format(unused_table_set))
 
   extra_table_set = used_table_set - table_set
   if extra_table_set:
-    raise ValueError('`feature_to_config_dict` refers to a table that is not '
-                     'specified in `table_to_config_dict`: {}.'
-                     .format(extra_table_set))
+    raise ValueError(
+        '`feature_to_config_dict` refers to a table that is not '
+        'specified in `table_to_config_dict`: {}.'.format(extra_table_set))
 
 
 def _validate_batch_size(batch_size, num_cores):
@@ -1777,10 +1886,9 @@ def _validate_optimization_parameters(optimization_parameters,
 
   Args:
       optimization_parameters: global optimizer provided in `TPUEmbedding`
-         constructor.
+        constructor.
       table_to_config_dict: A dictionary mapping from string of table name to
         `TableConfig`.
-
   """
   tbl_optimizer_missing = False
   for _, table_config in table_to_config_dict.items():
@@ -2017,8 +2125,7 @@ class _AdamHandler(_OptimizerHandler):
       load_op_list = []
       config = config_proto
       for host_id, table_variable, m_variable, v_variable in (zip(
-          range(num_hosts), table_variables,
-          m_variables, v_variables)):
+          range(num_hosts), table_variables, m_variables, v_variables)):
         with ops.colocate_with(table_variable):
           load_parameters_op = (
               tpu_ops.load_tpu_embedding_adam_parameters(
@@ -2044,8 +2151,7 @@ class _AdamHandler(_OptimizerHandler):
       retrieve_op_list = []
       config = config_proto
       for host_id, table_variable, m_variable, v_variable in (zip(
-          range(num_hosts), table_variables,
-          m_variables, v_variables)):
+          range(num_hosts), table_variables, m_variables, v_variables)):
         with ops.colocate_with(table_variable):
           retrieved_table, retrieved_m, retrieved_v = (
               tpu_ops.retrieve_tpu_embedding_adam_parameters(
@@ -2084,8 +2190,9 @@ class _FtrlHandler(_OptimizerHandler):
   def get_default_slot_variable_names(self, table):
     # These match the default slot variable names created by
     # tf.train.FtrlOptimizer.
-    return FtrlSlotVariableName('{}/{}'.format(table, 'Ftrl'),  # accumulator
-                                '{}/{}'.format(table, 'Ftrl_1'))  # linear
+    return FtrlSlotVariableName(
+        '{}/{}'.format(table, 'Ftrl'),  # accumulator
+        '{}/{}'.format(table, 'Ftrl_1'))  # linear
 
   def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
                                table_config, table_variables, config_proto):
@@ -2107,8 +2214,7 @@ class _FtrlHandler(_OptimizerHandler):
         embedding_dimension=table_config.dimension,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES],
         initializer=linear_initializer)
-    slot_variables = FtrlSlotVariable(accumulator_variables,
-                                      linear_variables)
+    slot_variables = FtrlSlotVariable(accumulator_variables, linear_variables)
 
     def load_ops_fn():
       """Returns the retrieve ops for Ftrl embedding tables.
@@ -2449,8 +2555,7 @@ class _StochasticGradientDescentHandler(_OptimizerHandler):
       """
       load_op_list = []
       config = config_proto
-      for host_id, table_variable in (zip(
-          range(num_hosts), table_variables)):
+      for host_id, table_variable in enumerate(table_variables):
         with ops.colocate_with(table_variable):
           load_parameters_op = (
               tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters(
@@ -2471,8 +2576,7 @@ class _StochasticGradientDescentHandler(_OptimizerHandler):
       """
       retrieve_op_list = []
       config = config_proto
-      for host_id, table_variable in (zip(
-          range(num_hosts), table_variables)):
+      for host_id, table_variable in enumerate(table_variables):
         with ops.colocate_with(table_variable):
           retrieved_table = (
               tpu_ops
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 1417824e53c..ec2ad9bb2dc 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -67,7 +67,7 @@ _NAME_KEY = "_tpu_embedding_layer"
 # sharded variables that can be used in the PSStrategy with optimizers.
 # We implement just enough of the of a tf.Variable so that this could be passed
 # to an optimizer.
-class TPUShardedVariable(sharded_variable.ShardedVariable):
+class TPUShardedVariable(sharded_variable.ShardedVariableMixin):
   """A ShardedVariable class for TPU."""
 
   @property
@@ -372,8 +372,9 @@ class TPUEmbedding(tracking.AutoTrackable):
 
       self._config_proto = self._create_config_proto()
 
-      logging.info("Initializing TPU Embedding engine with config: %s",
-                   self._config_proto)
+      logging.info("Initializing TPU Embedding engine.")
+      tpu_embedding_v2_utils.log_tpu_embedding_configuration(self._config_proto)
+
       @def_function.function
       def load_config():
         tpu.initialize_system_for_tpu_embedding(self._config_proto)
@@ -986,8 +987,8 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     # In the following loop we insert casts so that everything is either int32
     # or float32. This is because op inputs which are lists of tensors must be
-    # of the same type within the list. Moreover the CPU implementions of these
-    # ops cast to these types anyway, so we don't lose any data by casting
+    # of the same type within the list. Moreover the CPU implementations of
+    # these ops cast to these types anyway, so we don't lose any data by casting
     # early.
     for inp, weight, (path, feature) in zip(
         flat_inputs, flat_weights, flat_features):
@@ -1466,8 +1467,8 @@ def cpu_embedding_lookup(inputs, weights, tables, feature_config):
   Note that TPU specific options (such as `max_sequence_length`) in the
   configuration objects will be ignored.
 
-  In the following example we take take a trained model (see the documentation
-  for `tf.tpu.experimental.embedding.TPUEmbedding` for the context) and create a
+  In the following example we take a trained model (see the documentation for
+  `tf.tpu.experimental.embedding.TPUEmbedding` for the context) and create a
   saved model with a serving function that will perform the embedding lookup and
   pass the results to your model:
 
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
index 8960d907be7..32e5d541471 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
@@ -516,7 +516,7 @@ class TPUEmbeddingCorrectness(parameterized.TestCase, test.TestCase):
     # In general this means that after the update, if we lookup feature 0 and 1
     # the values will be 0.3*num_replicas lower per entry and for feature 2 they
     # will be 0.1*num_replicas lower.
-    # The one issue that that these lookups contain padding values.
+    # The one issue is that these lookups contain padding values.
     # For core 0, we get the first 2 elements of the 4 element batch.
     # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
     # length of 2, which means that [0, 1] will be 0s.
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index 4ad26ce5742..d5f9e6446a6 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -152,7 +152,7 @@ class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
     second_checkpoint = util.Checkpoint(model=self.second_mid_level)
     second_checkpoint.restore(_get_tmpdir('restore', 'save-1'))
 
-    # Call retrieve here as a way to check what the TPU contains contains.
+    # Call retrieve here as a way to check what the TPU contains.
     # Calling the retrieve ops directly might make for a cleaner separation of
     # test and module, though.
     self.second_mid_level._retrieve_variables()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index e04f1f0281a..33ff73ed706 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -23,9 +23,12 @@ import abc
 import math
 import typing
 from typing import Any, Dict, Callable, List, Optional, Text, Tuple, TypeVar, Union
+
+from absl import logging
 import six
 
 from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops_v2
@@ -731,3 +734,18 @@ class FeatureConfig(object):
             max_sequence_length=self.max_sequence_length,
             name=self.name)
     )
+
+
+def log_tpu_embedding_configuration(
+    config: tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration) -> None:
+  """Logs a TPUEmbeddingConfiguration proto across multiple statements.
+
+  Args:
+    config: TPUEmbeddingConfiguration proto to log.  Necessary because
+      logging.info has a maximum length to each log statement, which
+      particularly large configs can exceed.
+  """
+  logging.info("Beginning log of TPUEmbeddingConfiguration.")
+  for line in str(config).splitlines():
+    logging.info(line)
+  logging.info("Done with log of TPUEmbeddingConfiguration.")
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
index 48797b00009..770ca1fc407 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_embedding_v2_utils
@@ -88,6 +89,34 @@ class ConfigTest(test.TestCase):
     )
 
 
+class TPUEmbeddingConfigurationTest(test.TestCase):
+
+  def test_no_truncate(self):
+    truncate_length = 14937  # Experimentally maximum string length loggable.
+
+    config = tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration()
+    for i in range(500):
+      td = config.table_descriptor.add()
+      td.name = 'table_{}'.format(i)
+      td.vocabulary_size = i
+    config.num_hosts = 2
+    config.num_tensor_cores = 4
+    config.batch_size_per_tensor_core = 128
+
+    self.assertGreater(
+        len(str(config)), truncate_length,
+        'Test sanity check: generated config should be of truncating length.')
+
+    with self.assertLogs() as logs:
+      tpu_embedding_v2_utils.log_tpu_embedding_configuration(config)
+
+    self.assertIn('table_499', ''.join(logs.output))
+    for line in logs.output:
+      self.assertLess(
+          len(line), truncate_length,
+          'Logging function lines should not be of truncating length.')
+
+
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
   test.main()
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 4eb6429f3c8..2d9fc0242cc 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -34,6 +34,9 @@ from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
@@ -46,8 +49,10 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
+from tensorflow.python.tpu import functional as tpu_functional
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.tpu.ops import tpu_ops
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
@@ -90,6 +95,36 @@ def _events_from_logdir(test_case, logdir):
   return result
 
 
+def _rewrite_func_wrapper(tf_func):
+
+  def tpu_fn(*args, **kwargs):
+    # tpu.rewrite only accepts list of tensors as input. We need to flatten
+    # keyword arguments to meet this requirement.
+    concrete = tf_func.get_concrete_function(*(list(args) +
+                                               list(kwargs.values())))
+    return tpu.rewrite(concrete.__call__, list(args) + list(kwargs.values()))
+
+  return def_function.function(tpu_fn)
+
+
+def _tpu_partitioned_call_wrapper(tf_func):
+  """Wrap a tensorflow Function with TPUPartitionedCall."""
+
+  def inner_func(*args, **kwargs):
+    concrete = tf_func.get_concrete_function(*args, **kwargs)
+    # TPUPartitionedCall only accepts list of tensors as input args.
+    # Flatten keyword arguments and do some basic ordering:
+    # Positional args + Flattened keyword args + Captured args.
+    op_args = list(args) + list(kwargs.values()) + concrete.captured_inputs
+    return tpu_functional.TPUPartitionedCall(
+        args=op_args,
+        device_ordinal=tpu_ops.tpu_ordinal_selector(),
+        Tout=[o.type for o in concrete.function_def.signature.output_arg],
+        f=concrete)
+
+  return def_function.function(inner_func)
+
+
 class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
@@ -450,6 +485,36 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(2916., shape=(strategy.num_replicas_in_sync)))
 
+  def testColocateGradientWithOutsideCompiledOp(self):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def train_step():
+
+      @def_function.function
+      def tpu_fn(x):
+        x1 = tpu.outside_compilation(math_ops.sqrt, x)
+        grad = gradients_impl.gradients([x1], [x],
+                                        colocate_gradients_with_ops=True)[0]
+        sqrt = [
+            op for op in ops.get_default_graph().get_operations()
+            if op.type == "Sqrt"
+        ][0]
+        sqrt_grad = [
+            op for op in ops.get_default_graph().get_operations()
+            if op.type == "SqrtGrad"
+        ][0]
+        assert sqrt.get_attr(tpu._OUTSIDE_COMPILATION_ATTR) == b"0"
+        assert (sqrt_grad.get_attr(
+            tpu._OUTSIDE_COMPILATION_ATTR) == b"0.gradients/uid")
+        return grad
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(.1, shape=(strategy.num_replicas_in_sync)))
+
 
 class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
                                             parameterized.TestCase):
@@ -622,6 +687,34 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
     self.assertAllEqual(
         strategy.experimental_local_results(train_step())[0].shape, [1, 2, 3])
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/167235391): Reenable this test once function calls are handled "
+      "by MLIR bridge."
+  )
+  def testOutsideCompilationWithTPUPartitionedCallOp(self):
+    """Tests that control flow with TPUPartitionedCall including outside_compilation works."""
+    get_tpu_strategy()
+
+    def host_computation(x):
+      return x + 1
+
+    @def_function.function()
+    def train_step(x):
+      x2 = x + 5.0
+      logging_ops.print_v2(x2)
+      x2 = tpu.outside_compilation(host_computation, x2)
+      return x2 + 4.0
+
+    tpu_fn = _rewrite_func_wrapper(train_step)
+    partitioned_tpu_fn = _tpu_partitioned_call_wrapper(tpu_fn)
+
+    concrete = partitioned_tpu_fn.get_concrete_function(
+        x=tensor_spec.TensorSpec(
+            shape=(1), dtype=dtypes.float32, name="input_tensor"))
+
+    self.assertIsInstance(
+        concrete(array_ops.ones((1), dtype=dtypes.float32))[0], ops.Tensor)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index cf2d89b0d1f..0c7dbb1351c 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -602,7 +602,6 @@ tf_py_test(
     tags = [
         "noasan",  # TODO(b/161236904): flaky timeout in trying to start gRPC server
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -624,7 +623,6 @@ tf_py_test(
     srcs = ["server_lib_multiple_containers_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -645,7 +643,6 @@ tf_py_test(
     srcs = ["server_lib_same_variables_clear_container_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -666,7 +663,6 @@ tf_py_test(
     srcs = ["server_lib_same_variables_clear_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -687,7 +683,6 @@ tf_py_test(
     srcs = ["server_lib_same_variables_no_clear_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -708,7 +703,6 @@ tf_py_test(
     srcs = ["server_lib_sparse_job_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -735,7 +729,6 @@ cuda_py_test(
         "no_oss",  # Test flaky due to port collisions.
         "oss_serial",
     ],
-    tfrt_enabled = True,
     deps = [
         ":device_setter",
         "//tensorflow/python:client_testlib",
@@ -762,7 +755,6 @@ tf_py_test(
         "notsan",  # data race due to b/62910646
         "oss_serial",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -840,7 +832,6 @@ cuda_py_test(
         "checkpoint_management_test.py",
     ],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":checkpoint_management",
         ":saver",
@@ -962,7 +953,6 @@ tf_py_test(
         "noasan",  # http://b/30379628
         "notsan",  # http://b/30379628
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -982,7 +972,6 @@ tf_py_test(
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1050,7 +1039,6 @@ tf_py_test(
     grpc_enabled = True,
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":checkpoint_management",
         ":saver",
@@ -1106,7 +1094,6 @@ tf_py_test(
     size = "small",
     srcs = ["training_util_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":training_util",
         "//tensorflow/python:client_testlib",
@@ -1121,7 +1108,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["adam_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":adam",
         "//tensorflow/python:array_ops",
@@ -1150,7 +1136,9 @@ cuda_py_test(
         "no_windows",  # b/139083295: bfloat16 tests fail on Windows
         "notsan",
     ],
-    tfrt_enabled = True,
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
     deps = [
         ":moving_averages",
         ":saver",
@@ -1264,7 +1252,6 @@ tf_py_test(
         "no_windows",
         "notsan",  # intermittent races on a few percent of runs
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -1313,7 +1300,6 @@ tf_py_test(
     size = "small",
     srcs = ["checkpoint_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:checkpoint_ops_gen",
         "//tensorflow/python:client",
@@ -1334,7 +1320,6 @@ tf_py_test(
     size = "medium",
     srcs = ["warm_starting_util_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1382,7 +1367,6 @@ tf_py_test(
         "no_pip",
         "notsan",  # b/67945581
     ],
-    tfrt_enabled = True,
     deps = [
         ":checkpoint_management",
         ":monitored_session",
@@ -1408,7 +1392,6 @@ tf_py_test(
     size = "medium",
     srcs = ["input_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index b65cce7ae5c..4387b8ec303 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -487,7 +487,12 @@ def remove_checkpoint(checkpoint_prefix,
 def _delete_file_if_exists(filespec):
   """Deletes files matching `filespec`."""
   for pathname in file_io.get_matching_files(filespec):
-    file_io.delete_file(pathname)
+    try:
+      file_io.delete_file(pathname)
+    except errors.NotFoundError:
+      logging.warning(
+          "Hit NotFoundError when deleting '%s', possibly because another "
+          "process/thread is also deleting/moving the same file", pathname)
 
 
 def meta_graph_filename(checkpoint_filename, meta_graph_suffix="meta"):
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
index afc4cd673db..0e437666e19 100644
--- a/tensorflow/python/training/experimental/BUILD
+++ b/tensorflow/python/training/experimental/BUILD
@@ -107,7 +107,6 @@ cuda_py_test(
     size = "small",
     srcs = ["mixed_precision_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":mixed_precision",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/training/experimental/loss_scale.py b/tensorflow/python/training/experimental/loss_scale.py
index 542311c75d8..1bf8d4f542d 100644
--- a/tensorflow/python/training/experimental/loss_scale.py
+++ b/tensorflow/python/training/experimental/loss_scale.py
@@ -37,20 +37,31 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @six.add_metaclass(abc.ABCMeta)
-@deprecation.deprecated_endpoints('train.experimental.LossScale')
-@tf_export('mixed_precision.experimental.LossScale',
-           'train.experimental.LossScale')
+@deprecation.deprecated_endpoints('mixed_precision.experimental.LossScale',
+                                  'train.experimental.LossScale')
+@tf_export(
+    'mixed_precision.experimental.LossScale',
+    'train.experimental.LossScale',
+    v1=[
+        'mixed_precision.LossScale',
+        'mixed_precision.experimental.LossScale',
+        'train.experimental.LossScale'
+    ])
 class LossScale(trackable.Trackable):
-  """Base class for all loss scales.
+  """Base class for all TF1 loss scales.
+
+  WARNING: This class is deprecated and will be unexposed from the TF 2
+  namespace starting in TensorFlow 2.5. In TensorFlow 2.5, this class will only
+  be accessible as `tf.compat.v1.mixed_precision.LossScale`. Additionally in
+  2.5, you will no longer be able to pass a `LossScale` to a
+  `tf.keras.mixed_precision.Policy`. All the functionality in this class has
+  been merged into `tf.keras.mixed_precision.LossScaleOptimizer`, so this class
+  is no longer needed.
 
   This is an abstract base class, so you cannot instantiate it directly.
   Instead, use one of its concrete subclasses:
-    * `tf.mixed_precision.experimental.DynamicLossScale` (recommended)
-    * `tf.mixed_precision.experimental.FixedLossScale`
-
-  It's recommended to use a loss scale with a
-  `tf.keras.mixed_precision.experimental.LossScaleOptimizer`, as its easier than
-  using a loss scale directly.
+    * `tf.compat.v1.mixed_precision.DynamicLossScale`
+    * `tf.compat.v1.mixed_precision.FixedLossScale`
 
   Loss scaling is a process that multiplies the loss by a multiplier called the
   loss scale, and divides each gradient by the same multiplier. The pseudocode
@@ -198,16 +209,35 @@ class LossScale(trackable.Trackable):
     return cls(**config)
 
 
-@deprecation.deprecated_endpoints('train.experimental.FixedLossScale')
-@tf_export('mixed_precision.experimental.FixedLossScale',
-           'train.experimental.FixedLossScale')
+@deprecation.deprecated_endpoints('mixed_precision.experimental.FixedLossScale',
+                                  'train.experimental.FixedLossScale')
+@tf_export(
+    'mixed_precision.experimental.FixedLossScale',
+    'train.experimental.FixedLossScale',
+    v1=[
+        'mixed_precision.FixedLossScale',
+        'mixed_precision.experimental.FixedLossScale',
+        'train.experimental.FixedLossScale'
+    ])
 class FixedLossScale(LossScale):
   """Loss scale with a fixed value.
 
+  WARNING: This class is deprecated and will be unexposed from the TF 2
+  namespace starting in TensorFlow 2.5. In TensorFlow 2.5, this class will only
+  be accessible as `tf.compat.v1.mixed_precision.FixedLossScale`. Additionally
+  in 2.5, you will no longer be able to pass a `FixedLossScale` to a
+  `tf.keras.mixed_precision.Policy`. All the functionality in this class has
+  been merged into `tf.keras.mixed_precision.LossScaleOptimizer`, so this class
+  is no longer needed.
+
   The loss scale is not updated for the lifetime of instances of this class.
   A given instance of this class always returns the same number when called.
   """
 
+  @deprecation.deprecated(
+      None, 'Use tf.keras.mixed_precision.LossScaleOptimizer instead. '
+            'LossScaleOptimizer now has all the functionality of '
+            'FixedLossScale')
   def __init__(self, loss_scale_value):
     """Creates the fixed loss scale.
 
@@ -280,12 +310,28 @@ def _assign_if_finite(var, value):
       control_flow_ops.no_op)
 
 
-@deprecation.deprecated_endpoints('train.experimental.DynamicLossScale')
-@tf_export('mixed_precision.experimental.DynamicLossScale',
-           'train.experimental.DynamicLossScale')
+@deprecation.deprecated_endpoints(
+    'mixed_precision.experimental.DynamicLossScale',
+    'train.experimental.DynamicLossScale')
+@tf_export(
+    'mixed_precision.experimental.DynamicLossScale',
+    'train.experimental.DynamicLossScale',
+    v1=[
+        'mixed_precision.DynamicLossScale',
+        'mixed_precision.experimental.DynamicLossScale',
+        'train.experimental.DynamicLossScale'
+    ])
 class DynamicLossScale(LossScale):
   """Loss scale that dynamically adjusts itself.
 
+  WARNING: This class is deprecated and will be unexposed from the TF 2
+  namespace starting in TensorFlow 2.5. In TensorFlow 2.5, this class will only
+  be accessible as `tf.compat.v1.mixed_precision.DynamicLossScale`. Additionally
+  in 2.5, you will no longer be able to pass a `DynamicLossScale` to a
+  `tf.keras.mixed_precision.Policy`. All the functionality in this class has
+  been merged into `tf.keras.mixed_precision.LossScaleOptimizer`, so this class
+  is no longer needed.
+
   Dynamic loss scaling works by adjusting the loss scale as training progresses.
   The goal is to keep the loss scale as high as possible without overflowing the
   gradients. As long as the gradients do not overflow, raising the loss scale
@@ -299,6 +345,10 @@ class DynamicLossScale(LossScale):
   overflowing.
   """
 
+  @deprecation.deprecated(
+      None, 'Use tf.keras.mixed_precision.LossScaleOptimizer instead. '
+            'LossScaleOptimizer now has all the functionality of '
+            'DynamicLossScale')
   def __init__(self,
                initial_loss_scale=2 ** 15,  # See docstring for why this is big.
                increment_period=2000,
diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer.py b/tensorflow/python/training/experimental/loss_scale_optimizer.py
index c07c8cca60a..0c63177132d 100644
--- a/tensorflow/python/training/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer.py
@@ -24,10 +24,14 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=['train.experimental.MixedPrecisionLossScaleOptimizer'])
+@deprecation.deprecated_endpoints(
+    'train.experimental.MixedPrecisionLossScaleOptimizer')
+@tf_export(v1=['mixed_precision.MixedPrecisionLossScaleOptimizer',
+               'train.experimental.MixedPrecisionLossScaleOptimizer'])
 class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer):
   """An optimizer that applies loss scaling.
 
diff --git a/tensorflow/python/training/experimental/mixed_precision.py b/tensorflow/python/training/experimental/mixed_precision.py
index af0e27dd860..d1a423a5cc3 100644
--- a/tensorflow/python/training/experimental/mixed_precision.py
+++ b/tensorflow/python/training/experimental/mixed_precision.py
@@ -23,6 +23,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import optimizer
 from tensorflow.python.training.experimental import loss_scale_optimizer as loss_scale_optimizer_v1
 from tensorflow.python.training.experimental import mixed_precision_global_state
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -61,6 +62,12 @@ def _wrap_optimizer(opt, loss_scale, use_v1_behavior):
                      'tf.keras.optimizers.Optimizer, but got: %s' % opt)
 
 
+@deprecation.deprecated(
+    '2020-11-30',
+    'Use tf.keras.mixed_precision. There is a guide at '
+    'https://www.tensorflow.org/guide/mixed_precision. Alternatively, '
+    '`tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite` can '
+    'be used, but this is not recommended for TF2 code.')
 @tf_export('train.experimental.enable_mixed_precision_graph_rewrite', v1=[])
 def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
   """Enable mixed precision via a graph rewrite.
@@ -206,7 +213,10 @@ def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
                                                     use_v1_behavior=False)
 
 
-@tf_export(v1=['train.experimental.enable_mixed_precision_graph_rewrite'])
+@deprecation.deprecated_endpoints(
+    'train.experimental.enable_mixed_precision_graph_rewrite')
+@tf_export(v1=['mixed_precision.enable_mixed_precision_graph_rewrite',
+               'train.experimental.enable_mixed_precision_graph_rewrite'])
 def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
   """Enable mixed precision via a graph rewrite.
 
@@ -348,6 +358,12 @@ def _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
   return opt
 
 
+@deprecation.deprecated(
+    '2020-11-30',
+    'Use tf.keras.mixed_precision. There is a guide at '
+    'https://www.tensorflow.org/guide/mixed_precision. Alternatively, '
+    '`tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite` can '
+    'be used, but this is not recommended for TF2 code.')
 @tf_export('train.experimental.disable_mixed_precision_graph_rewrite', v1=[])
 def disable_mixed_precision_graph_rewrite():
   """Disables the mixed precision graph rewrite.
@@ -372,7 +388,10 @@ def disable_mixed_precision_graph_rewrite():
   mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled = False
 
 
-@tf_export(v1=['train.experimental.disable_mixed_precision_graph_rewrite'])
+@deprecation.deprecated_endpoints(
+    'train.experimental.disable_mixed_precision_graph_rewrite')
+@tf_export(v1=['mixed_precision.disable_mixed_precision_graph_rewrite',
+               'train.experimental.disable_mixed_precision_graph_rewrite'])
 def disable_mixed_precision_graph_rewrite_v1():
   """Disables the mixed precision graph rewrite.
 
diff --git a/tensorflow/python/training/experimental/mixed_precision_global_state.py b/tensorflow/python/training/experimental/mixed_precision_global_state.py
index 6df4fdbe593..4bf7916d983 100644
--- a/tensorflow/python/training/experimental/mixed_precision_global_state.py
+++ b/tensorflow/python/training/experimental/mixed_precision_global_state.py
@@ -33,7 +33,7 @@ mixed_precision_graph_rewrite_is_enabled = False
 # Session has already been created.
 non_mixed_precision_session_created = False
 
-# Whether the global tf.keras.mixed_precision.experimental.Policy uses mixed
-# precision. Used to raise an error message if both a mixed Policy and the graph
-# rewrite are used at the same time.
+# Whether the global tf.keras.mixed_precision.Policy uses mixed precision. Used
+# to raise an error message if both a mixed Policy and the graph rewrite are
+# used at the same time.
 using_mixed_precision_policy = False
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index ab63f4237da..6a06b31008b 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -1323,7 +1323,7 @@ class _CoordinatedSession(_WrappedSession):
   raises an exception, the exception is reported to the coordinator.
 
   In addition, after each call to `run()` this session ask the coordinator if
-  the session should stop.  In that case it will will join all the threads
+  the session should stop.  In that case it will join all the threads
   registered with the coordinator before returning.
 
   If the coordinator was requested to stop with an exception, that exception
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 65f6ab67915..a1c88a54788 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1073,7 +1073,12 @@ class Saver(object):
     """
     checkpoints_with_mtimes = []
     for checkpoint_path in checkpoint_paths:
-      mtime = checkpoint_management.get_checkpoint_mtimes([checkpoint_path])
+      try:
+        mtime = checkpoint_management.get_checkpoint_mtimes([checkpoint_path])
+      except errors.NotFoundError:
+        # It's fine if some other thread/process is deleting some older
+        # checkpoint concurrently.
+        continue
       if mtime:
         checkpoints_with_mtimes.append((checkpoint_path, mtime[0]))
     self.set_last_checkpoints_with_time(checkpoints_with_mtimes)
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 12940840309..7a65e3a77e8 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -39,6 +39,9 @@ cuda_py_test(
     srcs = [
         "functional_saver_test.py",
     ],
+    tags = [
+        "no_windows",  # TODO(b/171350346)
+    ],
     deps = [
         ":checkpoint_options",
         ":functional_saver",
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 6001dc2cbbe..370b78c84f5 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -57,7 +57,6 @@ py_library(
 tf_py_test(
     name = "tracking_test",
     srcs = ["tracking_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":base",
         ":tracking",
@@ -159,7 +158,6 @@ tf_py_test(
     name = "util_test",
     srcs = ["util_test.py"],
     tags = ["notsan"],  # b/74395663
-    tfrt_enabled = True,
     deps = [
         ":base",
         ":graph_view",
@@ -200,7 +198,6 @@ tf_py_test(
     tags = [
         "notsan",  # b/74395663
     ],
-    tfrt_enabled = True,
     deps = [
         ":tracking",
         ":util",
@@ -243,7 +240,6 @@ tf_py_test(
 tf_py_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
-    tfrt_enabled = True,
     deps = [
         ":util",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index 1d2398d6beb..38cc50a43f8 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -35,8 +35,12 @@ from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import layer_utils
+from tensorflow.python.util import lazy_loader
 from tensorflow.python.util.compat import collections_abc
 
+module = lazy_loader.LazyLoader(
+    "module", globals(), "tensorflow.python.module.module")
+
 
 class NoDependency(object):
   """Allows attribute assignment to `Trackable` objects with no dependency.
@@ -213,17 +217,45 @@ class TrackableDataStructure(base.Trackable):
 
   @property
   def trainable_weights(self):
-    return layer_utils.gather_trainable_weights(
-        trainable=self.trainable,
-        sub_layers=self._layers,
-        extra_variables=self._self_extra_variables)
+    if not self._self_trainable:
+      return []
+    trainable_variables = []
+    for obj in self._values:
+      if isinstance(obj, (TrackableDataStructure, module.Module)):
+        trainable_variables += obj.trainable_variables
+    trainable_extra_variables = [
+        v for v in self._self_extra_variables if v.trainable
+    ]
+    return trainable_variables + trainable_extra_variables
 
   @property
   def non_trainable_weights(self):
-    return layer_utils.gather_non_trainable_weights(
-        trainable=self.trainable,
-        sub_layers=self._layers,
-        extra_variables=self._self_extra_variables)
+    trainable_extra_variables = [
+        v for v in self._self_extra_variables if v.trainable
+    ]
+    non_trainable_extra_variables = [
+        v for v in self._self_extra_variables if not v.trainable
+    ]
+    non_trainable_variables = []
+    for obj in self._values:
+      if isinstance(obj, (TrackableDataStructure, module.Module)):
+        non_trainable_variables += obj.non_trainable_variables
+
+    if not self._self_trainable:
+      # Return order is all trainable vars, then all non-trainable vars.
+      trainable_variables = []
+      for obj in self._values:
+        if isinstance(obj, (TrackableDataStructure, module.Module)):
+          trainable_variables += obj.trainable_variables
+
+      non_trainable_variables = (
+          trainable_variables + trainable_extra_variables +
+          non_trainable_variables + non_trainable_extra_variables)
+    else:
+      non_trainable_variables = (
+          non_trainable_variables + non_trainable_extra_variables)
+
+    return non_trainable_variables
 
   @property
   def weights(self):
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
index 6aeb41b47a9..967bb47c6c5 100644
--- a/tensorflow/python/training/tracking/graph_view.py
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -185,7 +185,7 @@ class ObjectGraphView(object):
   def attached_dependencies(self):
     """Returns list of dependencies that should be saved in the checkpoint.
 
-    These dependencies are not tracked by root, but are in the the checkpoint.
+    These dependencies are not tracked by root, but are in the checkpoint.
     This is defined when the user creates a Checkpoint with both root and kwargs
     set.
 
@@ -430,7 +430,7 @@ class ObjectGraphView(object):
               name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects
 
-  def objects_ids_and_slot_variables(self):
+  def objects_ids_and_slot_variables_and_paths(self):
     """Traverse the object graph and list all accessible objects.
 
     Looks for `Trackable` objects which are dependencies of
@@ -439,7 +439,8 @@ class ObjectGraphView(object):
     (i.e. if they would be saved with a checkpoint).
 
     Returns:
-      A tuple of (trackable objects, object -> node id, slot variables)
+      A tuple of (trackable objects, paths from root for each object,
+                  object -> node id, slot variables)
     """
     trackable_objects, path_to_root = self._breadth_first_traversal()
     object_names = object_identity.ObjectIdentityDictionary()
@@ -452,6 +453,11 @@ class ObjectGraphView(object):
         trackable_objects=trackable_objects,
         node_ids=node_ids,
         object_names=object_names)
+    return trackable_objects, path_to_root, node_ids, slot_variables
+
+  def objects_ids_and_slot_variables(self):
+    trackable_objects, _, node_ids, slot_variables = (
+        self.objects_ids_and_slot_variables_and_paths())
     return trackable_objects, node_ids, slot_variables
 
   def list_objects(self):
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 5e822f87e8c..e634a2c67cf 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -29,6 +29,7 @@ from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util import tf_stack
+from tensorflow.tools.docs import doc_controls
 
 
 # Allow deprecation warnings to be silenced temporarily with a context manager.
@@ -305,8 +306,23 @@ def deprecated(date, instructions, warn_once=True):
   """
   _validate_deprecation_args(date, instructions)
 
-  def deprecated_wrapper(func):
+  def deprecated_wrapper(func_or_class):
     """Deprecation wrapper."""
+    if isinstance(func_or_class, type):
+      # If a class is deprecated, you actually want to wrap the constructor.
+      cls = func_or_class
+      if cls.__new__ is object.__new__:
+        func = cls.__init__
+        constructor_name = '__init__'
+      else:
+        func = cls.__new__
+        constructor_name = '__new__'
+
+    else:
+      cls = None
+      constructor_name = None
+      func = func_or_class
+
     decorator_utils.validate_callable(func, 'deprecated')
     @functools.wraps(func)
     def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
@@ -322,10 +338,25 @@ def deprecated(date, instructions, warn_once=True):
               'in a future version' if date is None else ('after %s' % date),
               instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(
+
+    doc_controls.set_deprecated(new_func)
+    new_func = tf_decorator.make_decorator(
         func, new_func, 'deprecated',
         _add_deprecated_function_notice_to_docstring(func.__doc__, date,
                                                      instructions))
+
+    if cls is None:
+      return new_func
+    else:
+      # Insert the wrapped function as the constructor
+      setattr(cls, constructor_name, new_func)
+
+      # And update the docstring of the class.
+      cls.__doc__ = _add_deprecated_function_notice_to_docstring(
+          cls.__doc__, date, instructions)
+
+      return cls
+
   return deprecated_wrapper
 
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 20c0846cfb8..a8babf3b011 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -19,6 +19,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import enum
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -95,6 +98,72 @@ class DeprecationTest(test.TestCase):
     _fn()
     self.assertEqual(1, mock_warning.call_count)
 
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_init_class(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions, warn_once=True)
+    class MyClass():
+      """A test class."""
+
+      def __init__(self, a):
+        pass
+
+    MyClass("")
+    self.assertEqual(1, mock_warning.call_count)
+    MyClass("")
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", MyClass.__doc__)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_new_class(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions, warn_once=True)
+    class MyStr(str):
+
+      def __new__(cls, value):
+        return str.__new__(cls, value)
+
+    MyStr("abc")
+    self.assertEqual(1, mock_warning.call_count)
+    MyStr("abc")
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", MyStr.__doc__)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_enum(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions, warn_once=True)
+    class MyEnum(enum.Enum):
+      a = 1
+      b = 2
+
+    self.assertIs(MyEnum(1), MyEnum.a)
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIs(MyEnum(2), MyEnum.b)
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", MyEnum.__doc__)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_namedtuple(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    mytuple = deprecation.deprecated(
+        date, instructions, warn_once=True)(
+            collections.namedtuple("my_tuple", ["field1", "field2"]))
+
+    mytuple(1, 2)
+    self.assertEqual(1, mock_warning.call_count)
+    mytuple(3, 4)
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", mytuple.__doc__)
+
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_silence(self, mock_warning):
     date = "2016-07-04"
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index 51dfe3793ae..aa3d6146b22 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -34,6 +34,8 @@ import itertools
 
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
+
 
 # Private function attribute used to store a list of dispatchers.
 DISPATCH_ATTR = "_tf_dispatchers"
@@ -43,6 +45,7 @@ DISPATCH_ATTR = "_tf_dispatchers"
 _GLOBAL_DISPATCHERS = []
 
 
+@tf_export("__internal__.dispatch.OpDispatcher", v1=[])
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -86,6 +89,7 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
+@tf_export("__internal__.dispatch.GlobalOpDispatcher", v1=[])
 class GlobalOpDispatcher(object):
   """Abstract base class for TensorFlow global operator dispatchers."""
 
@@ -193,6 +197,7 @@ def add_dispatch_list(target):
   return target
 
 
+@tf_export("__internal__.dispatch.add_dispatch_support", v1=[])
 def add_dispatch_support(target):
   """Decorator that adds a dispatch handling wrapper to an op."""
   def wrapper(*args, **kwargs):
diff --git a/tensorflow/python/util/keras_deps.py b/tensorflow/python/util/keras_deps.py
new file mode 100644
index 00000000000..bf64026c46d
--- /dev/null
+++ b/tensorflow/python/util/keras_deps.py
@@ -0,0 +1,81 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Interface that provides access to Keras dependencies.
+
+This library is a common interface that contains Keras functions needed by
+TensorFlow and TensorFlow Lite and is required as per the dependency inversion
+principle (https://en.wikipedia.org/wiki/Dependency_inversion_principle). As per
+this principle, high-level modules (eg: TensorFlow and TensorFlow Lite) should
+not depend on low-level modules (eg: Keras) and instead both should depend on a
+common interface such as this file.
+"""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+_KERAS_CALL_CONTEXT_FUNCTION = None
+_KERAS_CLEAR_SESSION_FUNCTION = None
+_KERAS_GET_SESSION_FUNCTION = None
+_KERAS_LOAD_MODEL_FUNCTION = None
+
+# TODO(scottzhu): Disable duplicated inject once keras is moved to
+# third_party/py/keras.
+# TODO(b/169898786): Use the Keras public API when TFLite moves out of TF
+
+
+# Register functions
+def register_call_context_function(func):
+  global _KERAS_CALL_CONTEXT_FUNCTION
+  _KERAS_CALL_CONTEXT_FUNCTION = func
+
+
+def register_clear_session_function(func):
+  global _KERAS_CLEAR_SESSION_FUNCTION
+  _KERAS_CLEAR_SESSION_FUNCTION = func
+
+
+def register_get_session_function(func):
+  global _KERAS_GET_SESSION_FUNCTION
+  _KERAS_GET_SESSION_FUNCTION = func
+
+
+def register_load_model_function(func):
+  global _KERAS_LOAD_MODEL_FUNCTION
+  _KERAS_LOAD_MODEL_FUNCTION = func
+
+
+# Get functions
+def get_call_context_function():
+  global _KERAS_CALL_CONTEXT_FUNCTION
+  return _KERAS_CALL_CONTEXT_FUNCTION
+
+
+def get_clear_session_function():
+  global _KERAS_CLEAR_SESSION_FUNCTION
+  return _KERAS_CLEAR_SESSION_FUNCTION
+
+
+def get_get_session_function():
+  global _KERAS_GET_SESSION_FUNCTION
+  return _KERAS_GET_SESSION_FUNCTION
+
+
+def get_load_model_function():
+  global _KERAS_LOAD_MODEL_FUNCTION
+  return _KERAS_LOAD_MODEL_FUNCTION
diff --git a/tensorflow/python/util/object_identity.py b/tensorflow/python/util/object_identity.py
index dfdd08d1501..d3a704e1415 100644
--- a/tensorflow/python/util/object_identity.py
+++ b/tensorflow/python/util/object_identity.py
@@ -22,6 +22,7 @@ import weakref
 from tensorflow.python.util.compat import collections_abc
 
 
+# LINT.IfChange
 class _ObjectIdentityWrapper(object):
   """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
 
@@ -30,7 +31,7 @@ class _ObjectIdentityWrapper(object):
   _ListWrapper objects to object-identity collections.
   """
 
-  __slots__ = ["_wrapped"]
+  __slots__ = ["_wrapped", "__weakref__"]
 
   def __init__(self, wrapped):
     self._wrapped = wrapped
@@ -72,6 +73,8 @@ class _ObjectIdentityWrapper(object):
 
 class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
 
+  __slots__ = ()
+
   def __init__(self, wrapped):
     super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
 
@@ -99,6 +102,8 @@ class Reference(_ObjectIdentityWrapper):
   ```
   """
 
+  __slots__ = ()
+
   # Disabling super class' unwrapped field.
   unwrapped = property()
 
@@ -153,6 +158,8 @@ class ObjectIdentityDictionary(collections_abc.MutableMapping):
 class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
   """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
 
+  __slots__ = ["__weakref__"]
+
   def _wrap_key(self, key):
     return _WeakObjectIdentityWrapper(key)
 
@@ -173,7 +180,7 @@ class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
 class ObjectIdentitySet(collections_abc.MutableSet):
   """Like the built-in set, but compares objects with "is"."""
 
-  __slots__ = ["_storage"]
+  __slots__ = ["_storage", "__weakref__"]
 
   def __init__(self, *args):
     self._storage = set(self._wrap_key(obj) for obj in list(*args))
@@ -221,6 +228,8 @@ class ObjectIdentitySet(collections_abc.MutableSet):
 class ObjectIdentityWeakSet(ObjectIdentitySet):
   """Like weakref.WeakSet, but compares objects with "is"."""
 
+  __slots__ = ()
+
   def _wrap_key(self, key):
     return _WeakObjectIdentityWrapper(key)
 
@@ -236,3 +245,4 @@ class ObjectIdentityWeakSet(ObjectIdentitySet):
         self.discard(key)
       else:
         yield unwrapped
+# LINT.ThenChange(//tensorflow/python/keras/utils/object_identity.py)
diff --git a/tensorflow/python/util/tf_stack.cc b/tensorflow/python/util/tf_stack.cc
index 7f5ff7ff8ae..b3a797568a7 100644
--- a/tensorflow/python/util/tf_stack.cc
+++ b/tensorflow/python/util/tf_stack.cc
@@ -63,6 +63,11 @@ struct FrameSummary {
   }
 
   bool operator!=(const FrameSummary& other) const { return !(*this == other); }
+
+  py::str toString() const {
+    return py::str("<FrameSummary file {}, line {} in {}>")
+        .format(filename, lineno, name);
+  }
 };
 
 std::vector<FrameSummary> ExtractStack(ssize_t limit, const py::list& mappers,
@@ -142,11 +147,7 @@ PYBIND11_MODULE(_tf_stack, m) {
              return py::iter(py::make_tuple(self.filename, self.lineno,
                                             self.name, self.line()));
            })
-      .def("__repr__",
-           [](const FrameSummary& self) {
-             return py::str("<FrameSummary file {}, line {} in {}>")
-                 .format(self.filename, self.lineno, self.name);
-           })
+      .def("__repr__", [](const FrameSummary& self) { return self.toString(); })
       .def("__len__", [](const FrameSummary&) { return 4; });
 
   py::bind_vector<std::vector<FrameSummary>>(m, "StackSummary",
@@ -162,7 +163,15 @@ PYBIND11_MODULE(_tf_stack, m) {
             }
             return self[eff_index];
           },
-          py::return_value_policy::reference_internal);
+          py::return_value_policy::reference_internal)
+      .def("__repr__", [](const std::vector<FrameSummary>& self) {
+        py::list frames;
+        for (const auto& frame : self) {
+          frames.append(frame.toString());
+        }
+        // "\n".join(frames)
+        return py::cast("\n").attr("join")(frames);
+      });
 
   m.def("extract_stack", [](const py::object& limit, const py::list& mappers,
                             const py::list& filters) {
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index 9f02de4153d..27e24b6392b 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -10,6 +10,8 @@ in [SECURITY.md](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.m
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
+| [TFSA-2020-028](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-028.md)   | Float cast overflow undefined behavior               | <= 2.3 | (Reported on GitHub) | [issue report](https://github.com/tensorflow/tensorflow/issues/42129) |
+| [TFSA-2020-027](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-027.md)   | Segfault in `tf.quantization.quantize_and_dequantize`| <= 2.3 | (Reported on GitHub) | [issue report](https://github.com/tensorflow/tensorflow/issues/42105) |
 | [TFSA-2020-026](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-026.md)   | Segfault in `tf.raw_ops.Switch` in eager mode                                             | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
 | [TFSA-2020-025](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-025.md)   | Undefined behavior in `dlpack.to_dlpack`                                                  | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
 | [TFSA-2020-024](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-024.md)   | Memory leak in `dlpack.to_dlpack`                                                  | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
diff --git a/tensorflow/security/advisory/tfsa-2020-027.md b/tensorflow/security/advisory/tfsa-2020-027.md
new file mode 100644
index 00000000000..e2c87c97953
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-027.md
@@ -0,0 +1,52 @@
+## TFSA-2020-027: Segfault in `tf.quantization.quantize_and_dequantize`
+
+### CVE Number
+CVE-2020-15265
+
+### Impact
+An attacker can pass an invalid `axis` value to
+`tf.quantization.quantize_and_dequantize`:
+
+```python
+tf.quantization.quantize_and_dequantize(
+    input=[2.5, 2.5], input_min=[0,0], input_max=[1,1], axis=10)
+```
+
+This results in accessing [a dimension outside the rank of the input
+tensor](https://github.com/tensorflow/tensorflow/blob/0225022b725993bfc19b87a02a2faaad9a53bc17/tensorflow/core/kernels/quantize_and_dequantize_op.cc#L74)
+in the C++ kernel implementation:
+```cc
+const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
+```
+
+However, [`dim_size` only does a
+`DCHECK`](https://github.com/tensorflow/tensorflow/blob/0225022b725993bfc19b87a02a2faaad9a53bc17/tensorflow/core/framework/tensor_shape.cc#L292-L307)
+to validate the argument and then uses it to access the corresponding element of
+an array:
+```cc
+int64 TensorShapeBase<Shape>::dim_size(int d) const {
+  DCHECK_GE(d, 0);
+  DCHECK_LT(d, dims());
+  DoStuffWith(dims_[d]);
+}
+```
+
+Since in normal builds, `DCHECK`-like macros are no-ops, this results in
+segfault and access out of bounds of the array.
+
+### Patches
+
+We have patched the issue in
+[eccb7ec454e6617738554a255d77f08e60ee0808](https://github.com/tensorflow/tensorflow/commit/eccb7ec454e6617738554a255d77f08e60ee0808)
+and will release TensorFlow 2.4.0 containing the patch. TensorFlow nightly
+packages after this commit will also have the issue resolved.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported in
+[#42105](https://github.com/tensorflow/issues/42105).
diff --git a/tensorflow/security/advisory/tfsa-2020-028.md b/tensorflow/security/advisory/tfsa-2020-028.md
new file mode 100644
index 00000000000..a69df53a77e
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-028.md
@@ -0,0 +1,27 @@
+## TFSA-2020-028: Float cast overflow undefined behavior
+
+### CVE Number
+CVE-2020-15266
+
+### Impact
+When the `boxes` argument of `tf.image.crop_and_resize` has a very large value,
+the CPU kernel implementation receives it as a C++ `nan` floating point value.
+Attempting to operate on this is undefined behavior which later produces a
+segmentation fault.
+
+### Patches
+
+We have patched the issue in
+[c0319231333f0f16e1cc75ec83660b01fedd4182](https://github.com/tensorflow/tensorflow/commit/c0319231333f0f16e1cc75ec83660b01fedd4182)
+and will release TensorFlow 2.4.0 containing the patch. TensorFlow nightly
+packages after this commit will also have the issue resolved.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported in
+[#42129](https://github.com/tensorflow/issues/42129).
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 84f592439ea..d6333d54c1e 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -61,7 +61,6 @@ cc_library(
         "blas.h",
         "device_description.h",
         "device_options.h",
-        "dnn.h",
         "event.cc",
         "fft.h",
         "kernel_cache_config.h",
@@ -103,7 +102,6 @@ cc_library(
 cc_library(
     name = "kernel",
     srcs = [
-        "dnn.h",
         "fft.h",
         "kernel.cc",
         "plugin.h",
@@ -300,6 +298,7 @@ cc_library(
     name = "host_or_device_scalar",
     hdrs = ["host_or_device_scalar.h"],
     deps = [
+        ":data_type",
         ":device_memory",
         "//tensorflow/stream_executor/platform",
     ],
@@ -331,7 +330,6 @@ cc_library(
     ],
     hdrs = [
         "blas.h",
-        "dnn.h",
         "executor_cache.h",
         "fft.h",
         "kernel.h",
@@ -423,11 +421,21 @@ tf_proto_library(
     make_default_target_header_only = True,
 )
 
+cc_library(
+    name = "data_type",
+    hdrs = ["data_type.h"],
+    deps = [
+        ":dnn_proto_cc",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
 cc_library(
     name = "dnn",
     srcs = ["dnn.cc"],
     hdrs = ["dnn.h"],
     deps = [
+        ":data_type",
         ":device_memory",
         ":dnn_proto_cc",
         ":stream_executor_headers",
@@ -445,7 +453,6 @@ cc_library(
 cc_library(
     name = "stream_executor_internal",
     srcs = [
-        "dnn.h",
         "stream_executor_internal.cc",
     ],
     hdrs = [
@@ -474,7 +481,6 @@ cc_library(
     name = "stream_executor_pimpl_header",
     hdrs = [
         "device_description.h",
-        "dnn.h",
         "kernel.h",
         "kernel_cache_config.h",
         "stream_executor_pimpl.h",
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index f499b3003d0..ca597595969 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -95,5 +95,30 @@ std::ostream& operator<<(std::ostream& os, ComputationType ty) {
   return os << ComputationTypeString(ty);
 }
 
+string DataTypeString(DataType ty) {
+  switch (ty) {
+    case DataType::kHalf:
+      return "f16";
+    case DataType::kFloat:
+      return "f32";
+    case DataType::kDouble:
+      return "f64";
+    case DataType::kInt8:
+      return "i8";
+    case DataType::kInt32:
+      return "i32";
+    case DataType::kComplexFloat:
+      return "complex f32";
+    case DataType::kComplexDouble:
+      return "complex f64";
+    default:
+      LOG(FATAL) << "Unknown DataType " << static_cast<int32>(ty);
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, DataType ty) {
+  return os << DataTypeString(ty);
+}
+
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index 5018d487ed1..20776b8416d 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <complex>
 #include <vector>
 
-#include "tensorflow/stream_executor/host_or_device_scalar.h"
+#include "tensorflow/stream_executor/dnn.h"  // For DataType, ToDataType
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -60,6 +60,9 @@ class ScratchAllocator;
 template <typename ElemT>
 class DeviceMemory;
 
+template <typename ElemT>
+class HostOrDeviceScalar;
+
 namespace blas {
 
 // Specifies whether the input matrix will be transposed or
@@ -101,6 +104,18 @@ enum class ComputationType {
   kI32,         // 32-bit integer
   kComplexF32,  // Complex number comprised of two f32s.
   kComplexF64,  // Complex number comprised of two f64s.
+  // The below values are only supported for BlasLt routines (both real and
+  // complex). They use float32 for accumulation but round the input mantissas
+  // to a smaller number of bits.
+  kTF32AsF32,  // 32-bit floating-point with reduced (>=10-bit) mantissa
+  kBF16AsF32,  // 32-bit floating-point with reduced (7-bit) mantissa
+};
+
+enum class Epilogue {
+  kDefault = 1,                   // No special postprocessing
+  kReLU = 2,                      // Apply ReLU func point-wise to the results
+  kBias = 4,                      // Add broadcasted bias vector to the results
+  kBiasThenReLU = kBias | kReLU,  // Apply bias and then ReLU transform
 };
 
 // Converts a ComputationType to a string.
@@ -108,6 +123,21 @@ std::string ComputationTypeString(ComputationType ty);
 
 std::ostream &operator<<(std::ostream &os, ComputationType ty);
 
+using dnn::DataType;
+using dnn::ToDataType;
+
+// Describes the type of pointers for the scaling factors alpha and beta in
+// blaslt routines.
+enum class PointerMode {
+  kHost,
+  kDevice,
+};
+
+// Converts a ComputationType to a string.
+string DataTypeString(DataType ty);
+
+std::ostream &operator<<(std::ostream &os, DataType ty);
+
 // Opaque identifier for an "algorithm" used by a blas routine.  This functions
 // as a hint to the blas library.
 typedef int64 AlgorithmType;
@@ -163,6 +193,44 @@ class AlgorithmConfig {
   AlgorithmType algorithm_;
 };
 
+struct IBlasLtMatmulPlan {
+  // Returns the data type of the A and B (input) matrices.
+  virtual DataType ab_type() const = 0;
+  // Returns the data type of the C (input/output) matrix.
+  virtual DataType c_type() const = 0;
+  virtual ~IBlasLtMatmulPlan() {}
+};
+
+struct IBlasLtMatmulAlgorithm {
+  virtual ~IBlasLtMatmulAlgorithm() {}
+  // Returns the index of the algorithm within the list returned by
+  // GetBlasLtMatmulAlgorithms.
+  virtual AlgorithmType index() const = 0;
+  // Returns the workspace size required by the algorithm in bytes.
+  virtual size_t workspace_size() const = 0;
+};
+
+// Parameters for the CreateBlasLtMatmulPlan method.
+struct BlasLtMatmulPlanParams {
+  DataType ab_type;
+  DataType c_type;
+  ComputationType computation_type;
+  PointerMode pointer_mode;
+  Epilogue epilogue;
+  Transpose transa;
+  Transpose transb;
+  uint64 m;
+  uint64 n;
+  uint64 k;
+  int64 lda;
+  int64 ldb;
+  int64 ldc;
+  int batch_count = 1;
+  int64 stride_a = 0;
+  int64 stride_b = 0;
+  int64 stride_c = 0;
+};
+
 // BLAS support interface -- this can be derived from a GPU executor when the
 // underlying platform has an BLAS library implementation available. See
 // StreamExecutor::AsBlas().
@@ -1383,6 +1451,71 @@ class BlasSupport {
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) = 0;
 
+  // Creates a backend-specific plan object for a blaslt matmul operation, which
+  // can then be passed to DoBlasLtMatmul(). When possible, plans should be
+  // created once and reused for multiple calls to DoBlasLtMatmul().
+  virtual port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+  CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &params) = 0;
+
+  // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are
+  // returned in the order of increasing estimated compute time according to an
+  // internal heuristic. The first returned algorithm can be used as the default
+  // algorithm if no autotuning is to be performed.
+  virtual port::StatusOr<
+      std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+  GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                            size_t max_workspace_size,
+                            int max_algorithm_count) = 0;
+
+  // Executes a blaslt matmul operation on the stream. If output_profile_result
+  // is not nullptr, the operation is profiled, error messages are
+  // suppressed, and output_profile_result->algorithm() is set to
+  // algorithm->index(). If epilogue was set to kBias or kBiasThenReLU when
+  // creating the plan, the bias argument here must refer to a valid device
+  // vector of length equal to the number of rows in matrix c. If epilogue was
+  // set to any other value then the bias argument here must be null. The bias
+  // vector is broadcast across the batch dimension.
+  // Note that the data types of a and b (c and bias) must match the ab_type
+  // (c_type) with which the plan was created, and the data types of alpha and
+  // beta must match the data type of c.
+  virtual bool DoBlasLtMatmul(
+      Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+      const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+      DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+      DeviceMemoryBase c, ScratchAllocator *scratch_allocator,
+      const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,
+      blas::ProfileResult *output_profile_result) = 0;
+
+  template <typename ABType, typename CType>
+  bool DoBlasLtMatmul(Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+                      const HostOrDeviceScalar<CType> &alpha,
+                      const DeviceMemory<ABType> &a,
+                      const DeviceMemory<ABType> &b,
+                      const HostOrDeviceScalar<CType> &beta,
+                      DeviceMemory<CType> *c,
+                      ScratchAllocator *scratch_allocator,
+                      const blas::IBlasLtMatmulAlgorithm *algorithm,
+                      const DeviceMemory<CType> &bias = {},
+                      blas::ProfileResult *output_profile_result = nullptr) {
+    constexpr blas::DataType ab_type = blas::ToDataType<ABType>::value;
+    if (ab_type != plan->ab_type()) {
+      VLOG(2) << "DoBlasLtMatmul returning false because a and b type does "
+                 "not match plan: expected "
+              << plan->ab_type() << ", got " << ab_type;
+      return false;
+    }
+    constexpr blas::DataType c_type = blas::ToDataType<CType>::value;
+    if (c_type != plan->c_type()) {
+      VLOG(2) << "DoBlasLtMatmul returning false because c type does "
+                 "not match plan: expected "
+              << plan->c_type() << ", got " << c_type;
+      return false;
+    }
+    return DoBlasLtMatmul(stream, plan, alpha, a, b, beta, *c,
+                          scratch_allocator, algorithm, bias,
+                          output_profile_result);
+  }
+
   virtual port::Status GetVersion(std::string *version) = 0;
 
  protected:
@@ -2196,6 +2329,19 @@ class BlasSupport {
                   uint64 n, std::complex<double> alpha,                        \
                   const DeviceMemory<std::complex<double>> &a, int lda,        \
                   DeviceMemory<std::complex<double>> *b, int ldb) override;    \
+  port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>                     \
+  CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &params) override; \
+  port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>   \
+  GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,               \
+                            size_t max_workspace_size,                         \
+                            int max_algorithm_count) override;                 \
+  bool DoBlasLtMatmul(                                                         \
+      Stream *stream, const blas::IBlasLtMatmulPlan *plan,                     \
+      const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,               \
+      DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,                \
+      DeviceMemoryBase c, ScratchAllocator *scratch_allocator,                 \
+      const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,    \
+      blas::ProfileResult *output_profile_result) override;                    \
   port::Status GetVersion(std::string *version) override;
 
 }  // namespace blas
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 124e7f9d482..0ee227d51f2 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -142,7 +142,9 @@ cc_library(
 tf_cuda_cc_test(
     name = "cuda_driver_test",
     srcs = ["cuda_driver_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171512140): re-enable.
+    ],
     deps = [
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -154,7 +156,9 @@ tf_cuda_cc_test(
 tf_cuda_cc_test(
     name = "memcpy_test",
     srcs = ["memcpy_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171512140): re-enable.
+    ],
     deps = [
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -251,6 +255,31 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cublas_lt_stub",
+    srcs = if_cuda_is_configured(["cublasLt_stub.cc"]),
+    textual_hdrs = glob(["cublasLt_*.inc"]),
+    deps = if_cuda_is_configured([
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cublas_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublasLt_headers)
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
+cc_library(name = "empty_lib")
+
+alias(
+    name = "cublas_lt_lib",
+    actual = select({
+        "//tensorflow:oss": ":cublas_lt_stub",
+        "//conditions:default": ":empty_lib",
+    }),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cublas_plugin",
     srcs = if_cuda_is_configured(["cuda_blas.cc"]),
@@ -258,6 +287,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cublas_lib",
+        ":cublas_lt_lib",
         ":cuda_activation",
         ":cuda_gpu_executor",
         ":cuda_platform_id",
@@ -632,13 +662,15 @@ cc_library(
 tf_cuda_cc_test(
     name = "redzone_allocator_test",
     srcs = ["redzone_allocator_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171512140): re-enable.
+    ],
     deps = [
         ":cuda_activation",
         ":cuda_gpu_executor",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:event",
diff --git a/tensorflow/stream_executor/cuda/cublasLt_11_0.inc b/tensorflow/stream_executor/cuda/cublasLt_11_0.inc
new file mode 100644
index 00000000000..5645753c56b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublasLt_11_0.inc
@@ -0,0 +1,390 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t *lightHandle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle);
+}
+
+size_t CUBLASWINAPI cublasLtGetVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUBLASWINAPI cublasLtGetCudartVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type,
+                                                int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmul(
+    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t computeDesc,
+    const void *alpha, /* host or device pointer */
+    const void *A, cublasLtMatrixLayout_t Adesc, const void *B,
+    cublasLtMatrixLayout_t Bdesc, const void *beta, /* host or device pointer */
+    const void *C, cublasLtMatrixLayout_t Cdesc, void *D,
+    cublasLtMatrixLayout_t Ddesc, const cublasLtMatmulAlgo_t *algo,
+    void *workspace, size_t workspaceSizeInBytes, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasLtMatmulDesc_t, const void *, const void *,
+      cublasLtMatrixLayout_t, const void *, cublasLtMatrixLayout_t,
+      const void *, const void *, cublasLtMatrixLayout_t, void *,
+      cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, void *, size_t,
+      cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmul");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C,
+                  Cdesc, D, Ddesc, algo, workspace, workspaceSizeInBytes,
+                  stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(
+    cublasLtHandle_t lightHandle, cublasLtMatrixTransformDesc_t transformDesc,
+    const void *alpha, /* host or device pointer */
+    const void *A, cublasLtMatrixLayout_t Adesc,
+    const void *beta, /* host or device pointer */
+    const void *B, cublasLtMatrixLayout_t Bdesc, void *C,
+    cublasLtMatrixLayout_t Cdesc, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasLtMatrixTransformDesc_t, const void *,
+      const void *, cublasLtMatrixLayout_t, const void *, const void *,
+      cublasLtMatrixLayout_t, void *, cublasLtMatrixLayout_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixTransform");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, transformDesc, alpha, A, Adesc, beta, B, Bdesc,
+                  C, Cdesc, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
+    cublasLtMatrixLayout_t matLayout, size_t size, cudaDataType type,
+    uint64_t rows, uint64_t cols, int64_t ld) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t, size_t, cudaDataType, uint64_t, uint64_t,
+      int64_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, size, type, rows, cols, ld);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
+    cublasLtMatrixLayout_t *matLayout, cudaDataType type, uint64_t rows,
+    uint64_t cols, int64_t ld) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t *, cudaDataType, uint64_t, uint64_t, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixLayoutCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, type, rows, cols, ld);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixLayout_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixLayoutDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
+    cublasLtMatrixLayout_t matLayout, cublasLtMatrixLayoutAttribute_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, const void *,
+      size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
+    cublasLtMatrixLayout_t matLayout, cublasLtMatrixLayoutAttribute_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, void *, size_t,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
+    cublasLtMatmulDesc_t matmulDesc, size_t size,
+    cublasComputeType_t computeType, cudaDataType_t scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulDesc_t, size_t, cublasComputeType_t, cudaDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, size, computeType, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(
+    cublasLtMatmulDesc_t *matmulDesc, cublasComputeType_t computeType,
+    cudaDataType_t scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtMatmulDesc_t *, cublasComputeType_t, cudaDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, computeType, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulDesc_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc, cublasLtMatmulDescAttributes_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, const void *,
+      size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc, cublasLtMatmulDescAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, void *, size_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(
+    cublasLtMatrixTransformDesc_t transformDesc, size_t size,
+    cudaDataType scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t,
+                                                 size_t, cudaDataType);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, size, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(
+    cublasLtMatrixTransformDesc_t *transformDesc, cudaDataType scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtMatrixTransformDesc_t *, cudaDataType);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(
+    cublasLtMatrixTransformDesc_t transformDesc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr, const void *buf,
+    size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t,
+      const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr, void *buf, size_t sizeInBytes,
+    size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t,
+      void *, size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(
+    cublasLtMatmulPreference_t pref, size_t size) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref, size);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t *pref) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
+    cublasLtMatmulPreference_t pref, cublasLtMatmulPreferenceAttributes_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t,
+      const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
+    cublasLtMatmulPreference_t pref, cublasLtMatmulPreferenceAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, void *,
+      size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(
+    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc,
+    cublasLtMatmulPreference_t preference, int requestedAlgoCount,
+    cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
+    int *returnAlgoCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t,
+      cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t,
+      cublasLtMatmulPreference_t, int, cublasLtMatmulHeuristicResult_t[],
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoGetHeuristic");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc,
+                  preference, requestedAlgoCount, heuristicResultsArray,
+                  returnAlgoCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(
+    cublasLtHandle_t lightHandle, cublasComputeType_t computeType,
+    cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype,
+    cudaDataType_t Ctype, cudaDataType_t Dtype, int requestedAlgoCount,
+    int algoIdsArray[], int *returnAlgoCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t,
+      cudaDataType_t, cudaDataType_t, cudaDataType_t, int, int[], int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoGetIds");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype,
+                  Dtype, requestedAlgoCount, algoIdsArray, returnAlgoCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(
+    cublasLtHandle_t lightHandle, cublasComputeType_t computeType,
+    cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype,
+    cudaDataType_t Ctype, cudaDataType_t Dtype, int algoId,
+    cublasLtMatmulAlgo_t *algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t,
+      cudaDataType_t, cudaDataType_t, cudaDataType_t, int,
+      cublasLtMatmulAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype,
+                  Dtype, algoId, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
+    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc,
+    const cublasLtMatmulAlgo_t *algo,  ///< may point to result->algo
+    cublasLtMatmulHeuristicResult_t *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t,
+      cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t,
+      const cublasLtMatmulAlgo_t *,  ///< may point to result->algo
+      cublasLtMatmulHeuristicResult_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, algo,
+                  result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(
+    const cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoCapAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoCapAttributes_t, void *,
+      size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoCapGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(
+    cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoConfigAttributes_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t,
+      const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoConfigSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algo, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(
+    const cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoConfigAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t,
+      void *, size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoConfigGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublasLt_stub.cc b/tensorflow/stream_executor/cuda/cublasLt_stub.cc
new file mode 100644
index 00000000000..aae8a94285b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublasLt_stub.cc
@@ -0,0 +1,59 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "third_party/gpus/cuda/include/cublasLt.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the cuBLASLt API by forwarding to cuBLASLt loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or =
+        stream_executor::internal::DsoLoader::GetCublasLtDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+void LogFatalSymbolNotFound(const char* symbol_name) {
+  LOG(FATAL) << symbol_name << " symbol not found.";
+}
+
+cublasStatus_t GetSymbolNotFoundError() { return CUBLAS_STATUS_INTERNAL_ERROR; }
+}  // namespace
+
+// We only use cublasLt from CUDA 11.0 onward.
+#if CUDA_VERSION >= 11000
+#include "tensorflow/stream_executor/cuda/cublasLt_11_0.inc"
+#endif
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 1e2fd0ef6b4..7fb94c7f543 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 
@@ -226,17 +227,38 @@ bool CUDABlas::Init() {
     return false;
   }
 
+#if CUDA_VERSION >= 11000
+  ret = cublasLtCreate(&blasLt_);
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create cublasLt handle: " << ToString(ret);
+    return false;
+  }
+#endif  // CUDA_VERSION >= 11000
+
   return true;
 }
 
 CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
-    : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
+    : parent_(CHECK_NOTNULL(parent)),
+      blas_(nullptr)
+#if CUDA_VERSION >= 11000
+      ,
+      blasLt_(nullptr)
+#endif
+{
+}
 
 CUDABlas::~CUDABlas() {
   if (blas_ != nullptr) {
     gpu::ScopedActivateExecutorContext sac{parent_};
     cublasDestroy(blas_);
   }
+#if CUDA_VERSION >= 11000
+  if (blasLt_ != nullptr) {
+    gpu::ScopedActivateExecutorContext sac{parent_};
+    cublasLtDestroy(blasLt_);
+  }
+#endif
 }
 
 bool CUDABlas::SetStream(Stream *stream) {
@@ -253,6 +275,13 @@ bool CUDABlas::SetStream(Stream *stream) {
   return true;
 }
 
+cudaStream_t CUDABlas::CUDAStream(Stream *stream) {
+  CHECK(stream != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
+  gpu::ScopedActivateExecutorContext sac{parent_};
+  return AsGpuStreamValue(stream);
+}
+
 namespace {
 
 // Helper functions transforming blas arguments into cuBLAS arguments.
@@ -381,8 +410,122 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
       return CUDA_C_32F;
     case blas::ComputationType::kComplexF64:
       return CUDA_C_64F;
+    case blas::ComputationType::kTF32AsF32:  // fall-through
+    case blas::ComputationType::kBF16AsF32:
+      // These cases are currently only supported in the blasLt routines, which
+      // use CUBLASComputationType() instead.
+      LOG(FATAL) << "Invalid value of blas::ComputationType.";
   }
 }
+
+#if CUDA_VERSION >= 11000
+cublasComputeType_t CUBLASComputationType(blas::ComputationType ty) {
+  switch (ty) {
+    case blas::ComputationType::kF16:
+      return CUBLAS_COMPUTE_16F;
+    case blas::ComputationType::kF32:  // fall-through
+    case blas::ComputationType::kComplexF32:
+      return CUBLAS_COMPUTE_32F;
+    case blas::ComputationType::kF64:  // fall-through
+    case blas::ComputationType::kComplexF64:
+      return CUBLAS_COMPUTE_64F;
+    case blas::ComputationType::kI32:
+      return CUBLAS_COMPUTE_32I;
+    case blas::ComputationType::kTF32AsF32:
+      return CUBLAS_COMPUTE_32F_FAST_TF32;
+    case blas::ComputationType::kBF16AsF32:
+      return CUBLAS_COMPUTE_32F_FAST_16BF;
+  }
+}
+#endif  // CUDA_VERSION >= 11000
+
+blas::DataType GetScaleType(blas::DataType data_type,
+                            blas::ComputationType compute_type) {
+  bool is_complex = data_type == blas::DataType::kComplexFloat ||
+                    data_type == blas::DataType::kComplexDouble;
+  switch (compute_type) {
+    case blas::ComputationType::kF16:
+      return blas::DataType::kHalf;
+    case blas::ComputationType::kF32:         // fall-through
+    case blas::ComputationType::kComplexF32:  // fall-through
+    case blas::ComputationType::kTF32AsF32:   // fall-through
+    case blas::ComputationType::kBF16AsF32:
+      return is_complex ? blas::DataType::kComplexFloat
+                        : blas::DataType::kFloat;
+    case blas::ComputationType::kF64:  // fall-through
+    case blas::ComputationType::kComplexF64:
+      return is_complex ? blas::DataType::kComplexDouble
+                        : blas::DataType::kDouble;
+    case blas::ComputationType::kI32:
+      return blas::DataType::kInt32;
+  }
+}
+
+#if CUDA_VERSION >= 11000
+cublasLtPointerMode_t CUBLASPointerMode(blas::PointerMode pointer_mode) {
+  switch (pointer_mode) {
+    case blas::PointerMode::kHost:
+      return CUBLASLT_POINTER_MODE_HOST;
+    case blas::PointerMode::kDevice:
+      return CUBLASLT_POINTER_MODE_DEVICE;
+  }
+}
+cublasLtEpilogue_t CUBLASEpilogue(blas::Epilogue epilogue) {
+  switch (epilogue) {
+    case blas::Epilogue::kDefault:
+      return CUBLASLT_EPILOGUE_DEFAULT;
+    case blas::Epilogue::kReLU:
+      return CUBLASLT_EPILOGUE_RELU;
+    case blas::Epilogue::kBias:
+      return CUBLASLT_EPILOGUE_BIAS;
+    case blas::Epilogue::kBiasThenReLU:
+      return CUBLASLT_EPILOGUE_RELU_BIAS;
+  }
+}
+#endif  // CUDA_VERSION >= 11000
+
+cudaDataType_t GetCUDADataType(blas::DataType ty) {
+  switch (ty) {
+    case blas::DataType::kHalf:
+      return CUDA_R_16F;
+    case blas::DataType::kFloat:
+      return CUDA_R_32F;
+    case blas::DataType::kDouble:
+      return CUDA_R_64F;
+    case blas::DataType::kInt8:
+      return CUDA_R_8I;
+    case blas::DataType::kInt32:
+      return CUDA_R_32I;
+    case blas::DataType::kComplexFloat:
+      return CUDA_C_32F;
+    case blas::DataType::kComplexDouble:
+      return CUDA_C_64F;
+    default:
+      LOG(FATAL) << "Invalid value of blas::DataType in GetCUDADataType";
+  }
+}
+
+int GetDataTypeSizeBytes(blas::DataType ty) {
+  switch (ty) {
+    case blas::DataType::kHalf:
+      return 2;
+    case blas::DataType::kFloat:
+      return 4;
+    case blas::DataType::kDouble:
+      return 8;
+    case blas::DataType::kInt8:
+      return 1;
+    case blas::DataType::kInt32:
+      return 4;
+    case blas::DataType::kComplexFloat:
+      return 8;
+    case blas::DataType::kComplexDouble:
+      return 16;
+    default:
+      LOG(FATAL) << "Invalid value of blas::DataType in GetDataTypeSizeBytes";
+  }
+}
+
 }  // namespace
 
 template <typename FuncT, typename... Args>
@@ -2921,6 +3064,680 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
+// We only use cublasLt from CUDA 11.0 onward.
+#if CUDA_VERSION >= 11000
+
+namespace {
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatrixLayout_t handle,
+                                    cublasLtMatrixLayoutAttribute_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatrixLayoutSetAttribute(handle, attr, &value, sizeof(T));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatrixLayoutSetAttribute(attr=", attr,
+                     ", value=", value, ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatmulAlgo_t *handle,
+                                    cublasLtMatmulAlgoConfigAttributes_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatmulAlgoConfigSetAttribute(handle, attr, &value, sizeof(T));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulAlgoConfigSetAttribute(attr=", attr,
+                     ", value=", value, ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatmulPreference_t handle,
+                                    cublasLtMatmulPreferenceAttributes_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatmulPreferenceSetAttribute(handle, attr, &value, sizeof(value));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulPreferenceSetAttribute(attr=", attr,
+                     ", value=", value, ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+template <typename T>
+inline bool GetCublasLtAttr(const cublasLtMatmulAlgo_t *handle,
+                            cublasLtMatmulAlgoConfigAttributes_t attr,
+                            T *value) {
+  auto mutable_handle = const_cast<cublasLtMatmulAlgo_t *>(handle);
+  size_t bytes_written = 0;
+  return cublasLtMatmulAlgoConfigGetAttribute(mutable_handle, attr, value,
+                                              sizeof(T), &bytes_written) ==
+             CUBLAS_STATUS_SUCCESS &&
+         bytes_written == sizeof(T);
+}
+
+template <typename T>
+inline const T &ValueForStrCat(const T &value) {
+  return value;
+}
+template <typename T>
+inline absl::Hex ValueForStrCat(T *ptr) {
+  return absl::Hex(reinterpret_cast<uintptr_t>(ptr));
+}
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatmulDesc_t handle,
+                                    cublasLtMatmulDescAttributes_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatmulDescSetAttribute(handle, attr, &value, sizeof(value));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulDescSetAttribute(attr=", attr, ", value=",
+                     ValueForStrCat(value), ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+struct MatmulDescDestroyer {
+  void operator()(cublasLtMatmulDesc_t matmul_desc) const {
+    cublasLtMatmulDescDestroy(matmul_desc);
+  }
+};
+struct LayoutDestroyer {
+  void operator()(cublasLtMatrixLayout_t layout) const {
+    cublasLtMatrixLayoutDestroy(layout);
+  }
+};
+struct MatmulPreferenceDestroyer {
+  void operator()(cublasLtMatmulPreference_t matmul_pref) const {
+    cublasLtMatmulPreferenceDestroy(matmul_pref);
+  }
+};
+using UniqueOpDesc =
+    std::unique_ptr<std::remove_pointer<cublasLtMatmulDesc_t>::type,
+                    MatmulDescDestroyer>;
+using UniqueLayoutDesc =
+    std::unique_ptr<std::remove_pointer<cublasLtMatrixLayout_t>::type,
+                    LayoutDestroyer>;
+using UniqueMatmulPreference =
+    std::unique_ptr<std::remove_pointer<cublasLtMatmulPreference_t>::type,
+                    MatmulPreferenceDestroyer>;
+
+port::StatusOr<UniqueOpDesc> CreateCublasLtOperationDesc(
+    blas::ComputationType computation_type, blas::DataType scale_type,
+    blas::PointerMode pointer_mode, blas::Epilogue epilogue,
+    blas::Transpose transa, blas::Transpose transb) {
+  cublasLtMatmulDesc_t desc;
+  cublasComputeType_t cublas_compute_type =
+      CUBLASComputationType(computation_type);
+  cudaDataType_t cuda_scale_type = GetCUDADataType(scale_type);
+  cublasStatus_t status =
+      cublasLtMatmulDescCreate(&desc, cublas_compute_type, cuda_scale_type);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulDescCreate(computation_type=",
+                     computation_type, ") failed: ", ToString(status)));
+  }
+  UniqueOpDesc unique_desc(desc);
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_POINTER_MODE,
+                                     CUBLASPointerMode(pointer_mode)));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+                                     CUBLASEpilogue(epilogue)));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA,
+                                     CUDABlasTranspose(transa)));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB,
+                                     CUDABlasTranspose(transb)));
+  return unique_desc;
+}
+
+port::StatusOr<UniqueLayoutDesc> CreateCublasLtLayoutDesc(
+    blas::DataType data_type, uint64 rows, uint64 cols, int64 ld, int64 stride,
+    int batch_count) {
+  cublasLtMatrixLayout_t desc;
+  cublasStatus_t status = cublasLtMatrixLayoutCreate(
+      &desc, GetCUDADataType(data_type), rows, cols, ld);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatrixLayoutCreate failed: ", ToString(status)));
+  }
+  UniqueLayoutDesc unique_desc(desc);
+  SE_RETURN_IF_ERROR(
+      SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, batch_count));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(
+      desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stride));
+  return unique_desc;
+}
+
+// Helper function to allocate workspace.
+port::Status AllocateWorkspace(void **workspace,
+                               ScratchAllocator *scratch_allocator,
+                               size_t num_bytes) {
+  SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace_bytes,
+                      scratch_allocator->AllocateBytes(num_bytes));
+  *workspace = (void *)GpuMemoryMutable(&workspace_bytes);
+  return port::Status::OK();
+}
+
+template <typename T>
+blas::ComputationType ToComputationType();
+template <>
+blas::ComputationType ToComputationType<Eigen::half>() {
+  return blas::ComputationType::kF16;
+}
+template <>
+blas::ComputationType ToComputationType<float>() {
+  return blas::ComputationType::kF32;
+}
+template <>
+blas::ComputationType ToComputationType<double>() {
+  return blas::ComputationType::kF64;
+}
+template <>
+blas::ComputationType ToComputationType<std::complex<float>>() {
+  return blas::ComputationType::kComplexF32;
+}
+template <>
+blas::ComputationType ToComputationType<std::complex<double>>() {
+  return blas::ComputationType::kComplexF64;
+}
+
+class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan {
+ public:
+  port::Status init(const blas::BlasLtMatmulPlanParams &p) {
+    params_ = p;
+    scale_type_ = GetScaleType(p.c_type, p.computation_type);
+    SE_ASSIGN_OR_RETURN(
+        op_desc_,
+        CreateCublasLtOperationDesc(
+            p.computation_type, GetScaleType(p.c_type, p.computation_type),
+            p.pointer_mode, p.epilogue, p.transa, p.transb));
+    uint64 rows_a = p.transa == blas::Transpose::kNoTranspose ? p.m : p.k;
+    uint64 cols_a = p.transa == blas::Transpose::kNoTranspose ? p.k : p.m;
+    uint64 rows_b = p.transb == blas::Transpose::kNoTranspose ? p.k : p.n;
+    uint64 cols_b = p.transb == blas::Transpose::kNoTranspose ? p.n : p.k;
+    SE_ASSIGN_OR_RETURN(
+        a_desc_, CreateCublasLtLayoutDesc(p.ab_type, rows_a, cols_a, p.lda,
+                                          p.stride_a, capped_batch_count()));
+    SE_ASSIGN_OR_RETURN(
+        b_desc_, CreateCublasLtLayoutDesc(p.ab_type, rows_b, cols_b, p.ldb,
+                                          p.stride_b, capped_batch_count()));
+    SE_ASSIGN_OR_RETURN(
+        c_desc_, CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c,
+                                          capped_batch_count()));
+    SE_ASSIGN_OR_RETURN(
+        d_desc_, CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c,
+                                          capped_batch_count()));
+    remainder_batch_count_ =
+        p.batch_count > kMaxBatchCount ? p.batch_count % kMaxBatchCount : 0;
+    if (remainder_batch_count_) {
+      SE_ASSIGN_OR_RETURN(
+          a_remainder_desc_,
+          CreateCublasLtLayoutDesc(p.ab_type, rows_a, cols_a, p.lda, p.stride_a,
+                                   remainder_batch_count_));
+      SE_ASSIGN_OR_RETURN(
+          b_remainder_desc_,
+          CreateCublasLtLayoutDesc(p.ab_type, rows_b, cols_b, p.ldb, p.stride_b,
+                                   remainder_batch_count_));
+      SE_ASSIGN_OR_RETURN(
+          c_remainder_desc_,
+          CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c,
+                                   remainder_batch_count_));
+      SE_ASSIGN_OR_RETURN(
+          d_remainder_desc_,
+          CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c,
+                                   remainder_batch_count_));
+    }
+    return port::Status::OK();
+  }
+
+  cublasLtMatmulDesc_t op_desc() const { return op_desc_.get(); }
+  cublasLtMatrixLayout_t a_desc() const { return a_desc_.get(); }
+  cublasLtMatrixLayout_t b_desc() const { return b_desc_.get(); }
+  cublasLtMatrixLayout_t c_desc() const { return c_desc_.get(); }
+  cublasLtMatrixLayout_t d_desc() const { return d_desc_.get(); }
+  cublasLtMatrixLayout_t a_remainder_desc() const {
+    return a_remainder_desc_.get();
+  }
+  cublasLtMatrixLayout_t b_remainder_desc() const {
+    return b_remainder_desc_.get();
+  }
+  cublasLtMatrixLayout_t c_remainder_desc() const {
+    return c_remainder_desc_.get();
+  }
+  cublasLtMatrixLayout_t d_remainder_desc() const {
+    return d_remainder_desc_.get();
+  }
+
+  const blas::BlasLtMatmulPlanParams &params() const { return params_; }
+  blas::DataType scale_type() const { return scale_type_; }
+  blas::DataType ab_type() const override { return params_.ab_type; }
+  blas::DataType c_type() const override { return params_.c_type; }
+  int capped_batch_count() const {
+    return std::min(params_.batch_count, kMaxBatchCount);
+  }
+  int remainder_batch_count() const { return remainder_batch_count_; }
+
+  // Note: Must be const to satisfy API. This is always called before the plan
+  // is executed, so the state change is not observed in subsequent executions.
+  bool SetBiasPointer(const void *bias) const;
+
+ private:
+  // In some cases cublasLt does not support large batch sizes, so we need to
+  // split up such cases into multiple calls.
+  static constexpr const int kMaxBatchCount = 65535;
+  blas::BlasLtMatmulPlanParams params_;
+  blas::DataType scale_type_;
+  UniqueOpDesc op_desc_;
+  // These have batch count set to capped_batch_count().
+  UniqueLayoutDesc a_desc_;
+  UniqueLayoutDesc b_desc_;
+  UniqueLayoutDesc c_desc_;
+  UniqueLayoutDesc d_desc_;
+  int remainder_batch_count_;
+  // These have batch count set to remainder_batch_count_, and are only created
+  // if params_.batch_count > kMaxBatchSize.
+  UniqueLayoutDesc a_remainder_desc_;
+  UniqueLayoutDesc b_remainder_desc_;
+  UniqueLayoutDesc c_remainder_desc_;
+  UniqueLayoutDesc d_remainder_desc_;
+};
+
+bool CUDABlasLtMatmulPlan::SetBiasPointer(const void *bias) const {
+  return SetCublasLtAttr(op_desc_.get(), CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                         bias)
+      .ok();
+}
+
+class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm {
+ public:
+  CUDABlasLtMatmulAlgorithm(blas::AlgorithmType index,
+                            cublasLtMatmulAlgo_t algo, size_t workspace_size)
+      : index_(index), algo_(algo), workspace_size_(workspace_size) {}
+
+  blas::AlgorithmType index() const override { return index_; }
+
+  size_t workspace_size() const override { return workspace_size_; }
+
+  const cublasLtMatmulAlgo_t *algo() const { return &algo_; }
+
+  int algo_id() const {
+    int id;
+    GetCublasLtAttr(&algo_, CUBLASLT_ALGO_CONFIG_ID, &id);
+    return id;
+  }
+
+ private:
+  blas::AlgorithmType index_;
+  cublasLtMatmulAlgo_t algo_;
+  size_t workspace_size_;
+};
+
+port::StatusOr<UniqueMatmulPreference> CreateCublasLtMatmulPreference(
+    const blas::IBlasLtMatmulPlan *plan, size_t max_workspace_bytes) {
+  cublasLtMatmulPreference_t preference;
+  cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(port::error::INTERNAL,
+                        absl::StrCat("cublasLtMatmulPreferenceCreate failed: ",
+                                     ToString(status)));
+  }
+  UniqueMatmulPreference unique_preference(preference);
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(preference,
+                                     CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                     max_workspace_bytes));
+
+  const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+  if (cuda_plan.params().batch_count == 0) {
+    return unique_preference;
+  }
+  // This is a workaround for a known issue in cuBlasLt where the heuristic may
+  // in rare cases select an algo that does not support the specified stride.
+  // Specifying the alignment requirements manually like this avoids the issue.
+  auto get_alignment_bytes = [](int64 stride, blas::DataType dtype) {
+    return (stride & -stride) * GetDataTypeSizeBytes(dtype);
+  };
+  if (cuda_plan.params().stride_a) {
+    SE_RETURN_IF_ERROR(SetCublasLtAttr(
+        preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
+        (uint32)get_alignment_bytes(cuda_plan.params().stride_a,
+                                    cuda_plan.params().ab_type)));
+  }
+  if (cuda_plan.params().stride_b) {
+    SE_RETURN_IF_ERROR(SetCublasLtAttr(
+        preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
+        (uint32)get_alignment_bytes(cuda_plan.params().stride_b,
+                                    cuda_plan.params().ab_type)));
+  }
+  if (cuda_plan.params().stride_c) {
+    SE_RETURN_IF_ERROR(SetCublasLtAttr(
+        preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
+        (uint32)get_alignment_bytes(cuda_plan.params().stride_c,
+                                    cuda_plan.params().c_type)));
+  }
+  if (cuda_plan.params().stride_c) {
+    SE_RETURN_IF_ERROR(SetCublasLtAttr(
+        preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES,
+        (uint32)get_alignment_bytes(cuda_plan.params().stride_c,
+                                    cuda_plan.params().c_type)));
+  }
+  return unique_preference;
+}
+
+}  // namespace
+
+#endif  // CUDA_VERSION >= 11000
+
+port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+CUDABlas::CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &p) {
+#if CUDA_VERSION >= 11000
+  auto cuda_plan = std::make_unique<CUDABlasLtMatmulPlan>();
+  SE_RETURN_IF_ERROR(cuda_plan->init(p));
+  return static_cast<std::unique_ptr<blas::IBlasLtMatmulPlan>>(
+      std::move(cuda_plan));
+#else
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "CreateBlasLtMatmulPlan is not supported with this version of CUDA");
+#endif
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+CUDABlas::GetBlasLtMatmulAlgorithmsInternal(const blas::IBlasLtMatmulPlan *plan,
+                                            size_t max_workspace_size,
+                                            int max_algorithm_count,
+                                            bool for_remainder_batch) {
+#if CUDA_VERSION >= 11000
+  SE_ASSIGN_OR_RETURN(UniqueMatmulPreference preference,
+                      CreateCublasLtMatmulPreference(plan, max_workspace_size));
+
+  std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count);
+  {
+    absl::MutexLock lock(&mu_);
+
+    CHECK(blasLt_ != nullptr);
+
+    gpu::ScopedActivateExecutorContext sac{parent_};
+
+    int found_algorithm_count = 0;
+    const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+    const auto &a_desc =
+        for_remainder_batch ? cuda_plan.a_remainder_desc() : cuda_plan.a_desc();
+    const auto &b_desc =
+        for_remainder_batch ? cuda_plan.b_remainder_desc() : cuda_plan.b_desc();
+    const auto &c_desc =
+        for_remainder_batch ? cuda_plan.c_remainder_desc() : cuda_plan.c_desc();
+    const auto &d_desc =
+        for_remainder_batch ? cuda_plan.d_remainder_desc() : cuda_plan.d_desc();
+    cublasStatus_t status = cublasLtMatmulAlgoGetHeuristic(
+        blasLt_, cuda_plan.op_desc(), a_desc, b_desc, c_desc, d_desc,
+        preference.get(), max_algorithm_count, results.data(),
+        &found_algorithm_count);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      return port::Status(
+          port::error::INTERNAL,
+          absl::StrCat("cublasLtMatmulAlgoGetHeuristic failed: ",
+                       ToString(status)));
+    }
+    results.resize(found_algorithm_count);
+  }
+
+  std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>> out_algorithms;
+  out_algorithms.reserve(results.size());
+  for (size_t i = 0; i < results.size(); ++i) {
+    const auto &result = results[i];
+    if (result.state != CUBLAS_STATUS_SUCCESS) continue;  // Skip failed algos
+    out_algorithms.emplace_back(std::make_unique<CUDABlasLtMatmulAlgorithm>(
+        i, result.algo, result.workspaceSize));
+  }
+  return out_algorithms;
+#else  // if CUDA_VERSION < 11000
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "GetBlasLtMatmulAlgorithms is not supported with this version of CUDA");
+#endif
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+CUDABlas::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                                    size_t max_workspace_size,
+                                    int max_algorithm_count) {
+  return GetBlasLtMatmulAlgorithmsInternal(plan, max_workspace_size,
+                                           max_algorithm_count);
+}
+
+#if CUDA_VERSION >= 11000
+bool CUDABlas::DoBlasLtMatmulInternal(
+    Stream *stream, bool err_on_failure, const blas::IBlasLtMatmulPlan *plan,
+    const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+    DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+    DeviceMemoryBase c, DeviceMemoryBase d, ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias) {
+  const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+  const auto &cuda_algo =
+      *static_cast<const CUDABlasLtMatmulAlgorithm *>(algorithm);
+
+  if (alpha.data_type() != cuda_plan.scale_type() ||
+      beta.data_type() != cuda_plan.scale_type()) {
+    VLOG(2) << "DoBlasLtMatmul returning false because alpha and beta types do "
+               "not match plan: expected "
+            << cuda_plan.c_type() << ", got alpha=" << alpha.data_type()
+            << " beta=" << beta.data_type();
+    return false;
+  }
+  if (alpha.is_pointer() != beta.is_pointer()) {
+    VLOG(2) << "DoBlasLtMatmul returning false because one of `alpha` "
+               "and `beta` is a pointer, but the other is not.";
+    return false;
+  }
+  bool is_pointer_mode_host = !alpha.is_pointer();
+  if ((cuda_plan.params().pointer_mode == blas::PointerMode::kHost) !=
+      is_pointer_mode_host) {
+    VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong "
+               "pointer_mode for the given alpha/beta.";
+    return false;
+  }
+  if ((cuda_plan.params().epilogue == blas::Epilogue::kBias ||
+       cuda_plan.params().epilogue == blas::Epilogue::kBiasThenReLU) !=
+      (bias != nullptr)) {
+    VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong "
+               "epilogue for the given bias pointer.";
+    return false;
+  }
+  const void *alpha_ptr = alpha.is_pointer() ? alpha.opaque_pointer().opaque()
+                                             : alpha.opaque_value();
+  const void *beta_ptr =
+      beta.is_pointer() ? beta.opaque_pointer().opaque() : beta.opaque_value();
+
+  void *workspace = nullptr;
+  if (cuda_algo.workspace_size()) {
+    port::Status allocation_status = AllocateWorkspace(
+        &workspace, scratch_allocator, cuda_algo.workspace_size());
+    if (!allocation_status.ok()) {
+      if (err_on_failure || VLOG_IS_ON(3)) {
+        LOG(ERROR)
+            << "Failed to allocate workspace for cublasLtMatmul algo with id: "
+            << cuda_algo.algo_id() << " requiring "
+            << cuda_algo.workspace_size() << " bytes of workspace";
+      }
+      return false;
+    }
+  }
+
+  // This is only used when batch_count > kMaxBatchCount.
+  std::unique_ptr<blas::IBlasLtMatmulAlgorithm> unique_remainder_algo;
+  if (cuda_plan.remainder_batch_count()) {
+    // There is no easy way to get the user-specified max workspace size here,
+    // so we just allow a very small amount and don't worry too much about
+    // performance because this is only used in rare cases. The same reasoning
+    // applies to selection of the algorithm.
+    size_t max_workspace_size = 4 * 1024 * 1024;  // 4 MiB
+    auto status_or_algorithms =
+        GetBlasLtMatmulAlgorithmsInternal(plan, max_workspace_size,
+                                          /* max_algorithm_count = */ 1,
+                                          /* for_remainder_batch = */ true);
+    if (!status_or_algorithms.ok()) {
+      if (err_on_failure || VLOG_IS_ON(3)) {
+        LOG(ERROR) << "Failed to get algorithms for blasLt remainder batch.";
+      }
+      return false;
+    }
+    auto algorithms = status_or_algorithms.ConsumeValueOrDie();
+    unique_remainder_algo = std::move(algorithms.front());
+  }
+
+  cudaStream_t cuda_stream = CUDAStream(stream);
+
+  absl::MutexLock lock(&mu_);
+
+  if (bias != nullptr) {
+    if (!cuda_plan.SetBiasPointer(bias.opaque())) {
+      VLOG(2) << "DoBlasLtMatmul returning false because setting the bias "
+                 "pointer failed.";
+      return false;
+    }
+  }
+
+  CHECK(blasLt_ != nullptr);
+
+  gpu::ScopedActivateExecutorContext sac{parent_};
+
+  // Plan execution is broken down into repeat calls with capped_batch_count,
+  // followed by a final call with remainder_batch_count.
+  // Cases where batch_count <= kMaxBatchCount require only a single call (a
+  // single loop iteration and no remainder).
+  int ab_type_size = GetDataTypeSizeBytes(cuda_plan.params().ab_type);
+  int c_type_size = GetDataTypeSizeBytes(cuda_plan.params().c_type);
+  const char *a_ptr = static_cast<const char *>(a.opaque());
+  const char *b_ptr = static_cast<const char *>(b.opaque());
+  const char *c_ptr = static_cast<const char *>(c.opaque());
+  char *d_ptr = static_cast<char *>(d.opaque());
+  int capped_batch_count = cuda_plan.capped_batch_count();
+  for (int batch = 0;
+       batch + capped_batch_count <= cuda_plan.params().batch_count;
+       batch += capped_batch_count) {
+    cublasStatus_t ret = cublasLtMatmul(
+        blasLt_, cuda_plan.op_desc(), alpha_ptr, a_ptr, cuda_plan.a_desc(),
+        b_ptr, cuda_plan.b_desc(), beta_ptr, c_ptr, cuda_plan.c_desc(), d_ptr,
+        cuda_plan.d_desc(), cuda_algo.algo(), workspace,
+        cuda_algo.workspace_size(), cuda_stream);
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      if (err_on_failure || VLOG_IS_ON(3)) {
+        LOG(ERROR) << "failed to run cublasLtMatmul routine: " << ToString(ret);
+      }
+      return false;
+    }
+    a_ptr += capped_batch_count * cuda_plan.params().stride_a * ab_type_size;
+    b_ptr += capped_batch_count * cuda_plan.params().stride_b * ab_type_size;
+    c_ptr += capped_batch_count * cuda_plan.params().stride_c * c_type_size;
+    d_ptr += capped_batch_count * cuda_plan.params().stride_c * c_type_size;
+  }
+  // This is only used when batch_count > kMaxBatchCount.
+  if (cuda_plan.remainder_batch_count()) {
+    const auto &remainder_algo =
+        *static_cast<const CUDABlasLtMatmulAlgorithm *>(
+            unique_remainder_algo.get());
+    if (remainder_algo.workspace_size()) {
+      port::Status allocation_status = AllocateWorkspace(
+          &workspace, scratch_allocator, remainder_algo.workspace_size());
+      if (!allocation_status.ok()) {
+        if (err_on_failure || VLOG_IS_ON(3)) {
+          LOG(ERROR) << "Failed to allocate workspace for cublasLtMatmul algo "
+                        "with id: "
+                     << remainder_algo.algo_id() << " requiring "
+                     << remainder_algo.workspace_size()
+                     << " bytes of workspace";
+        }
+        return false;
+      }
+    }
+    cublasStatus_t ret = cublasLtMatmul(
+        blasLt_, cuda_plan.op_desc(), alpha_ptr, a_ptr,
+        cuda_plan.a_remainder_desc(), b_ptr, cuda_plan.b_remainder_desc(),
+        beta_ptr, c_ptr, cuda_plan.c_remainder_desc(), d_ptr,
+        cuda_plan.d_remainder_desc(), remainder_algo.algo(), workspace,
+        remainder_algo.workspace_size(), cuda_stream);
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      if (err_on_failure || VLOG_IS_ON(3)) {
+        LOG(ERROR) << "failed to run remainder cublasLtMatmul routine: "
+                   << ToString(ret);
+      }
+      return false;
+    }
+  }
+  return true;
+}
+#endif  // CUDA_VERSION >= 11000
+
+bool CUDABlas::DoBlasLtMatmul(
+    Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+    const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+    DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+    DeviceMemoryBase c, ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,
+    blas::ProfileResult *output_profile_result) {
+#if CUDA_VERSION >= 11000
+  const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+  HostOrDeviceScalar<void> alpha_cast = alpha;
+  HostOrDeviceScalar<void> beta_cast = beta;
+  if (cuda_plan.c_type() == blas::DataType::kHalf &&
+      cuda_plan.scale_type() == blas::DataType::kFloat) {
+    // The given alpha and beta types are F16 (they always match c), but F32*
+    // computation type requires that they be F32, so we must cast them.
+    if (alpha.is_pointer() || beta.is_pointer()) {
+      // We cannot easily convert a pointer to f16 memory to a pointer to f32
+      // memory from here, so we don't support this for now.
+      return false;
+    }
+    alpha_cast = HostOrDeviceScalar<void>(
+        static_cast<float>(alpha.value<Eigen::half>()));
+    beta_cast =
+        HostOrDeviceScalar<void>(static_cast<float>(beta.value<Eigen::half>()));
+  }
+
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
+  if (output_profile_result) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
+      return false;
+    }
+  }
+
+  bool err_on_failure = timer != nullptr;
+  bool result = DoBlasLtMatmulInternal(stream, err_on_failure, plan, alpha_cast,
+                                       a, b, beta_cast, c, c, scratch_allocator,
+                                       algorithm, bias);
+
+  if (timer && result) {
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
+    // state.
+    if (!timer->Stop(AsGpuStream(stream))) {
+      return false;
+    }
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(algorithm->index());
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
+  return result;
+#else  // if CUDA_VERSION < 11000
+  return false;
+#endif
+}
+
 port::Status CUDABlas::GetVersion(std::string *version) {
   absl::MutexLock lock(&mu_);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 9ff63102aaa..ca2aa15d938 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -21,7 +21,9 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
 #include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
@@ -71,6 +73,9 @@ class CUDABlas : public blas::BlasSupport {
   // invoked before calling into cuBLAS.
   bool SetStream(Stream *stream) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns the underlying CUDA stream.
+  cudaStream_t CUDAStream(Stream *stream);
+
   // A helper function that calls the real cuBLAS function together with error
   // handling.
   //
@@ -134,6 +139,24 @@ class CUDABlas : public blas::BlasSupport {
                                    const T &beta, DeviceMemory<T> *y, int incy,
                                    blas::ProfileResult *output_profile_result);
 
+  // Helper function for implementing DoBlasLtMatmul.
+  bool DoBlasLtMatmulInternal(Stream *stream, bool err_on_failure,
+                              const blas::IBlasLtMatmulPlan *plan,
+                              const HostOrDeviceScalar<void> &alpha,
+                              DeviceMemoryBase a, DeviceMemoryBase b,
+                              const HostOrDeviceScalar<void> &beta,
+                              DeviceMemoryBase c, DeviceMemoryBase d,
+                              ScratchAllocator *scratch_allocator,
+                              const blas::IBlasLtMatmulAlgorithm *algorithm,
+                              DeviceMemoryBase bias);
+
+  // Helper function for implementing GetBlasLtMatmulAlgorithms.
+  port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+  GetBlasLtMatmulAlgorithmsInternal(const blas::IBlasLtMatmulPlan *plan,
+                                    size_t max_workspace_size,
+                                    int max_algorithm_count,
+                                    bool for_remainder_batch = false);
+
   // Guards the cuBLAS handle for this device.
   absl::Mutex mu_;
 
@@ -144,6 +167,11 @@ class CUDABlas : public blas::BlasSupport {
   // cuBLAS library handle on the device.
   cublasHandle_t blas_ TF_GUARDED_BY(mu_);
 
+#if CUDA_VERSION >= 11000
+  // cuBLASLt library handle on the device.
+  cublasLtHandle_t blasLt_ GUARDED_BY(mu_);
+#endif
+
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index a7ed5cedb4f..907b1c74996 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -291,6 +291,17 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
   return port::Status::OK();
 }
 
+#if CUDNN_MAJOR >= 8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4)
+void PreloadCudnnLibrary(cudnnStatus_t (*version_check_fn)(),
+                         absl::string_view sub_library) {
+  cudnnStatus_t status = version_check_fn();
+  if (status != CUDNN_STATUS_SUCCESS) {
+    VLOG(1) << "Could not pre-initialize cuDNN sub-library " << sub_library
+            << ".  Error: " << cudnnGetErrorString(status) << ".";
+  }
+}
+#endif
+
 }  // namespace
 
 CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
@@ -320,15 +331,17 @@ port::Status CudnnSupport::Init() {
 
     // Preload sub libs for cudnn 8.0.4+
 #if CUDNN_MAJOR >= 8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4)
-    cudnnOpsInferVersionCheck();
-    cudnnOpsTrainVersionCheck();
-    cudnnCnnInferVersionCheck();
-    cudnnCnnTrainVersionCheck();
-    cudnnAdvInferVersionCheck();
-    cudnnAdvTrainVersionCheck();
+    PreloadCudnnLibrary(cudnnOpsInferVersionCheck, "cudnn_ops_infer");
+    PreloadCudnnLibrary(cudnnOpsTrainVersionCheck, "cudnn_ops_train");
+    PreloadCudnnLibrary(cudnnCnnInferVersionCheck, "cudnn_cnn_infer");
+    PreloadCudnnLibrary(cudnnCnnTrainVersionCheck, "cudnn_cnn_train");
+    PreloadCudnnLibrary(cudnnAdvInferVersionCheck, "cudnn_adv_infer");
+    PreloadCudnnLibrary(cudnnAdvTrainVersionCheck, "cudnn_adv_train");
 #endif
 
     cudnn_.reset(new CudnnAccess(cudnn_handle));
+
+    LOG(INFO) << "Loaded cuDNN version " << cudnnGetVersion();
     return port::Status::OK();
   }
 
diff --git a/tensorflow/stream_executor/data_type.h b/tensorflow/stream_executor/data_type.h
new file mode 100644
index 00000000000..a1ce663c51e
--- /dev/null
+++ b/tensorflow/stream_executor/data_type.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
+
+#include <complex>
+
+#include "tensorflow/stream_executor/dnn.pb.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace Eigen {
+struct half;
+}  // namespace Eigen
+
+namespace stream_executor {
+namespace dnn {
+
+// A helper class to convert C/C++ types to the proper enums.
+template <typename T>
+struct ToDataType;
+template <>
+struct ToDataType<float> {
+  static constexpr DataType value = DataType::kFloat;
+};
+template <>
+struct ToDataType<double> {
+  static constexpr DataType value = DataType::kDouble;
+};
+template <>
+struct ToDataType<Eigen::half> {
+  static constexpr DataType value = DataType::kHalf;
+};
+template <>
+struct ToDataType<tensorflow::int8> {
+  static constexpr DataType value = DataType::kInt8;
+};
+template <>
+struct ToDataType<tensorflow::int32> {
+  static constexpr DataType value = DataType::kInt32;
+};
+template <>
+struct ToDataType<std::complex<float>> {
+  static constexpr DataType value = DataType::kComplexFloat;
+};
+template <>
+struct ToDataType<std::complex<double>> {
+  static constexpr DataType value = DataType::kComplexDouble;
+};
+
+}  // namespace dnn
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 53cdff8cb7a..920f5fe246c 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/stream_executor/data_type.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -110,30 +111,6 @@ enum class QuantizedActivationMode {
   k32Bit = 4,
 };
 
-// A helper class to convert C/C++ types to the proper enums.
-template <typename T>
-struct ToDataType;
-template <>
-struct ToDataType<float> {
-  static constexpr DataType value = DataType::kFloat;
-};
-template <>
-struct ToDataType<double> {
-  static constexpr DataType value = DataType::kDouble;
-};
-template <>
-struct ToDataType<Eigen::half> {
-  static constexpr DataType value = DataType::kHalf;
-};
-template <>
-struct ToDataType<int8> {
-  static constexpr DataType value = DataType::kInt8;
-};
-template <>
-struct ToDataType<int32> {
-  static constexpr DataType value = DataType::kInt32;
-};
-
 // Specifies the types of a RNN model.
 enum class RnnMode {
   kRnnRelu = 0,
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 4d09e615e7d..f849b011eb3 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -12,6 +12,8 @@ enum DataType {
   kHalf = 2;
   kInt8 = 3;
   kInt32 = 4;
+  kComplexFloat = 5;
+  kComplexDouble = 6;
 }
 
 // Describes how a convolution input or output layer's data is formatted.
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 863255d47a7..258142f5d83 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -240,8 +240,8 @@ cc_library(
         ":gpu_driver_header",
         ":gpu_helpers_header",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
-        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:cuda_libdevice_path",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings:str_format",
@@ -274,7 +274,7 @@ cc_library(
         "@com_google_absl//absl/types:optional",
         "//tensorflow/core/framework:allocator",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:stream_executor_headers",
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.cc b/tensorflow/stream_executor/gpu/asm_compiler.cc
index 53f76503f2a..6127f644471 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@@ -225,6 +225,21 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
   int exit_status = ptxas_info_dumper.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
   if (exit_status != 0) {
+    //  It happens when the ptxas installed is too old for the current GPU.
+    //  Example error message associated with this error code:
+    //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
+    // In that case, fallback to the driver for compilation
+    if (absl::StartsWith(stderr_output, "ptxas fatal   : Value '") &&
+        absl::StrContains(stderr_output,
+                          "is not defined for option 'gpu-name'")) {
+      LOG(WARNING) << "Your CUDA software stack is old. We fallback to the"
+                   << " NVIDIA driver for some compilation. Update your CUDA"
+                   << " version to get the best performance."
+                   << " The ptxas error was: " << stderr_output;
+      return tensorflow::errors::Unimplemented(
+          ptxas_path, " ptxas too old. Falling back to the driver to compile.");
+    }
+
     return port::InternalError(
         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
                         exit_status, stderr_output));
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
index 1f5d4b9260c..3274e7849fe 100644
--- a/tensorflow/stream_executor/host_or_device_scalar.h
+++ b/tensorflow/stream_executor/host_or_device_scalar.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 
+#include "tensorflow/stream_executor/data_type.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
@@ -23,6 +24,7 @@ namespace stream_executor {
 
 // Allows to represent a value that is either a host scalar or a scalar stored
 // on the GPU device.
+// See also the specialization for ElemT=void below.
 template <typename ElemT>
 class HostOrDeviceScalar {
  public:
@@ -52,5 +54,154 @@ class HostOrDeviceScalar {
   bool is_pointer_;
 };
 
+// Specialization for wrapping a dynamically-typed value (via type erasure).
+template <>
+class HostOrDeviceScalar<void> {
+ public:
+  using DataType = dnn::DataType;
+
+  // Constructors not marked as explicit because when using this constructor, we
+  // usually want to set this to a compile-time constant.
+
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(float value)
+      : float_(value), is_pointer_(false), dtype_(DataType::kFloat) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(double value)
+      : double_(value), is_pointer_(false), dtype_(DataType::kDouble) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(Eigen::half value)
+      : half_(value), is_pointer_(false), dtype_(DataType::kHalf) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(int8 value)
+      : int8_(value), is_pointer_(false), dtype_(DataType::kInt8) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(int32 value)
+      : int32_(value), is_pointer_(false), dtype_(DataType::kInt32) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(std::complex<float> value)
+      : complex_float_(value),
+        is_pointer_(false),
+        dtype_(DataType::kComplexFloat) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(std::complex<double> value)
+      : complex_double_(value),
+        is_pointer_(false),
+        dtype_(DataType::kComplexDouble) {}
+  template <typename T>
+  explicit HostOrDeviceScalar(const DeviceMemory<T>& pointer)
+      : pointer_(pointer),
+        is_pointer_(true),
+        dtype_(dnn::ToDataType<T>::value) {
+    CHECK_EQ(1, pointer.ElementCount());
+  }
+  // Construct from statically-typed version.
+  template <typename T, typename std::enable_if<!std::is_same<T, void>::value,
+                                                int>::type = 0>
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(const HostOrDeviceScalar<T>& other) {
+    if (other.is_pointer()) {
+      *this = HostOrDeviceScalar(other.pointer());
+    } else {
+      *this = HostOrDeviceScalar(other.value());
+    }
+  }
+
+  bool is_pointer() const { return is_pointer_; }
+  template <typename T>
+  const DeviceMemory<T>& pointer() const {
+    CHECK(is_pointer());
+    CHECK(dtype_ == dnn::ToDataType<T>::value);
+    return pointer_;
+  }
+  template <typename T>
+  const T& value() const {
+    CHECK(!is_pointer());
+    CHECK(dtype_ == dnn::ToDataType<T>::value);
+    return value_impl<T>();
+  }
+  const DeviceMemoryBase& opaque_pointer() const {
+    CHECK(is_pointer());
+    return pointer_;
+  }
+  const void* opaque_value() const {
+    CHECK(!is_pointer());
+    switch (dtype_) {
+      case DataType::kFloat:
+        return &float_;
+      case DataType::kDouble:
+        return &double_;
+      case DataType::kHalf:
+        return &half_;
+      case DataType::kInt8:
+        return &int8_;
+      case DataType::kInt32:
+        return &int32_;
+      case DataType::kComplexFloat:
+        return &complex_float_;
+      case DataType::kComplexDouble:
+        return &complex_double_;
+      default:
+        return nullptr;
+    }
+  }
+  DataType data_type() const { return dtype_; }
+
+ private:
+  template <typename T>
+  const T& value_impl() const;
+
+  union {
+    float float_;
+    double double_;
+    Eigen::half half_;
+    int8 int8_;
+    int32 int32_;
+    std::complex<float> complex_float_;
+    std::complex<double> complex_double_;
+    DeviceMemoryBase pointer_;
+  };
+  bool is_pointer_;
+  DataType dtype_;
+};
+
+template <>
+inline const float& HostOrDeviceScalar<void>::value_impl<float>() const {
+  return float_;
+}
+
+template <>
+inline const double& HostOrDeviceScalar<void>::value_impl<double>() const {
+  return double_;
+}
+
+template <>
+inline const Eigen::half& HostOrDeviceScalar<void>::value_impl<Eigen::half>()
+    const {
+  return half_;
+}
+
+template <>
+inline const int8& HostOrDeviceScalar<void>::value_impl<int8>() const {
+  return int8_;
+}
+
+template <>
+inline const int32& HostOrDeviceScalar<void>::value_impl<int32>() const {
+  return int32_;
+}
+
+template <>
+inline const std::complex<float>&
+HostOrDeviceScalar<void>::value_impl<std::complex<float>>() const {
+  return complex_float_;
+}
+
+template <>
+inline const std::complex<double>&
+HostOrDeviceScalar<void>::value_impl<std::complex<double>>() const {
+  return complex_double_;
+}
+
 }  // namespace stream_executor
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/tensorflow/stream_executor/platform/default/dlopen_checker.cc b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
index b55c9f53793..7b38dfcfec0 100644
--- a/tensorflow/stream_executor/platform/default/dlopen_checker.cc
+++ b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
@@ -23,6 +23,7 @@ namespace DsoLoader {
 port::Status TryDlopenCUDALibraries() {
   auto cudart_status = GetCudaRuntimeDsoHandle();
   auto cublas_status = GetCublasDsoHandle();
+  auto cublaslt_status = GetCublasLtDsoHandle();
   auto cufft_status = GetCufftDsoHandle();
   auto curand_status = GetCurandDsoHandle();
   auto cusolver_status = GetCusolverDsoHandle();
@@ -31,7 +32,7 @@ port::Status TryDlopenCUDALibraries() {
   if (!cudart_status.status().ok() || !cublas_status.status().ok() ||
       !cufft_status.status().ok() || !curand_status.status().ok() ||
       !cusolver_status.status().ok() || !cusparse_status.status().ok() ||
-      !cudnn_status.status().ok()) {
+      !cudnn_status.status().ok() || !cublaslt_status.status().ok()) {
     return port::Status(port::error::INTERNAL,
                         absl::StrCat("Cannot dlopen all CUDA libraries."));
   } else {
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index a78c738f32c..8b8cb2ff937 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -85,6 +85,10 @@ port::StatusOr<void*> GetCublasDsoHandle() {
   return GetDsoHandle("cublas", GetCublasVersion());
 }
 
+port::StatusOr<void*> GetCublasLtDsoHandle() {
+  return GetDsoHandle("cublasLt", GetCublasVersion());
+}
+
 port::StatusOr<void*> GetCufftDsoHandle() {
   return GetDsoHandle("cufft", GetCufftVersion());
 }
@@ -161,6 +165,11 @@ port::StatusOr<void*> GetCublasDsoHandle() {
   return *result;
 }
 
+port::StatusOr<void*> GetCublasLtDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCublasLtDsoHandle());
+  return *result;
+}
+
 port::StatusOr<void*> GetCurandDsoHandle() {
   static auto result = new auto(DsoLoader::GetCurandDsoHandle());
   return *result;
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 91138f713bd..7f087349fcf 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -37,6 +37,7 @@ namespace DsoLoader {
 port::StatusOr<void*> GetCudaDriverDsoHandle();
 port::StatusOr<void*> GetCudaRuntimeDsoHandle();
 port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCublasLtDsoHandle();
 port::StatusOr<void*> GetCufftDsoHandle();
 port::StatusOr<void*> GetCurandDsoHandle();
 port::StatusOr<void*> GetCusolverDsoHandle();
@@ -72,6 +73,7 @@ namespace CachedDsoLoader {
 port::StatusOr<void*> GetCudaDriverDsoHandle();
 port::StatusOr<void*> GetCudaRuntimeDsoHandle();
 port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCublasLtDsoHandle();
 port::StatusOr<void*> GetCufftDsoHandle();
 port::StatusOr<void*> GetCurandDsoHandle();
 port::StatusOr<void*> GetCusolverDsoHandle();
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
index 5ddad13ddf9..2223cb9ad67 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -2540,6 +2540,32 @@ port::Status ROCMBlas::GetVersion(string *version) {
   return port::UnimplementedError("");
 }
 
+port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+ROCMBlas::CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &p) {
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "CreateBlasLtMatmulPlan is not supported with this version of ROCM");
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+ROCMBlas::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                                    size_t max_workspace_size,
+                                    int max_algorithm_count) {
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "GetBlasLtMatmulAlgorithms is not supported with this version of ROCM");
+}
+
+bool ROCMBlas::DoBlasLtMatmul(
+    Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+    const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+    DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+    DeviceMemoryBase c, ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,
+    blas::ProfileResult *output_profile_result) {
+  return false;
+}
+
 }  // namespace gpu
 
 void initialize_rocblas() {
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 46441d396b7..4ad9fc128cc 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -4322,6 +4322,80 @@ Stream &Stream::ThenBlasGemmStridedBatched(
               c, ldc, stride_c, batch_count);
 }
 
+template <typename ABType, typename CType>
+Stream &Stream::ThenBlasLtMatmulImpl(
+    const blas::IBlasLtMatmulPlan *plan, const HostOrDeviceScalar<CType> &alpha,
+    const DeviceMemory<ABType> &a, const DeviceMemory<ABType> &b,
+    const HostOrDeviceScalar<CType> &beta, DeviceMemory<CType> *c,
+    ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm,
+    const DeviceMemory<CType> &bias,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta),
+            PARAM(c), PARAM(algorithm), PARAM(bias));
+
+  ThenBlasWithProfileImpl<
+      const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<CType> &,
+      const DeviceMemory<ABType> &, const DeviceMemory<ABType> &,
+      const HostOrDeviceScalar<CType> &, DeviceMemory<CType> *,
+      ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+      const DeviceMemory<CType> &>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta,
+              c, scratch_allocator, algorithm, bias, output_profile_result);
+}
+
+// Explicit template instantiations for each supported type combination.
+template Stream &Stream::ThenBlasLtMatmulImpl<int8, int32>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<int32> &,
+    const DeviceMemory<int8> &, const DeviceMemory<int8> &,
+    const HostOrDeviceScalar<int32> &, DeviceMemory<int32> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<int32> &, blas::ProfileResult *);
+
+template Stream &Stream::ThenBlasLtMatmulImpl<Eigen::half, Eigen::half>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<Eigen::half> &,
+    const DeviceMemory<Eigen::half> &, const DeviceMemory<Eigen::half> &,
+    const HostOrDeviceScalar<Eigen::half> &, DeviceMemory<Eigen::half> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<Eigen::half> &, blas::ProfileResult *);
+
+template Stream &Stream::ThenBlasLtMatmulImpl<float, float>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<float> &,
+    const DeviceMemory<float> &, const DeviceMemory<float> &,
+    const HostOrDeviceScalar<float> &, DeviceMemory<float> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<float> &, blas::ProfileResult *);
+
+template Stream &Stream::ThenBlasLtMatmulImpl<double, double>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<double> &,
+    const DeviceMemory<double> &, const DeviceMemory<double> &,
+    const HostOrDeviceScalar<double> &, DeviceMemory<double> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<double> &, blas::ProfileResult *);
+
+template Stream &
+Stream::ThenBlasLtMatmulImpl<std::complex<float>, std::complex<float>>(
+    const blas::IBlasLtMatmulPlan *,
+    const HostOrDeviceScalar<std::complex<float>> &,
+    const DeviceMemory<std::complex<float>> &,
+    const DeviceMemory<std::complex<float>> &,
+    const HostOrDeviceScalar<std::complex<float>> &,
+    DeviceMemory<std::complex<float>> *, ScratchAllocator *,
+    const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<std::complex<float>> &, blas::ProfileResult *);
+
+template Stream &
+Stream::ThenBlasLtMatmulImpl<std::complex<double>, std::complex<double>>(
+    const blas::IBlasLtMatmulPlan *,
+    const HostOrDeviceScalar<std::complex<double>> &,
+    const DeviceMemory<std::complex<double>> &,
+    const DeviceMemory<std::complex<double>> &,
+    const HostOrDeviceScalar<std::complex<double>> &,
+    DeviceMemory<std::complex<double>> *, ScratchAllocator *,
+    const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<std::complex<double>> &, blas::ProfileResult *);
+
 Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
   VLOG_CALL(PARAM(seed), PARAM(seed_bytes));
 
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index df5acddccd5..b1460b02935 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -75,6 +75,19 @@ class AlgorithmDesc;
 class StreamExecutor;
 class ScratchAllocator;
 
+namespace detail {
+
+// Helper class to prevent a template function argument from being deduced. This
+// is identical to std::type_identity in C++20.
+template <typename T>
+struct NonDeduced {
+  using type = T;
+};
+template <typename T>
+using NonDeducedType = typename NonDeduced<T>::type;
+
+}  // namespace detail
+
 // Convert a type to the corresponding QuantizedActivationMode.
 template <typename ElementType>
 struct Quantization;
@@ -1632,6 +1645,25 @@ class Stream {
                        const DeviceMemory<std::complex<double>> &a, int lda,
                        DeviceMemory<std::complex<double>> *b, int ldb);
 
+  // See BlasSupport::DoBlatLtMatmul.
+  // Note that we prevent alpha and beta from being used to deduce CType so that
+  // they can be constructed implicitly from values of type CType. Without this,
+  // type deduction would fail when this function is called with a value of type
+  // CType for alpha or beta.
+  template <typename ABType, typename CType>
+  Stream &ThenBlasLtMatmul(
+      const blas::IBlasLtMatmulPlan *plan,
+      const detail::NonDeducedType<HostOrDeviceScalar<CType>> &alpha,
+      const DeviceMemory<ABType> &a, const DeviceMemory<ABType> &b,
+      const detail::NonDeducedType<HostOrDeviceScalar<CType>> &beta,
+      DeviceMemory<CType> *c, ScratchAllocator *scratch_allocator,
+      const blas::IBlasLtMatmulAlgorithm *algorithm,
+      const DeviceMemory<CType> &bias = {},
+      blas::ProfileResult *output_profile_result = nullptr) {
+    return ThenBlasLtMatmulImpl(plan, alpha, a, b, beta, c, scratch_allocator,
+                                algorithm, bias, output_profile_result);
+  }
+
   // See FftSupport::DoFft.
   Stream &ThenFft(fft::Plan *plan,
                   const DeviceMemory<std::complex<float>> &input,
@@ -2064,6 +2096,19 @@ class Stream {
       const dnn::BatchDescriptor &bias_descriptor,
       DeviceMemory<T> *backward_bias_data);
 
+  // Implementation of ThenBlasLtMatmul that is shared by all types.
+  template <typename ABType, typename CType>
+  Stream &ThenBlasLtMatmulImpl(const blas::IBlasLtMatmulPlan *plan,
+                               const HostOrDeviceScalar<CType> &alpha,
+                               const DeviceMemory<ABType> &a,
+                               const DeviceMemory<ABType> &b,
+                               const HostOrDeviceScalar<CType> &beta,
+                               DeviceMemory<CType> *c,
+                               ScratchAllocator *scratch_allocator,
+                               const blas::IBlasLtMatmulAlgorithm *algorithm,
+                               const DeviceMemory<CType> &bias,
+                               blas::ProfileResult *output_profile_result);
+
   SE_DISALLOW_COPY_AND_ASSIGN(Stream);
 };
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index c32793e3e83..35b4844adc5 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -337,6 +337,30 @@ bool StreamExecutor::GetBlasGemmAlgorithms(
   return blas_support->GetBlasGemmAlgorithms(out_algorithms);
 }
 
+port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+StreamExecutor::CreateBlasLtMatmulPlan(
+    const blas::BlasLtMatmulPlanParams &params) {
+  blas::BlasSupport *blas_support = AsBlas();
+  if (!blas_support) {
+    return port::Status(port::error::UNKNOWN,
+                        "Fail to find the blas implementation.");
+  }
+  return blas_support->CreateBlasLtMatmulPlan(params);
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+StreamExecutor::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                                          size_t max_workspace_size,
+                                          int max_algorithm_count) {
+  blas::BlasSupport *blas_support = AsBlas();
+  if (!blas_support) {
+    return port::Status(port::error::UNKNOWN,
+                        "Fail to find the blas implementation.");
+  }
+  return blas_support->GetBlasLtMatmulAlgorithms(plan, max_workspace_size,
+                                                 max_algorithm_count);
+}
+
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
@@ -744,7 +768,6 @@ bool StreamExecutor::AllocateStream(Stream *stream) {
     return false;
   }
 
-  RegisterStream(stream);
   return true;
 }
 
@@ -752,7 +775,6 @@ void StreamExecutor::DeallocateStream(Stream *stream) {
   implementation_->DeallocateStream(stream);
   CHECK_GE(live_stream_count_.fetch_sub(1), 0)
       << "live stream count should not dip below zero";
-  UnregisterStream(stream);
 }
 
 bool StreamExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index f19c76c3790..43774cb5c71 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -395,6 +395,21 @@ class StreamExecutor {
   // Get the list of supported algorithms for BLAS gemm.
   bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms);
 
+  // Creates a backend-specific plan object for a blaslt matmul operation, which
+  // can then be passed to DoBlasLtMatmul(). When possible, plans should be
+  // created once and reused for multiple calls to DoBlasLtMatmul().
+  // Returns a null pointer on failure.
+  port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+  CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &params);
+
+  // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are
+  // returned in the order of increasing estimated compute time according to an
+  // internal heuristic. The first returned algorithm can be used as the default
+  // algorithm if no autotuning is to be performed.
+  port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+  GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                            size_t max_workspace_size, int max_algorithm_count);
+
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
@@ -513,24 +528,6 @@ class StreamExecutor {
   // allocation.
   StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
 
-  // Block host until all streams associated with this stream executor have
-  // finished all of enqueued work.
-  port::Status BlockHostUntilAllStreamsAreDone() {
-    std::vector<Stream *> streams;
-    {
-      absl::MutexLock lock(&mu_);
-      for (Stream *stream : streams_) {
-        streams.push_back(stream);
-      }
-    }
-
-    for (Stream *stream : streams) {
-      TF_RETURN_IF_ERROR(BlockHostUntilDone(stream));
-    }
-
-    return port::Status::OK();
-  }
-
  private:
   template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
             typename... BeginArgsT>
@@ -660,16 +657,6 @@ class StreamExecutor {
   template <typename TraceCallT, typename... ArgsT>
   void SubmitTrace(TraceCallT trace_call, ArgsT &&...args);
 
-  void RegisterStream(Stream *stream) {
-    absl::MutexLock lock(&mu_);
-    streams_.insert(stream);
-  }
-
-  void UnregisterStream(Stream *stream) {
-    absl::MutexLock lock(&mu_);
-    streams_.erase(stream);
-  }
-
   // Reader/writer lock for class-static StreamExecutor members.
   static absl::Mutex static_mu_;
 
@@ -760,9 +747,6 @@ class StreamExecutor {
 
   StreamExecutorMemoryAllocator allocator_;
 
-  // Set of streams associated with this stream executor.
-  std::set<Stream *> streams_ TF_GUARDED_BY(mu_);
-
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };
 
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 98d59726b60..a01adb43e78 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -71,16 +71,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_node_context_c_api_hdrs",
-    hdrs = ["tpu_node_context_c_api.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":c_api_decl",
-        "//tensorflow/core/tpu:libtftpu_header",
-    ],
-)
-
 cc_library(
     name = "status_helper",
     hdrs = ["status_helper.h"],
@@ -195,7 +185,6 @@ cc_library(
     deps = [
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
-        ":tpu_node_context_c_api_hdrs",
         ":tpu_platform_interface",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:backend",
@@ -204,6 +193,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/memory",
@@ -293,9 +283,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_program_c_api_hdrs",
+        "//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.cc b/tensorflow/stream_executor/tpu/c_api_conversions.cc
index 0a7801f45fc..896a2b6a1f8 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.cc
@@ -34,10 +34,8 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
     i++;
   }
 
-  xla::ShapedBuffer xla_shaped_buffer(
-      xla_on_device_shape,
-      tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(),
-      c_buffer->device_ordinal);
+  xla::ShapedBuffer xla_shaped_buffer(xla_on_device_shape,
+                                      c_buffer->device_ordinal);
   xla_shaped_buffer.set_buffers(xla_shape_tree);
   return xla_shaped_buffer;
 }
@@ -92,7 +90,7 @@ SE_DeviceMemoryAllocator ToC(
   se_allocator.allocate = [](void* ctx, int device_ordinal, uint64_t size,
                              bool retry_on_failure, int64_t memory_space,
                              SE_ScopedDeviceMemory* memory,
-                             SE_Status* se_status) {
+                             TF_Status* se_status) {
     auto allocation =
         reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(ctx)
             ->Allocate(device_ordinal, size, retry_on_failure, memory_space);
@@ -109,7 +107,7 @@ SE_DeviceMemoryAllocator ToC(
   };
 
   se_allocator.deallocate = [](void* ctx, SE_DeviceMemoryBase* base,
-                               int device_ordinal, SE_Status* se_status) {
+                               int device_ordinal, TF_Status* se_status) {
     auto status = reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(ctx)
                       ->Deallocate(device_ordinal, ApiConverter::FromC(*base));
     if (!status.ok()) {
@@ -149,18 +147,171 @@ stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base) {
   return base;
 }
 
-xla::Shape FromC(const XLA_Shape* shape) {
-  xla::ShapeProto p;
-  p.ParseFromArray(shape->bytes, shape->size);
-  return xla::Shape(p);
+// Helper functions for copying data to possibly-inlined C arrays.
+
+// 'Src' and 'Dst' are allowed to be different types to make this usable with
+// memory-identical types, e.g. int64 and int64_t. This should not be used with
+// types that require a static_cast.
+template <typename Src, typename Dst, typename DstList>
+static void CopyVectorBase(const absl::Span<Src> src, DstList* dst) {
+  static_assert(sizeof(Src) == sizeof(Dst), "Mismatched types");
+  dst->size = src.size();
+  if (dst->size > TPU_C_API_MAX_INLINED) {
+    dst->heap = new Dst[dst->size];
+    memcpy(dst->heap, src.data(), dst->size * sizeof(Src));
+  } else {
+    memcpy(dst->inlined, src.data(), dst->size * sizeof(Src));
+  }
+}
+
+static void CopyVector(const absl::Span<const typename tensorflow::int64> src,
+                       Int64List* dst) {
+  return CopyVectorBase<const typename tensorflow::int64, int64_t, Int64List>(
+      src, dst);
+}
+static void CopyVector(const absl::Span<const bool> src, BoolList* dst) {
+  return CopyVectorBase<const bool, bool, BoolList>(src, dst);
+}
+
+static void CopyVector(const absl::Span<const xla::Tile> src, TileList* dst) {
+  dst->size = src.size();
+  XLA_Tile* c_tiles;
+  if (dst->size > TPU_C_API_MAX_INLINED) {
+    dst->heap = new XLA_Tile[dst->size];
+    c_tiles = dst->heap;
+  } else {
+    c_tiles = dst->inlined;
+  }
+  for (int i = 0; i < dst->size; ++i) {
+    ToC(src[i], &c_tiles[i]);
+  }
+}
+
+// Helper functions for creating a view of possibly-inlined C arrays.
+
+// 'Src' and 'Dst' are allowed to be different types to make this usable with
+// memory-identical types, e.g. int64 and int64_t. This should not be used with
+// types that require a static_cast.
+template <typename Dst, typename Src, typename SrcList>
+static absl::Span<const Dst> MakeSpanBase(const SrcList& src_list) {
+  static_assert(sizeof(Src) == sizeof(Dst), "Mismatched types");
+  const Src* src = src_list.size > TPU_C_API_MAX_INLINED ? src_list.heap
+                                                         : &src_list.inlined[0];
+  return absl::Span<const Dst>(reinterpret_cast<const Dst*>(src),
+                               src_list.size);
+}
+
+static absl::Span<const typename tensorflow::int64> MakeSpan(
+    const Int64List& src_list) {
+  return MakeSpanBase<typename tensorflow::int64, int64_t, Int64List>(src_list);
+}
+static absl::Span<const bool> MakeSpan(const BoolList& src_list) {
+  return MakeSpanBase<bool, bool, BoolList>(src_list);
 }
 
 void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape) {
-  xla::ShapeProto p = xla_shape.ToProto();
-  std::string p_str = p.SerializeAsString();
-  c_shape->bytes = new char[p_str.size()];
-  c_shape->size = p_str.size();
-  memcpy(c_shape->bytes, p_str.data(), p_str.size());
+  c_shape->element_type = xla_shape.element_type();
+
+  CopyVector(xla_shape.dimensions(), &c_shape->dimensions);
+  CopyVector(xla_shape.dynamic_dimensions(), &c_shape->dynamic_dimensions);
+
+  c_shape->ntuple_shapes = xla_shape.tuple_shapes_size();
+  if (c_shape->ntuple_shapes > 0) {
+    c_shape->tuple_shapes = new XLA_Shape[c_shape->ntuple_shapes];
+    for (int i = 0; i < c_shape->ntuple_shapes; ++i) {
+      ToC(xla_shape.tuple_shapes(i), &c_shape->tuple_shapes[i]);
+    }
+  }
+
+  if (xla_shape.has_layout()) {
+    ToC(xla_shape.layout(), &c_shape->layout);
+  } else {
+    c_shape->layout.format = xla::INVALID_FORMAT;
+  }
+}
+
+xla::Shape FromC(const XLA_Shape* c_shape) {
+  absl::Span<const typename tensorflow::int64> dims =
+      MakeSpan(c_shape->dimensions);
+  absl::Span<const bool> dynamic_dims = MakeSpan(c_shape->dynamic_dimensions);
+
+  std::vector<xla::Shape> tuple_shapes;
+  tuple_shapes.reserve(c_shape->ntuple_shapes);
+  for (int i = 0; i < c_shape->ntuple_shapes; ++i) {
+    tuple_shapes.push_back(FromC(&c_shape->tuple_shapes[i]));
+  }
+
+  xla::Shape result(static_cast<xla::PrimitiveType>(c_shape->element_type),
+                    dims, dynamic_dims, std::move(tuple_shapes));
+  if (c_shape->layout.format != xla::INVALID_FORMAT) {
+    *result.mutable_layout() = FromC(&c_shape->layout);
+  }
+  return result;
+}
+
+void Free(XLA_Shape* c_shape) {
+  if (c_shape->dimensions.size > TPU_C_API_MAX_INLINED) {
+    delete[] c_shape->dimensions.heap;
+  }
+  if (c_shape->dynamic_dimensions.size > TPU_C_API_MAX_INLINED) {
+    delete[] c_shape->dynamic_dimensions.heap;
+  }
+  if (c_shape->ntuple_shapes > 0) {
+    for (int i = 0; i < c_shape->ntuple_shapes; ++i) {
+      Free(&c_shape->tuple_shapes[i]);
+    }
+    delete[] c_shape->tuple_shapes;
+  }
+  if (c_shape->layout.format != xla::INVALID_FORMAT) {
+    Free(&c_shape->layout);
+  }
+}
+
+void ToC(const xla::Layout& layout, XLA_Layout* c_layout) {
+  c_layout->format = layout.format();
+  CopyVector(layout.minor_to_major(), &c_layout->minor_to_major);
+  c_layout->element_size_in_bits = layout.element_size_in_bits();
+  c_layout->memory_space = layout.memory_space();
+  CopyVector(layout.tiles(), &c_layout->tiles);
+}
+
+xla::Layout FromC(const XLA_Layout* c_layout) {
+  absl::Span<const typename tensorflow::int64> minor_to_major =
+      MakeSpan(c_layout->minor_to_major);
+  absl::InlinedVector<xla::Tile, 1> tiles;
+  const XLA_Tile* c_tiles = c_layout->tiles.size > TPU_C_API_MAX_INLINED
+                                ? c_layout->tiles.heap
+                                : c_layout->tiles.inlined;
+  for (int i = 0; i < c_layout->tiles.size; ++i) {
+    tiles.push_back(FromC(&c_tiles[i]));
+  }
+  return xla::Layout(minor_to_major, tiles, c_layout->element_size_in_bits,
+                     c_layout->memory_space);
+}
+
+void Free(XLA_Layout* c_layout) {
+  if (c_layout->minor_to_major.size > TPU_C_API_MAX_INLINED) {
+    delete[] c_layout->minor_to_major.heap;
+  }
+  if (c_layout->tiles.size > TPU_C_API_MAX_INLINED) {
+    delete[] c_layout->tiles.heap;
+  }
+}
+
+void ToC(const xla::Tile& tile, XLA_Tile* c_tile) {
+  CopyVector(tile.dimensions(), &c_tile->dimensions);
+}
+
+xla::Tile FromC(const XLA_Tile* c_tile) {
+  absl::Span<const typename tensorflow::int64> dims =
+      MakeSpan(c_tile->dimensions);
+  return xla::Tile(dims);
+}
+
+void Free(XLA_Tile* c_tile) {
+  if (c_tile->dimensions.size > TPU_C_API_MAX_INLINED) {
+    delete[] c_tile->dimensions.heap;
+  }
 }
 
 XLA_ShapeIndex ToC(const xla::ShapeIndex& xla_shape) {
@@ -212,7 +363,6 @@ void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
   }
 }
 
-void Free(XLA_Shape* shape) { delete[] shape->bytes; }
 void Free(XLA_ShapeIndex* shape_index) { delete[] shape_index; }
 void Free(SE_DeviceMemoryBase*) {}
 
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.h b/tensorflow/stream_executor/tpu/c_api_conversions.h
index c4b5648e097..da856a8720b 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.h
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.h
@@ -43,9 +43,19 @@ stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
 void Free(SE_DeviceMemoryBase*);
 
 // xla::Shape
-xla::Shape FromC(const XLA_Shape* shape);
+xla::Shape FromC(const XLA_Shape* c_shape);
 void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape);
-void Free(XLA_Shape* shape);
+void Free(XLA_Shape* c_shape);
+
+// xla::Layout
+xla::Layout FromC(const XLA_Layout* c_layout);
+void ToC(const xla::Layout& xla_layout, XLA_Layout* c_layout);
+void Free(XLA_Layout* c_layout);
+
+// xla::Tile
+xla::Tile FromC(const XLA_Tile* c_tile);
+void ToC(const xla::Tile& xla_tile, XLA_Tile* c_tile);
+void Free(XLA_Tile* c_tile);
 
 // xla::ShapeIndex
 XLA_ShapeIndex ToC(const xla::ShapeIndex& xla_shape);
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
index dcb53823e0c..71a725f5886 100644
--- a/tensorflow/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/stream_executor/tpu/c_api_decl.h
@@ -25,6 +25,9 @@ limitations under the License.
 
 extern "C" {
 
+// Maximum number of array elements to inline into structs for performance.
+#define TPU_C_API_MAX_INLINED 6
+
 enum TpuCoreTypeEnum {
   kTensorCore,
   kEmbeddingV1,
@@ -38,8 +41,6 @@ enum TpuVersionEnum {
   kTpuV4,
 };
 
-typedef struct SE_Status SE_Status;
-
 typedef struct SE_Platform SE_Platform;
 typedef struct SE_StreamExecutor SE_StreamExecutor;
 typedef struct SE_Stream SE_Stream;
@@ -56,7 +57,7 @@ typedef struct SE_PlatformId {
 } SE_PlatformId;
 typedef struct SE_StreamExecutorConfig SE_StreamExecutorConfig;
 typedef struct SE_DeviceOptions SE_DeviceOptions;
-typedef SE_Status* (*SE_StatusCallbackFn)(void*);
+typedef TF_Status* (*SE_StatusCallbackFn)(void*);
 
 typedef struct SE_DeviceMemoryBase {
   void* opaque;
@@ -92,10 +93,10 @@ typedef struct SE_AllocatorStats {
 // direction and request memory via a callback.
 typedef void (*SE_AllocateFn)(void* ctx, int device_ordinal, uint64_t size,
                               bool retry_on_failure, int64_t memory_space,
-                              SE_ScopedDeviceMemory* result, SE_Status* status);
+                              SE_ScopedDeviceMemory* result, TF_Status* status);
 
 typedef void (*SE_DeallocateFn)(void* ctx, SE_DeviceMemoryBase* base,
-                                int device_ordinal, SE_Status* status);
+                                int device_ordinal, TF_Status* status);
 
 typedef struct SE_DeviceMemoryAllocator {
   SE_Platform* platform;
@@ -168,11 +169,50 @@ typedef struct SE_MaybeOwningDeviceMemory {
   SE_DeviceMemoryAllocator allocator;
 } SE_MaybeOwningDeviceMemory;
 
+struct Int64List {
+  union {
+    int64_t* heap;  // owned
+    int64_t inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+};
+
+struct BoolList {
+  union {
+    bool* heap;  // owned
+    bool inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+};
+
+typedef struct XLA_Tile {
+  Int64List dimensions;
+} XLA_Tile;
+
+struct TileList {
+  union {
+    XLA_Tile* heap;  // owned
+    XLA_Tile inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+};
+
+typedef struct XLA_Layout {
+  int format;
+  Int64List minor_to_major;
+  TileList tiles;
+  int64_t element_size_in_bits;
+  int64_t memory_space;
+} XLA_Layout;
+
 // Represents an XLA shape tree.
-// Shapes are flattened in default traversal order.
 typedef struct XLA_Shape {
-  char* bytes;
-  size_t size;
+  int element_type;
+  Int64List dimensions;
+  BoolList dynamic_dimensions;
+  XLA_Shape* tuple_shapes;  // owned
+  int ntuple_shapes;
+  XLA_Layout layout;
 } XLA_Shape;
 
 // Represents a leaf node for a XLA shaped buffer.
@@ -257,7 +297,7 @@ typedef struct XLA_TransferManager XLA_TransferManager;
 typedef struct XLA_ComputationPlacer XLA_ComputationPlacer;
 
 typedef void (*XLA_CallbackFn)(void*);
-typedef void (*XLA_StatusCallbackFn)(void*, SE_Status*);
+typedef void (*XLA_StatusCallbackFn)(void*, TF_Status*);
 
 typedef struct SE_TpuTopology SE_TpuTopology;
 typedef struct SE_TpuTopology_Core SE_TpuTopology_Core;
diff --git a/tensorflow/stream_executor/tpu/status_helper.h b/tensorflow/stream_executor/tpu/status_helper.h
index 0e522ce8241..319747647b0 100644
--- a/tensorflow/stream_executor/tpu/status_helper.h
+++ b/tensorflow/stream_executor/tpu/status_helper.h
@@ -29,7 +29,7 @@ class StatusHelper {
     tensorflow::tpu::ExecutorApiFn()->TpuStatus_FreeFn(c_status);
   }
 
-  static tensorflow::Status FromC(SE_Status* const c_status) {
+  static tensorflow::Status FromC(TF_Status* const c_status) {
     if (tensorflow::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status)) {
       return tensorflow::Status::OK();
     } else {
@@ -46,7 +46,7 @@ class StatusHelper {
 
   tensorflow::Status status() const { return FromC(c_status); }
 
-  SE_Status* const c_status;  // NOLINT
+  TF_Status* const c_status;  // NOLINT
 };
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.cc b/tensorflow/stream_executor/tpu/tpu_executable.cc
index 3f7d88392e5..9a092046a38 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executable.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/casts.h"
-#include "tensorflow/core/tpu/kernels/tpu_execute_c_api.h"
 #include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
@@ -30,9 +30,7 @@ namespace xla {
 TpuExecutable::TpuExecutable(const XLA_TpuProgram* core_program,
                              std::unique_ptr<HloModule> hlo_module,
                              HostCommandHandler host_command_handler)
-    : TpuExecutableInterface(std::move(hlo_module),
-                             /*hlo_profile_printer_data=*/nullptr,
-                             /*hlo_profile_index_map=*/nullptr),
+    : TpuExecutableInterface(std::move(hlo_module)),
       core_program_(core_program),
       host_command_handler_(std::move(host_command_handler)) {}
 
@@ -79,11 +77,10 @@ Status TpuExecutable::LoadProgramAndEnqueueToStream(
       run_options.run_options().stream()->implementation());
   StatusHelper status;
 
-  tensorflow::tpu::ExecuteApiFn()
-      ->TpuExecutable_LoadProgramAndEnqueueToStreamFn(
-          core_program_, arguments_bases, arguments.size(), &result_base,
-          (cross_program_prefetch_addr.has_value() ? &prefetch_base : nullptr),
-          rng_seed, &c_dev_assign, stream, status.c_status);
+  tensorflow::tpu::OpsApiFn()->TpuExecutable_LoadProgramAndEnqueueToStreamFn(
+      core_program_, arguments_bases, arguments.size(), &result_base,
+      (cross_program_prefetch_addr.has_value() ? &prefetch_base : nullptr),
+      rng_seed, &c_dev_assign, stream, status.c_status);
 
   if (dev_assign != nullptr) {
     stream_executor::tpu::SerializedProto_Free(dev_assign_serialized);
@@ -96,7 +93,7 @@ Shape TpuExecutable::HostShapeToDeviceShape(const Shape& host_shape) {
   XLA_Shape c_host_shape;
   XLA_Shape c_device_shape;
   ApiConverter::ToC(host_shape, &c_host_shape);
-  tensorflow::tpu::ExecuteApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
+  tensorflow::tpu::OpsApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
       &c_host_shape, &c_device_shape);
   Shape device_shape = ApiConverter::FromC(&c_device_shape);
   ApiConverter::Free(&c_host_shape);
@@ -108,7 +105,7 @@ int64 TpuExecutable::ShapeSize(const Shape& shape) {
   XLA_Shape c_shape;
   ApiConverter::ToC(shape, &c_shape);
   int64 size =
-      tensorflow::tpu::ExecuteApiFn()->HardwareLayout_ShapeSizeFn(&c_shape);
+      tensorflow::tpu::OpsApiFn()->HardwareLayout_ShapeSizeFn(&c_shape);
   ApiConverter::Free(&c_shape);
   return size;
 }
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.h b/tensorflow/stream_executor/tpu/tpu_executable.h
index d2c3200c93d..0785d66b83a 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable.h
+++ b/tensorflow/stream_executor/tpu/tpu_executable.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/tpu/tpu_executable_interface.h"
 
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.h b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
index 18e74a7f19d..43f67ff4375 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
@@ -40,12 +40,8 @@ namespace xla {
 // An executable capable of being fed to a TPU device.
 class TpuExecutableInterface : public Executable {
  public:
-  explicit TpuExecutableInterface(
-      std::shared_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-      : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
-                   std::move(hlo_profile_index_map)) {}
+  explicit TpuExecutableInterface(std::shared_ptr<HloModule> hlo_module)
+      : Executable(std::move(hlo_module)) {}
   ~TpuExecutableInterface() override = default;
 
   StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 177e813fc4c..9f48844c005 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -331,10 +331,10 @@ struct HostCallbackContext {
   std::function<Status()> callback;
 };
 
-SE_Status* HostCallbackTrampoline(void* ctx) {
+TF_Status* HostCallbackTrampoline(void* ctx) {
   HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
   Status status = host_ctx->callback();
-  SE_Status* c_status = tpu::ExecutorApiFn()->TpuStatus_CreateFn(
+  TF_Status* c_status = tpu::ExecutorApiFn()->TpuStatus_CreateFn(
       status.code(), status.error_message().c_str());
   delete host_ctx;
   return c_status;
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index 193730567e7..f9e0e0d5c56 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -30,11 +30,11 @@ SE_Platform* TpuPlatform_New();
 void TpuPlatform_Free(SE_Platform* platform);
 void TpuPlatform_Initialize(SE_Platform* platform, size_t options_size,
                             const char** options_key,
-                            const char** options_value, SE_Status* status);
+                            const char** options_value, TF_Status* status);
 bool TpuPlatform_Initialized(SE_Platform* platform);
 SE_StreamExecutor* TpuPlatform_GetExecutor(SE_Platform* platform,
                                            SE_StreamExecutorConfig* config,
-                                           SE_Status* status);
+                                           TF_Status* status);
 SE_PlatformId TpuPlatform_Id(SE_Platform* platform);
 int64_t TpuPlatform_VisibleDeviceCount(SE_Platform* platform);
 int64_t TpuPlatform_TpuMemoryLimit(SE_Platform* platform);
@@ -43,7 +43,7 @@ SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
 SE_TpuTopology_Host* TpuPlatform_GetHostLocation(SE_Platform* platform);
 
 void TpuExecutor_Init(SE_StreamExecutor* executor, int device_ordinal,
-                      SE_DeviceOptions* device_options, SE_Status* status);
+                      SE_DeviceOptions* device_options, TF_Status* status);
 void TpuExecutor_Free(SE_StreamExecutor* executor);
 
 int TpuExecutor_PlatformDeviceCount(SE_StreamExecutor* executor);
@@ -63,20 +63,20 @@ void TpuExecutor_DeallocateStream(SE_StreamExecutor* executor,
 bool TpuExecutor_CreateStreamDependency(SE_StreamExecutor* executor,
                                         SE_Stream* dependent, SE_Stream* other);
 void TpuExecutor_GetStatus(SE_StreamExecutor* executor, SE_Stream* stream,
-                           SE_Status* status);
+                           TF_Status* status);
 
 SE_TpuTopology_Core* TpuExecutor_GetCoreLocation(SE_StreamExecutor* executor);
 
 void TpuExecutor_AllocateEvent(SE_StreamExecutor* executor, SE_Event* event,
-                               SE_Status* status);
+                               TF_Status* status);
 void TpuExecutor_DeallocateEvent(SE_StreamExecutor* executor, SE_Event* event,
-                                 SE_Status* status);
+                                 TF_Status* status);
 int TpuExecutor_PollForEventStatus(SE_StreamExecutor* executor,
                                    SE_Event* event);
 void TpuExecutor_RecordEvent(SE_StreamExecutor* executor, SE_Stream* stream,
-                             SE_Event* event, SE_Status* status);
+                             SE_Event* event, TF_Status* status);
 void TpuExecutor_WaitForEvent(SE_StreamExecutor* executor, SE_Stream* stream,
-                              SE_Event* event, SE_Status* status);
+                              SE_Event* event, TF_Status* status);
 
 bool TpuExecutor_AllocateTimer(SE_StreamExecutor* executor, SE_Timer* timer);
 void TpuExecutor_DeallocateTimer(SE_StreamExecutor* executor, SE_Timer* timer);
@@ -88,11 +88,11 @@ bool TpuExecutor_StopTimer(SE_StreamExecutor* executor, SE_Stream* stream,
 void TpuExecutor_SynchronousMemcpyToHost(SE_StreamExecutor* executor,
                                          void* host_dst,
                                          const SE_DeviceMemoryBase* device_src,
-                                         uint64_t size, SE_Status* status);
+                                         uint64_t size, TF_Status* status);
 void TpuExecutor_SynchronousMemcpyFromHost(SE_StreamExecutor* executor,
                                            SE_DeviceMemoryBase* device_dst,
                                            const void* host_src, uint64_t size,
-                                           SE_Status* status);
+                                           TF_Status* status);
 bool TpuExecutor_MemcpyToHost(SE_StreamExecutor* executor, SE_Stream* stream,
                               void* host_dst,
                               const SE_DeviceMemoryBase* device_src,
@@ -104,21 +104,21 @@ bool TpuExecutor_MemcpyFromHost(SE_StreamExecutor* executor, SE_Stream* stream,
 
 void TpuExecutor_EnqueueInfeed(SE_StreamExecutor* executor,
                                int32_t infeed_queue_index, const uint8_t* data,
-                               int64_t size, SE_Status* status);
+                               int64_t size, TF_Status* status);
 void TpuExecutor_DequeueOutfeed(SE_StreamExecutor* executor,
                                 int32_t outfeed_queue_index, uint8_t* data,
-                                int64_t size, SE_Status* status);
+                                int64_t size, TF_Status* status);
 void TpuExecutor_WaitForInfeedReady(SE_StreamExecutor* executor,
                                     int32_t infeed_queue_index,
-                                    SE_Status* status);
+                                    TF_Status* status);
 void TpuExecutor_WaitForOutfeedReady(SE_StreamExecutor* executor,
                                      int32_t outfeed_queue_index,
-                                     SE_Status* status);
+                                     TF_Status* status);
 
 void TpuExecutor_BlockHostUntilDone(SE_StreamExecutor* executor,
-                                    SE_Stream* stream, SE_Status* status);
+                                    SE_Stream* stream, TF_Status* status);
 void TpuExecutor_BlockUntilDoneOrFailed(SE_StreamExecutor* executor,
-                                        SE_Status* status);
+                                        TF_Status* status);
 void TpuExecutor_SyncAndForgetFailedStreams(SE_StreamExecutor* executor);
 bool TpuExecutor_SynchronizeAllActivity(SE_StreamExecutor* executor);
 
@@ -130,15 +130,15 @@ bool TpuStream_IsSameSharedMemoryLocation(SE_Stream*, SE_Stream*);
 void TpuStream_EnqueueTransferHostToDevice(SE_Stream* stream,
                                            SE_DeviceMemoryBase device_dst,
                                            void* host_src, uint64_t size,
-                                           SE_Status* status);
+                                           TF_Status* status);
 void TpuStream_EnqueueTransferDeviceToHost(SE_Stream* stream,
                                            SE_DeviceMemoryBase device_src,
                                            void* host_dst, uint64_t size,
-                                           SE_Status* status);
+                                           TF_Status* status);
 void TpuStream_TpuEnqueueOnDeviceSendRecvLocal(SE_Stream* stream,
                                                SE_DeviceMemoryBase send_buffer,
                                                SE_DeviceMemoryBase recv_buffer,
-                                               SE_Status* status);
+                                               TF_Status* status);
 
 SE_Event* TpuEvent_New(SE_StreamExecutor* parent);
 void TpuEvent_Free(SE_Event*);
@@ -148,14 +148,14 @@ void TpuTimer_Free(SE_Timer*);
 int64_t TpuTimer_Nanoseconds(SE_Timer*);
 int64_t TpuTimer_Microseconds(SE_Timer*);
 
-SE_Status* TpuStatus_New();
-SE_Status* TpuStatus_Create(int32_t code, const char* msg);
-void TpuStatus_Set(SE_Status* status, int32_t code, const char* msg,
+TF_Status* TpuStatus_New();
+TF_Status* TpuStatus_Create(int32_t code, const char* msg);
+void TpuStatus_Set(TF_Status* status, int32_t code, const char* msg,
                    int32_t len);
-void TpuStatus_Free(SE_Status* status);
-const char* TpuStatus_Message(SE_Status* status);
-int TpuStatus_Code(SE_Status* status);
-bool TpuStatus_Ok(SE_Status* status);
+void TpuStatus_Free(TF_Status* status);
+const char* TpuStatus_Message(TF_Status* status);
+int TpuStatus_Code(TF_Status* status);
+bool TpuStatus_Ok(TF_Status* status);
 
 SE_StreamExecutorConfig* TpuStreamExecutorConfig_Default();
 void TpuStreamExecutorConfig_SetOrdinal(SE_StreamExecutorConfig*, int ordinal);
@@ -165,7 +165,7 @@ SE_DeviceDescription* TpuDeviceDescription_New();
 void TpuDeviceDescription_Free(SE_DeviceDescription* description);
 void TpuExecutor_CreateDeviceDescription(SE_StreamExecutor* executor,
                                          SE_DeviceDescription* description,
-                                         SE_Status* status);
+                                         TF_Status* status);
 
 SE_DeviceOptions* TpuExecutor_NewDeviceOptions(unsigned flags);
 void TpuExecutor_FreeDeviceOptions(SE_DeviceOptions* options);
@@ -181,7 +181,7 @@ void TpuTransferManager_HostShapeToDeviceShape(XLA_TransferManager* manager,
                                                XLA_Shape* device_shape);
 void TpuTransferManager_TransferLiteralToDeviceAsync(
     XLA_TransferManager* manager, SE_Stream* stream, XLA_Literal* literal,
-    XLA_ShapedBuffer* device_buffer, SE_Status* status);
+    XLA_ShapedBuffer* device_buffer, TF_Status* status);
 void TpuTransferManager_TransferLiteralFromDevice(
     XLA_TransferManager* manager, SE_Stream* stream,
     XLA_ShapedBuffer* device_buffer, XLA_Literal* literal,
@@ -190,7 +190,7 @@ int64_t TpuTransferManager_GetByteSizeRequirement(XLA_TransferManager* manager,
                                                   XLA_Shape* shape);
 void TpuTransferManager_ChooseCompactLayoutForShape(
     XLA_TransferManager* manager, XLA_Shape* host_shape, XLA_Shape* output,
-    SE_Status* status);
+    TF_Status* status);
 bool TpuTransferManager_CanShapedBufferBeAccessedNow(
     XLA_TransferManager* manager, SE_StreamExecutor* executor,
     XLA_ShapedBuffer* device_buffer);
@@ -200,32 +200,32 @@ bool TpuTransferManager_CanBufferBeAccessedNow(
 void TpuTransferManager_WriteSingleTupleIndexTable(
     XLA_TransferManager* manager, SE_Stream* stream,
     SE_DeviceMemoryBase* elements, size_t elements_len, XLA_Shape* shape,
-    SE_DeviceMemoryBase* region, SE_Status* status);
+    SE_DeviceMemoryBase* region, TF_Status* status);
 void TpuTransferManager_GetInfeedLayout(XLA_Shape* shape,
                                         XLA_Shape* infeed_shape);
 void TpuTransferManager_LinearizeToBuffers(
     XLA_TransferManager* manager, XLA_Literal* c_literal, char*** buffers_array,
-    int64_t** buffers_size, int64_t* buffers_array_size, SE_Status* status);
+    int64_t** buffers_size, int64_t* buffers_array_size, TF_Status* status);
 void TpuTransferManager_FreeBuffers(char** buffers_array, int64_t* buffers_size,
                                     int64_t buffers_array_size);
 void TpuTransferManager_TransferLiteralToInfeed(XLA_TransferManager* manager,
                                                 SE_StreamExecutor* executor,
                                                 XLA_Literal* c_literal,
-                                                SE_Status* status);
+                                                TF_Status* status);
 void TpuTransferManager_TransferBuffersToInfeed(XLA_TransferManager* manager,
                                                 SE_StreamExecutor* executor,
                                                 uint32_t** buffers_array,
                                                 int64_t* buffers_size_in_uint32,
                                                 int64_t buffers_array_size,
-                                                SE_Status* status);
+                                                TF_Status* status);
 void TpuTransferManager_TransferLiteralFromOutfeed(XLA_TransferManager* manager,
                                                    SE_StreamExecutor* executor,
                                                    XLA_Shape* shape,
                                                    XLA_Literal* c_literal,
-                                                   SE_Status* status);
+                                                   TF_Status* status);
 void TpuTransferManager_ResetDevices(XLA_TransferManager* manager,
                                      SE_StreamExecutor** executors,
-                                     int64_t num_executors, SE_Status* status);
+                                     int64_t num_executors, TF_Status* status);
 
 XLA_ComputationPlacer* TpuComputationPlacer_New();
 void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
@@ -235,12 +235,12 @@ void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
 void TpuComputationPlacer_AssignDevices(XLA_ComputationPlacer* placer,
                                         int replica_count,
                                         int computation_count, int* assignment,
-                                        SE_Status* status);
+                                        TF_Status* status);
 void TpuComputationPlacer_AssignLocalDevices(SE_TpuTopology_Host* host,
                                              int replica_count,
                                              int computation_count,
                                              int* assignment,
-                                             SE_Status* status);
+                                             TF_Status* status);
 
 int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
@@ -287,27 +287,27 @@ TFTPU_CAPI_EXPORT void TpuCompiler_Free(Tpu_Compiler* compiler);
 TFTPU_CAPI_EXPORT void TpuCompiler_RunHloPasses(
     Tpu_Compiler* compiler, XLA_HloModule* se_hlo_module,
     SE_StreamExecutor* stream_executor, SE_DeviceMemoryAllocator* allocator,
-    XLA_HloModule* result, SE_Status* status);
+    XLA_HloModule* result, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void TpuCompiler_RunBackend(
     Tpu_Compiler* compiler, XLA_HloModule* se_hlo_module,
     SE_StreamExecutor* stream_executor, SE_DeviceMemoryAllocator* allocator,
-    SE_Executable** result, SE_Status* status);
+    SE_Executable** result, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void TpuCompiler_Compile(
     Tpu_Compiler* compiler, XLA_HloModuleGroup* se_hlo_module_group,
     SE_StreamExecutorList* stream_exec_lists, int num_lists,
     SE_DeviceMemoryAllocator* allocator, SE_Executable** executables,
-    SE_Status* status);
+    TF_Status* status);
 
 TFTPU_CAPI_EXPORT int64_t TpuCompiler_ShapeSize(Tpu_Compiler* compiler,
                                                 XLA_Shape* c_shape);
 
 TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
-    SE_Executable* executable, SE_ExecutableRunOptions* run_options,
+    SE_Executable* executable, SE_ExecutableRunOptions* se_options,
     SE_ExecutionInput** se_arguments, int se_arguments_size,
-    SE_HloExecutionProfile* hlo_execution_profile, SE_ExecutionOutput* output,
-    SE_Status* status);
+    SE_HloExecutionProfile* hlo_execution_profile,
+    SE_ExecutionOutput* se_output, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void TpuExecutable_Fingerprint(SE_Executable* executable,
                                                  const char** fingerprint,
@@ -323,11 +323,11 @@ TFTPU_CAPI_EXPORT void TpuExecutable_Free(SE_Executable*);
 // Converts an XLA `Shape` into its equivalent TPU `Shape` representation.
 TFTPU_CAPI_EXPORT void XlaShapeToTpuShapeRepresentation(
     XLA_Shape* serialized_xla_shape, int data_type, bool use_fast_memory,
-    XLA_Shape* serialized_tpu_shape, SE_Status* status);
+    XLA_Shape* serialized_tpu_shape, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void XlaShapeToTpuPaddedShape(XLA_Shape* serialized_xla_shape,
-                                                XLA_Shape* serialized_tpu_shape,
-                                                SE_Status* status);
+                                                XLA_Shape* padded_shape,
+                                                TF_Status* status);
 
 struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_New);
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.cc b/tensorflow/stream_executor/tpu/tpu_node_context.cc
index b5597e2f88f..13447a74d40 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.cc
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_node_context.h"
 
 #include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -30,40 +30,38 @@ StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
     int device_ordinal) {
   StatusHelper status;
   XLA_TpuNodeContext* node_context =
-      tpu::NodeContextApiFn()->TpuNodeContext_CreateFn(device_ordinal,
-                                                       status.c_status);
+      tpu::OpsApiFn()->TpuNodeContext_CreateFn(device_ordinal, status.c_status);
   if (!status.status().ok()) {
     // TpuNodeContext_CreateFn allocates a new XLA_TpuNodeContext regardless of
     // status. It needs to be freed if it's not given to a TpuNodeContext below.
-    tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context);
+    tpu::OpsApiFn()->TpuNodeContext_FreeFn(node_context);
     return status.status();
   }
   return std::make_unique<TpuNodeContext>(device_ordinal, node_context);
 }
 
 TpuNodeContext::~TpuNodeContext() {
-  tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context_);
+  tpu::OpsApiFn()->TpuNodeContext_FreeFn(node_context_);
 }
 
 /* static */
 Status TpuNodeContext::StopChipHeartbeats() {
   StatusHelper status;
-  tpu::NodeContextApiFn()->TpuNodeContext_StopChipHeartbeatsFn(status.c_status);
+  tpu::OpsApiFn()->TpuNodeContext_StopChipHeartbeatsFn(status.c_status);
   return status.status();
 }
 
 /* static */
 Status TpuNodeContext::CloseTpuHost() {
   StatusHelper status;
-  tpu::NodeContextApiFn()->TpuNodeContext_CloseTpuHostFn(status.c_status);
+  tpu::OpsApiFn()->TpuNodeContext_CloseTpuHostFn(status.c_status);
   return status.status();
 }
 
 /* static */
 Status TpuNodeContext::Initialize(int device_ordinal) {
   StatusHelper status;
-  tpu::NodeContextApiFn()->TpuNodeContext_InitializeFn(device_ordinal,
-                                                       status.c_status);
+  tpu::OpsApiFn()->TpuNodeContext_InitializeFn(device_ordinal, status.c_status);
   return status.status();
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.h b/tensorflow/stream_executor/tpu/tpu_node_context.h
index 27cf32f854f..48b3c25bf10 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/tpu/tpu_ops_c_api.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
-#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 
 namespace tensorflow {
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
deleted file mode 100644
index 55288d2ba38..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
-
-#include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/c_api_decl.h"
-
-typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
-
-extern "C" {
-
-XLA_TpuNodeContext* TpuNodeContext_Create(int device_ordinal,
-                                          SE_Status* status);
-void TpuNodeContext_Free(XLA_TpuNodeContext* node_context);
-
-void TpuNodeContext_StopChipHeartbeats(SE_Status* status);
-
-void TpuNodeContext_CloseTpuHost(SE_Status* status);
-
-void TpuNodeContext_Initialize(int device_ordinal, SE_Status* status);
-
-}  // extern "C"
-
-struct TfTpu_NodeContextApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
-  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
-  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
-  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
-  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
-};
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 5a01848e78b..41a26644483 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -147,7 +147,7 @@ void TpuPlatform::EraseEvent(stream_executor::internal::EventInterface* key) {
 
 Status TpuPlatform::TpusPerHost(int* tpus) {
   TF_Status* status = TF_NewStatus();
-  tpu::ConfigApiFn()->TpuConfigurationApi_TpusPerHostFn(tpus, status);
+  tpu::OpsApiFn()->TpuConfigurationApi_TpusPerHostFn(tpus, status);
   auto ret_status = StatusFromTF_Status(status);
   TF_DeleteStatus(status);
   return ret_status;
@@ -155,7 +155,7 @@ Status TpuPlatform::TpusPerHost(int* tpus) {
 
 Status TpuPlatform::TpuMemoryLimit(int64* memory_limit) {
   TF_Status* status = TF_NewStatus();
-  tpu::ConfigApiFn()->TpuConfigurationApi_TpuMemoryLimitFn(
+  tpu::OpsApiFn()->TpuConfigurationApi_TpuMemoryLimitFn(
       reinterpret_cast<int64_t*>(memory_limit), status);
   auto ret_status = StatusFromTF_Status(status);
   TF_DeleteStatus(status);
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index be9fcc43a29..617b414232e 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -163,11 +163,11 @@ Status TpuTransferManager::ResetDevices(
 
 struct TransferFromDeviceState {
   std::atomic<int64_t> remaining_transfers;
-  SE_Status* overall_status =
+  TF_Status* overall_status =
       tpu::ExecutorApiFn()->TpuStatus_NewFn();  // OK or the first error
   std::function<void(Status)> done;
 
-  void TransferFinished(SE_Status* status) {
+  void TransferFinished(TF_Status* status) {
     if (!tpu::ExecutorApiFn()->TpuStatus_OkFn(status) &&
         tpu::ExecutorApiFn()->TpuStatus_OkFn(overall_status)) {
       std::swap(overall_status, status);
@@ -182,7 +182,7 @@ struct TransferFromDeviceState {
   }
 };
 
-void TransferLiteralFromDeviceTrampoline(void* ctx, SE_Status* status) {
+void TransferLiteralFromDeviceTrampoline(void* ctx, TF_Status* status) {
   reinterpret_cast<TransferFromDeviceState*>(ctx)->TransferFinished(status);
 }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 45c9dcbe9f7..72351028fd8 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -4,7 +4,6 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_dynamic_kernels",
     "if_static",
-    "register_extension_info",
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
     "tf_exec_properties",
@@ -39,7 +38,6 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_enable_mkl",
     "if_mkl",
-    "if_mkl_lnx_x64",
     "if_mkl_ml",
     "mkl_deps",
 )
@@ -48,16 +46,13 @@ load(
     "if_mkl_open_source_only",
     "if_mkldnn_threadpool",
 )
-load(
-    "//third_party/ngraph:build_defs.bzl",
-    "if_ngraph",
-)
+load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
 
 # version for the shared libraries, can
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.4.0"
+VERSION = "2.5.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 
 # Sanitize a dependency so that it works correctly from code that includes
@@ -116,11 +111,6 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
         for p in core_proto_sources_relative
     ])
 
-# Wrapper for portable protos which currently just creates an empty rule.
-def tf_portable_proto_library(name, proto_deps, deps = [], **kwargs):
-    _ignore = [kwargs]
-    cc_library(name = name, deps = deps + [dep + "_cc" for dep in proto_deps])
-
 def tf_portable_full_lite_protos(full, lite):
     return select({
         "//tensorflow:mobile_lite_protos": lite,
@@ -268,6 +258,12 @@ def if_libtpu(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+def if_registration_v2(if_true, if_false = []):
+    return select({
+        "//tensorflow:registration_v2": if_true,
+        "//conditions:default": if_false,
+    })
+
 # Linux systems may required -lrt linker flag for e.g. clock_gettime
 # see https://github.com/tensorflow/tensorflow/issues/15129
 def lrt_if_needed():
@@ -331,11 +327,9 @@ def tf_copts(
         if_libtpu(["-DLIBTPU_ON_GCE"], []) +
         if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) +
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
-        if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +
-        if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
+        if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16", "-DINTEL_MKL_DNN_ONLY"]) +
         if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
-        if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
         if_linux_x86_64(["-msse3"]) +
         if_ios_x86_64(["-msse4.1"]) +
@@ -355,7 +349,12 @@ def tf_copts(
     )
 
 def tf_openmp_copts():
-    return (if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fno-openmp"]))
+    # We assume when compiling on Linux gcc/clang will be used and MSVC on Windows
+    return select({
+        "@org_tensorflow//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"],
+        "@org_tensorflow//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp"],
+        "//conditions:default": [],
+    })
 
 def tf_opts_nortti():
     return [
@@ -649,11 +648,6 @@ def tf_cc_shared_object(
             visibility = visibility,
         )
 
-register_extension_info(
-    extension_name = "tf_cc_shared_object",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # Links in the framework shared object
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
 # statically. Also adds linker options (rpaths) so that the framework shared
@@ -709,11 +703,6 @@ def tf_cc_binary(
             visibility = visibility,
         )
 
-register_extension_info(
-    extension_name = "tf_cc_binary",
-    label_regex_for_dep = "{extension_name}.*",
-)
-
 # A simple wrap around native.cc_binary rule.
 # When using this rule, you should realize it doesn't link to any tensorflow
 # dependencies by default.
@@ -738,11 +727,6 @@ def tf_native_cc_binary(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_native_cc_binary",
-    label_regex_for_dep = "{extension_name}.*",
-)
-
 def tf_gen_op_wrapper_cc(
         name,
         out_ops_file,
@@ -1092,11 +1076,6 @@ def tf_cc_test(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_cc_test",
-    label_regex_for_dep = "{extension_name}.*",
-)
-
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
 def tf_cc_test_gpu(
@@ -1121,11 +1100,6 @@ def tf_cc_test_gpu(
         tags = tags,
     )
 
-register_extension_info(
-    extension_name = "tf_cc_test_gpu",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_gpu_cc_test(
         name,
         srcs = [],
@@ -1176,20 +1150,10 @@ def tf_gpu_cc_test(
         ]),
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_cc_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # terminology changes: saving tf_cuda_* definition for compatibility
 def tf_cuda_cc_test(*args, **kwargs):
     tf_gpu_cc_test(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "tf_cuda_cc_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_gpu_only_cc_test(
         name,
         srcs = [],
@@ -1229,20 +1193,10 @@ def tf_gpu_only_cc_test(
         exec_properties = tf_exec_properties({"tags": tags}),
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_only_cc_test",
-    label_regex_for_dep = "{extension_name}_gpu",
-)
-
 # terminology changes: saving tf_cuda_* definition for compatibility
 def tf_cuda_only_cc_test(*args, **kwargs):
     tf_gpu_only_cc_test(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "tf_cuda_only_cc_test",
-    label_regex_for_dep = "{extension_name}_gpu",
-)
-
 # Create a cc_test for each of the tensorflow tests listed in "tests", along
 # with a test suite of the given name, if provided.
 def tf_cc_tests(
@@ -1374,11 +1328,6 @@ def tf_java_test(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_java_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def _cuda_copts(opts = []):
     """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
@@ -1430,11 +1379,6 @@ def tf_gpu_kernel_library(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_kernel_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
     """Generate a cc_library with a conditional set of CUDA dependencies.
 
@@ -1470,20 +1414,10 @@ def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # terminology changes: saving tf_cuda_* definition for compatibility
 def tf_cuda_library(*args, **kwargs):
     tf_gpu_library(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "tf_cuda_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_kernel_library(
         name,
         prefix = None,
@@ -1596,11 +1530,6 @@ def tf_kernel_library(
         deps = deps,
     )
 
-register_extension_info(
-    extension_name = "tf_kernel_library",
-    label_regex_for_dep = "({extension_name}(_gpu)?|libtfkernel_{extension_name}\\.so)",
-)
-
 def tf_mkl_kernel_library(
         name,
         prefix = None,
@@ -1635,15 +1564,10 @@ def tf_mkl_kernel_library(
         hdrs = hdrs,
         deps = deps,
         alwayslink = alwayslink,
-        copts = copts,
+        copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
         features = disable_header_modules,
     )
 
-register_extension_info(
-    extension_name = "tf_mkl_kernel_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def _get_transitive_headers(hdrs, deps):
     """Obtain the header files for a target and its transitive dependencies.
 
@@ -1786,7 +1710,7 @@ def tf_custom_op_library_additional_deps():
         clean_dep("//tensorflow/core:framework_headers_lib"),
     ] + if_windows([clean_dep("//tensorflow/python:pywrap_tensorflow_import_lib")])
 
-# A list of targets that contains the implemenation of
+# A list of targets that contains the implementation of
 # tf_custom_op_library_additional_deps. It's used to generate a DEF file for
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
@@ -1911,11 +1835,6 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_custom_op_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # Placeholder to use until bazel supports py_strict_binary.
 def py_strict_binary(name, **kwargs):
     native.py_binary(name = name, **kwargs)
@@ -1946,11 +1865,6 @@ def tf_custom_op_py_library(
         deps = deps,
     )
 
-register_extension_info(
-    extension_name = "tf_custom_op_py_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # In tf_py_wrap_cc_opensource generated libraries
 # module init functions are not exported unless
 # they contain one of the keywords in the version file
@@ -2172,11 +2086,6 @@ def py_test(deps = [], data = [], kernels = [], exec_properties = None, **kwargs
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "py_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # Similar to py_test above, this macro is used to exclude dependencies for some py_binary
 # targets in order to reduce the size of //tensorflow/tools/pip_package:simple_console_windows.
 # See https://github.com/tensorflow/tensorflow/issues/22390
@@ -2197,11 +2106,6 @@ def py_binary(name, deps = [], **kwargs):
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "py_binary",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def pytype_library(**kwargs):
     # Types not enforced in OSS.
     native.py_library(**kwargs)
@@ -2298,11 +2202,6 @@ def tf_py_test(
             **kwargs
         )
 
-register_extension_info(
-    extension_name = "tf_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 def gpu_py_test(
         name,
         srcs,
@@ -2317,6 +2216,7 @@ def gpu_py_test(
         xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False,
+        xla_tags = [],  # additional tags for xla_gpu tests
         **kwargs):
     if main == None:
         main = name + ".py"
@@ -2339,7 +2239,7 @@ def gpu_py_test(
                 kernels = kernels,
                 main = main,
                 shard_count = shard_count,
-                tags = test_tags + ["xla", "manual"],
+                tags = test_tags + xla_tags + ["xla", "manual"],
                 xla_enabled = xla_enabled,
                 xla_enable_strict_auto_jit = True,
                 **kwargs
@@ -2363,20 +2263,10 @@ def gpu_py_test(
             **kwargs
         )
 
-register_extension_info(
-    extension_name = "gpu_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 # terminology changes: saving cuda_* definition for compatibility
 def cuda_py_test(*args, **kwargs):
     gpu_py_test(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "cuda_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 def py_tests(
         name,
         srcs,
@@ -2621,11 +2511,6 @@ def cc_library_with_android_deps(
     deps = if_not_android(deps) + if_android(android_deps) + common_deps
     cc_library(deps = deps, copts = copts, **kwargs)
 
-register_extension_info(
-    extension_name = "cc_library_with_android_deps",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tensorflow_opensource_extra_deps():
     return []
 
@@ -2819,7 +2704,7 @@ def if_cuda_or_rocm(if_true, if_false = []):
 
       If the same additional dependency is needed for both CUDA and ROCm
       (for eg. `reduction_ops` dependency for the `bias_op` target above),
-      then specifying that dependency in both  both `if_cuda` and `if_rocm` will
+      then specifying that dependency in both `if_cuda` and `if_rocm` will
       result in both those functions returning a select statement, which contains
       the same dependency, which then leads to a duplicate dependency bazel error.
 
@@ -2944,3 +2829,66 @@ def internal_tfrt_deps():
 
 def internal_cuda_deps():
     return []
+
+def _tf_gen_options_header_impl(ctx):
+    header_depset = depset([ctx.outputs.output_header])
+
+    define_vals = {True: "true", False: "false"}
+    substitutions = {}
+    for target, identifier in ctx.attr.build_settings.items():
+        setting_val = target[BuildSettingInfo].value
+        lines = [
+            "// %s" % target.label,
+            "#define TF_OPTION_%s() %s" % (identifier, define_vals[setting_val]),
+        ]
+        substitutions["#define_option %s" % identifier] = "\n".join(lines)
+
+    ctx.actions.expand_template(
+        template = ctx.file.template,
+        output = ctx.outputs.output_header,
+        substitutions = substitutions,
+    )
+
+    return [
+        DefaultInfo(files = header_depset),
+    ]
+
+tf_gen_options_header = rule(
+    attrs = {
+        "output_header": attr.output(
+            doc = "File path for the generated header (output)",
+            mandatory = True,
+        ),
+        "template": attr.label(
+            doc = """Template for the header.
+            For each option name 'X' (see build_settings attribute),
+            '#define_option X' results in a macro 'TF_OPTION_X()'
+            """,
+            allow_single_file = True,
+            mandatory = True,
+        ),
+        "build_settings": attr.label_keyed_string_dict(
+            doc = """Dictionary from build-setting labels to option names. Example:
+               {"//tensorflow:x_setting" : "X"}
+            """,
+            providers = [BuildSettingInfo],
+        ),
+    },
+    implementation = _tf_gen_options_header_impl,
+    doc = """
+    Generates a header file for Bazel build settings.
+
+    This is an alternative to setting preprocessor defines on the compiler
+    command line. It has a few advantages:
+      - Usage of the options requires #include-ing the header, and thus a
+        Bazel-level dependency.
+      - Each option has a definition site in source code, which mentions the
+        corresponding Bazel setting. This is particularly useful when
+        navigating code with the assistance of static analysis (e.g.
+        https://cs.opensource.google/tensorflow).
+      - Each option is represented as a FUNCTION()-style macro, which is always
+        defined (i.e. one uses #if instead of #ifdef). This allows forms like
+        'if constexpr (TF_OPTION_FOO()) { ... }', and helps catch missing
+        dependencies (if 'F' is undefined, '#if F()' results in an error).
+    """,
+)
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 911363bcef3..3c1a796c15f 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -10,3 +10,5 @@
 *stream_executor*
 *xla*
 *PyInit_*
+*SE_*
+*SP_*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 5796385dbc5..d26fa4e5278 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -12,6 +12,8 @@ tensorflow {
     *stream_executor*;
     *xla*;
     *PyInit_*;
+    *SE_*;
+    *SP_*;
   local:
     *;
 };
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
index 8c3438e4d8e..7019451496e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-external-state-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-external-state-policy.pbtxt
new file mode 100644
index 00000000000..c2912bed687
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-external-state-policy.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.ExternalStatePolicy"
+tf_class {
+  is_instance: "<enum \'ExternalStatePolicy\'>"
+  member {
+    name: "FAIL"
+    mtype: "<enum \'ExternalStatePolicy\'>"
+  }
+  member {
+    name: "IGNORE"
+    mtype: "<enum \'ExternalStatePolicy\'>"
+  }
+  member {
+    name: "WARN"
+    mtype: "<enum \'ExternalStatePolicy\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index e55f4061aa8..4a5a7c905fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "DistributeOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ExternalStatePolicy"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "INFINITE_CARDINALITY"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
index d8eaf9bc7d7..63878ebc25d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "dispatcher_address"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dispatcher_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "heartbeat_interval_ms"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
index 0a8e0b4421a..c35e409975d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -8,11 +8,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -24,10 +24,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
index b5ccb39d075..68f62ba9e0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
index 6a7a3a97aa0..df278d5cca5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.distribute.InputReplicationMode"
 tf_class {
   is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_REPLICA"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
   member {
     name: "PER_WORKER"
     mtype: "<enum \'InputReplicationMode\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
index 1a039b10501..266447848d0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
index 7876166dc40..55939070e23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -9,11 +9,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -25,10 +25,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
index 0101212e4cc..b364014e55a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.distribute.ReplicaContext"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContextV1\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContextBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "devices"
@@ -24,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "all_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "merge_call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
index b2d9d4ee2cb..7d7efcca81e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -37,7 +37,7 @@ tf_class {
   }
   member_method {
     name: "batch_reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "broadcast_to"
@@ -69,7 +69,7 @@ tf_class {
   }
   member_method {
     name: "reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "update"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
index 7eca1c80d8b..8803fbfea0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
@@ -1,16 +1,16 @@
 path: "tensorflow.distribute.experimental.CollectiveCommunication"
 tf_class {
-  is_instance: "<enum \'CollectiveCommunication\'>"
+  is_instance: "<enum \'CommunicationImplementation\'>"
   member {
     name: "AUTO"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplementation\'>"
   }
   member {
     name: "NCCL"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplementation\'>"
   }
   member {
     name: "RING"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplementation\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-implementation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-implementation.pbtxt
new file mode 100644
index 00000000000..0ce1ded9192
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-implementation.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.experimental.CommunicationImplementation"
+tf_class {
+  is_instance: "<enum \'CommunicationImplementation\'>"
+  member {
+    name: "AUTO"
+    mtype: "<enum \'CommunicationImplementation\'>"
+  }
+  member {
+    name: "NCCL"
+    mtype: "<enum \'CommunicationImplementation\'>"
+  }
+  member {
+    name: "RING"
+    mtype: "<enum \'CommunicationImplementation\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-options.pbtxt
new file mode 100644
index 00000000000..dfcb8954ac0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-options.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distribute.experimental.CommunicationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_util._OptionsExported\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bytes_per_pack\', \'timeout_seconds\', \'implementation\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'CommunicationImplementation.AUTO\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 598a9dbc15b..0c08a47c72b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CollectiveCommunication.AUTO\', \'None\'], "
+    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CommunicationImplementation.AUTO\', \'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
index 9247db37925..edac99871a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CollectiveHints"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CommunicationImplementation"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "CommunicationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MultiWorkerMirroredStrategy"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
index 355c57269fd..ebcf27eea53 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "pipeline_execution_with_tensor_core"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "profile_data_directory"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "table_to_config_dict"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index c3a84b15dd6..dc60db610ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -20,4 +20,8 @@ tf_module {
     name: "output_all_intermediates"
     argspec: "args=[\'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "register_filesystem_plugin"
+    argspec: "args=[\'plugin_location\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index da08722a7a3..41ab38df985 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -128,6 +136,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -298,7 +310,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 1719c8bd9c7..48495e4ed13 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -130,6 +138,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -316,7 +328,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index d93c018b073..1a23072830b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -21,6 +25,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -129,6 +137,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -299,7 +311,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1b50d327cd6..93dd8fa9972 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 7150f2bd928..97f2e75199a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 9fba915d01a..3e5cee9e574 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -21,6 +25,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -129,6 +137,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -299,7 +311,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 51277dfae56..5495727a331 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -115,6 +123,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 378f6568eef..610fe3840a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index a9d11967feb..5a5539404dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index fca5d2928ee..2ec76e78212 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 0c85d31934a..b1780f501ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 4ca1dc4a217..b7053964818 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index b2c3156cf7a..9481cbc18f4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index ae64e051158..4fc49aa433a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index fd77d449216..ec5df5c667a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index fc39c337669..4e3d41ee5dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index cbcfbb1022f..2b092efb833 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index bbb6c19bd7f..81124e1682b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 16d329f22c1..e861fda8107 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 56cb840bd0b..161e0372d63 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index 6dc759e1338..b3d992d5fdc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index a619ae0a480..7e926162ade 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "constraints"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -112,6 +120,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 237d4e7f34c..e716c8c4921 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index dc15a2c227c..3c34852d7e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -29,6 +29,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "data_format"
     mtype: "<type \'property\'>"
@@ -45,6 +49,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -197,6 +205,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 3d3ee3c67bf..8c4c827b92b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 23fb7bfc4eb..b503a8c75f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index a7eeb12ef04..43e6f7f332e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 9f4aa3cd95f..2a33509d196 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index a83cbc24972..9a1fc4949e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 7ccbb9a2694..d6c0ad0d151 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index 0733557f70d..66cdc51c355 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 71c2e77e7ff..517f2505bdb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 824bd8bbb2f..a609dcdb55f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index ac9d5be1883..10adc4ce928 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index ed63b8d98d4..ea0ee3028b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index d00f7a5b396..6e5d6bc942e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 2ca122485d6..c366341d8a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index 1b69967a59e..ce86e9a44b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 265f13b06bd..d46d79f30de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index e40eb1470a7..e3a3e3d9be5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -17,10 +17,18 @@ tf_class {
     name: "cell"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -117,6 +125,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 167a4d9e96f..33d3999c0a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -17,10 +17,18 @@ tf_class {
     name: "cell"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -117,6 +125,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index 59793ff6d45..7dd3f40faaf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 8370406e34d..bc7ddcea718 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 554d7531912..891a5b201b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 101719437d4..5ba7cd1a385 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index d441302523f..66045e97b8f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index a736e4c03fd..8ba42c66f06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 3f9002792d8..5f5149dc5c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 217f2701f3d..8bdcee57fa2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index a4f9e447bdf..db3514efe73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index f0e84b8edd7..de7ba971a32 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dropout"
     mtype: "<type \'property\'>"
@@ -36,6 +40,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -180,6 +188,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 1962f3284f0..6273d7986a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 64073b27c24..e6debc89415 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 73ed7f59394..08c9a100161 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 2fb47e8a5a6..d6e3f1252cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index e3ab2b5ab6d..94e9a96b88f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 494e2247fbf..00afd5fc967 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 22e79311d67..18d826fdd13 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 83f91393647..92e8876f5c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index d211683ae9c..f9b8b1778f4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 2b9442dee85..4bfcac30602 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index e02b42bdd0e..0762ac1e767 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 60d2a947d87..69ff605469c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 352527ea0f4..e05fb04f0b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index a1ff2f402a7..1268fbf60b0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index d1811a28b55..774a1c23255 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 13fbf554fd3..639efba7a75 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index ff9e8b6df74..a1536d0b82e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dropout"
     mtype: "<type \'property\'>"
@@ -36,6 +40,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -180,6 +188,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index c9f01c56606..211c366d9ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index 3b9306cdfe6..3c293f42da3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 03902ed1de4..962b7891b90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -10,10 +10,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -106,6 +114,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bf98a150184..3bd0b6a6071 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 040230d63b3..0d178d92f06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 8d49e7a58a1..da65b262707 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 485ae3b16ef..8dcd821c0e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 05050fdbffa..cce7e92fde3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 8ae6a0ab43b..390cb52e228 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ae8aea28552..4c2ddc2ce12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 94d2e0e6f6e..e7956c1029f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 91b0b44ea50..014af0a3b63 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 587850f1d6c..7c1a8e8820e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index ac97ca6e061..52286c20c58 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 7c8950ce3fa..3ae00343db3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
index 89fbb32194a..001a8743a91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 9ef978eeb3a..1b9ec5357fe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 19a48d77113..6b26c702b12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 03d5a2195cd..c79315618a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index c8c5b8326dd..c09439b7028 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 84530e067b2..4d9f2c18b80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 4de5b1c20d8..a9ab889dd55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index b9fcb027aaf..4ac4c4fe8f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 5b6bff9dc5f..664b4ec2547 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 3fb3c032a3e..6ab78380d99 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 5387a8e5fc5..5e5aff38b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index de2d3eaaab4..7560106c96e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 80e17948612..4b915342922 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 48e0c26b010..f5b25907cc6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dropout"
     mtype: "<type \'property\'>"
@@ -36,6 +40,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -168,6 +176,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 97e4b91bfa3..aebd00c9d98 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 0260221d093..415259bfbcf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index ddad5641e76..59c31486278 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 47e6ba9abfa..760e6592808 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 5379da642ed..bd454ac60ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -115,6 +123,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 1e070fb36db..e9c00b51415 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 6d7724bdfe3..407c860c033 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index d740fc8de3a..d0f7619d1bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index de377d9d2eb..7561273844e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index e2ee7941662..428f43d7523 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 8dd967cd3ce..482ecf32e2a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 334463a4031..e902ff4a483 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index ba4e58e3dfb..2312af6ebac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 8538d903fba..8937e9de579 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 2b8681ae8cc..795ad24a3ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
index a2d7d285409..0242b10c649 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
index e24ca0dc01a..425c5e45934 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
index ceb38316d11..1b5f2fc7a10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 04e59727b19..79946192d5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index 14d43cb08e8..d09d0f85402 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.__metaclass__.pbtxt
new file mode 100644
index 00000000000..088d4507b12
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization.DiscretizingCombiner.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.pbtxt
new file mode 100644
index 00000000000..2f75c151d66
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization.DiscretizingCombiner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization.DiscretizingCombiner\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.Combiner\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'epsilon\', \'num_bins\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute"
+    argspec: "args=[\'self\', \'values\', \'accumulator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'self\', \'encoded_accumulator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "extract"
+    argspec: "args=[\'self\', \'accumulator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'self\', \'accumulators\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'output\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'self\', \'accumulator\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
index cb7a793f94d..87c0e792cfd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -8,14 +9,26 @@ tf_class {
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "DiscretizingCombiner"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +121,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -118,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'bins\', \'epsilon\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
index 75a1efc2f15..2c9af8c3b32 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
index 75625d24d30..464ca87b9bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -17,10 +17,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -113,6 +121,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index 093a2b2292e..916149fa9f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index 0fa0355b0f2..62e1c15af95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index 8eca3903616..f5c73972d6a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index ad813468f53..55280e81038 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 15406e778f8..69fb7bea570 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index 8119cb9687f..80d7fda8d0d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index 7c68f9ef783..131bc4b3efd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 73866dfcf50..c212112a64f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index 0d113434d80..de032c6566c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index 392e2efef39..ab55f40e8fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index c8fcedd3221..96882e04598 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 7efb8d72dcb..3dc758729eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
index 2d7e71c2c43..515f976574b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -17,10 +17,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -113,6 +121,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 532f98fb322..69ae8d67722 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index 96703a02f88..b0d2b891dbc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -112,6 +120,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index e1b19dc5836..24026cc795b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 64edac45aa4..0cd26f8c214 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 8130de478cd..1055718babd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 7bfcf1f2bdd..29d1597747a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 4e37d89dba7..ff9a7004c4e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 9bb3b783982..43b95f6d23b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 4defb3657ec..270bd27279f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 628363c3347..0c88ab6d8b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 73c30e9a080..b2a8cbe0703 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index fe3b145999f..7aaa8e69472 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index e167d1a743c..c581d3ba734 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 83d2c77a759..6f561ab04a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 90f18e32247..47572bb065e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index cab559c90cb..b91104ea20b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 091cf58e3e2..5c5e0992d47 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index faf78e01f38..5376ff099c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index d4cb6e5b7ad..eb37f65f3a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 32a1cffa204..5a411922095 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 788f2e316b0..fd85ec63408 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "count"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -116,6 +124,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 76981e1950c..d0bb8dfc079 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index 65c65f71f6a..c9c15d8485c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index 22ba826b26e..a3676a52d84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
index c6bd13d4f1e..8a4ad3060e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index eaf6bbeb78d..bd55beef8aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
index 7254e26db7f..2f91c365dff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 1f0d977ea23..ab1e70387ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index c6f2ea193d4..1bab1fb18a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 1c383c8dcf0..827f812b6c0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 11c694eb234..ad026eee5d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index c95806164ac..7b2c8aad4f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 01b883278e5..9cb2fb31167 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 3d90bfc1ac9..cae343a0b79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 25858b461b3..d947aa57cc4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index 065036aee12..90c6ec1b640 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 2fac29e2415..561a13f6625 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 8fbf24a66ad..262ec6a3ed0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 4d1f2fd213f..af3f67971f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.-loss-scale-optimizer.pbtxt
new file mode 100644
index 00000000000..3bea3a9f8fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.-loss-scale-optimizer.pbtxt
@@ -0,0 +1,116 @@
+path: "tensorflow.keras.mixed_precision.LossScaleOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer._DelegatingTrackableMixin\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_counter"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_growth_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initial_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inner_optimizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loss_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'inner_optimizer\', \'dynamic\', \'initial_scale\', \'dynamic_growth_steps\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_scaled_loss"
+    argspec: "args=[\'self\', \'loss\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_unscaled_gradients"
+    argspec: "args=[\'self\', \'grads\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 3c016d331de..a68c17e8f24 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.keras.mixed_precision.experimental.LossScaleOptimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer.LossScaleOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer._DelegatingTrackableMixin\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizerV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer._DelegatingTrackableMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +14,30 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_counter"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_growth_steps"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "global_clipnorm"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "initial_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inner_optimizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt
index e8648afb5f7..502aef38b75 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.mixed_precision"
 tf_module {
+  member {
+    name: "LossScaleOptimizer"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 15c0ab5abbb..0c837d030c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -128,6 +136,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -298,7 +310,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 729fdd660ca..4a595abe9db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -130,6 +138,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -316,7 +328,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
index ac80126aaa3..798dd83194d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
@@ -30,6 +30,6 @@ tf_module {
   }
   member_method {
     name: "save_model"
-    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index 0fa2415c54f..301ef87309b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 871adc80846..c549934cd98 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 9b67ca9bee9..44fa4b26250 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 905b8c028bb..944e372cb6f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index ee778d9bead..13e5dce8ccf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 7c6958e4d9f..7a02a74cac1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -119,6 +127,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index e88b843679c..cf0049e0762 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index f31110ba8fe..e8801dc63fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -119,6 +127,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index a3f7e557ddc..79551b4dc93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 930706bec9d..7543d366593 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -117,6 +125,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index bb57573198a..f44894a17c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -117,6 +125,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 1dff295f333..caa74ad4f97 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -117,6 +125,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 108d1b94bc6..6795c5357c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -115,6 +123,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index b65100b2ace..a43f13dd06a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 8b44d2eca70..4f0899757e5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 2f2b7e8830b..694624eb931 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index ae06fe06994..c58d180acf7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -119,6 +127,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 330528e2df6..d6733740a2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -119,6 +127,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
index cd2342fa17b..b3b31b95dda 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operator"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 37cab1cd949..de449203072 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
index 15548662969..127f50673e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 96f3f456c22..977eab3f9a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index 82696611119..7288980a37c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index fa9ff47a9ea..3936db54a4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 1f3a3e01534..6e3e0479037 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 40aea957ecb..b23dff8a53b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index c23af284169..4df8050c91b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
index ac861ce8131..03c81529601 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index 1c8a1071cca..30174997a56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -51,6 +51,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
index 6379a67eadb..2c059c52a1c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operator"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index fda61393e1a..f014040758f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index c07a18eb61c..767007fb292 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -66,6 +66,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 39e44edf3c2..809a5608b9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
index 228bfd41be2..8cd13810fb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 358c0f88659..7a0fa4dbfba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
index 7f863ce4170..cbbab4d8046 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
index eadb8f066ec..b45a8fbd82a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
@@ -58,6 +58,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index f905de20b68..3510edea521 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index c9ee0301612..83df2d5b988 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -49,6 +49,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
index fc8f8cb0478..de4b1027f89 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -126,6 +134,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index 9cd3bbd8732..e9a1481b03a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -126,6 +134,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 1b865125114..2b2cbe615c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -144,6 +144,10 @@ tf_module {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "erfcinv"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "erfinv"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-dynamic-loss-scale.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-dynamic-loss-scale.pbtxt
new file mode 100644
index 00000000000..c744ae30e11
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-dynamic-loss-scale.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.mixed_precision.DynamicLossScale"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale.DynamicLossScale\'>"
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale.LossScale\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "increment_period"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initial_loss_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "multiplier"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_loss_scale\', \'increment_period\', \'multiplier\'], varargs=None, keywords=None, defaults=[\'32768\', \'2000\', \'2.0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'grads\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-fixed-loss-scale.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-fixed-loss-scale.pbtxt
new file mode 100644
index 00000000000..7393181eb85
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-fixed-loss-scale.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.mixed_precision.FixedLossScale"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale.FixedLossScale\'>"
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale.LossScale\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_scale_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'grads\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-loss-scale.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-loss-scale.pbtxt
new file mode 100644
index 00000000000..044b49a9999
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-loss-scale.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.mixed_precision.LossScale"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale.LossScale\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'grads\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-mixed-precision-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-mixed-precision-loss-scale-optimizer.pbtxt
new file mode 100644
index 00000000000..f1e49106bf3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.-mixed-precision-loss-scale-optimizer.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.mixed_precision.MixedPrecisionLossScaleOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale_optimizer.MixedPrecisionLossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.pbtxt
index 475c4a2ccde..b9020a1d912 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mixed_precision.pbtxt
@@ -1,7 +1,31 @@
 path: "tensorflow.mixed_precision"
 tf_module {
+  member {
+    name: "DynamicLossScale"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLossScale"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LossScale"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MixedPrecisionLossScaleOptimizer"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "disable_mixed_precision_graph_rewrite"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "enable_mixed_precision_graph_rewrite"
+    argspec: "args=[\'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'dynamic\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index f69f9d6fe7d..91e2a674648 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -126,6 +134,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index c9d09d08a67..956ba36fbf1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -126,6 +134,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 41bafa62ff9..bc842ea559a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -127,6 +135,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index e2ddb7feafe..bb1577af51b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -127,6 +135,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 11f02509c12..f2068c6a4b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -126,6 +134,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index d8f03cda8b5..a009efb9bd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -126,6 +134,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index de531d7a11a..6acd3afd1c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -125,6 +133,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index d678fae51f1..50926b63481 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -124,6 +132,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index b4449602033..823a3364b17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -127,6 +135,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index ba64d009908..abd33957365 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1824,6 +1824,10 @@ tf_module {
     name: "quantize"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\', \'narrow_range\', \'axis\', \'ensure_minimum_range\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\', \'False\', \'None\', \'0.01\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "quantize_v2"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\', \'narrow_range\', \'axis\', \'ensure_minimum_range\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\', \'False\', \'None\', \'0.01\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index 047fb4deda7..269873c63ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "quantize_and_dequantize"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index adc24e7006d..8c1ff69422e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -762,7 +762,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveGatherV2"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -774,7 +774,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduceV2"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -2940,6 +2940,14 @@ tf_module {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV4Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeDownAndShrinkRange"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3204,6 +3212,10 @@ tf_module {
     name: "RaggedTensorToVariant"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedTensorToVariantGradient"
+    argspec: "args=[\'encoded_ragged_grad\', \'row_splits\', \'dense_values_shape\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RandomCrop"
     argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adagrad-parameters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adagrad-parameters.pbtxt
index 5177e95357a..f56ef6540fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adagrad-parameters.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adagrad-parameters.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clip_gradient_min\', \'clip_gradient_max\'], varargs=None, keywords=None, defaults=[\'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adam-parameters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adam-parameters.pbtxt
index 68c62d5398b..922f4c3adc1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adam-parameters.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-adam-parameters.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.999\', \'1e-08\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clip_gradient_min\', \'clip_gradient_max\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.999\', \'1e-08\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
index 450015c3695..648c8340b70 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'multiply_linear_by_learning_rate\', \'beta\', \'allow_zero_accumulator\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'multiply_linear_by_learning_rate\', \'beta\', \'allow_zero_accumulator\', \'clip_gradient_min\', \'clip_gradient_max\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0\', \'False\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-stochastic-gradient-descent-parameters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-stochastic-gradient-descent-parameters.pbtxt
index 1b0bcf8be72..70b886b0324 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-stochastic-gradient-descent-parameters.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-stochastic-gradient-descent-parameters.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clip_gradient_min\', \'clip_gradient_max\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
index 8c3438e4d8e..7019451496e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.-global-op-dispatcher.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.-global-op-dispatcher.pbtxt
new file mode 100644
index 00000000000..7df7cd36152
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.-global-op-dispatcher.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.__internal__.dispatch.GlobalOpDispatcher"
+tf_class {
+  is_instance: "<class \'tensorflow.python.util.dispatch.GlobalOpDispatcher\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "NOT_SUPPORTED"
+    mtype: "<type \'object\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "handle"
+    argspec: "args=[\'self\', \'op\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.-op-dispatcher.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.-op-dispatcher.pbtxt
new file mode 100644
index 00000000000..e76086e7f6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.-op-dispatcher.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.__internal__.dispatch.OpDispatcher"
+tf_class {
+  is_instance: "<class \'tensorflow.python.util.dispatch.OpDispatcher\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "NOT_SUPPORTED"
+    mtype: "<type \'object\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "handle"
+    argspec: "args=[\'self\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'self\', \'op\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.pbtxt
new file mode 100644
index 00000000000..d4446c0d638
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.dispatch.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.__internal__.dispatch"
+tf_module {
+  member {
+    name: "GlobalOpDispatcher"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OpDispatcher"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "add_dispatch_support"
+    argspec: "args=[\'target\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
index effcae38787..b6a385783c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "decorator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "dispatch"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "distribute"
     mtype: "<type \'module\'>"
@@ -16,8 +20,16 @@ tf_module {
     name: "test"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "tf2"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "tracking"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "execute_fn_for_device"
+    argspec: "args=[\'device_branch_fns\', \'default_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'execute_fn\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.-parameterized-benchmark.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.-parameterized-benchmark.pbtxt
new file mode 100644
index 00000000000..fea00b642a2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.-parameterized-benchmark.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.__internal__.test.ParameterizedBenchmark"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.benchmark.ParameterizedBenchmark\'>"
+  is_instance: "<class \'tensorflow.python.platform.benchmark._BenchmarkRegistrar\'>"
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
index ec8f5b3dd96..495e53fcc99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.__internal__.test"
 tf_module {
+  member {
+    name: "ParameterizedBenchmark"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "combinations"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tf2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tf2.pbtxt
new file mode 100644
index 00000000000..6670e45b227
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tf2.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.tf2"
+tf_module {
+  member_method {
+    name: "enabled"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-external-state-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-external-state-policy.pbtxt
new file mode 100644
index 00000000000..c2912bed687
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-external-state-policy.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.ExternalStatePolicy"
+tf_class {
+  is_instance: "<enum \'ExternalStatePolicy\'>"
+  member {
+    name: "FAIL"
+    mtype: "<enum \'ExternalStatePolicy\'>"
+  }
+  member {
+    name: "IGNORE"
+    mtype: "<enum \'ExternalStatePolicy\'>"
+  }
+  member {
+    name: "WARN"
+    mtype: "<enum \'ExternalStatePolicy\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index d6d1d15e97a..22133135ee6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "DistributeOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ExternalStatePolicy"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "INFINITE_CARDINALITY"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
index d8eaf9bc7d7..63878ebc25d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "dispatcher_address"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dispatcher_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "heartbeat_interval_ms"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
index 0a8e0b4421a..c35e409975d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -8,11 +8,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -24,10 +24,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
index b5ccb39d075..68f62ba9e0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
index c3beabd938e..123e29cb163 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
@@ -3,10 +3,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputOptions\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputOptions\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_place_dataset_on_device"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_prefetch_to_device"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_replication_mode"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
index 6a7a3a97aa0..df278d5cca5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.distribute.InputReplicationMode"
 tf_class {
   is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_REPLICA"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
   member {
     name: "PER_WORKER"
     mtype: "<enum \'InputReplicationMode\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 598c01c6da0..75b0dd33fd3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -48,14 +48,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 00000000000..02816f1c5f6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,91 @@
+path: "tensorflow.distribute.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster_resolver\', \'communication_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_dataset"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_values_from_function"
+    argspec: "args=[\'self\', \'value_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
index 1a039b10501..266447848d0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 5b044576d14..559ee5e9519 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -48,14 +48,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
index 7876166dc40..55939070e23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -9,11 +9,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -25,10 +25,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
index 0101212e4cc..7379ddc856d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.distribute.ReplicaContext"
 tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContextBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "devices"
@@ -22,9 +23,13 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_gather"
+    argspec: "args=[\'self\', \'value\', \'axis\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "all_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "merge_call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
index 75e34579e5c..a1447c2ded0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "batch_reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
@@ -28,7 +28,7 @@ tf_class {
   }
   member_method {
     name: "reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "update"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index a8ebaa87590..5991d60fd81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -47,14 +47,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
index 3a94e8b9fa4..3d8e791613a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
@@ -52,10 +52,6 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_replicate_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
@@ -68,6 +64,10 @@ tf_class {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 290d834305a..00d7d652a89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -48,14 +48,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
index 7eca1c80d8b..8803fbfea0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
@@ -1,16 +1,16 @@
 path: "tensorflow.distribute.experimental.CollectiveCommunication"
 tf_class {
-  is_instance: "<enum \'CollectiveCommunication\'>"
+  is_instance: "<enum \'CommunicationImplementation\'>"
   member {
     name: "AUTO"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplementation\'>"
   }
   member {
     name: "NCCL"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplementation\'>"
   }
   member {
     name: "RING"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplementation\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-implementation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-implementation.pbtxt
new file mode 100644
index 00000000000..0ce1ded9192
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-implementation.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.experimental.CommunicationImplementation"
+tf_class {
+  is_instance: "<enum \'CommunicationImplementation\'>"
+  member {
+    name: "AUTO"
+    mtype: "<enum \'CommunicationImplementation\'>"
+  }
+  member {
+    name: "NCCL"
+    mtype: "<enum \'CommunicationImplementation\'>"
+  }
+  member {
+    name: "RING"
+    mtype: "<enum \'CommunicationImplementation\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-options.pbtxt
new file mode 100644
index 00000000000..dfcb8954ac0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-options.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distribute.experimental.CommunicationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_util._OptionsExported\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bytes_per_pack\', \'timeout_seconds\', \'implementation\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'CommunicationImplementation.AUTO\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 2245eb4b122..78f189a0d54 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -1,5 +1,6 @@
 path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
 tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy._CollectiveAllReduceStrategyExperimental\'>"
   is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
@@ -18,7 +19,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CollectiveCommunication.AUTO\', \'None\'], "
+    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CommunicationImplementation.AUTO\', \'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
@@ -48,14 +49,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 707d7281a5c..9393d410228 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.experimental.ParameterServerStrategy"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy_v2.ParameterServerStrategyV2\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'cluster_resolver\', \'variable_partitioner\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
@@ -48,14 +48,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index a2adcea87e6..25c525f8e18 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -48,14 +48,14 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-cluster-coordinator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-cluster-coordinator.pbtxt
new file mode 100644
index 00000000000..3381eb1036a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-cluster-coordinator.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.experimental.coordinator.ClusterCoordinator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.coordinator.cluster_coordinator.ClusterCoordinator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "create_per_worker_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "done"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "fetch"
+    argspec: "args=[\'self\', \'val\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "schedule"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-per-worker-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-per-worker-values.pbtxt
new file mode 100644
index 00000000000..5831f47dcfd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-per-worker-values.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distribute.experimental.coordinator.PerWorkerValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.coordinator.cluster_coordinator.PerWorkerValues\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-remote-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-remote-value.pbtxt
new file mode 100644
index 00000000000..f0e32de4aa1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.-remote-value.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.experimental.coordinator.RemoteValue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.coordinator.cluster_coordinator.RemoteValue\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "fetch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.pbtxt
new file mode 100644
index 00000000000..e394c0195c2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.coordinator.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.distribute.experimental.coordinator"
+tf_module {
+  member {
+    name: "ClusterCoordinator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PerWorkerValues"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RemoteValue"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-fixed-shards-partitioner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-fixed-shards-partitioner.pbtxt
new file mode 100644
index 00000000000..622d1666ff3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-fixed-shards-partitioner.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.distribute.experimental.partitioners.FixedShardsPartitioner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.FixedShardsPartitioner\'>"
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.Partitioner\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-max-size-partitioner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-max-size-partitioner.pbtxt
new file mode 100644
index 00000000000..0dab66b68b5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-max-size-partitioner.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.distribute.experimental.partitioners.MaxSizePartitioner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.MaxSizePartitioner\'>"
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.Partitioner\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_shard_bytes\', \'max_shards\', \'bytes_per_string\'], varargs=None, keywords=None, defaults=[\'None\', \'16\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-min-size-partitioner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-min-size-partitioner.pbtxt
new file mode 100644
index 00000000000..f20b23aeffa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-min-size-partitioner.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.distribute.experimental.partitioners.MinSizePartitioner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.MinSizePartitioner\'>"
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.Partitioner\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'min_shard_bytes\', \'max_shards\', \'bytes_per_string\'], varargs=None, keywords=None, defaults=[\'262144\', \'1\', \'16\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-partitioner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-partitioner.pbtxt
new file mode 100644
index 00000000000..cca4cd510c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.-partitioner.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.experimental.partitioners.Partitioner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.sharded_variable.Partitioner\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.pbtxt
new file mode 100644
index 00000000000..70d36b14b64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.partitioners.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.distribute.experimental.partitioners"
+tf_module {
+  member {
+    name: "FixedShardsPartitioner"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MaxSizePartitioner"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MinSizePartitioner"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Partitioner"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
index 06151eee4b4..77b3c5caebc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@@ -13,9 +13,17 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "MultiWorkerMirroredStrategy"
+    name: "CommunicationImplementation"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "CommunicationOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy._CollectiveAllReduceStrategyExperimentalMeta\'>"
+  }
   member {
     name: "ParameterServerStrategy"
     mtype: "<type \'type\'>"
@@ -28,4 +36,12 @@ tf_module {
     name: "ValueContext"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "coordinator"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "partitioners"
+    mtype: "<type \'module\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index d3867889a4f..9bd37181958 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "NcclAllReduce"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 58384846276..33c28d715a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -28,4 +28,8 @@ tf_module {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "register_filesystem_plugin"
+    argspec: "args=[\'plugin_location\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index da08722a7a3..41ab38df985 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -128,6 +136,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -298,7 +310,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 1719c8bd9c7..48495e4ed13 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -130,6 +138,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -316,7 +328,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index d93c018b073..1a23072830b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -21,6 +25,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -129,6 +137,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -299,7 +311,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1b50d327cd6..93dd8fa9972 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 7150f2bd928..97f2e75199a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 9fba915d01a..3e5cee9e574 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -21,6 +25,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -129,6 +137,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -299,7 +311,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 51277dfae56..5495727a331 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -115,6 +123,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 378f6568eef..610fe3840a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index a9d11967feb..5a5539404dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index fca5d2928ee..2ec76e78212 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 0c85d31934a..b1780f501ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 4ca1dc4a217..b7053964818 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index b2c3156cf7a..9481cbc18f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index ae64e051158..4fc49aa433a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index fd77d449216..ec5df5c667a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index fc39c337669..4e3d41ee5dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index cbcfbb1022f..2b092efb833 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index bbb6c19bd7f..81124e1682b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 16d329f22c1..e861fda8107 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 56cb840bd0b..161e0372d63 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index fd130c55979..d50d4bcf5c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index a619ae0a480..7e926162ade 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "constraints"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -112,6 +120,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 237d4e7f34c..e716c8c4921 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index dc15a2c227c..3c34852d7e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -29,6 +29,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "data_format"
     mtype: "<type \'property\'>"
@@ -45,6 +49,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -197,6 +205,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 3d3ee3c67bf..8c4c827b92b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 23fb7bfc4eb..b503a8c75f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index a7eeb12ef04..43e6f7f332e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 9f4aa3cd95f..2a33509d196 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index a83cbc24972..9a1fc4949e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 7ccbb9a2694..d6c0ad0d151 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index 0733557f70d..66cdc51c355 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 71c2e77e7ff..517f2505bdb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 824bd8bbb2f..a609dcdb55f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index ac9d5be1883..10adc4ce928 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index ed63b8d98d4..ea0ee3028b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index d00f7a5b396..6e5d6bc942e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 2ca122485d6..c366341d8a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index 1b69967a59e..ce86e9a44b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 265f13b06bd..d46d79f30de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index c74a5868d98..a73b2ad5cb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 8370406e34d..bc7ddcea718 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 554d7531912..891a5b201b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 101719437d4..5ba7cd1a385 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index d441302523f..66045e97b8f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index a736e4c03fd..8ba42c66f06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 3f9002792d8..5f5149dc5c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 217f2701f3d..8bdcee57fa2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 1a2338fe077..38aa2bb7199 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 7fd64ab47ca..f0f34aa6796 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dropout"
     mtype: "<type \'property\'>"
@@ -38,6 +42,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -182,6 +190,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 1962f3284f0..6273d7986a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 64073b27c24..e6debc89415 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 73ed7f59394..08c9a100161 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 2fb47e8a5a6..d6e3f1252cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index e3ab2b5ab6d..94e9a96b88f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 494e2247fbf..00afd5fc967 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 22e79311d67..18d826fdd13 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 83f91393647..92e8876f5c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index d211683ae9c..f9b8b1778f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 2b9442dee85..4bfcac30602 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index e02b42bdd0e..0762ac1e767 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 60d2a947d87..69ff605469c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 352527ea0f4..e05fb04f0b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index a1ff2f402a7..1268fbf60b0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index d1811a28b55..774a1c23255 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index c39df6fa394..42cf70b4bff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 3877cf015a2..67834ac5768 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dropout"
     mtype: "<type \'property\'>"
@@ -38,6 +42,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -182,6 +190,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index c9f01c56606..211c366d9ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index 3b9306cdfe6..3c293f42da3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 03902ed1de4..962b7891b90 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -10,10 +10,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -106,6 +114,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bf98a150184..3bd0b6a6071 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 040230d63b3..0d178d92f06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 8d49e7a58a1..da65b262707 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 485ae3b16ef..8dcd821c0e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 05050fdbffa..cce7e92fde3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 8ae6a0ab43b..390cb52e228 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ae8aea28552..4c2ddc2ce12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 94d2e0e6f6e..e7956c1029f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 91b0b44ea50..014af0a3b63 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 587850f1d6c..7c1a8e8820e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index ac97ca6e061..52286c20c58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 7c8950ce3fa..3ae00343db3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
index 89fbb32194a..001a8743a91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 9ef978eeb3a..1b9ec5357fe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 19a48d77113..6b26c702b12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 03d5a2195cd..c79315618a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index c8c5b8326dd..c09439b7028 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 84530e067b2..4d9f2c18b80 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 4de5b1c20d8..a9ab889dd55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index b9fcb027aaf..4ac4c4fe8f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 5b6bff9dc5f..664b4ec2547 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 3fb3c032a3e..6ab78380d99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 5387a8e5fc5..5e5aff38b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index de2d3eaaab4..7560106c96e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 80e17948612..4b915342922 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 48e0c26b010..f5b25907cc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "bias_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dropout"
     mtype: "<type \'property\'>"
@@ -36,6 +40,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -168,6 +176,10 @@ tf_class {
     name: "use_bias"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 97e4b91bfa3..aebd00c9d98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 0260221d093..415259bfbcf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index ddad5641e76..59c31486278 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 47e6ba9abfa..760e6592808 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 5379da642ed..bd454ac60ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -115,6 +123,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 1e070fb36db..e9c00b51415 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 6d7724bdfe3..407c860c033 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index d740fc8de3a..d0f7619d1bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index de377d9d2eb..7561273844e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index e2ee7941662..428f43d7523 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 8dd967cd3ce..482ecf32e2a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 334463a4031..e902ff4a483 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index ba4e58e3dfb..2312af6ebac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 8538d903fba..8937e9de579 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 2b8681ae8cc..795ad24a3ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
index a2d7d285409..0242b10c649 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
index e24ca0dc01a..425c5e45934 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
index f34dce7b307..661e1085887 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
index ceb38316d11..1b5f2fc7a10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index b4662d3c0e9..80d7a618df8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index 14d43cb08e8..d09d0f85402 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.__metaclass__.pbtxt
new file mode 100644
index 00000000000..088d4507b12
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization.DiscretizingCombiner.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.pbtxt
new file mode 100644
index 00000000000..2f75c151d66
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.-discretizing-combiner.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization.DiscretizingCombiner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization.DiscretizingCombiner\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.Combiner\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'epsilon\', \'num_bins\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute"
+    argspec: "args=[\'self\', \'values\', \'accumulator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'self\', \'encoded_accumulator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "extract"
+    argspec: "args=[\'self\', \'accumulator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'self\', \'accumulators\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'output\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'self\', \'accumulator\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
index cb7a793f94d..87c0e792cfd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -8,14 +9,26 @@ tf_class {
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "DiscretizingCombiner"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +121,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -118,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'bins\', \'epsilon\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
index 75a1efc2f15..2c9af8c3b32 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
index e4e24a25b7b..0c8453339af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index a58ffc1c2a5..d8ad9a9f683 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -119,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'dtype\', \'mean\', \'variance\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index 0fa0355b0f2..62e1c15af95 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index 8eca3903616..f5c73972d6a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index ad813468f53..55280e81038 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 15406e778f8..69fb7bea570 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index 8119cb9687f..80d7fda8d0d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index 7c68f9ef783..131bc4b3efd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 73866dfcf50..c212112a64f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index 0d113434d80..de032c6566c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index 392e2efef39..ab55f40e8fc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index c8fcedd3221..96882e04598 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 7efb8d72dcb..3dc758729eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
index 80da4a3df58..e628cc4d7f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 9fc7c410480..c5eab66f364 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index 96703a02f88..b0d2b891dbc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -112,6 +120,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index e1b19dc5836..24026cc795b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 64edac45aa4..0cd26f8c214 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 8130de478cd..1055718babd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 7bfcf1f2bdd..29d1597747a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 4e37d89dba7..ff9a7004c4e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 9bb3b783982..43b95f6d23b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 4defb3657ec..270bd27279f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 628363c3347..0c88ab6d8b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 73c30e9a080..b2a8cbe0703 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index fe3b145999f..7aaa8e69472 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index e167d1a743c..c581d3ba734 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 83d2c77a759..6f561ab04a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 90f18e32247..47572bb065e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index cab559c90cb..b91104ea20b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 091cf58e3e2..5c5e0992d47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index faf78e01f38..5376ff099c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index d4cb6e5b7ad..eb37f65f3a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 32a1cffa204..5a411922095 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 788f2e316b0..fd85ec63408 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "count"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -116,6 +124,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 76981e1950c..d0bb8dfc079 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index 65c65f71f6a..c9c15d8485c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index 22ba826b26e..a3676a52d84 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
index c6bd13d4f1e..8a4ad3060e4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index eaf6bbeb78d..bd55beef8aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
index 7254e26db7f..2f91c365dff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 1f0d977ea23..ab1e70387ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index c6f2ea193d4..1bab1fb18a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 1c383c8dcf0..827f812b6c0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 11c694eb234..ad026eee5d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index c95806164ac..7b2c8aad4f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 01b883278e5..9cb2fb31167 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 3d90bfc1ac9..cae343a0b79 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 25858b461b3..d947aa57cc4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index 065036aee12..90c6ec1b640 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 2fac29e2415..561a13f6625 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 8fbf24a66ad..262ec6a3ed0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 4d1f2fd213f..af3f67971f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.-loss-scale-optimizer.pbtxt
new file mode 100644
index 00000000000..3bea3a9f8fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.-loss-scale-optimizer.pbtxt
@@ -0,0 +1,116 @@
+path: "tensorflow.keras.mixed_precision.LossScaleOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer._DelegatingTrackableMixin\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_counter"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_growth_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initial_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inner_optimizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loss_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'inner_optimizer\', \'dynamic\', \'initial_scale\', \'dynamic_growth_steps\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_scaled_loss"
+    argspec: "args=[\'self\', \'loss\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_unscaled_gradients"
+    argspec: "args=[\'self\', \'grads\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.-policy.pbtxt
new file mode 100644
index 00000000000..278d4fe120e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.-policy.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.keras.mixed_precision.Policy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.policy.Policy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 3c016d331de..a68c17e8f24 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.keras.mixed_precision.experimental.LossScaleOptimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer.LossScaleOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer._DelegatingTrackableMixin\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizerV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.loss_scale_optimizer._DelegatingTrackableMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +14,30 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_counter"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic_growth_steps"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "global_clipnorm"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "initial_scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inner_optimizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
index e3435a32bef..74b342efdb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.mixed_precision.experimental.Policy"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.policy.Policy\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.policy.PolicyV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.policy.Policy\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "compute_dtype"
@@ -14,10 +15,6 @@ tf_class {
     name: "name"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "should_cast_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variable_dtype"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt
index e8648afb5f7..7ae84fcab43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt
@@ -1,7 +1,23 @@
 path: "tensorflow.keras.mixed_precision"
 tf_module {
+  member {
+    name: "LossScaleOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Policy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "global_policy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_global_policy"
+    argspec: "args=[\'policy\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 15c0ab5abbb..0c837d030c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -128,6 +136,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -298,7 +310,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 729fdd660ca..4a595abe9db 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -130,6 +138,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -316,7 +328,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
index ac80126aaa3..798dd83194d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
@@ -30,6 +30,6 @@ tf_module {
   }
   member_method {
     name: "save_model"
-    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
index cd2342fa17b..b3b31b95dda 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operator"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 37cab1cd949..de449203072 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
index 15548662969..127f50673e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 96f3f456c22..977eab3f9a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index 82696611119..7288980a37c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index fa9ff47a9ea..3936db54a4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 1f3a3e01534..6e3e0479037 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 40aea957ecb..b23dff8a53b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index c23af284169..4df8050c91b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
index ac861ce8131..03c81529601 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index 1c8a1071cca..30174997a56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -51,6 +51,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
index 6379a67eadb..2c059c52a1c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operator"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index fda61393e1a..f014040758f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "operators"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index c07a18eb61c..767007fb292 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -66,6 +66,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 39e44edf3c2..809a5608b9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
index 228bfd41be2..8cd13810fb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 358c0f88659..7a0fa4dbfba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
index 7f863ce4170..cbbab4d8046 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
index eadb8f066ec..b45a8fbd82a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
@@ -58,6 +58,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index f905de20b68..3510edea521 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index c9ee0301612..83df2d5b988 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -49,6 +49,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "parameters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 2ea4e8f84a6..77d0d2eeb70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -144,6 +144,10 @@ tf_module {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "erfcinv"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "erfinv"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
index e9177defa20..b2bc7a0a061 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -112,6 +120,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
index 6b63d833e42..562030170f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
index 3d4d7f1ca97..c1a5b14483c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
index bfa0226b05d..40bc3d2a97d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
index a1216947733..24b5a3fd7f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
index 2f67f041b9a..4593381f91d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
index 1c64601ad46..44d35532aef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index b2954042bbf..53bde7f9272 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
index 7bfee8ccd5d..4602dc18e9c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
index 44028896ecf..b29e3910f1a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
index fee3302e540..70a5af53252 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
index e548b5628b5..e9e5c421c8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
index 177850aaabf..6fbd6d8861a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
index 429d31b967d..602cc6d0005 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
index be99116d61b..9e3aaff1cd3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
index 861283a1272..886e92c45a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
index 17604647fad..f5a4cce8130 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
index e2ce76c2a1b..7429e27f359 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
index 4cc3f5c26e7..c03ee74e278 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
index 97ce4b030af..9f8ed91e211 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "count"
     mtype: "<type \'property\'>"
@@ -20,6 +24,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -116,6 +124,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
index de1ecd925e0..f79bdf8fd5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
index 28659b68e57..f0f17cca126 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -11,10 +11,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -107,6 +115,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
index 37440a594da..9a2cd2ab326 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
index 8d1e8854aca..67a6d7f2d83 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
index b53e97f9cbb..02a7897e1fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
index 95a9c0b64c1..d631356d886 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
index 9f4d76fecba..3faf3a5c803 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -108,6 +116,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
index 9d5461d1fb7..1bf12651e20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -110,6 +118,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
index 6799b934475..d96e46918f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
index 8e930197163..bcc0b52d2f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
index f5a41280932..8bd7cf79013 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 463730dece5..ceaa33ef2f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
index 4d1c97a60e0..09141d1b0cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
index 03092595d20..db373528065 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
index d738fa5a12a..477e71e2119 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
index 1a91cba2f63..f9259504e08 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -15,10 +15,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -111,6 +119,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
index 2c0c7cbab48..f9609c49b40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
index beb1e64b5ea..6b49bb35b01 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -109,6 +117,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index e0a352e79bf..1aa698465a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
index d9a3159309d..7c3d535b215 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
index cc3e1399eed..790b58ff9c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dynamic"
     mtype: "<type \'property\'>"
@@ -118,6 +126,10 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 83baba1b1ce..0aa5e8924a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -880,6 +880,10 @@ tf_module {
     name: "py_function"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "range"
     argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt
index 0ad95937220..1139c1c5038 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.profiler.profiler_v2.ProfilerOptions\'>"
   is_instance: "<class \'tensorflow.python.profiler.profiler_v2.ProfilerOptions\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "delay_ms"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "device_tracer_level"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index 047fb4deda7..269873c63ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "quantize_and_dequantize"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index adc24e7006d..8c1ff69422e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -762,7 +762,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveGatherV2"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -774,7 +774,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduceV2"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -2940,6 +2940,14 @@ tf_module {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV4Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeDownAndShrinkRange"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3204,6 +3212,10 @@ tf_module {
     name: "RaggedTensorToVariant"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedTensorToVariantGradient"
+    argspec: "args=[\'encoded_ragged_grad\', \'row_splits\', \'dense_values_shape\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RandomCrop"
     argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
diff --git a/tensorflow/tools/ci_build/Dockerfile.micro b/tensorflow/tools/ci_build/Dockerfile.micro
index 5da26218ecd..a86b4c828b5 100644
--- a/tensorflow/tools/ci_build/Dockerfile.micro
+++ b/tensorflow/tools/ci_build/Dockerfile.micro
@@ -6,4 +6,6 @@ FROM python:3.5-stretch
 LABEL maintainer="Pete Warden <petewarden@google.com>"
 
 RUN apt-get update && apt-get install -y zip xxd
-RUN pip install six
\ No newline at end of file
+RUN pip install six 
+# Install Renode test dependencies
+RUN pip install pyyaml requests psutil robotframework==3.1
diff --git a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
index 523daf666d5..047b169d13a 100644
--- a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+++ b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
@@ -19,11 +19,11 @@ set -e
 set -x
 
 # CPU size
-MAC_CPU_MAX_WHL_SIZE=165M
+MAC_CPU_MAX_WHL_SIZE=175M
 LINUX_CPU_MAX_WHL_SIZE=138M
 WIN_CPU_MAX_WHL_SIZE=113M
 # GPU size
-LINUX_GPU_MAX_WHL_SIZE=380M
+LINUX_GPU_MAX_WHL_SIZE=390M
 WIN_GPU_MAX_WHL_SIZE=252M
 
 function run_smoke_test() {
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
index 32a4241ca01..9090e5bf4a8 100755
--- a/tensorflow/tools/ci_build/builds/pip_new.sh
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -667,7 +667,7 @@ if [[ "$BUILD_BOTH_GPU_PACKAGES" -eq "1" ]] || [[ "$BUILD_BOTH_CPU_PACKAGES" -eq
         "\"${CONTAINER_TYPE}\" instead."
   fi
   if [[ "$PROJECT_NAME" == *_${PROJECT_SUFFIX} ]]; then
-    NEW_PROJECT_NAME=${PROJECT_NAME%"_${PROJECT_SUFFIX}"}
+    NEW_PROJECT_NAME=${PROJECT_NAME}"_${PROJECT_SUFFIX}"
   else
     NEW_PROJECT_NAME="${PROJECT_NAME}_${PROJECT_SUFFIX}"
   fi
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py35.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py35.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py36.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py36.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py37.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py37.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py38.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py38.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py35.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py35.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py36.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py36.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py38.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py38.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py35.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py35.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py36.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py36.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py38.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py38.sh
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py35.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py35.bat
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py36.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py36.bat
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py37.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py37.bat
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py38.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py38.bat
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py35.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/gpu_py35.bat
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py36.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/gpu_py36.bat
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py37.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/gpu_py37.bat
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py38.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/gpu_py38.bat
diff --git a/tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh b/tensorflow/tools/ci_build/nightly_release/windows/upload_nightly_pip.sh
similarity index 85%
rename from tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
rename to tensorflow/tools/ci_build/nightly_release/windows/upload_nightly_pip.sh
index 609c316cca7..31d21c46816 100644
--- a/tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
+++ b/tensorflow/tools/ci_build/nightly_release/windows/upload_nightly_pip.sh
@@ -21,11 +21,11 @@ source tensorflow/tools/ci_build/release/common.sh
 sudo pip install --upgrade twine
 
 # Copy and rename to tf_nightly
-for f in $(ls "${TF_FILE_DIR}"/tf_nightly_gpu*dev*cp3*-cp3*-win_amd64.whl); do
+for f in $(ls "${KOKORO_GFILE_DIR}"/tf_nightly_gpu*dev*cp3*-cp3*-win_amd64.whl); do
   copy_to_new_project_name "${f}" tf_nightly
 done
 
 # Upload the built packages to pypi.
-for f in $(ls "${TF_FILE_DIR}"/tf_nightly*dev*cp3*-cp3*-win_amd64.whl); do
+for f in $(ls "${KOKORO_GFILE_DIR}"/tf_nightly*dev*cp3*-cp3*-win_amd64.whl); do
   twine upload -r pypi-warehouse "$f" || echo
 done
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
deleted file mode 100644
index 9bce4d1020c..00000000000
--- a/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# TODO(mihaimaruseac,hyey,ggadde): Convert to py3
-
-set -e
-
-# Error if we somehow forget to set the path to bazel_wrapper.py
-set -u
-BAZEL_WRAPPER_PATH=$1
-set +u
-
-# From this point on, logs can be publicly available
-set -x
-
-function setup_pip () {
-  install_pip2
-  python -m virtualenv tf_build_env --system-site-packages
-  source tf_build_env/bin/activate
-  install_macos_pip_deps
-}
-
-function run_build () {
-  # Run configure.
-  export TF_NEED_CUDA=0
-  export PYTHON_BIN_PATH=$(which python2)
-  yes "" | $PYTHON_BIN_PATH configure.py
-  tag_filters="-no_oss,-no_oss_py2,-gpu,-tpu,-benchmark-test,-nomac,-no_mac,-v1only"
-
-  # Get the default test targets for bazel.
-  source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-  "${BAZEL_WRAPPER_PATH}" \
-    test \
-    --build_tag_filters="${tag_filters}" \
-    --test_tag_filters="${tag_filters}" \
-    --action_env=PATH \
-    --remote_accept_cached=true \
-    --spawn_strategy=standalone \
-    --remote_local_fallback=false \
-    --remote_timeout=600 \
-    --strategy=Javac=standalone \
-    --strategy=Closure=standalone \
-    --genrule_strategy=standalone \
-    -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-
-  # Copy log to output to be available to GitHub
-  ls -la "$(bazel info output_base)/java.log"
-  cp "$(bazel info output_base)/java.log" "${KOKORO_ARTIFACTS_DIR}/"
-}
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-setup_pip
-run_build
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
index e648c488a00..54bdd261fdf 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
@@ -52,7 +52,7 @@ function run_build () {
     --strategy=Javac=standalone \
     --strategy=Closure=standalone \
     --genrule_strategy=standalone \
-    -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+    -- ${DEFAULT_BAZEL_TARGETS}
 
   # Copy log to output to be available to GitHub
   ls -la "$(bazel info output_base)/java.log"
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh
deleted file mode 100644
index dc1491d65d7..00000000000
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python3.5 -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
deleted file mode 100644
index 99c2a149394..00000000000
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
index eb245cb1d04..dd7e2a56711 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
@@ -21,6 +21,7 @@ install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+export MACOSX_DEPLOYMENT_TARGET=10.10
 sudo xcode-select -s "${DEVELOPER_DIR}"
 python3.6 -m virtualenv tf_build_env --system-site-packages
 source tf_build_env/bin/activate
@@ -42,6 +43,7 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt \
+  --copt=-DGRPC_BAZEL_BUILD \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
index b9e6c3c9cf0..2f73ad62e1e 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
@@ -20,7 +20,7 @@ source tensorflow/tools/ci_build/release/common.sh
 install_bazelisk
 
 # Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+export DEVELOPER_DIR=/Applications/Xcode_11.3.app/Contents/Developer
 sudo xcode-select -s "${DEVELOPER_DIR}"
 python -m virtualenv tf_build_env --system-site-packages
 source tf_build_env/bin/activate
@@ -42,6 +42,7 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt \
+  --copt=-DGRPC_BAZEL_BUILD \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
index a90d59ff492..11b557adc96 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
@@ -21,6 +21,7 @@ install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+export MACOSX_DEPLOYMENT_TARGET=10.10
 sudo xcode-select -s "${DEVELOPER_DIR}"
 python -m virtualenv tf_build_env --system-site-packages
 source tf_build_env/bin/activate
@@ -42,6 +43,7 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt \
+  --copt=-DGRPC_BAZEL_BUILD \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
deleted file mode 100644
index fee64f0beb1..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
deleted file mode 100644
index bdbb7f15e34..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
index 6b05141f00f..1ea5c920d53 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
@@ -42,6 +42,7 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --local_test_jobs=8 \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
index db0c6056b6c..4a945896bac 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
@@ -42,6 +42,7 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --local_test_jobs=8 \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
index 36da30167d0..6271342c162 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
@@ -42,6 +42,7 @@ bazel test --test_output=errors --config=opt --test_lang_filters=py \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --local_test_jobs=8 \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
deleted file mode 100644
index a8dfd2047ba..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
deleted file mode 100644
index f178ac0754e..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
index c52acec7784..bc8b05f9361 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
@@ -43,7 +43,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36,-no_cuda11"
 
 set +e
 bazel test --config=cuda --config=opt \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
index 9bc559a01ab..fac34ce0244 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
@@ -35,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36,-no_cuda11'
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
@@ -46,7 +46,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
+export TF_PROJECT_NAME="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
index bf5fabba741..4476b0335b1 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
@@ -43,7 +43,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37,-no_cuda11"
 
 set +e
 bazel test --config=cuda --config=opt \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
index 71d6f3e6401..d842cfde829 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
@@ -35,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37,-no_cuda11'
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
@@ -46,7 +46,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
+export TF_PROJECT_NAME="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
index 5f29daf36e0..7dd74d62c58 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
@@ -43,7 +43,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38,-no_cuda11"
 
 test +e
 bazel test --config=cuda --config=opt \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
index f49b77bae70..59312a20b42 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
@@ -35,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38,-no_cuda11'
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
@@ -46,7 +46,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
+export TF_PROJECT_NAME="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
deleted file mode 100644
index 4122756ef40..00000000000
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
+++ /dev/null
@@ -1,24 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-if "%IS_NIGHTLY%" == "1" (
-    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
-) else (
-    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
-)
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
deleted file mode 100644
index 7a8eb53d1e1..00000000000
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
+++ /dev/null
@@ -1,26 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-if "%IS_NIGHTLY%" == "1" (
-    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
-) else (
-    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-    for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-    bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
-)
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index ec335ad7408..7e837596350 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -103,55 +103,6 @@ function update_bazel_linux {
 # LINT.ThenChange(
 #   //tensorflow_estimator/google/kokoro/common.sh)
 
-function install_pip2 {
-  curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-  sudo python2 get-pip.py
-}
-
-function install_pip3.5 {
-  curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-  sudo python3.5 get-pip.py
-}
-
-function install_pip_deps {
-  SUDO_CMD=""
-  PIP_CMD="pip"
-
-  while true; do
-    if [[ -z "${1}" ]]; then
-      break
-    fi
-    if [[ "$1" == "sudo" ]]; then
-      SUDO_CMD="sudo "
-    elif [[ "$1" == "pip"* ]]; then
-      PIP_CMD="$1"
-    fi
-    shift
-  done
-
-  # LINT.IfChange(ubuntu_pip_installations)
-  # TODO(aselle): Change all these to be --user instead of sudo.
-  ${SUDO_CMD} ${PIP_CMD} install astunparse==1.6.3
-  ${SUDO_CMD} ${PIP_CMD} install keras_preprocessing==1.1.0 --no-deps
-  "${PIP_CMD}" install numpy==1.16.0 --user
-  "${PIP_CMD}" install PyYAML==3.13 --user
-  ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
-  ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
-  ${SUDO_CMD} ${PIP_CMD} install six==1.12.0
-  ${SUDO_CMD} ${PIP_CMD} install grpcio
-  ${SUDO_CMD} ${PIP_CMD} install portpicker
-  ${SUDO_CMD} ${PIP_CMD} install scipy
-  ${SUDO_CMD} ${PIP_CMD} install scikit-learn
-  ${SUDO_CMD} ${PIP_CMD} install typing_extensions
-  ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
-  ${PIP_CMD} install --user --upgrade flatbuffers
-  ${PIP_CMD} install --user --upgrade attrs
-  ${PIP_CMD} install --user --upgrade tf-estimator-nightly
-  ${PIP_CMD} install --user --upgrade "future>=0.17.1"
-  ${PIP_CMD} install --user --upgrade wrapt
-  # LINT.ThenChange(:ubuntu_16_pip_installations)
-}
-
 function install_ubuntu_16_pip_deps {
   PIP_CMD="pip"
 
@@ -165,30 +116,44 @@ function install_ubuntu_16_pip_deps {
     shift
   done
 
-  # LINT.IfChange(ubuntu_16_pip_installations)
-  "${PIP_CMD}" install astunparse==1.6.3 --user
-  "${PIP_CMD}" install --user --upgrade attrs
-  "${PIP_CMD}" install --user --upgrade flatbuffers
-  "${PIP_CMD}" install keras_preprocessing==1.1.0 --no-deps --user
-  "${PIP_CMD}" install numpy==1.16.0 --user
-  "${PIP_CMD}" install --user --upgrade "future>=0.17.1"
-  "${PIP_CMD}" install gast==0.3.3 --user
-  "${PIP_CMD}" install h5py==2.10.0 --user
-  "${PIP_CMD}" install six==1.12.0 --user
-  "${PIP_CMD}" install grpcio --user
-  "${PIP_CMD}" install portpicker --user
-  "${PIP_CMD}" install scipy --user
-  "${PIP_CMD}" install scikit-learn --user
-  "${PIP_CMD}" install typing_extensions --user
-  "${PIP_CMD}" install PyYAML==3.13 --user
-  # b/156523241
-  "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
-  "${PIP_CMD}" install --user --upgrade tb-nightly
-  "${PIP_CMD}" install --user --upgrade wrapt
-  # LINT.ThenChange(:ubuntu_pip_installations)
+  # LINT.IfChange(linux_pip_installations)
+  # To have reproducible builds, these dependencies should be pinned always.
+  # Prefer pinning to the same version as in setup.py
+  # First, upgrade pypi wheels
+  "${PIP_CMD}" install --user --upgrade setuptools pip wheel
+  # Now, install the deps, as listed in setup.py
+  "${PIP_CMD}" install --user 'absl-py ~= 0.10'
+  "${PIP_CMD}" install --user 'astunparse ~= 1.6.3'
+  "${PIP_CMD}" install --user 'flatbuffers ~= 1.12.0'
+  "${PIP_CMD}" install --user 'google_pasta ~= 0.2'
+  "${PIP_CMD}" install --user 'h5py ~= 2.10.0'
+  "${PIP_CMD}" install --user 'keras_preprocessing ~= 1.1.2'
+  "${PIP_CMD}" install --user 'numpy ~= 1.19.2'
+  "${PIP_CMD}" install --user 'opt_einsum ~= 3.3.0'
+  "${PIP_CMD}" install --user 'protobuf ~= 3.13.0'
+  "${PIP_CMD}" install --user 'six ~= 1.15.0'
+  "${PIP_CMD}" install --user 'termcolor ~= 1.1.0'
+  "${PIP_CMD}" install --user 'typing_extensions ~= 3.7.4'
+  "${PIP_CMD}" install --user 'wheel ~= 0.35'
+  "${PIP_CMD}" install --user 'wrapt ~= 1.12.1'
+  # We need to pin gast dependency exactly
+  "${PIP_CMD}" install --user 'gast == 0.3.3'
+  # Finally, install tensorboard and estimator
+  # Note that here we want the latest version that matches (b/156523241)
+  "${PIP_CMD}" install --user --upgrade --force-reinstall 'tb-nightly ~= 2.4.0.a'
+  "${PIP_CMD}" install --user --upgrade --force-reinstall 'tensorflow_estimator ~= 2.3.0'
+  # Test dependencies
+  "${PIP_CMD}" install --user 'grpcio ~= 1.32.0'
+  "${PIP_CMD}" install --user 'portpicker ~= 1.3.1'
+  "${PIP_CMD}" install --user 'scipy ~= 1.5.2'
+  # LINT.ThenChange(:mac_pip_installations)
+  # Need to be addressed later. Unblocking 2.4 branchcut
+  "${PIP_CMD}" install --user 'PyYAML ~= 5.3.1'
 }
 
 function install_macos_pip_deps {
+  # TODO(mihaimaruseac): Remove need for sudo, then this can be merged with
+  # above (probably needs to convert to venv too).
   SUDO_CMD=""
   PIP_CMD="pip"
 
@@ -207,30 +172,37 @@ function install_macos_pip_deps {
     shift
   done
 
-   # High Sierra pip for Python2.7 installs don't work as expected.
-   if [[ "${PIP_CMD}" == "pip" ]]; then
-    PIP_CMD="python -m pip"
-    SUDO_CMD="sudo -H "
-   fi
-
-  # TODO(aselle): Change all these to be --user instead of sudo.
-  ${SUDO_CMD} ${PIP_CMD} install --upgrade setuptools==39.1.0
-  ${SUDO_CMD} ${PIP_CMD} install keras_preprocessing==1.1.0 --no-deps
-  ${SUDO_CMD} ${PIP_CMD} install --upgrade mock portpicker scipy grpcio
-  ${SUDO_CMD} ${PIP_CMD} install six==1.12.0
-  ${SUDO_CMD} ${PIP_CMD} install scikit-learn
-  ${SUDO_CMD} ${PIP_CMD} install numpy==1.16.0
-  ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
-  ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
-  ${SUDO_CMD} ${PIP_CMD} install typing_extensions
-  ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
-  ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
-  ${PIP_CMD} install --user --upgrade flatbuffers
-  ${PIP_CMD} install --user --upgrade attrs
-  # b/156523241
-  ${PIP_CMD} install --force-reinstall --user --upgrade tf-estimator-nightly
-  ${PIP_CMD} install --user --upgrade wrapt
-  ${PIP_CMD} install --user --upgrade "future>=0.17.1"
+  # LINT.IfChange(mac_pip_installations)
+  # To have reproducible builds, these dependencies should be pinned always.
+  # Prefer pinning to the same version as in setup.py
+  # First, upgrade pypi wheels
+  ${PIP_CMD} install --user --upgrade setuptools pip wheel
+  # Now, install the deps, as listed in setup.py
+  ${PIP_CMD} install --user 'absl-py ~= 0.10'
+  ${PIP_CMD} install --user 'astunparse ~= 1.6.3'
+  ${PIP_CMD} install --user 'flatbuffers ~= 1.12.0'
+  ${PIP_CMD} install --user 'google_pasta ~= 0.2'
+  ${PIP_CMD} install --user 'h5py ~= 2.10.0'
+  ${PIP_CMD} install --user 'keras_preprocessing ~= 1.1.2'
+  ${PIP_CMD} install --user 'numpy ~= 1.19.2'
+  ${PIP_CMD} install --user 'opt_einsum ~= 3.3.0'
+  ${PIP_CMD} install --user 'protobuf ~= 3.13.0'
+  ${PIP_CMD} install --user 'six ~= 1.15.0'
+  ${PIP_CMD} install --user 'termcolor ~= 1.1.0'
+  ${PIP_CMD} install --user 'typing_extensions ~= 3.7.4'
+  ${PIP_CMD} install --user 'wheel ~= 0.35'
+  ${PIP_CMD} install --user 'wrapt ~= 1.12.1'
+  # We need to pin gast dependency exactly
+  ${PIP_CMD} install --user 'gast == 0.3.3'
+  # Finally, install tensorboard and estimator
+  # Note that here we want the latest version that matches (b/156523241)
+  ${PIP_CMD} install --user --upgrade --force-reinstall 'tb-nightly ~= 2.4.0.a'
+  ${PIP_CMD} install --user --upgrade --force-reinstall 'tensorflow_estimator ~= 2.3.0'
+  # Test dependencies
+  ${PIP_CMD} install --user 'grpcio ~= 1.32.0'
+  ${PIP_CMD} install --user 'portpicker ~= 1.3.1'
+  ${PIP_CMD} install --user 'scipy ~= 1.5.2'
+  # LINT.ThenChange(:linux_pip_installations)
 }
 
 function maybe_skip_v1 {
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 41c536a58ff..a49140c7574 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -21,43 +21,37 @@ IF NOT DEFINED PYTHON_DIRECTORY (
   SET PYTHON_DIRECTORY=Python36
 )
 SET PY_EXE=C:\%PYTHON_DIRECTORY%\python.exe
-SET PIP_EXE=C:\%PYTHON_DIRECTORY%\Scripts\pip.exe
 SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
-@REM TODO(amitpatankar): Make an image with these packages and remove this.
-
-%PIP_EXE% install flatbuffers --upgrade --no-deps
-%PIP_EXE% install setuptools --upgrade
-%PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
-%PIP_EXE% install tb-nightly --no-deps
-%PIP_EXE% install numpy==1.16.0 --upgrade --no-deps
-%PIP_EXE% install opt_einsum --upgrade
-%PIP_EXE% install pandas --upgrade --no-deps
-%PIP_EXE% install protobuf --upgrade --no-deps
-%PIP_EXE% install keras_preprocessing==1.1.0 --upgrade --no-deps
-%PIP_EXE% install wrapt --upgrade --no-deps
-%PIP_EXE% install absl-py==0.9.0
-
-IF "%PYTHON_DIRECTORY%"=="Python37" (
-    %PIP_EXE% install colorama==0.3.9
-    %PIP_EXE% install cycler==0.10.0
-    %PIP_EXE% install jedi==0.11.1
-    %PIP_EXE% install oauth2client==4.1.2
-    %PIP_EXE% install portpicker==1.2.0
-    %PIP_EXE% install parso==0.1.1
-    %PIP_EXE% install protobuf==3.8.0
-    %PIP_EXE% install scikit-learn==0.19.2
-    %PIP_EXE% install scipy==1.1.0
-    %PIP_EXE% install termcolor==1.1.0
-)
-
-@REM TODO(amitpatankar): this is just a quick fix so that windows build doesn't
-@REM break with gast upgrade to 0.3.3. Need to figure out the right way to
-@REM handle this case.
-%PIP_EXE% install gast==0.3.3
-%PIP_EXE% install astunparse==1.6.3
-%PIP_EXE% install typing_extensions
+@REM To have reproducible builds, these dependencies should be pinned always.
+@REM Prefer pinning to the same version as in setup.py
+@REM First, upgrade pypi wheels
+%PY_EXE% -m pip install --upgrade setuptools pip wheel
+@REM Now, install the deps, as listed in setup.py
+%PY_EXE% -m pip install "absl-py ~= 0.10"
+%PY_EXE% -m pip install "astunparse ~= 1.6.3"
+%PY_EXE% -m pip install "flatbuffers ~= 1.12.0"
+%PY_EXE% -m pip install "google_pasta ~= 0.2"
+%PY_EXE% -m pip install "h5py ~= 2.10.0"
+%PY_EXE% -m pip install "keras_preprocessing ~= 1.1.2"
+%PY_EXE% -m pip install "numpy ~= 1.19.2"
+%PY_EXE% -m pip install "opt_einsum ~= 3.3.0"
+%PY_EXE% -m pip install "protobuf ~= 3.13.0"
+%PY_EXE% -m pip install "six ~= 1.15.0"
+%PY_EXE% -m pip install "termcolor ~= 1.1.0"
+%PY_EXE% -m pip install "typing_extensions ~= 3.7.4"
+%PY_EXE% -m pip install "wheel ~= 0.35"
+%PY_EXE% -m pip install "wrapt ~= 1.12.1"
+@REM We need to pin gast dependency exactly
+%PY_EXE% -m pip install "gast == 0.3.3"
+@REM Finally, install tensorboard and estimator
+@REM Note that here we want the latest version that matches (b/156523241)
+%PY_EXE% -m pip install --upgrade --force-reinstall "tb-nightly ~= 2.4.0.a"
+%PY_EXE% -m pip install --upgrade --force-reinstall "tensorflow_estimator ~= 2.3.0"
+@REM Test dependencies
+%PY_EXE% -m pip install "grpcio ~= 1.32.0"
+%PY_EXE% -m pip install "portpicker ~= 1.3.1"
+%PY_EXE% -m pip install "scipy ~= 1.5.2"
 
 :: Set cuda related environment variables. If we are not using CUDA, these are not used.
 IF NOT DEFINED TF_CUDA_VERSION (
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh b/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
deleted file mode 100644
index 3dfab5a2aaa..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-echo "chmod go+w lib_package/*" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
-echo "bazel clean --expunge" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
-
-# Install latest bazel
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Update the version string to nightly
-./tensorflow/tools/ci_build/update_version.py --nightly
-
-tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
-
-# Copy the nightly version update script
-cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
deleted file mode 100644
index dc1491d65d7..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python3.5 -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
deleted file mode 100644
index f045e7103e0..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Install pip dependencies
-install_pip3.5
-install_macos_pip_deps sudo pip3.5
-
-export PATH=$PATH:/usr/local/bin
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35"
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --incompatible_depset_union=false \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  //tensorflow/... \
-  -//tensorflow/compiler/... \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
deleted file mode 100644
index 99c2a149394..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
deleted file mode 100644
index dcbd5b504c8..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Install pip dependencies
-sudo pip3.5 install --upgrade pip
-install_macos_pip_deps sudo pip3.5
-
-# For python3 path on Mac
-export PATH=$PATH:/usr/local/bin
-
-# Export required variables for running pip.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls ${TF_ARTIFACTS_DIR}/tensorflow/${TF_PIP_TEST_ROOT}/whl/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
deleted file mode 100644
index 8ee43fb1b2f..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# For python3 path on Mac
-export PATH=$PATH:/usr/local/bin
-
-sudo pip install twine
-
-./tensorflow/tools/ci_build/update_version.py --nightly
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Build the pip package
-bazel build --config=opt tensorflow/tools/pip_package:build_pip_package
-mkdir pip_pkg
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
-
-# Also upload the python 3.5 package as python 3.3 and 3.4 packages.
-FILENAME="$(ls pip_pkg/tf_nightly-*dev*-macosx_*.whl)"
-tensorflow/tools/ci_build/copy_binary.py --filename "${FILENAME}" --new_py_ver 33
-tensorflow/tools/ci_build/copy_binary.py --filename "${FILENAME}" --new_py_ver 34
-
-for f in $(ls pip_pkg/tf_nightly-*dev*macosx*.whl); do
-  echo "Uploading package: ${f}"
-  twine upload -r pypi-warehouse "${f}" || echo
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
deleted file mode 100644
index eb245cb1d04..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python3.6 -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.6
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.6)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
deleted file mode 100644
index 2f639d7fc6b..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Install pip dependencies
-install_macos_pip_deps sudo pip3.6
-
-export PATH=$PATH:/usr/local/bin
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --incompatible_depset_union=false \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
deleted file mode 100644
index 375a8c705fa..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.6
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.6'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
deleted file mode 100644
index 3d04cf1d9ba..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Install pip dependencies
-install_macos_pip_deps sudo pip3.6
-
-# For python3 path on Mac
-export PATH=$PATH:/usr/local/bin
-
-# Export required variables for running pip.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls "${TF_ARTIFACTS_DIR}"/tensorflow/${TF_PIP_TEST_ROOT}/whl/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
deleted file mode 100644
index b9e6c3c9cf0..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.7
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
deleted file mode 100644
index a05cd81d74f..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Install pip dependencies
-install_macos_pip_deps sudo pip3.7
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac"
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
deleted file mode 100644
index ea6779be698..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.7
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.7'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py37,-v1only,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
deleted file mode 100644
index c3840aa2dc8..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Install pip dependencies
-install_macos_pip_deps sudo pip3.7
-
-# For python3 path on Mac
-export PATH=$PATH:/usr/local/bin
-
-# Export required variables for running pip.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls ${TF_PIP_TEST_ROOT}/whl/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
deleted file mode 100644
index 7465838abb9..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-install_macos_pip_deps sudo pip3.7
-
-# For python3 path on Mac
-export PATH=$PATH:/usr/local/bin
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Build the pip package
-bazel build --config=opt tensorflow/tools/pip_package:build_pip_package
-mkdir pip_pkg
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls "${TF_ARTIFACTS_DIR}"/github/tensorflow/pip_pkg/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
deleted file mode 100644
index a90d59ff492..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.8
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.8)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
deleted file mode 100644
index f0ef8e89766..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.8
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.8'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-v1only,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
deleted file mode 100644
index fee64f0beb1..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
deleted file mode 100644
index 4231891fbdb..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
deleted file mode 100644
index bdbb7f15e34..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
deleted file mode 100644
index 1e2665f4120..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
deleted file mode 100644
index 6b05141f00f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
deleted file mode 100644
index 38d03c8868c..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
deleted file mode 100644
index 6277291043c..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
deleted file mode 100644
index c4d78dc3fe5..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
deleted file mode 100644
index db0c6056b6c..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh
deleted file mode 100644
index 098155aa026..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
deleted file mode 100644
index ff88ae46f39..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
deleted file mode 100644
index 2208327388f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
deleted file mode 100644
index 36da30167d0..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.8)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
deleted file mode 100644
index 52872cfd0a6..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.8'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
deleted file mode 100755
index 22ca5b7b567..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update Bazel to the desired version
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-########################
-## Build GPU pip package
-########################
-bazel build --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
-
-# Set TF nightly flag so we get the proper version of estimator
-if [[ "$IS_NIGHTLY" == 1 ]]; then
-  NIGHTLY_FLAG="--nightly_flag"
-fi
-
-PIP_WHL_DIR=whl
-mkdir -p ${PIP_WHL_DIR}
-PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
-bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
-WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
-
-cp "${WHL_PATH}" "$(pwd)"/.
-chmod +x tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
-docker run -e "BAZEL_VERSION=${BAZEL_VERSION}" -e "CI_BUILD_USER=$(id -u -n)" -e "CI_BUILD_UID=$(id -u)"  -e "CI_BUILD_GROUP=$(id -g -n)" -e "CI_BUILD_GID=$(id -g)"  -e "CI_BUILD_HOME=/bazel_pip" -v "$(pwd)":/bazel_pip tensorflow/tensorflow:devel "./bazel_pip/tensorflow/tools/ci_build/builds/with_the_same_user" "./bazel_pip/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
deleted file mode 100644
index a8dfd2047ba..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
deleted file mode 100644
index e0e69504f26..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
deleted file mode 100644
index f178ac0754e..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
deleted file mode 100644
index 6c83621269e..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=opt --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
deleted file mode 100644
index c52acec7784..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
deleted file mode 100644
index 1da93811d43..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
-
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
deleted file mode 100644
index 9bc559a01ab..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
deleted file mode 100644
index e3da69ebc32..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
-export TF_BUILD_FLAGS="--config=opt --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
deleted file mode 100644
index bf5fabba741..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
deleted file mode 100644
index a620e3c92d2..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
-
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
deleted file mode 100644
index 71d6f3e6401..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
deleted file mode 100644
index a0fb0c40001..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
-export TF_BUILD_FLAGS="--config=opt --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
deleted file mode 100644
index 5f29daf36e0..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-update_bazel_linux
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.8)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
-
-test +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
deleted file mode 100644
index f49b77bae70..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-update_bazel_linux
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.8'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
deleted file mode 100644
index 1504688dcbc..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-
-# Source the external common scripts.
-source tensorflow/tools/ci_build/release/common.sh
-
-
-# Install latest bazel
-install_bazelisk
-which bazel
-
-# Install realpath
-sudo apt-get install realpath
-
-# Update the version string to nightly
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  ./tensorflow/tools/ci_build/update_version.py --nightly
-fi
-
-./tensorflow/tools/ci_build/linux/libtensorflow.sh
-
-# Copy the nightly version update script
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
-fi
-
-# Upload to go/tf-sizetracker
-python3 ./tensorflow/tools/ci_build/sizetrack_helper.py \
-  --team tensorflow_libtensorflow \
-  --artifact_id ubuntu_cpu_nightly \
-  --upload \
-  --artifact "$(find lib_package -iname "libtensorflow*.tar.gz" -not -iname "*jni*" | head -n 1)"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
deleted file mode 100644
index d294311d1ff..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-
-# Source the external common scripts.
-source tensorflow/tools/ci_build/release/common.sh
-
-
-# Install latest bazel
-install_bazelisk
-which bazel
-
-# Install realpath
-sudo apt-get install realpath
-
-export TF_NEED_CUDA=1
-
-# Update the version string to nightly
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  ./tensorflow/tools/ci_build/update_version.py --nightly
-fi
-
-./tensorflow/tools/ci_build/linux/libtensorflow.sh
-
-# Copy the nightly version update script
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
-fi
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
deleted file mode 100644
index dcc03e784db..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\bazel\run_libtensorflow.bat || exit /b
-
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/release.bat
deleted file mode 100644
index 67941234b15..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/release.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\bazel\run_libtensorflow.bat || exit /b 1
-
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
deleted file mode 100644
index 979a30e046c..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
+++ /dev/null
@@ -1,22 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-echo on
-setlocal enableextensions enabledelayedexpansion
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
deleted file mode 100644
index 175917d7cad..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat
deleted file mode 100644
index e0f0bfeae7b..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build
-
-for %%a in ("%~dp0.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
deleted file mode 100644
index fd1854603f5..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
deleted file mode 100644
index 85b75053eff..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release_v1.bat
deleted file mode 100644
index 44483213724..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build
-
-for %%a in ("%~dp0.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat
deleted file mode 100644
index 69b9449b0c3..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
deleted file mode 100644
index d8a6673ba4c..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_v1.bat
deleted file mode 100644
index ac549eca53e..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
deleted file mode 100644
index 0d5b3a7fff8..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python38
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
deleted file mode 100644
index 86adcda0bb9..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python38
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
-
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/nightly.bat
deleted file mode 100644
index 8ab78bef3ca..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\bazel\run_libtensorflow.bat || exit /b
-
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/release.bat
deleted file mode 100644
index 8ab78bef3ca..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/release.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\bazel\run_libtensorflow.bat || exit /b
-
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_pip_on_cpu/build.bat b/tensorflow/tools/ci_build/release/windows/gpu_pip_on_cpu/build.bat
deleted file mode 100644
index 213de532069..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_pip_on_cpu/build.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\integration\gpu_pip_on_cpu\run.bat
-
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
deleted file mode 100644
index ba8dee59853..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
deleted file mode 100644
index 86c118b2f83..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat
deleted file mode 100644
index 55e4e4f5782..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
deleted file mode 100644
index 9624ca5f5b2..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
deleted file mode 100644
index cc4f84afbee..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release_v1.bat
deleted file mode 100644
index a66ca900e47..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat
deleted file mode 100644
index c6141c42916..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
deleted file mode 100644
index 5fa798e3eb8..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_v1.bat
deleted file mode 100644
index 059e28134c8..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
deleted file mode 100644
index dcbed63089e..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python38
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
deleted file mode 100644
index fa1fc131145..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python38
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index e2aa438cb1a..b925f6b9c36 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -770,6 +770,12 @@ renames = {
         'tf.linalg.matrix_transpose',
     'tf.matrix_triangular_solve':
         'tf.linalg.triangular_solve',
+    'tf.mixed_precision.DynamicLossScale':
+        'tf.compat.v1.mixed_precision.DynamicLossScale',
+    'tf.mixed_precision.FixedLossScale':
+        'tf.compat.v1.mixed_precision.FixedLossScale',
+    'tf.mixed_precision.LossScale':
+        'tf.compat.v1.mixed_precision.LossScale',
     'tf.metrics.accuracy':
         'tf.compat.v1.metrics.accuracy',
     'tf.metrics.auc':
@@ -838,6 +844,12 @@ renames = {
         'tf.compat.v1.metrics.true_positives_at_thresholds',
     'tf.min_max_variable_partitioner':
         'tf.compat.v1.min_max_variable_partitioner',
+    'tf.mixed_precision.MixedPrecisionLossScaleOptimizer':
+        'tf.compat.v1.mixed_precision.MixedPrecisionLossScaleOptimizer',
+    'tf.mixed_precision.disable_mixed_precision_graph_rewrite':
+        'tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite',
+    'tf.mixed_precision.enable_mixed_precision_graph_rewrite':
+        'tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite',
     'tf.mod':
         'tf.math.floormod',
     'tf.model_variables':
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 7c7461c19da..28c44261a24 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -107,7 +107,7 @@ Simple usage:
       "--no_upgrade_compat_v1_import",
       dest="no_upgrade_compat_v1_import",
       help=("If specified, don't upgrade explicit imports of "
-            "`tensorflow.compat.v1 as tf` to the v2 apis. Otherwise, "
+            "`tensorflow.compat.v1 as tf` to the v2 APIs. Otherwise, "
             "explicit imports of  the form `tensorflow.compat.v1 as tf` will "
             "be upgraded."),
       action="store_true")
@@ -158,8 +158,7 @@ Simple usage:
           "--outfile=<output file> argument is required when converting a "
           "single file.")
     if args.in_place and args.output_file:
-      raise ValueError(
-          "--outfile argument is invalid when when converting in place")
+      raise ValueError("--outfile argument is invalid when converting in place")
     output_file = args.input_file if args.in_place else args.output_file
     files_processed, report_text, errors = process_file(
         args.input_file, output_file, upgrade)
@@ -171,8 +170,7 @@ Simple usage:
           "--outtree=<output directory> argument is required when converting a "
           "file tree.")
     if args.in_place and args.output_tree:
-      raise ValueError(
-          "--outtree argument is invalid when when converting in place")
+      raise ValueError("--outtree argument is invalid when converting in place")
     output_tree = args.input_tree if args.in_place else args.output_tree
     files_processed, report_text, errors = upgrade.process_tree(
         args.input_tree, output_tree, args.copy_other_files)
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 85b943e1f7e..af5b1a104f4 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -346,6 +346,10 @@ tensorflow::profiler::ProfilerServer::~ProfilerServer
 tensorflow::profiler::ProfileGrpc
 tensorflow::profiler::NewSessionGrpc
 tensorflow::profiler::MonitorGrpc
+tensorflow::profiler::RemoteProfilerSession::Create
+tensorflow::profiler::RemoteProfilerSession::GetServiceAddress
+tensorflow::profiler::RemoteProfilerSession::WaitForCompletion
+tensorflow::profiler::RemoteProfilerSession::~RemoteProfilerSession
 
 [status_macros] # tfcompile
 xla::status_macros::MakeErrorStream::Impl::Impl
@@ -389,6 +393,13 @@ tensorflow::tensor_float_32_execution_enabled
 
 [get_compiler_ir] # tfe
 tensorflow::GetCompilerIr
+stream_executor::port::internal_statusor::Helper::Crash
 
 [tensor_handle] # tfe
 tensorflow::TensorHandle::Tensor
+
+[python_api_dispatcher] # python_api_dispatcher
+tensorflow::PythonAPIDispatcher
+
+[python_tensor_converter] # python_tensor_converter
+tensorflow::PythonTensorConverter
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 1998e47a3ad..6adc0a73610 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -2,6 +2,9 @@
 #   Doc generator
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -129,22 +132,11 @@ py_test(
 py_library(
     name = "doc_controls",
     srcs = ["doc_controls.py"],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
 
-py_test(
-    name = "doc_controls_test",
-    size = "small",
-    srcs = ["doc_controls_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":doc_controls",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 py_test(
     name = "generate2_test",
     size = "medium",
diff --git a/tensorflow/tools/docs/build_cc_api_headers.py b/tensorflow/tools/docs/build_cc_api_headers.py
new file mode 100644
index 00000000000..c0b67429f73
--- /dev/null
+++ b/tensorflow/tools/docs/build_cc_api_headers.py
@@ -0,0 +1,63 @@
+# Lint as: python3
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate Java reference docs for TensorFlow.org."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pathlib
+import subprocess
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+# These flags are required by infrastructure, not all of them are used.
+flags.DEFINE_string('output_dir', None,
+                    ("Use this branch as the root version and don't"
+                     ' create in version directory'))
+
+# __file__ is the path to this file
+DOCS_TOOLS_DIR = pathlib.Path(__file__).resolve().parent
+TENSORFLOW_ROOT = DOCS_TOOLS_DIR.parents[2]
+
+
+def build_headers(output_dir):
+  """Builds the headers files for TF."""
+
+  # `$ yes | configure`
+  yes = subprocess.Popen(['yes', ''], stdout=subprocess.PIPE)
+  configure = subprocess.Popen([TENSORFLOW_ROOT / 'configure'],
+                               stdin=yes.stdout,
+                               cwd=TENSORFLOW_ROOT)
+  configure.communicate()
+
+  subprocess.check_call(['bazel', 'build', 'tensorflow/cc:cc_ops'],
+                        cwd=TENSORFLOW_ROOT)
+  subprocess.check_call(
+      ['cp', '--dereference', '-r', 'bazel-bin', output_dir / 'bazel-genfiles'],
+      cwd=TENSORFLOW_ROOT)
+
+
+def main(argv):
+  del argv
+  build_headers(pathlib.Path(FLAGS.output_dir))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['output_dir'])
+  app.run(main)
diff --git a/tensorflow/tools/docs/doc_controls.py b/tensorflow/tools/docs/doc_controls.py
index 27a1d2075e9..4899e4c7b1a 100644
--- a/tensorflow/tools/docs/doc_controls.py
+++ b/tensorflow/tools/docs/doc_controls.py
@@ -18,10 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from typing import TypeVar
+
+T = TypeVar("T")
+
+
+_DEPRECATED = "_tf_docs_deprecated"
+
+
+def set_deprecated(obj: T) -> T:
+  """Explicitly tag an object as deprecated for the doc generator."""
+  setattr(obj, _DEPRECATED, None)
+  return obj
+
+
 _DO_NOT_DOC = "_tf_docs_do_not_document"
 
 
-def do_not_generate_docs(obj):
+def do_not_generate_docs(obj: T) -> T:
   """A decorator: Do not generate docs for this object.
 
   For example the following classes:
@@ -102,7 +116,7 @@ def do_not_generate_docs(obj):
 _DO_NOT_DOC_INHERITABLE = "_tf_docs_do_not_doc_inheritable"
 
 
-def do_not_doc_inheritable(obj):
+def do_not_doc_inheritable(obj: T) -> T:
   """A decorator: Do not generate docs for this method.
 
   This version of the decorator is "inherited" by subclasses. No docs will be
@@ -165,7 +179,7 @@ def do_not_doc_inheritable(obj):
 _FOR_SUBCLASS_IMPLEMENTERS = "_tf_docs_tools_for_subclass_implementers"
 
 
-def for_subclass_implementers(obj):
+def for_subclass_implementers(obj: T) -> T:
   """A decorator: Only generate docs for this method in the defining class.
 
   Also group this method's docs with and `@abstractmethod` in the class's docs.
@@ -242,81 +256,33 @@ def for_subclass_implementers(obj):
 
 do_not_doc_in_subclasses = for_subclass_implementers
 
+_DOC_PRIVATE = "_tf_docs_doc_private"
 
-def should_skip(obj):
-  """Returns true if docs generation should be skipped for this object.
 
-  checks for the `do_not_generate_docs` or `do_not_doc_inheritable` decorators.
+def doc_private(obj: T) -> T:
+  """A decorator: Generates docs for private methods/functions.
+
+  For example:
+
+  ```
+  class Try:
+
+    @doc_controls.doc_private
+    def _private(self):
+      ...
+  ```
+
+  As a rule of thumb, private(beginning with `_`) methods/functions are
+  not documented.
+
+  This decorator allows to force document a private method/function.
 
   Args:
-    obj: The object to document, or skip.
+    obj: The class-attribute to hide from the generated docs.
 
   Returns:
-    True if the object should be skipped
+    obj
   """
-  # Unwrap fget if the object is a property
-  if isinstance(obj, property):
-    obj = obj.fget
 
-  return hasattr(obj, _DO_NOT_DOC) or hasattr(obj, _DO_NOT_DOC_INHERITABLE)
-
-
-def should_skip_class_attr(cls, name):
-  """Returns true if docs should be skipped for this class attribute.
-
-  Args:
-    cls: The class the attribute belongs to.
-    name: The name of the attribute.
-
-  Returns:
-    True if the attribute should be skipped.
-  """
-  # Get the object with standard lookup, from the nearest
-  # defining parent.
-  try:
-    obj = getattr(cls, name)
-  except AttributeError:
-    # Avoid error caused by enum metaclasses in python3
-    if name in ("name", "value"):
-      return True
-    raise
-
-  # Unwrap fget if the object is a property
-  if isinstance(obj, property):
-    obj = obj.fget
-
-  # Skip if the object is decorated with `do_not_generate_docs` or
-  # `do_not_doc_inheritable`
-  if should_skip(obj):
-    return True
-
-  # Use __dict__ lookup to get the version defined in *this* class.
-  obj = cls.__dict__.get(name, None)
-  if isinstance(obj, property):
-    obj = obj.fget
-  if obj is not None:
-    # If not none, the object is defined in *this* class.
-    # Do not skip if decorated with `for_subclass_implementers`.
-    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
-      return False
-
-  # for each parent class
-  for parent in cls.__mro__[1:]:
-    obj = getattr(parent, name, None)
-
-    if obj is None:
-      continue
-
-    if isinstance(obj, property):
-      obj = obj.fget
-
-    # Skip if the parent's definition is decorated with `do_not_doc_inheritable`
-    # or `for_subclass_implementers`
-    if hasattr(obj, _DO_NOT_DOC_INHERITABLE):
-      return True
-
-    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
-      return True
-
-  # No blockng decorators --> don't skip
-  return False
+  setattr(obj, _DOC_PRIVATE, None)
+  return obj
diff --git a/tensorflow/tools/docs/doc_controls_test.py b/tensorflow/tools/docs/doc_controls_test.py
deleted file mode 100644
index d5eb4ffc000..00000000000
--- a/tensorflow/tools/docs/doc_controls_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for documentation control decorators."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.platform import googletest
-from tensorflow.tools.docs import doc_controls
-
-
-class DocControlsTest(googletest.TestCase):
-
-  def test_do_not_generate_docs(self):
-
-    @doc_controls.do_not_generate_docs
-    def dummy_function():
-      pass
-
-    self.assertTrue(doc_controls.should_skip(dummy_function))
-
-  def test_do_not_doc_on_method(self):
-    """The simple decorator is not aware of inheritance."""
-
-    class Parent(object):
-
-      @doc_controls.do_not_generate_docs
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertFalse(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_do_not_doc_inheritable(self):
-
-    class Parent(object):
-
-      @doc_controls.do_not_doc_inheritable
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_do_not_doc_inheritable_property(self):
-
-    class Parent(object):
-
-      @property
-      @doc_controls.do_not_doc_inheritable
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      @property
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_do_not_doc_inheritable_staticmethod(self):
-
-    class GrandParent(object):
-
-      def my_method(self):
-        pass
-
-    class Parent(GrandParent):
-
-      @staticmethod
-      @doc_controls.do_not_doc_inheritable
-      def my_method():
-        pass
-
-    class Child(Parent):
-
-      @staticmethod
-      def my_method():
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertFalse(doc_controls.should_skip(GrandParent.my_method))
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_for_subclass_implementers(self):
-
-    class GrandParent(object):
-
-      def my_method(self):
-        pass
-
-    class Parent(GrandParent):
-
-      @doc_controls.for_subclass_implementers
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-      pass
-
-    class GrandChild(Child):
-
-      def my_method(self):
-        pass
-
-    class Grand2Child(Child):
-      pass
-
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
-    self.assertFalse(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
-
-  def test_for_subclass_implementers_short_circuit(self):
-
-    class GrandParent(object):
-
-      @doc_controls.for_subclass_implementers
-      def my_method(self):
-        pass
-
-    class Parent(GrandParent):
-
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      @doc_controls.do_not_doc_inheritable
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-
-      @doc_controls.for_subclass_implementers
-      def my_method(self):
-        pass
-
-    class Grand2Child(Child):
-      pass
-
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 0b3b9e00bb6..fd59772cd6a 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -173,7 +173,7 @@ def build_docs(output_dir, code_url_prefix, search_hints, gen_report):
     if not name.startswith("_"):
       doc_controls.hide_from_search(obj)
 
-  for cls in [tf.Module, tf.keras.layers.Layer]:
+  for cls in [tf.Module, tf.keras.layers.Layer, tf.keras.optimizers.Optimizer]:
     doc_controls.decorate_all_class_attributes(
         decorator=doc_controls.do_not_doc_in_subclasses,
         cls=cls,
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index f990a165f21..0f95ce50aea 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -340,7 +340,7 @@ Status SparsifyGatherInternal(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
           }
-          // Add both both weight and identity node names.
+          // Add both weight and identity node names.
           removed_node_names.push_back(weights_node.name());
           removed_node_names.push_back(match.inputs[0].node.name());
           for (auto input_node : match.inputs[0].node.input()) {
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index cace6af08c6..c8d1a880630 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -6,7 +6,6 @@ load("//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
-load("//third_party/ngraph:build_defs.bzl", "if_ngraph")
 
 package(default_visibility = ["//visibility:private"])
 
@@ -32,11 +31,14 @@ transitive_hdrs(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/cc/saved_model:bundle_v2",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
+        "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:ops_hdrs",
         # WARNING: None of the C/C++ code under python/ has any API guarantees, and TF team
         # reserves the right to change APIs and other header-level interfaces.  If your custom
         # op uses these headers, it may break when users upgrade their version of tensorflow.
@@ -98,7 +100,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
     "//tensorflow/compiler/mlir/tensorflow:gen_mlir_passthrough_op_py",
     "//tensorflow/core:protos_all_proto_srcs",
-    "//tensorflow/examples/saved_model/integration_tests:mnist_util",
     "//tensorflow/lite/python/testdata:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
@@ -131,7 +132,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/keras:combinations",
     "//tensorflow/python/keras/layers/preprocessing:preprocessing_test_utils",
     "//tensorflow/python/keras/distribute:distribute_test_lib_pip",
-    "//tensorflow/python/keras/mixed_precision/experimental:test_util",
+    "//tensorflow/python/keras/mixed_precision:test_util",
     "//tensorflow/python/keras/tests:model_subclassing_test_util",
     "//tensorflow/python/keras/tests:model_architectures",
     "//tensorflow/python/keras/benchmarks:keras_benchmark_lib_pip",
@@ -152,9 +153,9 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/common:test_module1",
     "//tensorflow/tools/common:traverse",
     "//tensorflow/python/distribute:parameter_server_strategy_v2",
-    "//tensorflow/python/distribute/client:client",
-    "//tensorflow/python/distribute/client:remote_eager_lib",
-    "//tensorflow/python/distribute/client:metric_utils",
+    "//tensorflow/python/distribute/coordinator:cluster_coordinator",
+    "//tensorflow/python/distribute/coordinator:remote_eager_lib",
+    "//tensorflow/python/distribute/coordinator:metric_utils",
 ]
 
 # On Windows, python binary is a zip file of runfiles tree.
@@ -273,12 +274,7 @@ filegroup(
             "@com_github_grpc_grpc//:LICENSE",
             "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
         ],
-    ) + if_ngraph([
-        "@ngraph//:LICENSE",
-        "@ngraph_tf//:LICENSE",
-        "@nlohmann_json_lib//:LICENSE.MIT",
-        "@tbb//:LICENSE",
-    ]) + tf_additional_license_deps(),
+    ) + tf_additional_license_deps(),
 )
 
 sh_binary(
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index e4a69d22a6f..dff1f96e3ff 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -44,42 +44,18 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
+
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.4.0'
+_VERSION = '2.5.0'
 
-REQUIRED_PACKAGES = [
-    'absl-py >= 0.9.0',
-    'astunparse == 1.6.3',
-    'flatbuffers >= 1.12',
-    'gast == 0.3.3',
-    'google_pasta >= 0.1.8',
-    'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing >= 1.1.1, < 1.2',
-    # TODO(mihaimaruseac): numpy 1.19.0 has ABI breakage
-    # https://github.com/numpy/numpy/pull/15355
-    'numpy >= 1.16.0, < 1.19.0',
-    'opt_einsum >= 2.3.2',
-    'portpicker >= 1.3.0',  # Needed by tf.__internal__.distribute.combinations.
-    'protobuf >= 3.9.2',
-    'tensorboard >= 2.3.0, < 3',
-    'tensorflow_estimator >= 2.3.0, < 2.4.0',
-    'termcolor >= 1.1.0',
-    'typing_extensions >= 3.7.4.2',
-    'wrapt >= 1.11.1',
-    'wheel >= 0.26',
-    'six >= 1.12.0',
-]
-
-if sys.byteorder == 'little':
-  # grpcio does not build correctly on big-endian machines due to lack of
-  # BoringSSL support.
-  # See https://github.com/tensorflow/tensorflow/issues/17882.
-  REQUIRED_PACKAGES.append('grpcio >= 1.8.6')
 
+# We use the same setup.py for all tensorflow_* packages and for the nightly
+# equivalents (tf_nightly_*). The package is controlled from the argument line
+# when building the pip package.
 project_name = 'tensorflow'
 if '--project_name' in sys.argv:
   project_name_idx = sys.argv.index('--project_name')
@@ -87,13 +63,72 @@ if '--project_name' in sys.argv:
   sys.argv.remove('--project_name')
   sys.argv.pop(project_name_idx)
 
-# tf-nightly should depend on tb-nightly
+
+# All versions of TF need these packages. We use the `~=` syntax to pin packages
+# to the latest major.minor release accepting all other patches on top of that.
+# If we already know of a patched version, we pin to that.
+# For packages that don't have yet a stable release, we pin using `~= 0.x` which
+# means we accept any `0.y` version (y >= x) but not the first major release. We
+# will need additional testing for that.
+# NOTE: This assumes that all packages follow SemVer. If a packages follows a
+# different versioning scheme (e.g., PVP), we use different bound specifier and
+# comment the versioning scheme.
+# NOTE: Please add test only packages to `TEST_PACKAGES` below.
+REQUIRED_PACKAGES = [
+    'absl-py ~= 0.10',
+    'astunparse ~= 1.6.3',
+    'flatbuffers ~= 1.12.0',
+    'google_pasta ~= 0.2',
+    'h5py ~= 2.10.0',
+    'keras_preprocessing ~= 1.1.2',
+    'numpy ~= 1.19.2',
+    'opt_einsum ~= 3.3.0',
+    'protobuf ~= 3.13.0',
+    'six ~= 1.15.0',
+    'termcolor ~= 1.1.0',
+    'typing_extensions ~= 3.7.4',
+    'wheel ~= 0.35',
+    'wrapt ~= 1.12.1',
+    # These packages needs to be pinned exactly as newer versions are
+    # incompatible with the rest of the ecosystem
+    'gast == 0.3.3',
+    # TensorFlow ecosystem packages that TF exposes API for
+    # These need to be in sync with the existing TF version
+    # They are updated during the release process
+    # When updating these, please also update the nightly versions below
+    'tensorboard ~= 2.3',
+    'tensorflow_estimator ~= 2.3.0',
+]
+
+
+# For nightly packages, instead of dependening on tensorboard and
+# tensorflow_estimator, we depend on their nightly equivalent.
+# When updating these, make sure to also update the release versions above.
+# NOTE: the nightly versions are one version ahead of the release ones!
+# NOTE: the nightly versions specify alpha/dev!
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.4.0a0, < 3.0.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly ~= 2.4.0.a'
     elif 'tensorflow_estimator' in pkg:
-      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
+      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly ~= 2.4.0.dev'
+
+
+# grpcio does not build correctly on big-endian machines due to lack of
+# BoringSSL support.
+# See https://github.com/tensorflow/tensorflow/issues/17882.
+if sys.byteorder == 'little':
+  REQUIRED_PACKAGES.append('grpcio ~= 1.32.0')
+
+
+# Packages which are only needed for testing code.
+# Please don't add test-only packages to `REQUIRED_PACKAGES`!
+# Follows the same conventions as `REQUIRED_PACKAGES`
+TEST_PACKAGES = [
+    'portpicker ~= 1.3.1',
+    'scipy ~= 1.5.2',
+]
+
 
 DOCLINES = __doc__.split('\n')
 if project_name.endswith('-gpu'):
@@ -111,6 +146,7 @@ CONSOLE_SCRIPTS = [
     'tflite_convert = tensorflow.lite.python.tflite_convert:main',
     'toco = tensorflow.lite.python.tflite_convert:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
+    'import_pb_to_tensorboard = tensorflow.python.tools.import_pb_to_tensorboard:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
@@ -126,10 +162,6 @@ CONSOLE_SCRIPTS = [
 if 'tf_nightly' in project_name:
   CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
-TEST_PACKAGES = [
-    'scipy >= 0.15.1',
-]
-
 
 class BinaryDistribution(Distribution):
 
@@ -297,7 +329,6 @@ setup(
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: Apache Software License',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 637d3b9e05b..c689cbaac67 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -4,7 +4,6 @@ load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
-load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
@@ -125,48 +124,22 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         armhf_repo = "../armhf_linux_toolchain",
     )
 
-    mkl_repository(
-        name = "mkl_linux",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "a936d6b277a33d2a027a024ea8e65df62bd2e162c7ca52c48486ed9d5dc27160",
-        strip_prefix = "mklml_lnx_2019.0.5.20190502",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.21/mklml_lnx_2019.0.5.20190502.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.21/mklml_lnx_2019.0.5.20190502.tgz",
-        ],
-    )
-    mkl_repository(
-        name = "mkl_windows",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "33cc27652df3b71d7cb84b26718b5a2e8965e2c864a502347db02746d0430d57",
-        strip_prefix = "mklml_win_2020.0.20190813",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.21/mklml_win_2020.0.20190813.zip",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.21/mklml_win_2020.0.20190813.zip",
-        ],
-    )
-    mkl_repository(
-        name = "mkl_darwin",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "2fbb71a0365d42a39ea7906568d69b1db3bfc9914fee75eedb06c5f32bf5fa68",
-        strip_prefix = "mklml_mac_2019.0.5.20190502",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.21/mklml_mac_2019.0.5.20190502.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.21/mklml_mac_2019.0.5.20190502.tgz",
-        ],
-    )
-
     if path_prefix:
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
+    # To update any of the dependencies bellow:
+    # a) update URL and strip_prefix to the new git commit hash
+    # b) get the sha256 hash of the commit by running:
+    #    curl -L <url> | sha256sum
+    # and update the sha256 with the result.
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "4b199c96fb2d551450b48eb5549843b41c023ad200aa86760a7c56d0dc0da806",
-        strip_prefix = "XNNPACK-68447302abcfad0d4b6b19a1efe7d7eef8833f4a",
+        sha256 = "e803c4acfde4a836e7462bf68275e248f7a2c6ac405d05e15d9f93e3dcdf677d",
+        strip_prefix = "XNNPACK-30d4b250aef4ee74271c2254943f062a8356a23e",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/68447302abcfad0d4b6b19a1efe7d7eef8833f4a.zip",
-            "https://github.com/google/XNNPACK/archive/68447302abcfad0d4b6b19a1efe7d7eef8833f4a.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/30d4b250aef4ee74271c2254943f062a8356a23e.zip",
+            "https://github.com/google/XNNPACK/archive/30d4b250aef4ee74271c2254943f062a8356a23e.zip",
         ],
     )
 
@@ -190,11 +163,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    # Important: If you are upgrading MKL-DNN, then update the version numbers
-    # in third_party/mkl_dnn/mkldnn.BUILD. In addition, the new version of
-    # MKL-DNN might require upgrading MKL ML libraries also. If they need to be
-    # upgraded then update the version numbers on all three versions above
-    # (Linux, Mac, Windows).
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
@@ -209,11 +177,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn_v1.BUILD"),
-        sha256 = "aef4d2a726f76f5b98902491a1a4ac69954039aa8e5a1d67ef6ce58ed00e23a6",
-        strip_prefix = "oneDNN-1.5.1",
+        sha256 = "5369f7b2f0b52b40890da50c0632c3a5d1082d98325d0f2bff125d19d0dcaa1d",
+        strip_prefix = "oneDNN-1.6.4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
-            "https://github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.6.4.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.6.4.tar.gz",
         ],
     )
 
@@ -235,11 +203,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "00ff67c15f8e8faf14495482e7396cc1d99cdfaaa2151f4aafef92bc754e634d",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-22c971a225dbb567cd1a45f6006d16c4aa618551",
+        sha256 = "e807a6a6f3a0e8ab10adeb59bb5a9bbb113e8e1684f9b4b32f73f58fd758b4cf",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-011e0db31d1bed8b7f73662be6d57d9f30fa457a",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/22c971a225dbb567cd1a45f6006d16c4aa618551/eigen-22c971a225dbb567cd1a45f6006d16c4aa618551.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/22c971a225dbb567cd1a45f6006d16c4aa618551/eigen-22c971a225dbb567cd1a45f6006d16c4aa618551.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/011e0db31d1bed8b7f73662be6d57d9f30fa457a/eigen-011e0db31d1bed8b7f73662be6d57d9f30fa457a.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/011e0db31d1bed8b7f73662be6d57d9f30fa457a/eigen-011e0db31d1bed8b7f73662be6d57d9f30fa457a.tar.gz",
         ],
     )
 
@@ -582,23 +550,25 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "dill_archive",
         build_file = clean_dep("//third_party:dill.BUILD"),
+        system_build_file = clean_dep("//third_party/systemlibs:dill.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/files.pythonhosted.org/packages/c7/11/345f3173809cea7f1a193bfbf02403fff250a3360e0e118a1630985e547d/dill-0.3.1.1.tar.gz",
-            "https://files.pythonhosted.org/packages/c7/11/345f3173809cea7f1a193bfbf02403fff250a3360e0e118a1630985e547d/dill-0.3.1.1.tar.gz",
+            "http://mirror.tensorflow.org/files.pythonhosted.org/packages/e2/96/518a8ea959a734b70d2e95fef98bcbfdc7adad1c1e5f5dd9148c835205a5/dill-0.3.2.zip",
+            "https://files.pythonhosted.org/packages/e2/96/518a8ea959a734b70d2e95fef98bcbfdc7adad1c1e5f5dd9148c835205a5/dill-0.3.2.zip",
         ],
-        sha256 = "42d8ef819367516592a825746a18073ced42ca169ab1f5f4044134703e7a049c",
-        strip_prefix = "dill-0.3.1.1",
+        sha256 = "6e12da0d8e49c220e8d6e97ee8882002e624f1160289ce85ec2cc0a5246b3a2e",
+        strip_prefix = "dill-0.3.2",
     )
 
     tf_http_archive(
         name = "tblib_archive",
         build_file = clean_dep("//third_party:tblib.BUILD"),
+        system_build_file = clean_dep("//third_party/systemlibs:tblib.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/files.pythonhosted.org/packages/ec/c4/8c651f3240a73c28a218194f3d527eb2be5a173d08501060cdee84ade33f/tblib-1.3.2.tar.gz",
-            "https://files.pythonhosted.org/packages/ec/c4/8c651f3240a73c28a218194f3d527eb2be5a173d08501060cdee84ade33f/tblib-1.3.2.tar.gz",
+            "http://mirror.tensorflow.org/files.pythonhosted.org/packages/d3/41/901ef2e81d7b1e834b9870d416cb09479e175a2be1c4aa1a9dcd0a555293/tblib-1.7.0.tar.gz",
+            "https://files.pythonhosted.org/packages/d3/41/901ef2e81d7b1e834b9870d416cb09479e175a2be1c4aa1a9dcd0a555293/tblib-1.7.0.tar.gz",
         ],
-        sha256 = "436e4200e63d92316551179dc540906652878df4ff39b43db30fcf6400444fe7",
-        strip_prefix = "tblib-1.3.2",
+        sha256 = "059bd77306ea7b419d4f76016aef6d7027cc8a0785579b5aad198803435f882c",
+        strip_prefix = "tblib-1.7.0",
     )
 
     filegroup_external(
@@ -693,6 +663,10 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:BUILD": "bazel/BUILD",
             "//third_party/systemlibs:grpc.BUILD": "src/compiler/BUILD",
             "//third_party/systemlibs:grpc.bazel.grpc_deps.bzl": "bazel/grpc_deps.bzl",
+            "//third_party/systemlibs:grpc.bazel.grpc_extra_deps.bzl": "bazel/grpc_extra_deps.bzl",
+            "//third_party/systemlibs:grpc.bazel.cc_grpc_library.bzl": "bazel/cc_grpc_library.bzl",
+            "//third_party/systemlibs:grpc.bazel.generate_cc.bzl": "bazel/generate_cc.bzl",
+            "//third_party/systemlibs:grpc.bazel.protobuf.bzl": "bazel/protobuf.bzl",
         },
         urls = [
             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/grpc/grpc/archive/b54a5b338637f92bfcf4b0bc05e0f57a5fd8fadd.tar.gz",
@@ -712,8 +686,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "a2291a58bf1c860d026581fee6fe96019dc25440"
-    LLVM_SHA256 = "2c171a49faeaa520122154b243818b02d0fd5d3fe236aaa01025801040fb91af"
+    LLVM_COMMIT = "756f5978410809530150f5e1cd425e85ad94d1cd"
+    LLVM_SHA256 = "16a7a4014acdf44fb737b0c9430b82a6a3d0bb10dccb15ce8dbb0b328347fc14"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -728,6 +702,19 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/mlir:BUILD": "mlir/BUILD",
             "//third_party/mlir:test.BUILD": "mlir/test/BUILD",
         },
+        patch_file = clean_dep("//third_party:mlir_fix_gcc7_ice.patch"),
+    )
+
+    # Intel openMP that is part of LLVM sources.
+    tf_http_archive(
+        name = "llvm_openmp",
+        build_file = clean_dep("//third_party/llvm_openmp:BUILD"),
+        sha256 = "d19f728c8e04fb1e94566c8d76aef50ec926cd2f95ef3bf1e0a5de4909b28b44",
+        strip_prefix = "openmp-10.0.1.src",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz",
+            "https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz",
+        ],
     )
 
     tf_http_archive(
@@ -1107,31 +1094,9 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "tbb",
-        build_file = clean_dep("//third_party/ngraph:tbb.BUILD"),
-        sha256 = "c3245012296f09f1418b78a8c2f17df5188b3bd0db620f7fd5fabe363320805a",
-        strip_prefix = "tbb-2019_U1",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/01org/tbb/archive/2019_U1.zip",
-            "https://github.com/01org/tbb/archive/2019_U1.zip",
-        ],
-    )
-
-    tf_http_archive(
-        name = "ngraph",
-        build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "a1780f24a1381fc25e323b4b2d08b6ef5129f42e011305b2a34dcf43a48030d5",
-        strip_prefix = "ngraph-0.11.0",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "nlohmann_json_lib",
-        build_file = clean_dep("//third_party/ngraph:nlohmann_json.BUILD"),
+        build_file = clean_dep("//third_party:nlohmann_json.BUILD"),
         sha256 = "c377963a95989270c943d522bfefe7b889ef5ed0e1e15d535fd6f6f16ed70732",
         strip_prefix = "json-3.4.0",
         urls = [
@@ -1140,25 +1105,14 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "ngraph_tf",
-        build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "742a642d2c6622277df4c902b6830d616d0539cc8cd843d6cdb899bb99e66e36",
-        strip_prefix = "ngraph-tf-0.9.0",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
-        ],
-    )
-
     tf_http_archive(
         name = "pybind11",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pybind/pybind11/archive/v2.4.3.tar.gz",
-            "https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pybind/pybind11/archive/v2.6.0.tar.gz",
+            "https://github.com/pybind/pybind11/archive/v2.6.0.tar.gz",
         ],
-        sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
-        strip_prefix = "pybind11-2.4.3",
+        sha256 = "90b705137b69ee3b5fc655eaca66d0dc9862ea1759226f7ccd3098425ae69571",
+        strip_prefix = "pybind11-2.6.0",
         build_file = clean_dep("//third_party:pybind11.BUILD"),
         system_build_file = clean_dep("//third_party/systemlibs:pybind11.BUILD"),
     )
@@ -1185,6 +1139,17 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "tf_toolchains",
+        sha256 = "eb175afa73e5a33d2b5d2aabcfde6c8c3395fd7001eb5ba765a5cd98cce714ba",
+        strip_prefix = "toolchains-0.0.2",
+        build_file = clean_dep("//third_party:tf_toolchains.BUILD"),
+        urls = [
+            "http://mirror.tensorflow.org/github.com/tensorflow/toolchains/archive/v0.0.2.tar.gz",
+            "https://github.com/tensorflow/toolchains/archive/v0.0.2.tar.gz",
+        ],
+    )
+
 def tf_bind():
     """Bind targets for some external repositories"""
     ##############################################################################
diff --git a/third_party/cpuinfo/BUILD.bazel b/third_party/cpuinfo/BUILD.bazel
index 15cfcd1c4ee..9b007cc0daa 100644
--- a/third_party/cpuinfo/BUILD.bazel
+++ b/third_party/cpuinfo/BUILD.bazel
@@ -102,6 +102,7 @@ cc_library(
         ":linux_armv7a": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
         ":linux_armeabi": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
         ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
+        ":linux_mips64": COMMON_SRCS + LINUX_SRCS,
         ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
         ":windows_x86_64": COMMON_SRCS + X86_SRCS + WINDOWS_X86_SRCS,
         ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS,
@@ -208,6 +209,11 @@ config_setting(
     values = {"cpu": "aarch64"},
 )
 
+config_setting(
+    name = "linux_mips64",
+    values = {"cpu": "mips64"},
+)
+
 config_setting(
     name = "macos_x86_64",
     values = {
diff --git a/third_party/eigen3/gpu_packet_math.patch b/third_party/eigen3/gpu_packet_math.patch
index fdc8961b93d..c0f466c24d3 100644
--- a/third_party/eigen3/gpu_packet_math.patch
+++ b/third_party/eigen3/gpu_packet_math.patch
@@ -23,3 +23,76 @@ diff -ru a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geom
      return res;
    }
  };
+diff -ru a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
+--- a/Eigen/src/Core/GenericPacketMath.h
++++ b/Eigen/src/Core/GenericPacketMath.h
+@@ -255,49 +255,43 @@
+   return std::complex<RealScalar>(b, b);
+ }
+ 
+-template <typename Packet, typename Op>
+-EIGEN_DEVICE_FUNC inline Packet bitwise_helper(const Packet& a, const Packet& b, Op op) {
++/** \internal \returns the bitwise and of \a a and \a b */
++template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
++pand(const Packet& a, const Packet& b) {
+   const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+   const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
+   Packet c;
+   unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+   for (size_t i = 0; i < sizeof(Packet); ++i) {
+-    *c_ptr++ = op(*a_ptr++, *b_ptr++);
++    *c_ptr++ = *a_ptr++ & *b_ptr++;
+   }
+   return c;
+ }
+ 
+-/** \internal \returns the bitwise and of \a a and \a b */
+-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+-pand(const Packet& a, const Packet& b) {
+-#if defined(EIGEN_HIP_DEVICE_COMPILE)
+-  return bitwise_helper(a ,b, std::bit_and<unsigned char>());
+-#else
+-  EIGEN_USING_STD(bit_and);
+-  return bitwise_helper(a ,b, bit_and<unsigned char>());
+-#endif
+-}
+-
+ /** \internal \returns the bitwise or of \a a and \a b */
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ por(const Packet& a, const Packet& b) {
+-#if defined(EIGEN_HIP_DEVICE_COMPILE)
+-  return bitwise_helper(a ,b, std::bit_or<unsigned char>());
+-#else
+-  EIGEN_USING_STD(bit_or);
+-  return bitwise_helper(a ,b, bit_or<unsigned char>());
+-#endif
++  const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
++  const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
++  Packet c;
++  unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
++  for (size_t i = 0; i < sizeof(Packet); ++i) {
++    *c_ptr++ = *a_ptr++ | *b_ptr++;
++  }
++  return c;
+ }
+ 
+ /** \internal \returns the bitwise xor of \a a and \a b */
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ pxor(const Packet& a, const Packet& b) {
+-#if defined(EIGEN_HIP_DEVICE_COMPILE)
+-  return bitwise_helper(a ,b, std::bit_xor<unsigned char>());
+-#else
+-  EIGEN_USING_STD(bit_xor);
+-  return bitwise_helper(a ,b, bit_xor<unsigned char>());
+-#endif
++  const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
++  const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
++  Packet c;
++  unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
++  for (size_t i = 0; i < sizeof(Packet); ++i) {
++    *c_ptr++ = *a_ptr++ ^ *b_ptr++;
++  }
++  return c;
+ }
+ 
+ /** \internal \returns the bitwise and of \a a and not \a b */
diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
index 686d36f5c77..afd6380b0ac 100644
--- a/third_party/gpus/check_cuda_libs.py
+++ b/third_party/gpus/check_cuda_libs.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Verifies that a list of libraries is installed on the system.
 
-Takes a a list of arguments with every two subsequent arguments being a logical
+Takes a list of arguments with every two subsequent arguments being a logical
 tuple of (path, check_soname). The path to the library and either True or False
 to indicate whether to check the soname field on the shared library.
 
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index afc8132bd15..2e9ff87dc06 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -497,12 +497,11 @@ def _features(cpu, compiler, ctx):
                     ),
                     flag_set(
                         actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = (
-                                ["-Wl,-no-as-needed"] if cpu == "local" else []
-                            ) + [
-                                "-B" + ctx.attr.linker_bin_path,
-                            ]),
+                        flag_groups = ([
+                            flag_group(flags = ["-Wl,-no-as-needed"])
+                        ] if cpu == "local" else []) + ([
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path])
+                        ] if ctx.attr.linker_bin_path else []) + [
                             flag_group(
                                 flags = ["@%{linker_param_file}"],
                                 expand_if_available = "linker_param_file",
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index a4a21abc367..70eacf82883 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -127,6 +127,13 @@ cc_library(
     linkstatic = 1,
 )
 
+cc_library(
+    name = "cublasLt",
+    srcs = ["cuda/lib/%{cublasLt_lib}"],
+    data = ["cuda/lib/%{cublasLt_lib}"],
+    linkstatic = 1,
+)
+
 cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
@@ -168,6 +175,7 @@ cc_library(
     name = "cuda",
     deps = [
         ":cublas",
+        ":cublasLt",
         ":cuda_headers",
         ":cudart",
         ":cudnn",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 704003b7f63..3ba34470b93 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -551,6 +551,13 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
             cuda_config.cublas_version,
             static = False,
         ),
+        "cublasLt": _check_cuda_lib_params(
+            "cublasLt",
+            cpu_value,
+            cuda_config.config["cublas_library_dir"],
+            cuda_config.cublas_version,
+            static = False,
+        ),
         "cusolver": _check_cuda_lib_params(
             "cusolver",
             cpu_value,
@@ -780,6 +787,7 @@ def _create_dummy_repository(repository_ctx):
             "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
             "%{cudart_lib}": lib_name("cudart", cpu_value),
             "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": lib_name("cublasLt", cpu_value),
             "%{cusolver_lib}": lib_name("cusolver", cpu_value),
             "%{cudnn_lib}": lib_name("cudnn", cpu_value),
             "%{cufft_lib}": lib_name("cufft", cpu_value),
@@ -811,6 +819,7 @@ filegroup(name="cudnn-include")
         "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
     )
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublasLt", cpu_value))
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
@@ -1002,11 +1011,13 @@ def _create_local_cuda_repository(repository_ctx):
             cublas_include_path + "/cublas.h",
             cublas_include_path + "/cublas_v2.h",
             cublas_include_path + "/cublas_api.h",
+            cublas_include_path + "/cublasLt.h",
         ],
         outs = [
             "cublas/include/cublas.h",
             "cublas/include/cublas_v2.h",
             "cublas/include/cublas_api.h",
+            "cublas/include/cublasLt.h",
         ],
     ))
 
@@ -1147,6 +1158,7 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
             "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
             "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
+            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
             "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
             "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
             "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
diff --git a/third_party/llvm/BUILD b/third_party/llvm/BUILD
index 1a5634a6285..88f1574ce9e 100644
--- a/third_party/llvm/BUILD
+++ b/third_party/llvm/BUILD
@@ -2,5 +2,8 @@ py_binary(
     name = "expand_cmake_vars",
     srcs = ["expand_cmake_vars.py"],
     srcs_version = "PY2AND3",
-    visibility = ["@llvm-project//:__subpackages__"],
+    visibility = [
+        "@llvm-project//:__subpackages__",
+        "@llvm_openmp//:__subpackages__",
+    ],
 )
diff --git a/third_party/llvm/expand_cmake_vars.py b/third_party/llvm/expand_cmake_vars.py
index ffc6a255fd1..a8a4b9673ed 100644
--- a/third_party/llvm/expand_cmake_vars.py
+++ b/third_party/llvm/expand_cmake_vars.py
@@ -25,6 +25,7 @@ import sys
 _CMAKE_DEFINE_REGEX = re.compile(r"\s*#cmakedefine\s+([A-Za-z_0-9]*)(\s.*)?$")
 _CMAKE_DEFINE01_REGEX = re.compile(r"\s*#cmakedefine01\s+([A-Za-z_0-9]*)")
 _CMAKE_VAR_REGEX = re.compile(r"\${([A-Za-z_0-9]*)}")
+_CMAKE_ATVAR_REGEX = re.compile(r"@([A-Za-z_0-9]*)@")
 
 
 def _parse_args(argv):
@@ -37,10 +38,10 @@ def _parse_args(argv):
 
 
 def _expand_variables(input_str, cmake_vars):
-  """Expands ${VARIABLE}s in 'input_str', using dictionary 'cmake_vars'.
+  """Expands ${VARIABLE}s and @VARIABLE@s in 'input_str', using dictionary 'cmake_vars'.
 
   Args:
-    input_str: the string containing ${VARIABLE} expressions to expand.
+    input_str: the string containing ${VARIABLE} or @VARIABLE@ expressions to expand.
     cmake_vars: a dictionary mapping variable names to their values.
 
   Returns:
@@ -50,7 +51,7 @@ def _expand_variables(input_str, cmake_vars):
     if match.group(1) in cmake_vars:
       return cmake_vars[match.group(1)]
     return ""
-  return _CMAKE_VAR_REGEX.sub(replace, input_str)
+  return _CMAKE_ATVAR_REGEX.sub(replace,_CMAKE_VAR_REGEX.sub(replace, input_str))
 
 
 def _expand_cmakedefines(line, cmake_vars):
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index e0d64fe867a..0e8286a0779 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -468,6 +468,7 @@ llvm_target_list = [
             ("-gen-global-isel", "lib/Target/AArch64/AArch64GenGlobalISel.inc"),
             ("-gen-global-isel-combiner -combiners=AArch64PreLegalizerCombinerHelper", "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc"),
             ("-gen-global-isel-combiner -combiners=AArch64PostLegalizerCombinerHelper", "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc"),
+            ("-gen-global-isel-combiner -combiners=AArch64PostLegalizerLoweringHelper", "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc"),
             ("-gen-callingconv", "lib/Target/AArch64/AArch64GenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/AArch64/AArch64GenSubtargetInfo.inc"),
             ("-gen-disassembler", "lib/Target/AArch64/AArch64GenDisassemblerTables.inc"),
@@ -1772,6 +1773,50 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "CSKYCodeGen",
+    srcs = glob([
+        "lib/Target/CSKY/*.c",
+        "lib/Target/CSKY/*.cpp",
+        "lib/Target/CSKY/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/CSKY/*.h",
+        "include/llvm/Target/CSKY/*.def",
+        "include/llvm/Target/CSKY/*.inc",
+        "lib/Target/CSKY/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/CSKY"],
+    deps = [
+        ":CSKYInfo",
+        ":CodeGen",
+        ":Core",
+        ":Support",
+        ":Target",
+        ":config",
+    ],
+)
+
+cc_library(
+    name = "CSKYInfo",
+    srcs = glob([
+        "lib/Target/CSKY/TargetInfo/*.c",
+        "lib/Target/CSKY/TargetInfo/*.cpp",
+        "lib/Target/CSKY/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/CSKY/TargetInfo/*.h",
+        "include/llvm/Target/CSKY/TargetInfo/*.def",
+        "include/llvm/Target/CSKY/TargetInfo/*.inc",
+        "lib/Target/CSKY/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/CSKY"],
+    deps = [
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_library(
     name = "CodeGen",
     srcs = glob([
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index dcbaab9edd4..5bd4ff2c4b9 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -293,11 +293,6 @@ win32_cmake_vars = {
 
     # LLVM features
     "LTDL_SHLIB_EXT": ".dll",
-
-    # ThreadPoolExecutor global destructor and thread handshaking do not work
-    # on this platform when used as a DLL.
-    # See: https://bugs.llvm.org/show_bug.cgi?id=44211
-    "LLVM_ENABLE_THREADS": 0,
 }
 
 # Select a set of CMake variables based on the platform.
diff --git a/third_party/llvm_fix_windows.patch b/third_party/llvm_fix_windows.patch
new file mode 100644
index 00000000000..c85c07274be
--- /dev/null
+++ b/third_party/llvm_fix_windows.patch
@@ -0,0 +1,14 @@
+diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
+index 726cf101cb32..659dd9a1d3eb 100644
+--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
++++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
+@@ -16,9 +16,7 @@
+ #include "llvm/Support/MSVCErrorWorkarounds.h"
+ 
+ #include <condition_variable>
+-#if LLVM_ENABLE_THREADS
+ #include <future>
+-#endif
+ 
+ #define DEBUG_TYPE "orc"
+ 
diff --git a/third_party/llvm_openmp/BUILD b/third_party/llvm_openmp/BUILD
new file mode 100644
index 00000000000..099a84dcbaa
--- /dev/null
+++ b/third_party/llvm_openmp/BUILD
@@ -0,0 +1,205 @@
+# Build file for OpenMP library that is part of llvm
+
+load(
+    "@org_tensorflow//third_party/llvm:llvm.bzl",
+    "cmake_var_string",
+    "expand_cmake_vars",
+)
+load(
+    "@org_tensorflow//third_party/llvm_openmp:openmp.bzl",
+    "dict_add",
+)
+
+exports_files(["LICENSE.txt"])
+
+genrule(
+    name = "kmp_i18n_id",
+    srcs = [
+        "runtime/tools/message-converter.pl",
+        "runtime/src/i18n/en_US.txt",
+    ],
+    outs = ["include/kmp_i18n_id.inc"],
+    cmd = "perl $(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --enum=$@  $(location runtime/src/i18n/en_US.txt)",
+)
+
+genrule(
+    name = "kmp_i18n_default",
+    srcs = [
+        "runtime/tools/message-converter.pl",
+        "runtime/src/i18n/en_US.txt",
+    ],
+    outs = ["include/kmp_i18n_default.inc"],
+    cmd = "perl $(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --default=$@ $(location runtime/src/i18n/en_US.txt)",
+)
+
+# Bazel doesn't accept .txt as an input, rename the ldscript to .inc to workaround.
+genrule(
+    name = "ldscript",
+    srcs = ["runtime/src/exports_so.txt"],
+    outs = ["exports_so.inc"],
+    cmd = "cp $(location runtime/src/exports_so.txt) $@",
+)
+
+genrule(
+    name = "openmp_asm",
+    srcs = [
+        "runtime/src/z_Windows_NT-586_asm.asm",
+    ],
+    outs = [
+        "z_Windows_NT-586_asm.S",
+    ],
+    cmd = "cp $(location runtime/src/z_Windows_NT-586_asm.asm) $@",
+    visibility = ["//visibility:public"],
+)
+
+# Common Cmake vars to expand.
+omp_vars = {
+    "LIBOMP_ENABLE_SHARED": 1,
+    "LIBOMP_LEGAL_ARCH": "Intel(R) 64",
+    "LIBOMP_LIB_FILE": "libiomp5",
+    "LIBOMP_VERSION_MAJOR": 5,
+    "LIBOMP_VERSION_MINOR": 0,
+}
+
+# Linux Cmake vars to expand.
+omp_vars_linux = {
+    "LIBOMP_USE_VERSION_SYMBOLS": 1,
+    "LIBOMP_HAVE_WEAK_ATTRIBUTE": 1,
+    "LIBOMP_USE_ADAPTIVE_LOCKS": 1,
+    "LIBOMP_ENABLE_ASSERTIONS": 1,
+}
+
+# Windows Cmake vars to expand.
+omp_vars_win = {
+    "MSVC": 1,
+}
+
+omp_all_cmake_vars = select({
+    "@org_tensorflow//tensorflow:windows": cmake_var_string(
+        dict_add(
+            omp_vars,
+            omp_vars_win,
+        ),
+    ),
+    "//conditions:default": cmake_var_string(
+        dict_add(
+            omp_vars,
+            omp_vars_linux,
+        ),
+    ),
+})
+
+expand_cmake_vars(
+    name = "config_kmp",
+    src = "runtime/src/kmp_config.h.cmake",
+    cmake_vars = omp_all_cmake_vars,
+    dst = "include/kmp_config.h",
+)
+
+expand_cmake_vars(
+    name = "config_omp",
+    src = "runtime/src/include/omp.h.var",
+    cmake_vars = omp_all_cmake_vars,
+    dst = "include/omp.h",
+)
+
+cppsources = [
+    "runtime/src/kmp_alloc.cpp",
+    "runtime/src/kmp_atomic.cpp",
+    "runtime/src/kmp_csupport.cpp",
+    "runtime/src/kmp_debug.cpp",
+    "runtime/src/kmp_itt.cpp",
+    "runtime/src/kmp_environment.cpp",
+    "runtime/src/kmp_error.cpp",
+    "runtime/src/kmp_global.cpp",
+    "runtime/src/kmp_i18n.cpp",
+    "runtime/src/kmp_io.cpp",
+    "runtime/src/kmp_runtime.cpp",
+    "runtime/src/kmp_settings.cpp",
+    "runtime/src/kmp_str.cpp",
+    "runtime/src/kmp_tasking.cpp",
+    "runtime/src/kmp_threadprivate.cpp",
+    "runtime/src/kmp_utility.cpp",
+    "runtime/src/kmp_barrier.cpp",
+    "runtime/src/kmp_wait_release.cpp",
+    "runtime/src/kmp_affinity.cpp",
+    "runtime/src/kmp_dispatch.cpp",
+    "runtime/src/kmp_lock.cpp",
+    "runtime/src/kmp_sched.cpp",
+    "runtime/src/kmp_taskdeps.cpp",
+    "runtime/src/kmp_cancel.cpp",
+    "runtime/src/kmp_ftn_cdecl.cpp",
+    "runtime/src/kmp_ftn_extra.cpp",
+    "runtime/src/kmp_version.cpp",
+]
+
+srcdeps = [
+    ":config_kmp",
+    ":config_omp",
+    ":kmp_i18n_id",
+    ":kmp_i18n_default",
+    ":ldscript",
+]
+
+common_includes = [
+    "runtime/src/",
+    "include/",
+]
+
+# TODO(Intel-tf) Replace the following 3 calls to cc_binary with cc_library.
+# cc_library should be used for files that are not independently executed. Using
+# cc_library results in linking errors. For e.g on Linux, the build fails
+# with the following error message.
+# ERROR: //tensorflow/BUILD:689:1: Linking of rule '//tensorflow:libtensorflow_framework.so.2.4.0' failed (Exit 1)
+# /usr/bin/ld.gold: error: symbol GOMP_parallel_loop_nonmonotonic_guided has undefined version VERSION
+# /usr/bin/ld.gold: error: symbol GOMP_parallel_start has undefined version GOMP_1.0
+# /usr/bin/ld.gold: error: symbol GOMP_cancellation_point has undefined version GOMP_4.0
+# /usr/bin/ld.gold: error: symbol omp_set_num_threads has undefined version OMP_1.0
+# ......
+# ......
+
+cc_binary(
+    name = "libiomp5.so",
+    srcs = cppsources + [
+        #linux specific files
+        "runtime/src/z_Linux_util.cpp",
+        "runtime/src/kmp_gsupport.cpp",
+        "runtime/src/z_Linux_asm.S",
+    ] + srcdeps,
+    copts = ["-Domp_EXPORTS -D_GNU_SOURCE -D_REENTRANT"],
+    includes = common_includes,
+    linkopts = ["-lpthread -ldl -Wl,--version-script=$(location :ldscript)"],
+    linkshared = True,
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "libiomp5md.dll",
+    srcs = cppsources + [
+        #window specific files
+        "runtime/src/z_Windows_NT_util.cpp",
+        "runtime/src/z_Windows_NT-586_util.cpp",
+    ] + srcdeps + [":openmp_asm"],
+    copts = ["/Domp_EXPORTS /D_M_AMD64 /DOMPT_SUPPORT=0 /D_WINDOWS /D_WINNT /D_USRDLL"],
+    includes = common_includes,
+    linkopts = ["/MACHINE:X64"],
+    linkshared = True,
+    visibility = ["//visibility:public"],
+)
+
+# MacOS build has not been tested, however since the MacOS build of openmp
+# uses the same configuration as Linux, the following should work.
+cc_binary(
+    name = "libiomp5.dylib",
+    srcs = cppsources + [
+        #linux/MacOS specific files
+        "runtime/src/z_Linux_util.cpp",
+        "runtime/src/kmp_gsupport.cpp",
+        "runtime/src/z_Linux_asm.S",
+    ] + srcdeps,
+    copts = ["-Domp_EXPORTS -D_GNU_SOURCE -D_REENTRANT"],
+    includes = common_includes,
+    linkopts = ["-lpthread -ldl -Wl,--version-script=$(location :ldscript)"],
+    linkshared = True,
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/llvm_openmp/openmp.bzl b/third_party/llvm_openmp/openmp.bzl
new file mode 100644
index 00000000000..9f428b5b37d
--- /dev/null
+++ b/third_party/llvm_openmp/openmp.bzl
@@ -0,0 +1,21 @@
+"""This file contains BUILD extensions for building llvm_openmp.
+TODO(Intel-tf): Delete this and reuse a similar function in third_party/llvm
+after the TF 2.4 branch cut has passed.
+"""
+
+def dict_add(*dictionaries):
+    """Returns a new `dict` that has all the entries of the given dictionaries.
+
+    If the same key is present in more than one of the input dictionaries, the
+    last of them in the argument list overrides any earlier ones.
+
+    Args:
+      *dictionaries: Zero or more dictionaries to be added.
+
+    Returns:
+      A new `dict` that has all the entries of the given dictionaries.
+    """
+    result = {}
+    for d in dictionaries:
+        result.update(d)
+    return result
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index c1c2c450e34..aa65b585b85 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -21,6 +21,30 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkl_lnx_openmp",
+    constraint_values = [
+        "@platforms//os:linux",
+    ],
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "build_with_mkl_windows_openmp",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "build_with_mkl_aarch64",
     define_values = {
@@ -40,18 +64,38 @@ config_setting(
 
 filegroup(
     name = "LICENSE",
-    srcs = ["MKL_LICENSE"] + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:LICENSE",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:LICENSE",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }),
+    srcs = [
+        "MKL_LICENSE",
+        "@llvm_openmp//:LICENSE.txt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# TODO(Intel-tf) Remove the following 3 calls to cc_library and replace all uses
+# of mkl_libs_* with @llvm_openmp//:libiomp5.* directly.
+
+cc_library(
+    name = "mkl_libs_linux",
+    srcs = [
+        "@llvm_openmp//:libiomp5.so",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# MacOS build configuration is provided for completness, it has not been tested
+cc_library(
+    name = "mkl_libs_darwin",
+    srcs = [
+        "@llvm_openmp//:libiomp5.dylib",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "mkl_libs_windows",
+    srcs = [
+        "@llvm_openmp//:libiomp5md.dll",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -60,16 +104,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
+            ":mkl_libs_linux",
         ],
         "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:mkl_headers",
-            "@mkl_darwin//:mkl_libs_darwin",
+            ":mkl_libs_darwin",
         ],
         "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:mkl_headers",
-            "@mkl_windows//:mkl_libs_windows",
+            ":mkl_libs_windows",
         ],
         "//conditions:default": [],
     }),
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
deleted file mode 100644
index 72370182c41..00000000000
--- a/third_party/mkl/mkl.BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
-
-exports_files(["license.txt"])
-
-filegroup(
-    name = "LICENSE",
-    srcs = [
-        "license.txt",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl_headers",
-    srcs = glob(["include/*(.cc|.cpp|.cxx|.c++|.C|.c|.h|.hh|.hpp|.ipp|.hxx|.inc|.S|.s|.asm|.a|.lib|.pic.a|.lo|.lo.lib|.pic.lo|.so|.dylib|.dll|.o|.obj|.pic.o)"]),
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl_libs_linux",
-    srcs = [
-        "lib/libiomp5.so",
-        "lib/libmklml_intel.so",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl_libs_darwin",
-    srcs = [
-        "lib/libiomp5.dylib",
-        "lib/libmklml.dylib",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl_libs_windows",
-    srcs = [
-        "lib/libiomp5md.lib",
-        "lib/mklml.lib",
-    ],
-    linkopts = ["/FORCE:MULTIPLE"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 11b9b917fa0..f15bb5bf995 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -57,8 +57,10 @@ cc_library(
         "src/cpu/xbyak/*.h",
     ]) + [":mkldnn_version_h"],
     hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
+    copts = select({
+        "@org_tensorflow//tensorflow:windows": [],
+        "//conditions:default": ["-fexceptions"],
+    }) + [
         "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
     ],
     includes = [
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 3ac44913f1e..96d48506ba1 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,5 +1,9 @@
 exports_files(["LICENSE"])
 
+load(
+    "@org_tensorflow//tensorflow:tensorflow.bzl",
+    "tf_openmp_copts",
+)
 load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
@@ -14,14 +18,6 @@ load(
     "template_rule",
 )
 
-config_setting(
-    name = "clang_linux_x86_64",
-    values = {
-        "cpu": "k8",
-        "define": "using_clang=true",
-    },
-)
-
 _DNNL_RUNTIME_OMP = {
     "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
     "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
@@ -58,8 +54,8 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "5",
-        "@DNNL_VERSION_PATCH@": "1",
+        "@DNNL_VERSION_MINOR@": "6",
+        "@DNNL_VERSION_PATCH@": "4",
         "@DNNL_VERSION_HASH@": "N/A",
     },
 )
@@ -81,19 +77,13 @@ cc_library(
         ":dnnl_version_h",
     ],
     hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
+    copts = select({
+        "@org_tensorflow//tensorflow:windows": [],
+        "//conditions:default": ["-fexceptions"],
+    }) + [
         "-UUSE_MKL",
         "-UUSE_CBLAS",
-    ] + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "-fopenmp",  # only works with gcc
-        ],
-        # TODO(ibiryukov): enable openmp with clang by including libomp as a
-        # dependency.
-        ":clang_linux_x86_64": [],
-        "//conditions:default": [],
-    }),
+    ] + tf_openmp_copts(),
     includes = [
         "include",
         "src",
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index f925d035a1d..a084d7a6e3d 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -120,6 +120,7 @@ cc_library(
 cc_library(
     name = "CAPIIR",
     srcs = [
+        "lib/CAPI/IR/AffineExpr.cpp",
         "lib/CAPI/IR/AffineMap.cpp",
         "lib/CAPI/IR/Diagnostics.cpp",
         "lib/CAPI/IR/IR.cpp",
@@ -129,6 +130,7 @@ cc_library(
         "lib/CAPI/Standard/StandardDialect.cpp",
     ],
     hdrs = [
+        "include/mlir-c/AffineExpr.h",
         "include/mlir-c/AffineMap.h",
         "include/mlir-c/Diagnostics.h",
         "include/mlir-c/IR.h",
@@ -136,6 +138,7 @@ cc_library(
         "include/mlir-c/StandardDialect.h",
         "include/mlir-c/StandardTypes.h",
         "include/mlir-c/Support.h",
+        "include/mlir/CAPI/AffineExpr.h",
         "include/mlir/CAPI/AffineMap.h",
         "include/mlir/CAPI/Diagnostics.h",
         "include/mlir/CAPI/IR.h",
@@ -458,6 +461,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
         "include/mlir/IR/OpAsmInterface.td",
+        "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
@@ -581,6 +585,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "AsyncTransforms",
+    srcs = glob([
+        "lib/Dialect/Async/Transforms/*.cpp",
+        "lib/Dialect/Async/Transforms/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":Async",
+        ":IR",
+        ":Pass",
+        ":SCFDialect",
+        ":StandardOps",
+        ":Support",
+        ":TransformUtils",
+        ":Transforms",
+        ":TransformsPassIncGen",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "AffineUtils",
     srcs = glob(
@@ -595,6 +621,7 @@ cc_library(
         ":Affine",
         ":IR",
         ":Support",
+        ":TransformUtils",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -664,6 +691,7 @@ cc_library(
     deps = [
         ":AVX512ToLLVM",
         ":AffineToStandard",
+        ":AsyncToLLVM",
         ":ConversionPassIncGen",
         ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
@@ -674,6 +702,7 @@ cc_library(
         ":LinalgToSPIRV",
         ":LinalgToStandard",
         ":OpenMPToLLVM",
+        ":PDLToPDLInterp",
         ":SCFToGPUPass",
         ":SCFToStandard",
         ":SPIRVToLLVM",
@@ -687,6 +716,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "AsyncToLLVM",
+    srcs = glob([
+        "lib/Conversion/AsyncToLLVM/*.cpp",
+        "lib/Conversion/AsyncToLLVM/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob(["include/mlir/Conversion/AsyncToLLVM/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":Async",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LLVMDialect",
+        ":Pass",
+        ":StandardOps",
+        ":Support",
+        ":Transforms",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "AffineToStandard",
     srcs = glob([
@@ -976,6 +1026,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":StandardOpsTransformsPassIncGen",
         ":Support",
@@ -1271,6 +1322,30 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "GPUBaseIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-dialect-decls -dialect=gpu",
+            "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc",
+        ),
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Dialect/GPU/GPUOpInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/GPU/GPUBase.td",
+    td_srcs = [
+        ":GPUOpsTdFiles",
+    ],
+)
+
 gentbl(
     name = "GPUOpsIncGen",
     strip_include_prefix = "include",
@@ -1283,10 +1358,6 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/GPU/GPUOps.cpp.inc",
         ),
-        (
-            "-gen-dialect-decls -dialect=gpu",
-            "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc",
-        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/GPU/GPUOps.td",
@@ -1308,12 +1379,14 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":GPUBaseIncGen",
         ":GPUOpsIncGen",
         ":IR",
         ":LLVMDialect",
         ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -1623,6 +1696,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "PDLToPDLInterp",
+    srcs = glob([
+        "lib/Conversion/PDLToPDLInterp/*.cpp",
+        "lib/Conversion/PDLToPDLInterp/*.h",
+    ]) + [
+        "lib/Conversion/PassDetail.h",
+    ],
+    hdrs = [
+        "include/mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ConversionPassIncGen",
+        ":IR",
+        ":InferTypeOpInterface",
+        ":PDLDialect",
+        ":PDLInterpDialect",
+        ":Pass",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "SPIRVToLLVM",
     srcs = glob([
@@ -1636,6 +1733,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
+        ":GPUDialect",
         ":IR",
         ":LLVMDialect",
         ":Pass",
@@ -2259,6 +2357,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "SPIRVLinking",
+    srcs = glob(
+        [
+            "lib/Dialect/SPIRV/Linking/ModuleCombiner/*.cpp",
+        ],
+    ),
+    hdrs = [
+        "include/mlir/Dialect/SPIRV/ModuleCombiner.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":SPIRVDialect",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "SPIRVTranslateRegistration",
     srcs = [
@@ -2276,6 +2393,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "Rewrite",
+    srcs = glob([
+        "lib/Rewrite/*.cpp",
+        "lib/Rewrite/*.h",
+    ]),
+    hdrs = glob(["include/mlir/Rewrite/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":Pass",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "TransformUtils",
     srcs = glob([
@@ -2292,6 +2425,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":Pass",
+        ":Rewrite",
         ":SCFDialect",
         ":SideEffectInterfaces",
         ":StandardOps",
@@ -2451,6 +2585,7 @@ cc_library(
         ":LinalgOps",
         ":LoopLikeInterface",
         ":Pass",
+        ":Rewrite",
         ":SCFDialect",
         ":SideEffectInterfaces",
         ":StandardOps",
@@ -2589,6 +2724,7 @@ cc_library(
         ":Parser",
         ":Pass",
         ":StandardOps",
+        ":StandardOpsTransforms",
         ":Support",
         ":TransformUtils",
         ":Transforms",
@@ -2782,11 +2918,13 @@ cc_library(
     deps = [
         ":Affine",
         ":CallOpInterfaces",
+        ":ControlFlowInterfaces",
         ":IR",
         ":LinalgOps",
         ":SCFDialect",
         ":StandardOps",
         ":Support",
+        ":ViewLikeInterface",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -2949,6 +3087,7 @@ cc_library(
     deps = [
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Coroutines",
         "@llvm-project//llvm:IPO",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
@@ -3050,6 +3189,8 @@ cc_library(
         ":AffineToStandard",
         ":AffineTransforms",
         ":Async",
+        ":AsyncToLLVM",
+        ":AsyncTransforms",
         ":ConversionPasses",
         ":GPUDialect",
         ":GPUPassIncGen",
@@ -3076,6 +3217,7 @@ cc_library(
         ":OpenMPToLLVM",
         ":PDLDialect",
         ":PDLInterpDialect",
+        ":PDLToPDLInterp",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
@@ -3178,6 +3320,17 @@ cc_library(
     includes = ["include"],
 )
 
+cc_library(
+    name = "mlir_async_runtime",
+    srcs = [
+        "lib/ExecutionEngine/AsyncRuntime.cpp",
+    ],
+    hdrs = [
+        "include/mlir/ExecutionEngine/AsyncRuntime.h",
+    ],
+    includes = ["include"],
+)
+
 cc_library(
     name = "mlir_runner_utils",
     srcs = [
@@ -3799,6 +3952,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":IR",
         ":LinalgOps",
+        ":LinalgTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
@@ -3895,6 +4049,7 @@ cc_library(
         "include/mlir/Dialect/Linalg/EDSC/Builders.h",
         "include/mlir/Dialect/Linalg/EDSC/FoldedIntrinsics.h",
         "include/mlir/Dialect/Linalg/Passes.h",
+        "include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h",
         "include/mlir/Dialect/Linalg/Transforms/Hoisting.h",
         "include/mlir/Dialect/Linalg/Transforms/Transforms.h",
         "include/mlir/Dialect/Linalg/Utils/Utils.h",
@@ -3916,12 +4071,14 @@ cc_library(
         ":SCFToStandard",
         ":SCFTransforms",
         ":StandardOps",
+        ":StandardOpsTransforms",
         ":StandardToLLVM",
         ":Support",
         ":TransformUtils",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
+        ":VectorToSCF",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
     ],
@@ -4010,7 +4167,6 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LinalgTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index d88190ce60a..0e05a3029cb 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -17,6 +17,21 @@ cc_library(
     includes = ["."],
 )
 
+filegroup(
+    name = "TestOpTdFiles",
+    srcs = [
+        "lib/Dialect/Test/TestOps.td",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
+        "@llvm-project//mlir:include/mlir/IR/RegionKindInterface.td",
+        "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+    ],
+)
+
 gentbl(
     name = "TestOpsIncGen",
     strip_include_prefix = "lib/Dialect/Test",
@@ -57,14 +72,7 @@ gentbl(
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "lib/Dialect/Test/TestOps.td",
     td_srcs = [
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
-        "@llvm-project//mlir:include/mlir/IR/RegionKindInterface.td",
-        "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":TestOpTdFiles",
     ],
     test = True,
 )
@@ -90,11 +98,34 @@ gentbl(
     test = True,
 )
 
+gentbl(
+    name = "TestTypeDefsIncGen",
+    strip_include_prefix = "lib/Dialect/Test",
+    tbl_outs = [
+        (
+            "-gen-typedef-decls",
+            "lib/Dialect/Test/TestTypeDefs.h.inc",
+        ),
+        (
+            "-gen-typedef-defs",
+            "lib/Dialect/Test/TestTypeDefs.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/Test/TestTypeDefs.td",
+    td_srcs = [
+        ":TestOpTdFiles",
+    ],
+    test = True,
+)
+
 cc_library(
     name = "TestDialect",
     srcs = [
         "lib/Dialect/Test/TestDialect.cpp",
         "lib/Dialect/Test/TestPatterns.cpp",
+        "lib/Dialect/Test/TestTraits.cpp",
+        "lib/Dialect/Test/TestTypes.cpp",
     ],
     hdrs = [
         "lib/Dialect/Test/TestDialect.h",
@@ -106,6 +137,7 @@ cc_library(
     deps = [
         ":TestInterfacesIncGen",
         ":TestOpsIncGen",
+        ":TestTypeDefsIncGen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
@@ -236,6 +268,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SPIRVDialect",
+        "@llvm-project//mlir:SPIRVLinking",
         "@llvm-project//mlir:SPIRVLowering",
     ],
 )
diff --git a/third_party/mlir_fix_gcc7_ice.patch b/third_party/mlir_fix_gcc7_ice.patch
new file mode 100644
index 00000000000..88a60543f16
--- /dev/null
+++ b/third_party/mlir_fix_gcc7_ice.patch
@@ -0,0 +1,17 @@
+diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+index 74b695eec18e..917c27714b4a 100644
+--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
++++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+@@ -108,10 +108,8 @@ struct GpuAsyncRegionPass::Callback {
+ // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
+ // execution semantics and that no GPU ops are asynchronous yet.
+ void GpuAsyncRegionPass::runOnFunction() {
+-  if (getFunction()
+-          .getRegion()
+-          .walk(Callback{OpBuilder(&getContext())})
+-          .wasInterrupted())
++  Callback callback{OpBuilder(&getContext())};
++  if (getFunction().getRegion().walk(callback).wasInterrupted())
+     return signalPassFailure();
+ }
+ 
diff --git a/third_party/ngraph/BUILD b/third_party/ngraph/BUILD
deleted file mode 100644
index 922559f68ec..00000000000
--- a/third_party/ngraph/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
-
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/ngraph/LICENSE b/third_party/ngraph/LICENSE
deleted file mode 100644
index 9c8f3ea0871..00000000000
--- a/third_party/ngraph/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/third_party/ngraph/NGRAPH_LICENSE b/third_party/ngraph/NGRAPH_LICENSE
deleted file mode 100644
index 9c8f3ea0871..00000000000
--- a/third_party/ngraph/NGRAPH_LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/third_party/ngraph/build_defs.bzl b/third_party/ngraph/build_defs.bzl
deleted file mode 100644
index 3c34be524bc..00000000000
--- a/third_party/ngraph/build_defs.bzl
+++ /dev/null
@@ -1,11 +0,0 @@
-"""Build configurations for nGraph."""
-
-def clean_dep(dep):
-    return str(Label(dep))
-
-def if_ngraph(if_true, if_false = []):
-    """select()'ing on whether we're building with nGraph support."""
-    return select({
-        clean_dep("//tensorflow:with_ngraph_support"): if_true,
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
deleted file mode 100644
index 715148d38f6..00000000000
--- a/third_party/ngraph/ngraph.BUILD
+++ /dev/null
@@ -1,167 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "ngraph_headers",
-    hdrs = glob(["src/ngraph/**/*.hpp"]),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "ngraph_cpu_backend",
-    srcs = [
-        "src/ngraph/runtime/cpu/builder/add.cpp",
-        "src/ngraph/runtime/cpu/builder/allreduce.cpp",
-        "src/ngraph/runtime/cpu/builder/argmax.cpp",
-        "src/ngraph/runtime/cpu/builder/argmin.cpp",
-        "src/ngraph/runtime/cpu/builder/avg_pool.cpp",
-        "src/ngraph/runtime/cpu/builder/batch_norm.cpp",
-        "src/ngraph/runtime/cpu/builder/bounded_relu.cpp",
-        "src/ngraph/runtime/cpu/builder/broadcast.cpp",
-        "src/ngraph/runtime/cpu/builder/concat.cpp",
-        "src/ngraph/runtime/cpu/builder/convert.cpp",
-        "src/ngraph/runtime/cpu/builder/convert_layout.cpp",
-        "src/ngraph/runtime/cpu/builder/convolution.cpp",
-        "src/ngraph/runtime/cpu/builder/dot.cpp",
-        "src/ngraph/runtime/cpu/builder/function_call.cpp",
-        "src/ngraph/runtime/cpu/builder/lrn.cpp",
-        "src/ngraph/runtime/cpu/builder/lstm.cpp",
-        "src/ngraph/runtime/cpu/builder/matmul_bias.cpp",
-        "src/ngraph/runtime/cpu/builder/max.cpp",
-        "src/ngraph/runtime/cpu/builder/max_pool.cpp",
-        "src/ngraph/runtime/cpu/builder/min.cpp",
-        "src/ngraph/runtime/cpu/builder/one_hot.cpp",
-        "src/ngraph/runtime/cpu/builder/pad.cpp",
-        "src/ngraph/runtime/cpu/builder/product.cpp",
-        "src/ngraph/runtime/cpu/builder/quantization.cpp",
-        "src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp",
-        "src/ngraph/runtime/cpu/builder/quantized_conv.cpp",
-        "src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp",
-        "src/ngraph/runtime/cpu/builder/reduce_function.cpp",
-        "src/ngraph/runtime/cpu/builder/reduce_function_window.cpp",
-        "src/ngraph/runtime/cpu/builder/relu.cpp",
-        "src/ngraph/runtime/cpu/builder/replace_slice.cpp",
-        "src/ngraph/runtime/cpu/builder/reshape.cpp",
-        "src/ngraph/runtime/cpu/builder/reverse.cpp",
-        "src/ngraph/runtime/cpu/builder/reverse_sequence.cpp",
-        "src/ngraph/runtime/cpu/builder/rnn.cpp",
-        "src/ngraph/runtime/cpu/builder/select.cpp",
-        "src/ngraph/runtime/cpu/builder/select_and_scatter.cpp",
-        "src/ngraph/runtime/cpu/builder/sigmoid.cpp",
-        "src/ngraph/runtime/cpu/builder/slice.cpp",
-        "src/ngraph/runtime/cpu/builder/softmax.cpp",
-        "src/ngraph/runtime/cpu/builder/sum.cpp",
-        "src/ngraph/runtime/cpu/builder/topk.cpp",
-        "src/ngraph/runtime/cpu/cpu_backend.cpp",
-        "src/ngraph/runtime/cpu/cpu_builder.cpp",
-        "src/ngraph/runtime/cpu/cpu_call_frame.cpp",
-        "src/ngraph/runtime/cpu/cpu_cse.cpp",
-        "src/ngraph/runtime/cpu/cpu_executor.cpp",
-        "src/ngraph/runtime/cpu/cpu_external_function.cpp",
-        "src/ngraph/runtime/cpu/cpu_kernels.cpp",
-        "src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp",
-        "src/ngraph/runtime/cpu/cpu_op_annotations.cpp",
-        "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
-        "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
-        "src/ngraph/runtime/cpu/cpu_tracing.cpp",
-        "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
-        "src/ngraph/runtime/cpu/kernel/pad.cpp",
-        "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
-        "src/ngraph/runtime/cpu/kernel/reduce_sum.cpp",
-        "src/ngraph/runtime/cpu/kernel/reshape.cpp",
-        "src/ngraph/runtime/cpu/mkldnn_emitter.cpp",
-        "src/ngraph/runtime/cpu/mkldnn_invoke.cpp",
-        "src/ngraph/runtime/cpu/mkldnn_utils.cpp",
-        "src/ngraph/runtime/cpu/op/batch_dot.cpp",
-        "src/ngraph/runtime/cpu/op/batch_norm_relu.cpp",
-        "src/ngraph/runtime/cpu/op/bounded_relu.cpp",
-        "src/ngraph/runtime/cpu/op/conv_add.cpp",
-        "src/ngraph/runtime/cpu/op/conv_bias.cpp",
-        "src/ngraph/runtime/cpu/op/conv_relu.cpp",
-        "src/ngraph/runtime/cpu/op/convert_layout.cpp",
-        "src/ngraph/runtime/cpu/op/group_conv.cpp",
-        "src/ngraph/runtime/cpu/op/group_conv_bias.cpp",
-        "src/ngraph/runtime/cpu/op/halide_op.cpp",
-        "src/ngraph/runtime/cpu/op/leaky_relu.cpp",
-        "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
-        "src/ngraph/runtime/cpu/op/lstm.cpp",
-        "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
-        "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
-        "src/ngraph/runtime/cpu/op/rnn.cpp",
-        "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
-        "src/ngraph/runtime/cpu/op/update_slice.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_fusion.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_layout.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_mat_fusion.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_rnn_fusion.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
-    ],
-    hdrs = glob(["src/ngraph/runtime/cpu/**/*.hpp"]) + glob([]),
-    copts = [
-        "-I external/ngraph/src",
-        "-I external/nlohmann_json_lib/include/",
-        "-D SHARED_LIB_EXT=\".so\"",
-        "-D NGRAPH_VERSION=\"0.11.0\"",
-        "-D NGRAPH_DEX_ONLY",
-        "-D PROJECT_ROOT_DIR=\"\"",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ngraph_headers",
-        "@eigen_archive//:eigen",
-        "@mkl_dnn_v1//:mkl_dnn",
-        "@mkl_dnn_v1//:mkl_dnn_aarch64",
-        "@nlohmann_json_lib",
-        "@tbb",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "ngraph_core",
-    srcs = glob([
-        "src/ngraph/*.cpp",
-        "src/ngraph/autodiff/*.cpp",
-        "src/ngraph/builder/*.cpp",
-        "src/ngraph/descriptor/*.cpp",
-        "src/ngraph/descriptor/layout/*.cpp",
-        "src/ngraph/op/experimental/generate_mask.cpp",
-        "src/ngraph/op/experimental/quantized_avg_pool.cpp",
-        "src/ngraph/op/experimental/quantized_conv_bias.cpp",
-        "src/ngraph/op/experimental/quantized_conv_relu.cpp",
-        "src/ngraph/op/experimental/quantized_conv.cpp",
-        "src/ngraph/op/experimental/quantized_max_pool.cpp",
-        "src/ngraph/op/experimental/shape_of.cpp",
-        "src/ngraph/op/*.cpp",
-        "src/ngraph/op/util/*.cpp",
-        "src/ngraph/pattern/*.cpp",
-        "src/ngraph/pattern/*.hpp",
-        "src/ngraph/pass/*.cpp",
-        "src/ngraph/pass/*.hpp",
-        "src/ngraph/runtime/*.cpp",
-        "src/ngraph/type/*.cpp",
-    ]),
-    copts = [
-        "-I external/ngraph/src",
-        "-I external/nlohmann_json_lib/include/",
-        "-D SHARED_LIB_EXT=\\\".so\\\"",
-        "-D NGRAPH_VERSION=\\\"0.11.0\\\"",
-        "-D PROJECT_ROOT_DIR=\\\"\\\"",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ngraph_cpu_backend",
-        ":ngraph_headers",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-    ],
-    alwayslink = 1,
-)
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
deleted file mode 100644
index 3ce31feec27..00000000000
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ /dev/null
@@ -1,99 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
-
-exports_files(["LICENSE"])
-
-load(
-    "@org_tensorflow//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
-cc_library(
-    name = "ngraph_tf",
-    srcs = [
-        "logging/ngraph_log.cc",
-        "logging/ngraph_log.h",
-        "logging/tf_graph_writer.cc",
-        "logging/tf_graph_writer.h",
-        "src/ngraph_api.cc",
-        "src/ngraph_api.h",
-        "src/ngraph_assign_clusters.cc",
-        "src/ngraph_assign_clusters.h",
-        "src/ngraph_backend_manager.cc",
-        "src/ngraph_backend_manager.h",
-        "src/ngraph_builder.cc",
-        "src/ngraph_builder.h",
-        "src/ngraph_capture_variables.cc",
-        "src/ngraph_capture_variables.h",
-        "src/ngraph_cluster_manager.cc",
-        "src/ngraph_cluster_manager.h",
-        "src/ngraph_conversions.h",
-        "src/ngraph_deassign_clusters.cc",
-        "src/ngraph_deassign_clusters.h",
-        "src/ngraph_encapsulate_clusters.cc",
-        "src/ngraph_encapsulate_clusters.h",
-        "src/ngraph_encapsulate_op.cc",
-        "src/ngraph_freshness_tracker.cc",
-        "src/ngraph_freshness_tracker.h",
-        "src/ngraph_mark_for_clustering.cc",
-        "src/ngraph_mark_for_clustering.h",
-        "src/ngraph_rewrite_for_tracking.cc",
-        "src/ngraph_rewrite_for_tracking.h",
-        "src/ngraph_rewrite_pass.cc",
-        "src/ngraph_tracked_variable.cc",
-        "src/ngraph_utils.cc",
-        "src/ngraph_utils.h",
-        "src/ngraph_version_utils.h",
-        "src/tf_deadness_analysis.cc",
-        "src/tf_deadness_analysis.h",
-        "src/tf_graphcycles.cc",
-        "src/tf_graphcycles.h",
-    ],
-    copts = [
-        "-I external/ngraph_tf/src",
-        "-I external/ngraph_tf/logging",
-        "-I external/ngraph/src",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/container:container_memory",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/types:variant",
-        "@ngraph//:ngraph_core",
-        "@org_tensorflow//tensorflow/core:framework_headers_lib",
-        "@org_tensorflow//tensorflow/core/common_runtime:core_cpu_headers_lib",
-    ],
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "ngraph_tf_tests",
-    size = "small",
-    srcs = [
-        "test/conversions.cpp",
-        "test/graph_rewrites/assign_clusters.cc",
-        "test/graph_rewrites/deadness_test.cc",
-        "test/main.cpp",
-        "test/opexecuter.cpp",
-        "test/opexecuter.h",
-        "test/padding.cpp",
-        "test/test_array_ops.cpp",
-        "test/test_math_ops.cpp",
-        "test/test_nn_ops.cpp",
-        "test/test_utilities.cpp",
-        "test/test_utilities.h",
-        "test/tf_exec.cpp",
-    ],
-    extra_copts = [
-        "-fexceptions ",
-        "-I external/ngraph_tf/src",
-        "-I external/ngraph_tf/logging",
-        "-I external/ngraph/src",
-    ],
-    deps = [
-        ":ngraph_tf",
-        "@com_google_googletest//:gtest",
-        "@org_tensorflow//tensorflow/cc:cc_ops",
-        "@org_tensorflow//tensorflow/cc:client_session",
-        "@org_tensorflow//tensorflow/core:tensorflow",
-    ],
-)
diff --git a/third_party/ngraph/tbb.BUILD b/third_party/ngraph/tbb.BUILD
deleted file mode 100644
index c78a2d79ddf..00000000000
--- a/third_party/ngraph/tbb.BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
-
-exports_files(["LICENSE"])
-
-# Taken from: https://github.com/rnburn/satyr/blob/master/bazel/tbb.BUILD
-# License for this BUILD file: MIT
-# See: https://github.com/rnburn/satyr/blob/master/LICENSE
-#
-# License for TBB: Apache 2.0
-# See: https://github.com/01org/tbb/blob/tbb_2018/LICENSE
-
-genrule(
-    name = "build_tbb",
-    srcs = glob(["**"]) + [
-        "@local_config_cc//:toolchain",
-    ],
-    outs = [
-        "libtbb.a",
-        "libtbbmalloc.a",
-    ],
-    cmd = """
-	    set -e
-	    WORK_DIR=$$PWD
-		DEST_DIR=$$PWD/$(@D)
-        export PATH=$$(dirname $(AR)):$$PATH
-		export CXXFLAGS=$(CC_FLAGS)
-		export NM=$(NM)
-		export AR=$(AR)
-		cd $$(dirname $(location :Makefile))
-
-        #TBB's build needs some help to figure out what compiler it's using
-        if $$CXX --version | grep clang &> /dev/null; then 
-           COMPILER_OPT="compiler=clang"
-        else
-			COMPILER_OPT="compiler=gcc"
-
-          #  # Workaround for TBB bug
-          #  # See https://github.com/01org/tbb/issues/59
-          #  CXXFLAGS="$$CXXFLAGS -flifetime-dse=1"
-        fi 
-
-        # uses extra_inc=big_iron.inc to specify that static libraries are
-        # built. See https://software.intel.com/en-us/forums/intel-threading-building-blocks/topic/297792
-        make tbb_build_prefix="build" \
-              extra_inc=big_iron.inc \
-              $$COMPILER_OPT; \
-
-        echo cp build/build_{release,debug}/*.a $$DEST_DIR
-        cp build/build_{release,debug}/*.a $$DEST_DIR
-		cd $$WORK_DIR
-	""",
-)
-
-cc_library(
-    name = "tbb",
-    srcs = ["libtbb.a"],
-    hdrs = glob([
-        "include/serial/**",
-        "include/tbb/**/**",
-    ]),
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/ngraph/nlohmann_json.BUILD b/third_party/nlohmann_json.BUILD
similarity index 100%
rename from third_party/ngraph/nlohmann_json.BUILD
rename to third_party/nlohmann_json.BUILD
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
index 2d627e26cb8..d7e69326205 100644
--- a/third_party/remote_config/common.bzl
+++ b/third_party/remote_config/common.bzl
@@ -41,15 +41,24 @@ def get_python_bin(repository_ctx):
     python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)
     if python_bin != None:
         return python_bin
-    python_bin_path = which(repository_ctx, "python")
-    if python_bin_path == None:
-        auto_config_fail("Cannot find python in PATH, please make sure " +
-                         "python is installed and add its directory in PATH, or --define " +
-                         "%s='/something/else'.\nPATH=%s" % (
-                             PYTHON_BIN_PATH,
-                             get_environ("PATH", ""),
-                         ))
-    return python_bin_path
+
+    # First check for an explicit "python3"
+    python_bin = which(repository_ctx, "python3")
+    if python_bin != None:
+        return python_bin
+
+    # Some systems just call pythone3 "python"
+    python_bin = which(repository_ctx, "python")
+    if python_bin != None:
+        return python_bin
+
+    auto_config_fail("Cannot find python in PATH, please make sure " +
+                     "python is installed and add its directory in PATH, or --define " +
+                     "%s='/something/else'.\nPATH=%s" % (
+                         PYTHON_BIN_PATH,
+                         get_environ("PATH", ""),
+                     ))
+    return python_bin  # unreachable
 
 def get_bash_bin(repository_ctx):
     """Gets the bash bin path.
diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl
index 386ad603950..29520396905 100644
--- a/third_party/remote_config/remote_platform_configure.bzl
+++ b/third_party/remote_config/remote_platform_configure.bzl
@@ -22,6 +22,8 @@ def _remote_platform_configure_impl(repository_ctx):
         cpu = "aarch64"
     elif machine_type.startswith("arm"):
         cpu = "arm"
+    elif machine_type.startswith("mips64"):
+        cpu = "mips64"
 
     exec_properties = repository_ctx.attr.platform_exec_properties
 
diff --git a/third_party/systemlibs/absl_py.absl.testing.BUILD b/third_party/systemlibs/absl_py.absl.testing.BUILD
index ef514a40478..ee810f8f210 100644
--- a/third_party/systemlibs/absl_py.absl.testing.BUILD
+++ b/third_party/systemlibs/absl_py.absl.testing.BUILD
@@ -9,3 +9,8 @@ py_library(
     name = "absltest",
     visibility = ["//visibility:public"],
 )
+
+py_library(
+    name = "flagsaver",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/dill.BUILD b/third_party/systemlibs/dill.BUILD
new file mode 100644
index 00000000000..0a62b21f2b6
--- /dev/null
+++ b/third_party/systemlibs/dill.BUILD
@@ -0,0 +1,12 @@
+licenses(["notice"])  # BSD 3-clause
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "dill",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/grpc.BUILD b/third_party/systemlibs/grpc.BUILD
index fd90eb0dd3d..8b703f11556 100644
--- a/third_party/systemlibs/grpc.BUILD
+++ b/third_party/systemlibs/grpc.BUILD
@@ -7,25 +7,47 @@ filegroup(
 
 cc_library(
     name = "grpc",
-    linkopts = ["-lgrpc"],
+    linkopts = [
+        "-lgrpc",
+        "-lgpr",
+    ],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "grpc++",
-    linkopts = ["-lgrpc++"],
+    linkopts = [
+        "-lgrpc++",
+        "-lgpr",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "grpc++_public_hdrs",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "grpc++_codegen_proto",
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "grpc_unsecure",
-    linkopts = ["-lgrpc_unsecure"],
+    linkopts = [
+        "-lgrpc_unsecure",
+        "-lgpr",
+    ],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "grpc++_unsecure",
-    linkopts = ["-lgrpc++_unsecure"],
+    linkopts = [
+        "-lgrpc++_unsecure",
+        "-lgpr",
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/third_party/systemlibs/grpc.bazel.cc_grpc_library.bzl b/third_party/systemlibs/grpc.bazel.cc_grpc_library.bzl
new file mode 100644
index 00000000000..e427328c39b
--- /dev/null
+++ b/third_party/systemlibs/grpc.bazel.cc_grpc_library.bzl
@@ -0,0 +1,105 @@
+"""Generates and compiles C++ grpc stubs from proto_library rules."""
+
+load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
+load("@com_github_grpc_grpc//bazel:protobuf.bzl", "well_known_proto_libs")
+
+def cc_grpc_library(
+        name,
+        srcs,
+        deps,
+        proto_only = False,
+        well_known_protos = False,
+        generate_mocks = False,
+        use_external = False,
+        grpc_only = False,
+        **kwargs):
+    """Generates C++ grpc classes for services defined in a proto file.
+
+    If grpc_only is True, this rule is compatible with proto_library and
+    cc_proto_library native rules such that it expects proto_library target
+    as srcs argument and generates only grpc library classes, expecting
+    protobuf messages classes library (cc_proto_library target) to be passed in
+    deps argument. By default grpc_only is False which makes this rule to behave
+    in a backwards-compatible mode (trying to generate both proto and grpc
+    classes).
+
+    Assumes the generated classes will be used in cc_api_version = 2.
+
+    Args:
+        name (str): Name of rule.
+        srcs (list): A single .proto file which contains services definitions,
+          or if grpc_only parameter is True, a single proto_library which
+          contains services descriptors.
+        deps (list): A list of C++ proto_library (or cc_proto_library) which
+          provides the compiled code of any message that the services depend on.
+        proto_only (bool): If True, create only C++ proto classes library,
+          avoid creating C++ grpc classes library (expect it in deps).
+          Deprecated, use native cc_proto_library instead. False by default.
+        well_known_protos (bool): Should this library additionally depend on
+          well known protos. Deprecated, the well known protos should be
+          specified as explicit dependencies of the proto_library target
+          (passed in srcs parameter) instead. False by default.
+        generate_mocks (bool): when True, Google Mock code for client stub is
+          generated. False by default.
+        use_external (bool): Not used.
+        grpc_only (bool): if True, generate only grpc library, expecting
+          protobuf messages library (cc_proto_library target) to be passed as
+          deps. False by default (will become True by default eventually).
+        **kwargs: rest of arguments, e.g., compatible_with and visibility
+    """
+    if len(srcs) > 1:
+        fail("Only one srcs value supported", "srcs")
+    if grpc_only and proto_only:
+        fail("A mutualy exclusive configuration is specified: grpc_only = True and proto_only = True")
+
+    extra_deps = []
+    proto_targets = []
+
+    if not grpc_only:
+        proto_target = "_" + name + "_only"
+        cc_proto_target = name if proto_only else "_" + name + "_cc_proto"
+
+        proto_deps = ["_" + dep + "_only" for dep in deps if dep.find(":") == -1]
+        proto_deps += [dep.split(":")[0] + ":" + "_" + dep.split(":")[1] + "_only" for dep in deps if dep.find(":") != -1]
+        if well_known_protos:
+            proto_deps += well_known_proto_libs()
+
+        native.proto_library(
+            name = proto_target,
+            srcs = srcs,
+            deps = proto_deps,
+            **kwargs
+        )
+
+        native.cc_proto_library(
+            name = cc_proto_target,
+            deps = [":" + proto_target],
+            **kwargs
+        )
+        extra_deps.append(":" + cc_proto_target)
+        proto_targets.append(proto_target)
+    else:
+        if not srcs:
+            fail("srcs cannot be empty", "srcs")
+        proto_targets += srcs
+
+    if not proto_only:
+        codegen_grpc_target = "_" + name + "_grpc_codegen"
+        generate_cc(
+            name = codegen_grpc_target,
+            srcs = proto_targets,
+            plugin = "@com_github_grpc_grpc//src/compiler:grpc_cpp_plugin",
+            well_known_protos = well_known_protos,
+            generate_mocks = generate_mocks,
+            **kwargs
+        )
+
+        native.cc_library(
+            name = name,
+            srcs = [":" + codegen_grpc_target],
+            hdrs = [":" + codegen_grpc_target],
+            deps = deps +
+                   extra_deps +
+                   ["@com_github_grpc_grpc//:grpc++_codegen_proto"],
+            **kwargs
+        )
diff --git a/third_party/systemlibs/grpc.bazel.generate_cc.bzl b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
new file mode 100644
index 00000000000..1534e526fd1
--- /dev/null
+++ b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
@@ -0,0 +1,186 @@
+"""Generates C++ grpc stubs from proto_library rules.
+
+This is an internal rule used by cc_grpc_library, and shouldn't be used
+directly.
+"""
+
+load(
+    "@com_github_grpc_grpc//bazel:protobuf.bzl",
+    "get_include_directory",
+    "get_plugin_args",
+    "get_proto_root",
+    "proto_path_to_generated_filename",
+)
+
+_GRPC_PROTO_HEADER_FMT = "{}.grpc.pb.h"
+_GRPC_PROTO_SRC_FMT = "{}.grpc.pb.cc"
+_GRPC_PROTO_MOCK_HEADER_FMT = "{}_mock.grpc.pb.h"
+_PROTO_HEADER_FMT = "{}.pb.h"
+_PROTO_SRC_FMT = "{}.pb.cc"
+
+def _strip_package_from_path(label_package, file):
+    prefix_len = 0
+    if not file.is_source and file.path.startswith(file.root.path):
+        prefix_len = len(file.root.path) + 1
+
+    path = file.path
+    if len(label_package) == 0:
+        return path
+    if not path.startswith(label_package + "/", prefix_len):
+        fail("'{}' does not lie within '{}'.".format(path, label_package))
+    return path[prefix_len + len(label_package + "/"):]
+
+def _get_srcs_file_path(file):
+    if not file.is_source and file.path.startswith(file.root.path):
+        return file.path[len(file.root.path) + 1:]
+    return file.path
+
+def _join_directories(directories):
+    massaged_directories = [directory for directory in directories if len(directory) != 0]
+    return "/".join(massaged_directories)
+
+def generate_cc_impl(ctx):
+    """Implementation of the generate_cc rule."""
+    protos = [f for src in ctx.attr.srcs for f in src[ProtoInfo].check_deps_sources.to_list()]
+    includes = [
+        f
+        for src in ctx.attr.srcs
+        for f in src[ProtoInfo].transitive_imports.to_list()
+    ]
+    outs = []
+    proto_root = get_proto_root(
+        ctx.label.workspace_root,
+    )
+
+    label_package = _join_directories([ctx.label.workspace_root, ctx.label.package])
+    if ctx.executable.plugin:
+        outs += [
+            proto_path_to_generated_filename(
+                _strip_package_from_path(label_package, proto),
+                _GRPC_PROTO_HEADER_FMT,
+            )
+            for proto in protos
+        ]
+        outs += [
+            proto_path_to_generated_filename(
+                _strip_package_from_path(label_package, proto),
+                _GRPC_PROTO_SRC_FMT,
+            )
+            for proto in protos
+        ]
+        if ctx.attr.generate_mocks:
+            outs += [
+                proto_path_to_generated_filename(
+                    _strip_package_from_path(label_package, proto),
+                    _GRPC_PROTO_MOCK_HEADER_FMT,
+                )
+                for proto in protos
+            ]
+    else:
+        outs += [
+            proto_path_to_generated_filename(
+                _strip_package_from_path(label_package, proto),
+                _PROTO_HEADER_FMT,
+            )
+            for proto in protos
+        ]
+        outs += [
+            proto_path_to_generated_filename(
+                _strip_package_from_path(label_package, proto),
+                _PROTO_SRC_FMT,
+            )
+            for proto in protos
+        ]
+    out_files = [ctx.actions.declare_file(out) for out in outs]
+    dir_out = str(ctx.genfiles_dir.path + proto_root)
+
+    arguments = []
+    if ctx.executable.plugin:
+        arguments += get_plugin_args(
+            ctx.executable.plugin,
+            ctx.attr.flags,
+            dir_out,
+            ctx.attr.generate_mocks,
+        )
+        tools = [ctx.executable.plugin]
+    else:
+        arguments += ["--cpp_out=" + ",".join(ctx.attr.flags) + ":" + dir_out]
+        tools = []
+
+    arguments += [
+        "--proto_path={}".format(get_include_directory(i))
+        for i in includes
+    ]
+
+    # Include the output directory so that protoc puts the generated code in the
+    # right directory.
+    arguments += ["--proto_path={0}{1}".format(dir_out, proto_root)]
+    arguments += [_get_srcs_file_path(proto) for proto in protos]
+
+    # create a list of well known proto files if the argument is non-None
+    well_known_proto_files = []
+    if ctx.attr.well_known_protos:
+        f = ctx.attr.well_known_protos.files.to_list()[0].dirname
+        if f != "external/com_google_protobuf/src/google/protobuf":
+            print(
+                "Error: Only @com_google_protobuf//:well_known_protos is supported",
+            )
+        else:
+            # f points to "external/com_google_protobuf/src/google/protobuf"
+            # add -I argument to protoc so it knows where to look for the proto files.
+            arguments += ["-I{0}".format(f + "/../..")]
+            well_known_proto_files = [
+                f
+                for f in ctx.attr.well_known_protos.files.to_list()
+            ]
+
+    ctx.actions.run(
+        inputs = protos + includes + well_known_proto_files,
+        tools = tools,
+        outputs = out_files,
+        executable = ctx.executable._protoc,
+        arguments = arguments,
+    )
+
+    return struct(files = depset(out_files))
+
+_generate_cc = rule(
+    attrs = {
+        "srcs": attr.label_list(
+            mandatory = True,
+            allow_empty = False,
+            providers = [ProtoInfo],
+        ),
+        "plugin": attr.label(
+            executable = True,
+            providers = ["files_to_run"],
+            cfg = "host",
+        ),
+        "flags": attr.string_list(
+            mandatory = False,
+            allow_empty = True,
+        ),
+        "well_known_protos": attr.label(mandatory = False),
+        "generate_mocks": attr.bool(
+            default = False,
+            mandatory = False,
+        ),
+        "_protoc": attr.label(
+            default = Label("@com_google_protobuf//:protoc"),
+            executable = True,
+            cfg = "host",
+        ),
+    },
+    # We generate .h files, so we need to output to genfiles.
+    output_to_genfiles = True,
+    implementation = generate_cc_impl,
+)
+
+def generate_cc(well_known_protos, **kwargs):
+    if well_known_protos:
+        _generate_cc(
+            well_known_protos = "@com_google_protobuf//:well_known_protos",
+            **kwargs
+        )
+    else:
+        _generate_cc(**kwargs)
diff --git a/third_party/systemlibs/grpc.bazel.grpc_extra_deps.bzl b/third_party/systemlibs/grpc.bazel.grpc_extra_deps.bzl
new file mode 100644
index 00000000000..631c93af047
--- /dev/null
+++ b/third_party/systemlibs/grpc.bazel.grpc_extra_deps.bzl
@@ -0,0 +1,4 @@
+"""Stub version of @com_github_grpc_grpc//bazel:grpc_extra_deps.bzl necessary for TF system libs"""
+
+def grpc_extra_deps():
+    pass
diff --git a/third_party/systemlibs/grpc.bazel.protobuf.bzl b/third_party/systemlibs/grpc.bazel.protobuf.bzl
new file mode 100644
index 00000000000..3eca97dc231
--- /dev/null
+++ b/third_party/systemlibs/grpc.bazel.protobuf.bzl
@@ -0,0 +1,244 @@
+"""Utility functions for generating protobuf code."""
+
+_PROTO_EXTENSION = ".proto"
+_VIRTUAL_IMPORTS = "/_virtual_imports/"
+
+def well_known_proto_libs():
+    return [
+        "@com_google_protobuf//:any_proto",
+        "@com_google_protobuf//:api_proto",
+        "@com_google_protobuf//:compiler_plugin_proto",
+        "@com_google_protobuf//:descriptor_proto",
+        "@com_google_protobuf//:duration_proto",
+        "@com_google_protobuf//:empty_proto",
+        "@com_google_protobuf//:field_mask_proto",
+        "@com_google_protobuf//:source_context_proto",
+        "@com_google_protobuf//:struct_proto",
+        "@com_google_protobuf//:timestamp_proto",
+        "@com_google_protobuf//:type_proto",
+        "@com_google_protobuf//:wrappers_proto",
+    ]
+
+def get_proto_root(workspace_root):
+    """Gets the root protobuf directory.
+
+    Args:
+      workspace_root: context.label.workspace_root
+
+    Returns:
+      The directory relative to which generated include paths should be.
+    """
+    if workspace_root:
+        return "/{}".format(workspace_root)
+    else:
+        return ""
+
+def _strip_proto_extension(proto_filename):
+    if not proto_filename.endswith(_PROTO_EXTENSION):
+        fail('"{}" does not end with "{}"'.format(
+            proto_filename,
+            _PROTO_EXTENSION,
+        ))
+    return proto_filename[:-len(_PROTO_EXTENSION)]
+
+def proto_path_to_generated_filename(proto_path, fmt_str):
+    """Calculates the name of a generated file for a protobuf path.
+
+    For example, "examples/protos/helloworld.proto" might map to
+      "helloworld.pb.h".
+
+    Args:
+      proto_path: The path to the .proto file.
+      fmt_str: A format string used to calculate the generated filename. For
+        example, "{}.pb.h" might be used to calculate a C++ header filename.
+
+    Returns:
+      The generated filename.
+    """
+    return fmt_str.format(_strip_proto_extension(proto_path))
+
+def get_include_directory(source_file):
+    """Returns the include directory path for the source_file.
+
+    I.e. all of the include statements within the given source_file
+    are calculated relative to the directory returned by this method.
+
+    The returned directory path can be used as the "--proto_path=" argument
+    value.
+
+    Args:
+      source_file: A proto file.
+
+    Returns:
+      The include directory path for the source_file.
+    """
+    directory = source_file.path
+    prefix_len = 0
+
+    if is_in_virtual_imports(source_file):
+        root, relative = source_file.path.split(_VIRTUAL_IMPORTS, 2)
+        result = root + _VIRTUAL_IMPORTS + relative.split("/", 1)[0]
+        return result
+
+    if not source_file.is_source and directory.startswith(source_file.root.path):
+        prefix_len = len(source_file.root.path) + 1
+
+    if directory.startswith("external", prefix_len):
+        external_separator = directory.find("/", prefix_len)
+        repository_separator = directory.find("/", external_separator + 1)
+        return directory[:repository_separator]
+    else:
+        return source_file.root.path if source_file.root.path else "."
+
+def get_plugin_args(
+        plugin,
+        flags,
+        dir_out,
+        generate_mocks,
+        plugin_name = "PLUGIN"):
+    """Returns arguments configuring protoc to use a plugin for a language.
+
+    Args:
+      plugin: An executable file to run as the protoc plugin.
+      flags: The plugin flags to be passed to protoc.
+      dir_out: The output directory for the plugin.
+      generate_mocks: A bool indicating whether to generate mocks.
+      plugin_name: A name of the plugin, it is required to be unique when there
+      are more than one plugin used in a single protoc command.
+    Returns:
+      A list of protoc arguments configuring the plugin.
+    """
+    augmented_flags = list(flags)
+    if generate_mocks:
+        augmented_flags.append("generate_mock_code=true")
+
+    augmented_dir_out = dir_out
+    if augmented_flags:
+        augmented_dir_out = ",".join(augmented_flags) + ":" + dir_out
+
+    return [
+        "--plugin=protoc-gen-{plugin_name}={plugin_path}".format(
+            plugin_name = plugin_name,
+            plugin_path = plugin.path,
+        ),
+        "--{plugin_name}_out={dir_out}".format(
+            plugin_name = plugin_name,
+            dir_out = augmented_dir_out,
+        ),
+    ]
+
+def _get_staged_proto_file(context, source_file):
+    if (source_file.dirname == context.label.package or
+        is_in_virtual_imports(source_file)):
+        return source_file
+    else:
+        copied_proto = context.actions.declare_file(source_file.basename)
+        context.actions.run_shell(
+            inputs = [source_file],
+            outputs = [copied_proto],
+            command = "cp {} {}".format(source_file.path, copied_proto.path),
+            mnemonic = "CopySourceProto",
+        )
+        return copied_proto
+
+def protos_from_context(context):
+    """Copies proto files to the appropriate location.
+
+    Args:
+      context: The ctx object for the rule.
+
+    Returns:
+      A list of the protos.
+    """
+    protos = []
+    for src in context.attr.deps:
+        for file in src[ProtoInfo].direct_sources:
+            protos.append(_get_staged_proto_file(context, file))
+    return protos
+
+def includes_from_deps(deps):
+    """Get includes from rule dependencies."""
+    return [
+        file
+        for src in deps
+        for file in src[ProtoInfo].transitive_imports.to_list()
+    ]
+
+def get_proto_arguments(protos, genfiles_dir_path):
+    """Get the protoc arguments specifying which protos to compile."""
+    arguments = []
+    for proto in protos:
+        strip_prefix_len = 0
+        if is_in_virtual_imports(proto):
+            incl_directory = get_include_directory(proto)
+            if proto.path.startswith(incl_directory):
+                strip_prefix_len = len(incl_directory) + 1
+        elif proto.path.startswith(genfiles_dir_path):
+            strip_prefix_len = len(genfiles_dir_path) + 1
+
+        arguments.append(proto.path[strip_prefix_len:])
+
+    return arguments
+
+def declare_out_files(protos, context, generated_file_format):
+    """Declares and returns the files to be generated."""
+
+    out_file_paths = []
+    for proto in protos:
+        if not is_in_virtual_imports(proto):
+            out_file_paths.append(proto.basename)
+        else:
+            path = proto.path[proto.path.index(_VIRTUAL_IMPORTS) + 1:]
+            out_file_paths.append(path)
+
+    return [
+        context.actions.declare_file(
+            proto_path_to_generated_filename(
+                out_file_path,
+                generated_file_format,
+            ),
+        )
+        for out_file_path in out_file_paths
+    ]
+
+def get_out_dir(protos, context):
+    """ Returns the calculated value for --<lang>_out= protoc argument based on
+    the input source proto files and current context.
+
+    Args:
+        protos: A list of protos to be used as source files in protoc command
+        context: A ctx object for the rule.
+    Returns:
+        The value of --<lang>_out= argument.
+    """
+    at_least_one_virtual = 0
+    for proto in protos:
+        if is_in_virtual_imports(proto):
+            at_least_one_virtual = True
+        elif at_least_one_virtual:
+            fail("Proto sources must be either all virtual imports or all real")
+    if at_least_one_virtual:
+        out_dir = get_include_directory(protos[0])
+        ws_root = protos[0].owner.workspace_root
+        if ws_root and out_dir.find(ws_root) >= 0:
+            out_dir = "".join(out_dir.rsplit(ws_root, 1))
+        return struct(
+            path = out_dir,
+            import_path = out_dir[out_dir.find(_VIRTUAL_IMPORTS) + 1:],
+        )
+    return struct(path = context.genfiles_dir.path, import_path = None)
+
+def is_in_virtual_imports(source_file, virtual_folder = _VIRTUAL_IMPORTS):
+    """Determines if source_file is virtual (is placed in _virtual_imports
+    subdirectory). The output of all proto_library targets which use
+    import_prefix  and/or strip_import_prefix arguments is placed under
+    _virtual_imports directory.
+
+    Args:
+        source_file: A proto file.
+        virtual_folder: The virtual folder name (is set to "_virtual_imports"
+            by default)
+    Returns:
+        True if source_file is located under _virtual_imports, False otherwise.
+    """
+    return not source_file.is_source and virtual_folder in source_file.path
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
index ccf2ab4dc7d..48f49ad8dc2 100644
--- a/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -1,3 +1,4 @@
+load("@rules_proto//proto:defs.bzl", "proto_library")
 load(
     "@com_google_protobuf//:protobuf.bzl",
     "cc_proto_library",
@@ -12,29 +13,63 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-PROTO_FILES = [
-    "google/protobuf/any.proto",
-    "google/protobuf/api.proto",
-    "google/protobuf/compiler/plugin.proto",
-    "google/protobuf/descriptor.proto",
-    "google/protobuf/duration.proto",
-    "google/protobuf/empty.proto",
-    "google/protobuf/field_mask.proto",
-    "google/protobuf/source_context.proto",
-    "google/protobuf/struct.proto",
-    "google/protobuf/timestamp.proto",
-    "google/protobuf/type.proto",
-    "google/protobuf/wrappers.proto",
+# Map of all well known protos.
+# name => (include path, imports)
+WELL_KNOWN_PROTO_MAP = {
+    "any": ("google/protobuf/any.proto", []),
+    "api": (
+        "google/protobuf/api.proto",
+        [
+            "source_context",
+            "type",
+        ],
+    ),
+    "compiler_plugin": (
+        "google/protobuf/compiler/plugin.proto",
+        ["descriptor"],
+    ),
+    "descriptor": ("google/protobuf/descriptor.proto", []),
+    "duration": ("google/protobuf/duration.proto", []),
+    "empty": ("google/protobuf/empty.proto", []),
+    "field_mask": ("google/protobuf/field_mask.proto", []),
+    "source_context": ("google/protobuf/source_context.proto", []),
+    "struct": ("google/protobuf/struct.proto", []),
+    "timestamp": ("google/protobuf/timestamp.proto", []),
+    "type": (
+        "google/protobuf/type.proto",
+        [
+            "any",
+            "source_context",
+        ],
+    ),
+    "wrappers": ("google/protobuf/wrappers.proto", []),
+}
+
+HEADERS = [
+    "google/protobuf/arena.h",
+    "google/protobuf/compiler/importer.h",
+    "google/protobuf/descriptor.h",
+    "google/protobuf/io/coded_stream.h",
+    "google/protobuf/io/zero_copy_stream.h",
+    "google/protobuf/io/zero_copy_stream_impl_lite.h",
+    "google/protobuf/map.h",
+    "google/protobuf/repeated_field.h",
+    "google/protobuf/text_format.h",
+    "google/protobuf/util/json_util.h",
+    "google/protobuf/util/type_resolver_util.h",
+] + [
+    proto[0].replace(".proto", ".pb.h")
+    for proto in WELL_KNOWN_PROTO_MAP.values()
 ]
 
 genrule(
     name = "link_proto_files",
-    outs = PROTO_FILES,
+    outs = HEADERS + [proto[0] for proto in WELL_KNOWN_PROTO_MAP.values()],
     cmd = """
       for i in $(OUTS); do
         f=$${i#$(@D)/}
         mkdir -p $(@D)/$${f%/*}
-        ln -sf $(INCLUDEDIR)/$$f $(@D)/$$f
+        ln -sf $(PROTOBUF_INCLUDE_PATH)/$$f $(@D)/$$f
       done
     """,
 )
@@ -43,10 +78,12 @@ cc_library(
     name = "protobuf",
     linkopts = ["-lprotobuf"],
     visibility = ["//visibility:public"],
+    deps = [":protobuf_headers"],
 )
 
 cc_library(
     name = "protobuf_headers",
+    hdrs = HEADERS,
     linkopts = ["-lprotobuf"],
     visibility = ["//visibility:public"],
 )
@@ -85,74 +122,9 @@ py_library(
     visibility = ["//visibility:public"],
 )
 
-proto_library(
-    name = "any_proto",
-    srcs = ["google/protobuf/any.proto"],
+[proto_library(
+    name = proto[0] + "_proto",
+    srcs = [proto[1][0]],
     visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "api_proto",
-    srcs = ["google/protobuf/api.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "compiler_plugin_proto",
-    srcs = ["google/protobuf/compiler/plugin.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "descriptor_proto",
-    srcs = ["google/protobuf/descriptor.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "duration_proto",
-    srcs = ["google/protobuf/duration.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "empty_proto",
-    srcs = ["google/protobuf/empty.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "field_mask_proto",
-    srcs = ["google/protobuf/field_mask.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "source_context_proto",
-    srcs = ["google/protobuf/source_context.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "struct_proto",
-    srcs = ["google/protobuf/struct.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "timestamp_proto",
-    srcs = ["google/protobuf/timestamp.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "type_proto",
-    srcs = ["google/protobuf/type.proto"],
-    visibility = ["//visibility:public"],
-)
-
-proto_library(
-    name = "wrappers_proto",
-    srcs = ["google/protobuf/wrappers.proto"],
-    visibility = ["//visibility:public"],
-)
+    deps = [dep + "_proto" for dep in proto[1][1]],
+) for proto in WELL_KNOWN_PROTO_MAP.items()]
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 76948f2c2cb..74f15c885d9 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -20,6 +20,7 @@ VALID_LIBS = [
     "com_googlesource_code_re2",
     "curl",
     "cython",
+    "dill_archive",
     "double_conversion",
     "enum34_archive",
     "flatbuffers",
@@ -41,6 +42,7 @@ VALID_LIBS = [
     "pybind11",
     "six_archive",
     "snappy",
+    "tblib_archive",
     "termcolor_archive",
     "wrapt",
     "zlib",
diff --git a/third_party/systemlibs/tblib.BUILD b/third_party/systemlibs/tblib.BUILD
new file mode 100644
index 00000000000..0b2ad25c746
--- /dev/null
+++ b/third_party/systemlibs/tblib.BUILD
@@ -0,0 +1,12 @@
+licenses(["notice"])  # BSD 3-clause
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "tblib",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/tf_toolchains.BUILD b/third_party/tf_toolchains.BUILD
new file mode 100644
index 00000000000..ffd0fb0cdc5
--- /dev/null
+++ b/third_party/tf_toolchains.BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 46d89150306..8efa702bf1c 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -11,7 +11,7 @@ container_digests = {
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
     "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
-    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:f436545b7e14b014393b42975923dcd01f408496b1399abb5a35608f888ca140",
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:8c6ba5a831c23906716cc9e9c201081f2b5632e3bf3cbc0207da0ddbef18d525",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }